diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,9 +1,9 @@ { - "best_metric": 0.6068801160501502, - "best_model_checkpoint": "/m/triton/scratch/elec/puhe/p/palp3/MUCS/mucs_language_segregated_data/trainwithtags_warmup1500_s300_shuff100/output/checkpoint-1000", - "epoch": 5.6, - "eval_steps": 100, - "global_step": 3500, + "best_metric": 0.6194798466480157, + "best_model_checkpoint": "/m/triton/scratch/elec/puhe/p/palp3/MUCS/mucs_language_segregated_data/trainwithtags_warmup2000_s300_shuff100/output/checkpoint-1000", + "epoch": 9.6, + "eval_steps": 1000, + "global_step": 6000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, @@ -17,24862 +17,42072 @@ }, { "epoch": 0.0032, - "grad_norm": 19.480770111083984, - "learning_rate": 3.9999999999999993e-07, + "grad_norm": 19.478151321411133, + "learning_rate": 3e-07, "loss": 28.9967, "step": 2 }, { "epoch": 0.0048, - "grad_norm": 21.03929901123047, - "learning_rate": 7.999999999999999e-07, + "grad_norm": 21.032878875732422, + "learning_rate": 6e-07, "loss": 25.1809, "step": 3 }, { "epoch": 0.0064, - "grad_norm": 13.241178512573242, - "learning_rate": 1.2e-06, + "grad_norm": 13.343757629394531, + "learning_rate": 9e-07, "loss": 20.6383, "step": 4 }, { "epoch": 0.008, "grad_norm": NaN, - "learning_rate": 1.2e-06, - "loss": 19.1797, + "learning_rate": 9e-07, + "loss": 19.1799, "step": 5 }, { "epoch": 0.0096, - "grad_norm": 13.427491188049316, - "learning_rate": 1.5999999999999997e-06, + "grad_norm": 12.849465370178223, + "learning_rate": 1.2e-06, "loss": 20.0215, "step": 6 }, { "epoch": 0.0112, - "grad_norm": 13.29790210723877, - "learning_rate": 2e-06, - "loss": 20.0473, + "grad_norm": 13.198436737060547, + "learning_rate": 1.4999999999999998e-06, + "loss": 20.0485, "step": 7 }, { "epoch": 0.0128, - "grad_norm": 11.687248229980469, - "learning_rate": 2.4e-06, - "loss": 18.6201, + "grad_norm": 11.687944412231445, + "learning_rate": 1.8e-06, + "loss": 18.6217, "step": 8 }, { "epoch": 0.0144, - "grad_norm": 10.248677253723145, - "learning_rate": 2.8e-06, - "loss": 16.4545, + "grad_norm": 10.567115783691406, + "learning_rate": 2.1e-06, + "loss": 16.4561, "step": 9 }, { "epoch": 0.016, - "grad_norm": 10.466910362243652, - "learning_rate": 3.1999999999999994e-06, - "loss": 16.9539, + "grad_norm": 10.45132064819336, + "learning_rate": 2.4e-06, + "loss": 16.9583, "step": 10 }, { "epoch": 0.0176, - "grad_norm": 13.025243759155273, - "learning_rate": 3.6e-06, - "loss": 17.2907, + "grad_norm": 12.070805549621582, + "learning_rate": 2.6999999999999996e-06, + "loss": 17.2946, "step": 11 }, { "epoch": 0.0192, - "grad_norm": 10.613122940063477, - "learning_rate": 4e-06, - "loss": 17.1058, + "grad_norm": 10.647071838378906, + "learning_rate": 2.9999999999999997e-06, + "loss": 17.1101, "step": 12 }, { "epoch": 0.0208, - "grad_norm": 9.954842567443848, - "learning_rate": 4.399999999999999e-06, - "loss": 16.4296, + "grad_norm": 9.95898723602295, + "learning_rate": 3.2999999999999993e-06, + "loss": 16.4421, "step": 13 }, { "epoch": 0.0224, - "grad_norm": 11.088004112243652, - "learning_rate": 4.8e-06, - "loss": 17.5867, + "grad_norm": 11.099569320678711, + "learning_rate": 3.6e-06, + "loss": 17.5943, "step": 14 }, { "epoch": 0.024, - "grad_norm": 10.448060989379883, - "learning_rate": 5.199999999999999e-06, - "loss": 16.871, + "grad_norm": 10.42167854309082, + "learning_rate": 3.899999999999999e-06, + "loss": 16.8807, "step": 15 }, { "epoch": 0.0256, - "grad_norm": 8.993600845336914, - "learning_rate": 5.6e-06, - "loss": 14.601, + "grad_norm": 9.058366775512695, + "learning_rate": 4.2e-06, + "loss": 14.6257, "step": 16 }, { "epoch": 0.0272, - "grad_norm": 8.905482292175293, - "learning_rate": 5.999999999999999e-06, - "loss": 14.0951, + "grad_norm": 9.066213607788086, + "learning_rate": 4.499999999999999e-06, + "loss": 14.1237, "step": 17 }, { "epoch": 0.0288, - "grad_norm": 9.374123573303223, - "learning_rate": 6.399999999999999e-06, - "loss": 15.2176, + "grad_norm": 9.31420612335205, + "learning_rate": 4.8e-06, + "loss": 15.2356, "step": 18 }, { "epoch": 0.0304, - "grad_norm": 19.181642532348633, - "learning_rate": 6.8e-06, - "loss": 15.4023, + "grad_norm": 10.176087379455566, + "learning_rate": 5.1e-06, + "loss": 15.4251, "step": 19 }, { "epoch": 0.032, - "grad_norm": 8.2308988571167, - "learning_rate": 7.2e-06, - "loss": 13.7388, + "grad_norm": 8.170927047729492, + "learning_rate": 5.399999999999999e-06, + "loss": 13.76, "step": 20 }, { "epoch": 0.0336, - "grad_norm": 8.79343032836914, - "learning_rate": 7.599999999999999e-06, - "loss": 14.4443, + "grad_norm": 8.670867919921875, + "learning_rate": 5.7e-06, + "loss": 14.4806, "step": 21 }, { "epoch": 0.0352, - "grad_norm": 11.329385757446289, - "learning_rate": 8e-06, - "loss": 16.5094, + "grad_norm": 11.31945514678955, + "learning_rate": 5.999999999999999e-06, + "loss": 16.5971, "step": 22 }, { "epoch": 0.0368, - "grad_norm": 9.5508451461792, - "learning_rate": 8.4e-06, - "loss": 14.956, + "grad_norm": 9.18188762664795, + "learning_rate": 6.3e-06, + "loss": 15.0373, "step": 23 }, { "epoch": 0.0384, - "grad_norm": 9.614250183105469, - "learning_rate": 8.799999999999999e-06, - "loss": 14.7454, + "grad_norm": 9.432085990905762, + "learning_rate": 6.599999999999999e-06, + "loss": 14.792, "step": 24 }, { "epoch": 0.04, - "grad_norm": 9.611518859863281, - "learning_rate": 9.199999999999998e-06, - "loss": 14.8943, + "grad_norm": 9.444454193115234, + "learning_rate": 6.899999999999999e-06, + "loss": 14.9425, "step": 25 }, { "epoch": 0.0416, - "grad_norm": 10.2857084274292, - "learning_rate": 9.6e-06, - "loss": 15.5331, + "grad_norm": 10.104069709777832, + "learning_rate": 7.2e-06, + "loss": 15.5962, "step": 26 }, { "epoch": 0.0432, - "grad_norm": 8.22513198852539, - "learning_rate": 9.999999999999999e-06, - "loss": 13.0081, + "grad_norm": 8.144676208496094, + "learning_rate": 7.499999999999999e-06, + "loss": 13.0629, "step": 27 }, { "epoch": 0.0448, - "grad_norm": 11.55008602142334, - "learning_rate": 1.0399999999999999e-05, - "loss": 16.8155, + "grad_norm": 11.250031471252441, + "learning_rate": 7.799999999999998e-06, + "loss": 16.8965, "step": 28 }, { "epoch": 0.0464, - "grad_norm": 9.69379711151123, - "learning_rate": 1.0799999999999998e-05, - "loss": 14.1567, + "grad_norm": 9.27499008178711, + "learning_rate": 8.099999999999999e-06, + "loss": 14.2719, "step": 29 }, { "epoch": 0.048, - "grad_norm": 9.33486557006836, - "learning_rate": 1.12e-05, - "loss": 13.6537, + "grad_norm": 8.996031761169434, + "learning_rate": 8.4e-06, + "loss": 13.7765, "step": 30 }, { "epoch": 0.0496, - "grad_norm": 10.678018569946289, - "learning_rate": 1.1599999999999999e-05, - "loss": 15.083, + "grad_norm": 10.058349609375, + "learning_rate": 8.7e-06, + "loss": 15.2312, "step": 31 }, { "epoch": 0.0512, - "grad_norm": 10.413251876831055, - "learning_rate": 1.1999999999999999e-05, - "loss": 14.4103, + "grad_norm": 9.924124717712402, + "learning_rate": 8.999999999999999e-06, + "loss": 14.5462, "step": 32 }, { "epoch": 0.0528, - "grad_norm": 11.2171630859375, - "learning_rate": 1.2399999999999998e-05, - "loss": 15.0434, + "grad_norm": 10.69217586517334, + "learning_rate": 9.299999999999999e-06, + "loss": 15.1555, "step": 33 }, { "epoch": 0.0544, - "grad_norm": 8.697986602783203, - "learning_rate": 1.2799999999999998e-05, - "loss": 12.6689, + "grad_norm": 8.319705963134766, + "learning_rate": 9.6e-06, + "loss": 12.7821, "step": 34 }, { "epoch": 0.056, - "grad_norm": 10.12214183807373, - "learning_rate": 1.3199999999999997e-05, - "loss": 13.8483, + "grad_norm": 9.608613967895508, + "learning_rate": 9.9e-06, + "loss": 13.9979, "step": 35 }, { "epoch": 0.0576, - "grad_norm": 10.093426704406738, - "learning_rate": 1.36e-05, - "loss": 13.6302, + "grad_norm": 9.609368324279785, + "learning_rate": 1.02e-05, + "loss": 13.7818, "step": 36 }, { "epoch": 0.0592, - "grad_norm": 10.467079162597656, - "learning_rate": 1.4e-05, - "loss": 13.8487, + "grad_norm": 9.907469749450684, + "learning_rate": 1.05e-05, + "loss": 14.0173, "step": 37 }, { "epoch": 0.0608, - "grad_norm": 11.031571388244629, - "learning_rate": 1.44e-05, - "loss": 14.0714, + "grad_norm": 10.462899208068848, + "learning_rate": 1.0799999999999998e-05, + "loss": 14.2503, "step": 38 }, { "epoch": 0.0624, - "grad_norm": 11.238005638122559, - "learning_rate": 1.4799999999999999e-05, - "loss": 14.0722, + "grad_norm": 10.519027709960938, + "learning_rate": 1.1099999999999999e-05, + "loss": 14.2442, "step": 39 }, { "epoch": 0.064, - "grad_norm": 11.471139907836914, - "learning_rate": 1.5199999999999998e-05, - "loss": 13.9102, + "grad_norm": 10.643707275390625, + "learning_rate": 1.14e-05, + "loss": 14.0883, "step": 40 }, { "epoch": 0.0656, - "grad_norm": 13.19694995880127, - "learning_rate": 1.5599999999999996e-05, - "loss": 15.5816, + "grad_norm": 12.463756561279297, + "learning_rate": 1.17e-05, + "loss": 15.8071, "step": 41 }, { "epoch": 0.0672, - "grad_norm": 15.062193870544434, - "learning_rate": 1.6e-05, - "loss": 16.0428, + "grad_norm": 13.493027687072754, + "learning_rate": 1.1999999999999999e-05, + "loss": 16.2933, "step": 42 }, { "epoch": 0.0688, - "grad_norm": 14.18619155883789, - "learning_rate": 1.64e-05, - "loss": 15.99, + "grad_norm": 13.236719131469727, + "learning_rate": 1.2299999999999999e-05, + "loss": 16.2827, "step": 43 }, { "epoch": 0.0704, - "grad_norm": 11.722272872924805, - "learning_rate": 1.68e-05, - "loss": 13.8123, + "grad_norm": 10.915094375610352, + "learning_rate": 1.26e-05, + "loss": 14.0712, "step": 44 }, { "epoch": 0.072, - "grad_norm": 12.653892517089844, - "learning_rate": 1.7199999999999998e-05, - "loss": 14.2928, + "grad_norm": 11.671866416931152, + "learning_rate": 1.2899999999999998e-05, + "loss": 14.5855, "step": 45 }, { "epoch": 0.0736, - "grad_norm": 12.380364418029785, - "learning_rate": 1.7599999999999998e-05, - "loss": 13.9655, + "grad_norm": 11.406953811645508, + "learning_rate": 1.3199999999999997e-05, + "loss": 14.2818, "step": 46 }, { "epoch": 0.0752, - "grad_norm": 10.684913635253906, - "learning_rate": 1.7999999999999997e-05, - "loss": 12.4269, + "grad_norm": 9.801484107971191, + "learning_rate": 1.3499999999999998e-05, + "loss": 12.7244, "step": 47 }, { "epoch": 0.0768, - "grad_norm": 13.753266334533691, - "learning_rate": 1.8399999999999997e-05, - "loss": 14.3082, + "grad_norm": 12.542718887329102, + "learning_rate": 1.3799999999999998e-05, + "loss": 14.6995, "step": 48 }, { "epoch": 0.0784, - "grad_norm": 14.648118019104004, - "learning_rate": 1.8799999999999996e-05, - "loss": 14.7347, + "grad_norm": 13.37395191192627, + "learning_rate": 1.4099999999999999e-05, + "loss": 15.1658, "step": 49 }, { "epoch": 0.08, - "grad_norm": 12.614858627319336, - "learning_rate": 1.92e-05, - "loss": 13.0945, + "grad_norm": 11.486909866333008, + "learning_rate": 1.44e-05, + "loss": 13.4777, "step": 50 }, { "epoch": 0.0816, - "grad_norm": 46.947227478027344, - "learning_rate": 1.9599999999999995e-05, - "loss": 30.2031, + "grad_norm": 41.783447265625, + "learning_rate": 1.47e-05, + "loss": 31.3796, "step": 51 }, { "epoch": 0.0832, - "grad_norm": NaN, - "learning_rate": 1.9599999999999995e-05, - "loss": 24.7875, + "grad_norm": 37.17393112182617, + "learning_rate": 1.4999999999999999e-05, + "loss": 25.7284, "step": 52 }, { "epoch": 0.0848, - "grad_norm": 32.876708984375, - "learning_rate": 1.9999999999999998e-05, - "loss": 25.5323, + "grad_norm": 30.743728637695312, + "learning_rate": 1.53e-05, + "loss": 26.3766, "step": 53 }, { "epoch": 0.0864, - "grad_norm": 22.72123146057129, - "learning_rate": 2.04e-05, - "loss": 18.7279, + "grad_norm": 21.993959426879883, + "learning_rate": 1.5599999999999996e-05, + "loss": 19.3039, "step": 54 }, { "epoch": 0.088, - "grad_norm": 60.50054168701172, - "learning_rate": 2.0799999999999997e-05, - "loss": 25.0974, + "grad_norm": NaN, + "learning_rate": 1.5599999999999996e-05, + "loss": 25.9385, "step": 55 }, { "epoch": 0.0896, - "grad_norm": 19.52132225036621, - "learning_rate": 2.12e-05, - "loss": 16.16, + "grad_norm": 17.90077781677246, + "learning_rate": 1.5899999999999997e-05, + "loss": 16.7533, "step": 56 }, { "epoch": 0.0912, - "grad_norm": 33.881954193115234, - "learning_rate": 2.1599999999999996e-05, - "loss": 23.8675, + "grad_norm": 30.298683166503906, + "learning_rate": 1.6199999999999997e-05, + "loss": 24.8963, "step": 57 }, { "epoch": 0.0928, - "grad_norm": 18.729801177978516, - "learning_rate": 2.2e-05, - "loss": 15.2585, + "grad_norm": 16.790489196777344, + "learning_rate": 1.6499999999999998e-05, + "loss": 15.8818, "step": 58 }, { "epoch": 0.0944, - "grad_norm": 18.455617904663086, - "learning_rate": 2.24e-05, - "loss": 14.5196, + "grad_norm": 16.33099937438965, + "learning_rate": 1.68e-05, + "loss": 15.1584, "step": 59 }, { "epoch": 0.096, - "grad_norm": 19.266185760498047, - "learning_rate": 2.28e-05, - "loss": 15.0203, + "grad_norm": 17.538009643554688, + "learning_rate": 1.71e-05, + "loss": 15.7514, "step": 60 }, { "epoch": 0.0976, - "grad_norm": 21.54351234436035, - "learning_rate": 2.3199999999999998e-05, - "loss": 15.7071, + "grad_norm": 19.184165954589844, + "learning_rate": 1.74e-05, + "loss": 16.5568, "step": 61 }, { "epoch": 0.0992, - "grad_norm": 18.729368209838867, - "learning_rate": 2.3599999999999998e-05, - "loss": 13.9821, + "grad_norm": 16.525760650634766, + "learning_rate": 1.7699999999999997e-05, + "loss": 14.7701, "step": 62 }, { "epoch": 0.1008, - "grad_norm": 19.222980499267578, - "learning_rate": 2.3999999999999997e-05, - "loss": 13.5072, + "grad_norm": 17.058059692382812, + "learning_rate": 1.7999999999999997e-05, + "loss": 14.3009, "step": 63 }, { "epoch": 0.1024, - "grad_norm": 18.127918243408203, - "learning_rate": 2.4399999999999997e-05, - "loss": 13.0847, + "grad_norm": 15.610758781433105, + "learning_rate": 1.8299999999999998e-05, + "loss": 13.9059, "step": 64 }, { "epoch": 0.104, - "grad_norm": 17.083431243896484, - "learning_rate": 2.4799999999999996e-05, - "loss": 12.3284, + "grad_norm": 14.720325469970703, + "learning_rate": 1.8599999999999998e-05, + "loss": 13.1319, "step": 65 }, { "epoch": 0.1056, - "grad_norm": 20.064725875854492, - "learning_rate": 2.52e-05, - "loss": 13.3525, + "grad_norm": 17.151647567749023, + "learning_rate": 1.89e-05, + "loss": 14.3394, "step": 66 }, { "epoch": 0.1072, - "grad_norm": 19.82396697998047, - "learning_rate": 2.5599999999999995e-05, - "loss": 12.8442, + "grad_norm": 16.752134323120117, + "learning_rate": 1.92e-05, + "loss": 13.8388, "step": 67 }, { "epoch": 0.1088, - "grad_norm": 19.43636131286621, - "learning_rate": 2.6e-05, - "loss": 12.5456, + "grad_norm": 16.487157821655273, + "learning_rate": 1.95e-05, + "loss": 13.5623, "step": 68 }, { "epoch": 0.1104, - "grad_norm": 19.312517166137695, - "learning_rate": 2.6399999999999995e-05, - "loss": 12.0877, + "grad_norm": 16.15814781188965, + "learning_rate": 1.98e-05, + "loss": 13.1342, "step": 69 }, { "epoch": 0.112, - "grad_norm": 19.31719970703125, - "learning_rate": 2.6799999999999998e-05, - "loss": 11.4867, + "grad_norm": 15.702254295349121, + "learning_rate": 2.01e-05, + "loss": 12.5287, "step": 70 }, { "epoch": 0.1136, - "grad_norm": 20.121965408325195, - "learning_rate": 2.72e-05, - "loss": 11.5062, + "grad_norm": 16.827510833740234, + "learning_rate": 2.04e-05, + "loss": 12.6514, "step": 71 }, { "epoch": 0.1152, - "grad_norm": 19.37114715576172, - "learning_rate": 2.7599999999999997e-05, - "loss": 11.0737, + "grad_norm": 16.10146713256836, + "learning_rate": 2.07e-05, + "loss": 12.2281, "step": 72 }, { "epoch": 0.1168, - "grad_norm": 21.755847930908203, - "learning_rate": 2.8e-05, - "loss": 11.4482, + "grad_norm": 17.98247528076172, + "learning_rate": 2.1e-05, + "loss": 12.7926, "step": 73 }, { "epoch": 0.1184, - "grad_norm": 22.922266006469727, - "learning_rate": 2.8399999999999996e-05, - "loss": 11.6379, + "grad_norm": 18.706369400024414, + "learning_rate": 2.1299999999999996e-05, + "loss": 13.0467, "step": 74 }, { "epoch": 0.12, - "grad_norm": 25.408145904541016, - "learning_rate": 2.88e-05, - "loss": 11.4125, + "grad_norm": 19.2159423828125, + "learning_rate": 2.1599999999999996e-05, + "loss": 12.9773, "step": 75 }, { "epoch": 0.1216, - "grad_norm": 23.630056381225586, - "learning_rate": 2.9199999999999995e-05, - "loss": 11.1628, + "grad_norm": 18.725379943847656, + "learning_rate": 2.1899999999999997e-05, + "loss": 12.7265, "step": 76 }, { "epoch": 0.1232, - "grad_norm": 24.943918228149414, - "learning_rate": 2.9599999999999998e-05, - "loss": 10.9194, + "grad_norm": 19.681507110595703, + "learning_rate": 2.2199999999999998e-05, + "loss": 12.6436, "step": 77 }, { "epoch": 0.1248, - "grad_norm": 26.104032516479492, - "learning_rate": 2.9999999999999997e-05, - "loss": 10.7166, + "grad_norm": 20.383150100708008, + "learning_rate": 2.2499999999999998e-05, + "loss": 12.5706, "step": 78 }, { "epoch": 0.1264, - "grad_norm": 23.736133575439453, - "learning_rate": 3.0399999999999997e-05, - "loss": 9.6632, + "grad_norm": 18.141130447387695, + "learning_rate": 2.28e-05, + "loss": 11.3421, "step": 79 }, { "epoch": 0.128, - "grad_norm": 29.95717430114746, - "learning_rate": 3.0799999999999996e-05, - "loss": 10.7768, + "grad_norm": 23.76494789123535, + "learning_rate": 2.31e-05, + "loss": 12.8837, "step": 80 }, { "epoch": 0.1296, - "grad_norm": 27.857250213623047, - "learning_rate": 3.119999999999999e-05, - "loss": 9.604, + "grad_norm": 21.412139892578125, + "learning_rate": 2.34e-05, + "loss": 11.5721, "step": 81 }, { "epoch": 0.1312, - "grad_norm": 31.466289520263672, - "learning_rate": 3.1599999999999996e-05, - "loss": 10.2822, + "grad_norm": 24.82501792907715, + "learning_rate": 2.3699999999999997e-05, + "loss": 12.6466, "step": 82 }, { "epoch": 0.1328, - "grad_norm": 25.81043243408203, - "learning_rate": 3.2e-05, - "loss": 8.937, + "grad_norm": 20.29831886291504, + "learning_rate": 2.3999999999999997e-05, + "loss": 10.9449, "step": 83 }, { "epoch": 0.1344, - "grad_norm": 32.71095275878906, - "learning_rate": 3.2399999999999995e-05, - "loss": 9.828, + "grad_norm": 26.979677200317383, + "learning_rate": 2.4299999999999998e-05, + "loss": 12.3749, "step": 84 }, { "epoch": 0.136, - "grad_norm": 29.731168746948242, - "learning_rate": 3.28e-05, - "loss": 8.8936, + "grad_norm": 23.62879180908203, + "learning_rate": 2.4599999999999998e-05, + "loss": 11.1793, "step": 85 }, { "epoch": 0.1376, - "grad_norm": 30.395309448242188, - "learning_rate": 3.3199999999999994e-05, - "loss": 8.8308, + "grad_norm": 27.601375579833984, + "learning_rate": 2.49e-05, + "loss": 11.2504, "step": 86 }, { "epoch": 0.1392, - "grad_norm": 31.029926300048828, - "learning_rate": 3.36e-05, - "loss": 8.7211, + "grad_norm": 26.318252563476562, + "learning_rate": 2.52e-05, + "loss": 11.2328, "step": 87 }, { "epoch": 0.1408, - "grad_norm": 28.081674575805664, - "learning_rate": 3.399999999999999e-05, - "loss": 7.9264, + "grad_norm": 24.130613327026367, + "learning_rate": 2.55e-05, + "loss": 10.2347, "step": 88 }, { "epoch": 0.1424, - "grad_norm": 25.76122283935547, - "learning_rate": 3.4399999999999996e-05, - "loss": 7.4507, + "grad_norm": 22.916906356811523, + "learning_rate": 2.5799999999999997e-05, + "loss": 9.6041, "step": 89 }, { "epoch": 0.144, - "grad_norm": 29.275390625, - "learning_rate": 3.48e-05, - "loss": 7.8001, + "grad_norm": 26.457481384277344, + "learning_rate": 2.6099999999999997e-05, + "loss": 10.3063, "step": 90 }, { "epoch": 0.1456, - "grad_norm": 26.79217529296875, - "learning_rate": 3.5199999999999995e-05, - "loss": 7.3485, + "grad_norm": 24.79904556274414, + "learning_rate": 2.6399999999999995e-05, + "loss": 9.7561, "step": 91 }, { "epoch": 0.1472, - "grad_norm": 26.25724220275879, - "learning_rate": 3.56e-05, - "loss": 7.0524, + "grad_norm": 25.292531967163086, + "learning_rate": 2.6699999999999995e-05, + "loss": 9.4004, "step": 92 }, { "epoch": 0.1488, - "grad_norm": 23.24335289001465, - "learning_rate": 3.5999999999999994e-05, - "loss": 6.5839, + "grad_norm": 23.30132484436035, + "learning_rate": 2.6999999999999996e-05, + "loss": 8.7521, "step": 93 }, { "epoch": 0.1504, - "grad_norm": 22.426977157592773, - "learning_rate": 3.64e-05, - "loss": 6.4155, + "grad_norm": 23.52458381652832, + "learning_rate": 2.7299999999999996e-05, + "loss": 8.5213, "step": 94 }, { "epoch": 0.152, - "grad_norm": 30.2196102142334, - "learning_rate": 3.679999999999999e-05, - "loss": 6.9315, + "grad_norm": 31.594478607177734, + "learning_rate": 2.7599999999999997e-05, + "loss": 9.6137, "step": 95 }, { "epoch": 0.1536, - "grad_norm": 24.466903686523438, - "learning_rate": 3.7199999999999996e-05, - "loss": 6.1573, + "grad_norm": 25.19120979309082, + "learning_rate": 2.7899999999999997e-05, + "loss": 8.2984, "step": 96 }, { "epoch": 0.1552, - "grad_norm": 29.544233322143555, - "learning_rate": 3.759999999999999e-05, - "loss": 6.8333, + "grad_norm": 34.27452850341797, + "learning_rate": 2.8199999999999998e-05, + "loss": 9.7313, "step": 97 }, { "epoch": 0.1568, - "grad_norm": 15.694954872131348, - "learning_rate": 3.8e-05, - "loss": 5.5889, + "grad_norm": 20.038379669189453, + "learning_rate": 2.8499999999999998e-05, + "loss": 7.3048, "step": 98 }, { "epoch": 0.1584, - "grad_norm": 18.473628997802734, - "learning_rate": 3.84e-05, - "loss": 5.7588, + "grad_norm": 24.651748657226562, + "learning_rate": 2.88e-05, + "loss": 7.757, "step": 99 }, { "epoch": 0.16, - "grad_norm": 12.698282241821289, - "learning_rate": 3.8799999999999994e-05, - "loss": 5.2909, - "step": 100 - }, - { - "epoch": 0.16, - "eval_cer": 0.9933272212699805, - "eval_loss": 5.788337707519531, - "eval_runtime": 158.8341, - "eval_samples_per_second": 19.744, - "eval_steps_per_second": 1.234, - "eval_wer": 1.0, + "grad_norm": 19.042896270751953, + "learning_rate": 2.91e-05, + "loss": 6.7935, "step": 100 }, { "epoch": 0.1616, "grad_norm": NaN, - "learning_rate": 3.8799999999999994e-05, - "loss": 9.8624, + "learning_rate": 2.91e-05, + "loss": 16.7497, "step": 101 }, { "epoch": 0.1632, - "grad_norm": 55.51943588256836, - "learning_rate": 3.919999999999999e-05, - "loss": 6.8633, + "grad_norm": 59.27364730834961, + "learning_rate": 2.94e-05, + "loss": 10.7056, "step": 102 }, { "epoch": 0.1648, - "grad_norm": 31.776363372802734, - "learning_rate": 3.96e-05, - "loss": 5.9656, + "grad_norm": 40.90422821044922, + "learning_rate": 2.97e-05, + "loss": 8.9462, "step": 103 }, { "epoch": 0.1664, - "grad_norm": 41.10131072998047, - "learning_rate": 3.9999999999999996e-05, - "loss": 6.4203, + "grad_norm": 95.33076477050781, + "learning_rate": 2.9999999999999997e-05, + "loss": 9.9958, "step": 104 }, { "epoch": 0.168, - "grad_norm": 22.717975616455078, - "learning_rate": 4.039999999999999e-05, - "loss": 5.4663, + "grad_norm": 34.60516357421875, + "learning_rate": 3.0299999999999998e-05, + "loss": 7.8844, "step": 105 }, { "epoch": 0.1696, - "grad_norm": 20.537715911865234, - "learning_rate": 4.08e-05, - "loss": 5.4047, + "grad_norm": 36.03730010986328, + "learning_rate": 3.06e-05, + "loss": 7.8807, "step": 106 }, { "epoch": 0.1712, - "grad_norm": 17.539196014404297, - "learning_rate": 4.12e-05, - "loss": 5.2689, + "grad_norm": 33.82999801635742, + "learning_rate": 3.09e-05, + "loss": 7.5575, "step": 107 }, { "epoch": 0.1728, - "grad_norm": 14.2318754196167, - "learning_rate": 4.1599999999999995e-05, - "loss": 4.9925, + "grad_norm": 30.55006217956543, + "learning_rate": 3.119999999999999e-05, + "loss": 7.0033, "step": 108 }, { "epoch": 0.1744, - "grad_norm": 13.268582344055176, - "learning_rate": 4.2e-05, - "loss": 4.9523, + "grad_norm": 31.026508331298828, + "learning_rate": 3.149999999999999e-05, + "loss": 6.9573, "step": 109 }, { "epoch": 0.176, - "grad_norm": 11.914827346801758, - "learning_rate": 4.24e-05, - "loss": 4.8187, + "grad_norm": 29.27751922607422, + "learning_rate": 3.1799999999999994e-05, + "loss": 6.6252, "step": 110 }, { "epoch": 0.1776, - "grad_norm": 9.080737113952637, - "learning_rate": 4.28e-05, - "loss": 4.7312, + "grad_norm": 27.643978118896484, + "learning_rate": 3.2099999999999994e-05, + "loss": 6.4041, "step": 111 }, { "epoch": 0.1792, - "grad_norm": 7.438036918640137, - "learning_rate": 4.319999999999999e-05, - "loss": 4.655, + "grad_norm": 25.53025245666504, + "learning_rate": 3.2399999999999995e-05, + "loss": 6.1614, "step": 112 }, { "epoch": 0.1808, - "grad_norm": 7.071867942810059, - "learning_rate": 4.3599999999999996e-05, - "loss": 4.6643, + "grad_norm": 26.29717254638672, + "learning_rate": 3.2699999999999995e-05, + "loss": 6.1687, "step": 113 }, { "epoch": 0.1824, - "grad_norm": 4.7864603996276855, - "learning_rate": 4.4e-05, - "loss": 4.5668, + "grad_norm": 22.344379425048828, + "learning_rate": 3.2999999999999996e-05, + "loss": 5.8028, "step": 114 }, { "epoch": 0.184, - "grad_norm": 3.781824827194214, - "learning_rate": 4.4399999999999995e-05, - "loss": 4.503, + "grad_norm": 20.374876022338867, + "learning_rate": 3.3299999999999996e-05, + "loss": 5.5785, "step": 115 }, { "epoch": 0.1856, - "grad_norm": 3.6288676261901855, - "learning_rate": 4.48e-05, - "loss": 4.4637, + "grad_norm": 19.226865768432617, + "learning_rate": 3.36e-05, + "loss": 5.4582, "step": 116 }, { "epoch": 0.1872, - "grad_norm": 3.490495443344116, - "learning_rate": 4.5199999999999994e-05, - "loss": 4.467, + "grad_norm": 19.295761108398438, + "learning_rate": 3.39e-05, + "loss": 5.4439, "step": 117 }, { "epoch": 0.1888, - "grad_norm": 3.827871561050415, - "learning_rate": 4.56e-05, - "loss": 4.4434, + "grad_norm": 16.88180923461914, + "learning_rate": 3.42e-05, + "loss": 5.291, "step": 118 }, { "epoch": 0.1904, - "grad_norm": 3.4929349422454834, - "learning_rate": 4.599999999999999e-05, - "loss": 4.4377, + "grad_norm": 18.576812744140625, + "learning_rate": 3.45e-05, + "loss": 5.3835, "step": 119 }, { "epoch": 0.192, - "grad_norm": 3.696927309036255, - "learning_rate": 4.6399999999999996e-05, - "loss": 4.3986, + "grad_norm": 17.378028869628906, + "learning_rate": 3.48e-05, + "loss": 5.2777, "step": 120 }, { "epoch": 0.1936, - "grad_norm": 3.202934980392456, - "learning_rate": 4.68e-05, - "loss": 4.3563, + "grad_norm": 16.458574295043945, + "learning_rate": 3.51e-05, + "loss": 5.1775, "step": 121 }, { "epoch": 0.1952, - "grad_norm": 4.273110866546631, - "learning_rate": 4.7199999999999995e-05, - "loss": 4.2898, + "grad_norm": 11.267019271850586, + "learning_rate": 3.539999999999999e-05, + "loss": 4.9021, "step": 122 }, { "epoch": 0.1968, - "grad_norm": 4.1607346534729, - "learning_rate": 4.76e-05, - "loss": 4.2119, + "grad_norm": 10.104020118713379, + "learning_rate": 3.5699999999999994e-05, + "loss": 4.812, "step": 123 }, { "epoch": 0.1984, - "grad_norm": 4.52202033996582, - "learning_rate": 4.7999999999999994e-05, - "loss": 4.225, + "grad_norm": 7.518466949462891, + "learning_rate": 3.5999999999999994e-05, + "loss": 4.7303, "step": 124 }, { "epoch": 0.2, - "grad_norm": 4.284320831298828, - "learning_rate": 4.84e-05, - "loss": 4.1765, + "grad_norm": 6.941814422607422, + "learning_rate": 3.6299999999999995e-05, + "loss": 4.6951, "step": 125 }, { "epoch": 0.2016, - "grad_norm": 2.703249454498291, - "learning_rate": 4.8799999999999994e-05, - "loss": 4.1354, + "grad_norm": 8.405250549316406, + "learning_rate": 3.6599999999999995e-05, + "loss": 4.7108, "step": 126 }, { "epoch": 0.2032, - "grad_norm": 2.5615289211273193, - "learning_rate": 4.9199999999999997e-05, - "loss": 4.1119, + "grad_norm": 5.605133056640625, + "learning_rate": 3.6899999999999996e-05, + "loss": 4.6324, "step": 127 }, { "epoch": 0.2048, - "grad_norm": 2.4537241458892822, - "learning_rate": 4.959999999999999e-05, - "loss": 4.0635, + "grad_norm": 4.763094902038574, + "learning_rate": 3.7199999999999996e-05, + "loss": 4.5848, "step": 128 }, { "epoch": 0.2064, - "grad_norm": 2.5134150981903076, - "learning_rate": 4.9999999999999996e-05, - "loss": 4.0521, + "grad_norm": 4.2179646492004395, + "learning_rate": 3.75e-05, + "loss": 4.5576, "step": 129 }, { "epoch": 0.208, - "grad_norm": 3.711264133453369, - "learning_rate": 5.04e-05, - "loss": 4.0572, + "grad_norm": 4.336385726928711, + "learning_rate": 3.78e-05, + "loss": 4.535, "step": 130 }, { "epoch": 0.2096, - "grad_norm": 3.0089399814605713, - "learning_rate": 5.0799999999999995e-05, - "loss": 4.0094, + "grad_norm": 3.6008989810943604, + "learning_rate": 3.81e-05, + "loss": 4.481, "step": 131 }, { "epoch": 0.2112, - "grad_norm": 4.526411533355713, - "learning_rate": 5.119999999999999e-05, - "loss": 4.0729, + "grad_norm": 3.8394198417663574, + "learning_rate": 3.84e-05, + "loss": 4.5259, "step": 132 }, { "epoch": 0.2128, - "grad_norm": 3.459010362625122, - "learning_rate": 5.1599999999999994e-05, - "loss": 4.0024, + "grad_norm": 3.363095998764038, + "learning_rate": 3.87e-05, + "loss": 4.4499, "step": 133 }, { "epoch": 0.2144, - "grad_norm": 4.967794418334961, - "learning_rate": 5.2e-05, - "loss": 4.0761, + "grad_norm": 3.6167714595794678, + "learning_rate": 3.9e-05, + "loss": 4.4849, "step": 134 }, { "epoch": 0.216, - "grad_norm": 1.4037067890167236, - "learning_rate": 5.239999999999999e-05, - "loss": 3.9801, + "grad_norm": 4.084835529327393, + "learning_rate": 3.93e-05, + "loss": 4.4275, "step": 135 }, { "epoch": 0.2176, - "grad_norm": 6.1755781173706055, - "learning_rate": 5.279999999999999e-05, - "loss": 3.9841, + "grad_norm": 3.523148775100708, + "learning_rate": 3.96e-05, + "loss": 4.3896, "step": 136 }, { "epoch": 0.2192, - "grad_norm": 2.2986528873443604, - "learning_rate": 5.32e-05, - "loss": 4.0046, + "grad_norm": 3.151064157485962, + "learning_rate": 3.99e-05, + "loss": 4.3881, "step": 137 }, { "epoch": 0.2208, - "grad_norm": 2.812514066696167, - "learning_rate": 5.3599999999999995e-05, - "loss": 3.993, + "grad_norm": 3.005023717880249, + "learning_rate": 4.02e-05, + "loss": 4.3544, "step": 138 }, { "epoch": 0.2224, - "grad_norm": 1.9895284175872803, - "learning_rate": 5.399999999999999e-05, - "loss": 4.0296, + "grad_norm": 2.9251019954681396, + "learning_rate": 4.05e-05, + "loss": 4.3701, "step": 139 }, { "epoch": 0.224, - "grad_norm": 3.297950506210327, - "learning_rate": 5.44e-05, - "loss": 3.936, + "grad_norm": 4.313841819763184, + "learning_rate": 4.08e-05, + "loss": 4.2844, "step": 140 }, { "epoch": 0.2256, - "grad_norm": 2.8747730255126953, - "learning_rate": 5.48e-05, - "loss": 3.9037, + "grad_norm": 3.625789165496826, + "learning_rate": 4.11e-05, + "loss": 4.2265, "step": 141 }, { "epoch": 0.2272, - "grad_norm": 0.9972714781761169, - "learning_rate": 5.519999999999999e-05, - "loss": 3.9606, + "grad_norm": 2.4950482845306396, + "learning_rate": 4.14e-05, + "loss": 4.2455, "step": 142 }, { "epoch": 0.2288, - "grad_norm": 3.7870514392852783, - "learning_rate": 5.559999999999999e-05, - "loss": 3.878, + "grad_norm": 2.7788918018341064, + "learning_rate": 4.17e-05, + "loss": 4.1592, "step": 143 }, { "epoch": 0.2304, - "grad_norm": 2.812012195587158, - "learning_rate": 5.6e-05, - "loss": 3.8778, + "grad_norm": 2.6029088497161865, + "learning_rate": 4.2e-05, + "loss": 4.1254, "step": 144 }, { "epoch": 0.232, - "grad_norm": 3.2882652282714844, - "learning_rate": 5.6399999999999995e-05, - "loss": 3.7931, + "grad_norm": 2.919585704803467, + "learning_rate": 4.229999999999999e-05, + "loss": 4.0502, "step": 145 }, { "epoch": 0.2336, - "grad_norm": 1.2659672498703003, - "learning_rate": 5.679999999999999e-05, - "loss": 3.8827, + "grad_norm": 2.0354297161102295, + "learning_rate": 4.259999999999999e-05, + "loss": 4.1037, "step": 146 }, { "epoch": 0.2352, - "grad_norm": 3.4999775886535645, - "learning_rate": 5.72e-05, - "loss": 3.8191, + "grad_norm": 2.800039529800415, + "learning_rate": 4.289999999999999e-05, + "loss": 4.0396, "step": 147 }, { "epoch": 0.2368, - "grad_norm": 1.5007144212722778, - "learning_rate": 5.76e-05, - "loss": 3.8964, + "grad_norm": 3.689743995666504, + "learning_rate": 4.319999999999999e-05, + "loss": 4.1239, "step": 148 }, { "epoch": 0.2384, - "grad_norm": 1.7462321519851685, - "learning_rate": 5.7999999999999994e-05, - "loss": 3.9211, + "grad_norm": 3.157559871673584, + "learning_rate": 4.3499999999999993e-05, + "loss": 4.1312, "step": 149 }, { "epoch": 0.24, - "grad_norm": 1.0943163633346558, - "learning_rate": 5.839999999999999e-05, - "loss": 3.9544, + "grad_norm": 1.6889827251434326, + "learning_rate": 4.3799999999999994e-05, + "loss": 4.1446, "step": 150 }, { "epoch": 0.2416, - "grad_norm": 49.87346649169922, - "learning_rate": 5.88e-05, - "loss": 5.3277, + "grad_norm": 43.56467819213867, + "learning_rate": 4.4099999999999995e-05, + "loss": 5.576, "step": 151 }, { "epoch": 0.2432, - "grad_norm": 19.33259391784668, - "learning_rate": 5.9199999999999996e-05, - "loss": 4.1933, + "grad_norm": 17.566373825073242, + "learning_rate": 4.4399999999999995e-05, + "loss": 4.3442, "step": 152 }, { "epoch": 0.2448, - "grad_norm": 42.26847839355469, - "learning_rate": 5.959999999999999e-05, - "loss": 4.3403, + "grad_norm": 19.273839950561523, + "learning_rate": 4.4699999999999996e-05, + "loss": 4.4241, "step": 153 }, { "epoch": 0.2464, - "grad_norm": 12.728076934814453, - "learning_rate": 5.9999999999999995e-05, - "loss": 3.996, + "grad_norm": 43.27154541015625, + "learning_rate": 4.4999999999999996e-05, + "loss": 4.0996, "step": 154 }, { "epoch": 0.248, - "grad_norm": 26.57826805114746, - "learning_rate": 6.04e-05, - "loss": 4.5252, + "grad_norm": 22.586294174194336, + "learning_rate": 4.5299999999999997e-05, + "loss": 4.5675, "step": 155 }, { "epoch": 0.2496, - "grad_norm": 7.899193286895752, - "learning_rate": 6.0799999999999994e-05, - "loss": 3.8146, + "grad_norm": 5.962088108062744, + "learning_rate": 4.56e-05, + "loss": 3.9192, "step": 156 }, { "epoch": 0.2512, - "grad_norm": 7.121301651000977, - "learning_rate": 6.12e-05, - "loss": 3.8425, + "grad_norm": 5.2486701011657715, + "learning_rate": 4.59e-05, + "loss": 3.929, "step": 157 }, { "epoch": 0.2528, - "grad_norm": 17.966394424438477, - "learning_rate": 6.159999999999999e-05, - "loss": 3.8739, + "grad_norm": 6.9419331550598145, + "learning_rate": 4.62e-05, + "loss": 3.9572, "step": 158 }, { "epoch": 0.2544, - "grad_norm": 3.4284541606903076, - "learning_rate": 6.199999999999999e-05, - "loss": 3.817, + "grad_norm": 10.546914100646973, + "learning_rate": 4.65e-05, + "loss": 3.9106, "step": 159 }, { "epoch": 0.256, - "grad_norm": 0.9217383861541748, - "learning_rate": 6.239999999999999e-05, - "loss": 3.7066, + "grad_norm": 1.4478720426559448, + "learning_rate": 4.68e-05, + "loss": 3.816, "step": 160 }, { "epoch": 0.2576, - "grad_norm": 1.034417986869812, - "learning_rate": 6.28e-05, - "loss": 3.7111, + "grad_norm": 1.5371733903884888, + "learning_rate": 4.709999999999999e-05, + "loss": 3.8152, "step": 161 }, { "epoch": 0.2592, - "grad_norm": 1.0513995885849, - "learning_rate": 6.319999999999999e-05, - "loss": 3.7579, + "grad_norm": 1.1498860120773315, + "learning_rate": 4.7399999999999993e-05, + "loss": 3.8535, "step": 162 }, { "epoch": 0.2608, - "grad_norm": 3.502819776535034, - "learning_rate": 6.359999999999999e-05, - "loss": 3.7508, + "grad_norm": 3.339113473892212, + "learning_rate": 4.7699999999999994e-05, + "loss": 3.8388, "step": 163 }, { "epoch": 0.2624, - "grad_norm": 2.819303035736084, - "learning_rate": 6.4e-05, - "loss": 3.697, + "grad_norm": 2.404588460922241, + "learning_rate": 4.7999999999999994e-05, + "loss": 3.7853, "step": 164 }, { "epoch": 0.264, - "grad_norm": 2.330233097076416, - "learning_rate": 6.44e-05, - "loss": 3.7655, + "grad_norm": 1.7241195440292358, + "learning_rate": 4.8299999999999995e-05, + "loss": 3.8405, "step": 165 }, { "epoch": 0.2656, - "grad_norm": 1.2856422662734985, - "learning_rate": 6.479999999999999e-05, - "loss": 3.7024, + "grad_norm": 1.0453760623931885, + "learning_rate": 4.8599999999999995e-05, + "loss": 3.7872, "step": 166 }, { "epoch": 0.2672, - "grad_norm": 0.9377660155296326, - "learning_rate": 6.519999999999999e-05, - "loss": 3.7571, + "grad_norm": 0.8506628274917603, + "learning_rate": 4.8899999999999996e-05, + "loss": 3.8375, "step": 167 }, { "epoch": 0.2688, - "grad_norm": 2.6969499588012695, - "learning_rate": 6.56e-05, - "loss": 3.6941, + "grad_norm": 2.324955940246582, + "learning_rate": 4.9199999999999997e-05, + "loss": 3.7611, "step": 168 }, { "epoch": 0.2704, - "grad_norm": 3.896618604660034, - "learning_rate": 6.599999999999999e-05, - "loss": 3.7757, + "grad_norm": 3.906370162963867, + "learning_rate": 4.95e-05, + "loss": 3.8567, "step": 169 }, { "epoch": 0.272, - "grad_norm": 0.6105890870094299, - "learning_rate": 6.639999999999999e-05, - "loss": 3.7015, + "grad_norm": 0.7771504521369934, + "learning_rate": 4.98e-05, + "loss": 3.7718, "step": 170 }, { "epoch": 0.2736, - "grad_norm": 1.8918788433074951, - "learning_rate": 6.68e-05, - "loss": 3.6959, + "grad_norm": 1.1586111783981323, + "learning_rate": 5.01e-05, + "loss": 3.7604, "step": 171 }, { "epoch": 0.2752, - "grad_norm": 1.5491015911102295, - "learning_rate": 6.72e-05, - "loss": 3.6853, + "grad_norm": 1.0725429058074951, + "learning_rate": 5.04e-05, + "loss": 3.7399, "step": 172 }, { "epoch": 0.2768, - "grad_norm": 2.8067069053649902, - "learning_rate": 6.759999999999999e-05, - "loss": 3.7012, + "grad_norm": 2.3980965614318848, + "learning_rate": 5.07e-05, + "loss": 3.7637, "step": 173 }, { "epoch": 0.2784, - "grad_norm": 17.379011154174805, - "learning_rate": 6.799999999999999e-05, - "loss": 3.7106, + "grad_norm": 1.2434422969818115, + "learning_rate": 5.1e-05, + "loss": 3.7556, "step": 174 }, { "epoch": 0.28, - "grad_norm": 2.3803513050079346, - "learning_rate": 6.84e-05, - "loss": 3.7663, + "grad_norm": 2.1702017784118652, + "learning_rate": 5.13e-05, + "loss": 3.8249, "step": 175 }, { "epoch": 0.2816, - "grad_norm": 0.6946349740028381, - "learning_rate": 6.879999999999999e-05, - "loss": 3.7089, + "grad_norm": 0.7948981523513794, + "learning_rate": 5.1599999999999994e-05, + "loss": 3.7648, "step": 176 }, { "epoch": 0.2832, - "grad_norm": 2.5919649600982666, - "learning_rate": 6.919999999999999e-05, - "loss": 3.6104, + "grad_norm": 2.3048489093780518, + "learning_rate": 5.1899999999999994e-05, + "loss": 3.6571, "step": 177 }, { "epoch": 0.2848, - "grad_norm": 1.1172287464141846, - "learning_rate": 6.96e-05, - "loss": 3.7244, + "grad_norm": 0.9848898649215698, + "learning_rate": 5.2199999999999995e-05, + "loss": 3.7757, "step": 178 }, { "epoch": 0.2864, - "grad_norm": 2.474264144897461, - "learning_rate": 7e-05, - "loss": 3.7115, + "grad_norm": 2.904839515686035, + "learning_rate": 5.2499999999999995e-05, + "loss": 3.7699, "step": 179 }, { "epoch": 0.288, - "grad_norm": 2.681856870651245, - "learning_rate": 7.039999999999999e-05, - "loss": 3.6786, + "grad_norm": 1.1673673391342163, + "learning_rate": 5.279999999999999e-05, + "loss": 3.7191, "step": 180 }, { "epoch": 0.2896, - "grad_norm": 1.6489158868789673, - "learning_rate": 7.079999999999999e-05, - "loss": 3.6661, + "grad_norm": 1.269823670387268, + "learning_rate": 5.309999999999999e-05, + "loss": 3.7021, "step": 181 }, { "epoch": 0.2912, - "grad_norm": 1.1805118322372437, - "learning_rate": 7.12e-05, - "loss": 3.654, + "grad_norm": 0.9047891497612, + "learning_rate": 5.339999999999999e-05, + "loss": 3.7024, "step": 182 }, { "epoch": 0.2928, - "grad_norm": 2.3934733867645264, - "learning_rate": 7.159999999999999e-05, - "loss": 3.7694, + "grad_norm": 2.7220137119293213, + "learning_rate": 5.369999999999999e-05, + "loss": 3.8191, "step": 183 }, { "epoch": 0.2944, - "grad_norm": 1.6547563076019287, - "learning_rate": 7.199999999999999e-05, - "loss": 3.6795, + "grad_norm": 1.4135470390319824, + "learning_rate": 5.399999999999999e-05, + "loss": 3.7193, "step": 184 }, { "epoch": 0.296, - "grad_norm": 3.933615207672119, - "learning_rate": 7.24e-05, - "loss": 3.7822, + "grad_norm": 4.183174133300781, + "learning_rate": 5.429999999999999e-05, + "loss": 3.832, "step": 185 }, { "epoch": 0.2976, - "grad_norm": 2.2447006702423096, - "learning_rate": 7.28e-05, - "loss": 3.7498, + "grad_norm": 2.3090178966522217, + "learning_rate": 5.459999999999999e-05, + "loss": 3.7992, "step": 186 }, { "epoch": 0.2992, - "grad_norm": 1.272269606590271, - "learning_rate": 7.319999999999999e-05, - "loss": 3.7266, + "grad_norm": 1.39043128490448, + "learning_rate": 5.489999999999999e-05, + "loss": 3.7616, "step": 187 }, { "epoch": 0.3008, - "grad_norm": 0.6136051416397095, - "learning_rate": 7.359999999999999e-05, - "loss": 3.6942, + "grad_norm": 0.6333518028259277, + "learning_rate": 5.519999999999999e-05, + "loss": 3.7379, "step": 188 }, { "epoch": 0.3024, - "grad_norm": 0.6757500767707825, - "learning_rate": 7.4e-05, - "loss": 3.6945, + "grad_norm": 0.6841683387756348, + "learning_rate": 5.5499999999999994e-05, + "loss": 3.7347, "step": 189 }, { "epoch": 0.304, - "grad_norm": 1.0322827100753784, - "learning_rate": 7.439999999999999e-05, - "loss": 3.6552, + "grad_norm": 0.857160747051239, + "learning_rate": 5.5799999999999994e-05, + "loss": 3.6922, "step": 190 }, { "epoch": 0.3056, - "grad_norm": 0.9092890620231628, - "learning_rate": 7.479999999999999e-05, - "loss": 3.7145, + "grad_norm": 1.119098424911499, + "learning_rate": 5.6099999999999995e-05, + "loss": 3.7586, "step": 191 }, { "epoch": 0.3072, - "grad_norm": 0.8890368342399597, - "learning_rate": 7.519999999999998e-05, - "loss": 3.765, + "grad_norm": 0.9412086606025696, + "learning_rate": 5.6399999999999995e-05, + "loss": 3.8095, "step": 192 }, { "epoch": 0.3088, - "grad_norm": 1.3483480215072632, - "learning_rate": 7.56e-05, - "loss": 3.771, + "grad_norm": 0.7174120545387268, + "learning_rate": 5.6699999999999996e-05, + "loss": 3.8177, "step": 193 }, { "epoch": 0.3104, - "grad_norm": 3.8980510234832764, - "learning_rate": 7.6e-05, - "loss": 3.7792, + "grad_norm": 3.7620341777801514, + "learning_rate": 5.6999999999999996e-05, + "loss": 3.8253, "step": 194 }, { "epoch": 0.312, - "grad_norm": 0.7139620780944824, - "learning_rate": 7.639999999999999e-05, - "loss": 3.7156, + "grad_norm": 0.6953389644622803, + "learning_rate": 5.73e-05, + "loss": 3.7556, "step": 195 }, { "epoch": 0.3136, - "grad_norm": 1.751401424407959, - "learning_rate": 7.68e-05, - "loss": 3.6977, + "grad_norm": 2.053453207015991, + "learning_rate": 5.76e-05, + "loss": 3.7528, "step": 196 }, { "epoch": 0.3152, - "grad_norm": 1.1341018676757812, - "learning_rate": 7.72e-05, - "loss": 3.6716, + "grad_norm": 0.926834225654602, + "learning_rate": 5.79e-05, + "loss": 3.7177, "step": 197 }, { "epoch": 0.3168, - "grad_norm": 1.0171042680740356, - "learning_rate": 7.759999999999999e-05, - "loss": 3.719, + "grad_norm": 0.9621994495391846, + "learning_rate": 5.82e-05, + "loss": 3.7611, "step": 198 }, { "epoch": 0.3184, - "grad_norm": 2.265624523162842, - "learning_rate": 7.8e-05, - "loss": 3.7401, + "grad_norm": 2.589616537094116, + "learning_rate": 5.85e-05, + "loss": 3.7764, "step": 199 }, { "epoch": 0.32, "grad_norm": NaN, - "learning_rate": 7.8e-05, - "loss": 4.4471, - "step": 200 - }, - { - "epoch": 0.32, - "eval_cer": 0.9811814884202974, - "eval_loss": 3.759681463241577, - "eval_runtime": 157.878, - "eval_samples_per_second": 19.863, - "eval_steps_per_second": 1.241, - "eval_wer": 1.0, + "learning_rate": 5.85e-05, + "loss": 4.5885, "step": 200 }, { "epoch": 0.3216, - "grad_norm": 40.01380920410156, - "learning_rate": 7.839999999999998e-05, - "loss": 4.8451, + "grad_norm": 36.955745697021484, + "learning_rate": 5.88e-05, + "loss": 4.8317, "step": 201 }, { "epoch": 0.3232, - "grad_norm": 16.052330017089844, - "learning_rate": 7.879999999999999e-05, - "loss": 3.9661, + "grad_norm": 14.525715827941895, + "learning_rate": 5.91e-05, + "loss": 3.9233, "step": 202 }, { "epoch": 0.3248, - "grad_norm": 31.00684928894043, - "learning_rate": 7.92e-05, - "loss": 4.454, + "grad_norm": 26.20575714111328, + "learning_rate": 5.94e-05, + "loss": 4.3815, "step": 203 }, { "epoch": 0.3264, - "grad_norm": 6.007049083709717, - "learning_rate": 7.959999999999998e-05, - "loss": 3.7405, + "grad_norm": 6.467894554138184, + "learning_rate": 5.97e-05, + "loss": 3.768, "step": 204 }, { "epoch": 0.328, - "grad_norm": 8.176580429077148, - "learning_rate": 7.999999999999999e-05, - "loss": 3.7768, + "grad_norm": 6.852605819702148, + "learning_rate": 5.9999999999999995e-05, + "loss": 3.7986, "step": 205 }, { "epoch": 0.3296, - "grad_norm": 10.413802146911621, - "learning_rate": 8.04e-05, - "loss": 3.9944, + "grad_norm": 10.890352249145508, + "learning_rate": 6.0299999999999995e-05, + "loss": 4.0176, "step": 206 }, { "epoch": 0.3312, - "grad_norm": 3.2589361667633057, - "learning_rate": 8.079999999999999e-05, - "loss": 3.7885, + "grad_norm": 5.478235721588135, + "learning_rate": 6.0599999999999996e-05, + "loss": 3.8291, "step": 207 }, { "epoch": 0.3328, - "grad_norm": 4.633168697357178, - "learning_rate": 8.12e-05, - "loss": 3.7046, + "grad_norm": 2.3643927574157715, + "learning_rate": 6.0899999999999996e-05, + "loss": 3.692, "step": 208 }, { "epoch": 0.3344, - "grad_norm": 3.281346321105957, - "learning_rate": 8.16e-05, - "loss": 3.8043, + "grad_norm": 0.9052429795265198, + "learning_rate": 6.12e-05, + "loss": 3.8016, "step": 209 }, { "epoch": 0.336, - "grad_norm": 4.552253246307373, - "learning_rate": 8.199999999999999e-05, - "loss": 3.7858, + "grad_norm": 2.090831995010376, + "learning_rate": 6.149999999999999e-05, + "loss": 3.7758, "step": 210 }, { "epoch": 0.3376, - "grad_norm": 2.2564077377319336, - "learning_rate": 8.24e-05, - "loss": 3.8172, + "grad_norm": 1.3076395988464355, + "learning_rate": 6.18e-05, + "loss": 3.8275, "step": 211 }, { "epoch": 0.3392, - "grad_norm": 4.931669235229492, - "learning_rate": 8.28e-05, - "loss": 3.7392, + "grad_norm": 3.603741407394409, + "learning_rate": 6.209999999999999e-05, + "loss": 3.7251, "step": 212 }, { "epoch": 0.3408, - "grad_norm": 5.935787677764893, - "learning_rate": 8.319999999999999e-05, - "loss": 3.6674, + "grad_norm": 5.31221866607666, + "learning_rate": 6.239999999999999e-05, + "loss": 3.6644, "step": 213 }, { "epoch": 0.3424, - "grad_norm": 5.499577045440674, - "learning_rate": 8.36e-05, - "loss": 3.6903, + "grad_norm": 5.733158588409424, + "learning_rate": 6.269999999999999e-05, + "loss": 3.7033, "step": 214 }, { "epoch": 0.344, - "grad_norm": 2.0088694095611572, - "learning_rate": 8.4e-05, - "loss": 3.6526, + "grad_norm": 3.1695661544799805, + "learning_rate": 6.299999999999999e-05, + "loss": 3.6704, "step": 215 }, { "epoch": 0.3456, - "grad_norm": 1.7222843170166016, - "learning_rate": 8.439999999999999e-05, - "loss": 3.6685, + "grad_norm": 1.594991683959961, + "learning_rate": 6.33e-05, + "loss": 3.6788, "step": 216 }, { "epoch": 0.3472, - "grad_norm": 1.0298047065734863, - "learning_rate": 8.48e-05, - "loss": 3.6548, + "grad_norm": 1.6268411874771118, + "learning_rate": 6.359999999999999e-05, + "loss": 3.6691, "step": 217 }, { "epoch": 0.3488, - "grad_norm": 0.6882361173629761, - "learning_rate": 8.519999999999998e-05, - "loss": 3.5886, + "grad_norm": 1.526171326637268, + "learning_rate": 6.39e-05, + "loss": 3.613, "step": 218 }, { "epoch": 0.3504, - "grad_norm": 3.674741744995117, - "learning_rate": 8.56e-05, - "loss": 3.6667, + "grad_norm": 4.093273162841797, + "learning_rate": 6.419999999999999e-05, + "loss": 3.6907, "step": 219 }, { "epoch": 0.352, - "grad_norm": 0.8755781054496765, - "learning_rate": 8.6e-05, - "loss": 3.5641, + "grad_norm": 2.1383774280548096, + "learning_rate": 6.45e-05, + "loss": 3.5904, "step": 220 }, { "epoch": 0.3536, - "grad_norm": 1.6391894817352295, - "learning_rate": 8.639999999999999e-05, - "loss": 3.598, + "grad_norm": 4.359639644622803, + "learning_rate": 6.479999999999999e-05, + "loss": 3.6432, "step": 221 }, { "epoch": 0.3552, - "grad_norm": 4.3577165603637695, - "learning_rate": 8.68e-05, - "loss": 3.7042, + "grad_norm": 7.548089027404785, + "learning_rate": 6.51e-05, + "loss": 3.7759, "step": 222 }, { "epoch": 0.3568, - "grad_norm": 7.5377960205078125, - "learning_rate": 8.719999999999999e-05, - "loss": 3.7538, + "grad_norm": 10.02462100982666, + "learning_rate": 6.539999999999999e-05, + "loss": 3.8324, "step": 223 }, { "epoch": 0.3584, - "grad_norm": 0.9260856509208679, - "learning_rate": 8.759999999999999e-05, - "loss": 3.5839, + "grad_norm": 2.3231656551361084, + "learning_rate": 6.57e-05, + "loss": 3.6139, "step": 224 }, { "epoch": 0.36, - "grad_norm": 1.126509189605713, - "learning_rate": 8.8e-05, - "loss": 3.7167, + "grad_norm": 2.966249942779541, + "learning_rate": 6.599999999999999e-05, + "loss": 3.743, "step": 225 }, { "epoch": 0.3616, - "grad_norm": 0.5898628830909729, - "learning_rate": 8.84e-05, - "loss": 3.6411, + "grad_norm": 1.6161023378372192, + "learning_rate": 6.63e-05, + "loss": 3.6788, "step": 226 }, { "epoch": 0.3632, - "grad_norm": 1.984059453010559, - "learning_rate": 8.879999999999999e-05, - "loss": 3.8212, + "grad_norm": 2.224886655807495, + "learning_rate": 6.659999999999999e-05, + "loss": 3.8758, "step": 227 }, { "epoch": 0.3648, - "grad_norm": 2.239217519760132, - "learning_rate": 8.92e-05, - "loss": 3.6481, + "grad_norm": 3.000509262084961, + "learning_rate": 6.69e-05, + "loss": 3.6772, "step": 228 }, { "epoch": 0.3664, - "grad_norm": 0.8924047946929932, - "learning_rate": 8.96e-05, - "loss": 3.6647, + "grad_norm": 2.5037214756011963, + "learning_rate": 6.72e-05, + "loss": 3.7001, "step": 229 }, { "epoch": 0.368, - "grad_norm": 2.23852801322937, - "learning_rate": 8.999999999999999e-05, - "loss": 3.605, + "grad_norm": 4.560110092163086, + "learning_rate": 6.75e-05, + "loss": 3.6615, "step": 230 }, { "epoch": 0.3696, - "grad_norm": 1.5995547771453857, - "learning_rate": 9.039999999999999e-05, - "loss": 3.6514, + "grad_norm": 5.233952045440674, + "learning_rate": 6.78e-05, + "loss": 3.7162, "step": 231 }, { "epoch": 0.3712, - "grad_norm": 1.6624646186828613, - "learning_rate": 9.079999999999998e-05, - "loss": 3.5983, + "grad_norm": 4.030121803283691, + "learning_rate": 6.81e-05, + "loss": 3.6467, "step": 232 }, { "epoch": 0.3728, - "grad_norm": 2.7629284858703613, - "learning_rate": 9.12e-05, - "loss": 3.6708, + "grad_norm": 3.1937036514282227, + "learning_rate": 6.84e-05, + "loss": 3.693, "step": 233 }, { "epoch": 0.3744, - "grad_norm": 5.317010879516602, - "learning_rate": 9.159999999999999e-05, - "loss": 3.8436, + "grad_norm": 1.4005098342895508, + "learning_rate": 6.87e-05, + "loss": 3.8877, "step": 234 }, { "epoch": 0.376, - "grad_norm": 1.7832276821136475, - "learning_rate": 9.199999999999999e-05, - "loss": 3.5452, + "grad_norm": 3.893272876739502, + "learning_rate": 6.9e-05, + "loss": 3.593, "step": 235 }, { "epoch": 0.3776, - "grad_norm": 0.7123947143554688, - "learning_rate": 9.24e-05, - "loss": 3.5936, + "grad_norm": 0.6252413988113403, + "learning_rate": 6.93e-05, + "loss": 3.6299, "step": 236 }, { "epoch": 0.3792, - "grad_norm": 1.8740249872207642, - "learning_rate": 9.279999999999999e-05, - "loss": 3.5139, + "grad_norm": 0.9029965400695801, + "learning_rate": 6.96e-05, + "loss": 3.5456, "step": 237 }, { "epoch": 0.3808, - "grad_norm": 0.8917979001998901, - "learning_rate": 9.319999999999999e-05, - "loss": 3.6048, + "grad_norm": 1.729459285736084, + "learning_rate": 6.989999999999999e-05, + "loss": 3.6351, "step": 238 }, { "epoch": 0.3824, - "grad_norm": 1.3209342956542969, - "learning_rate": 9.36e-05, - "loss": 3.5569, + "grad_norm": 4.091501235961914, + "learning_rate": 7.02e-05, + "loss": 3.6169, "step": 239 }, { "epoch": 0.384, - "grad_norm": 2.313504934310913, - "learning_rate": 9.4e-05, - "loss": 3.6239, + "grad_norm": 5.327137470245361, + "learning_rate": 7.049999999999999e-05, + "loss": 3.695, "step": 240 }, { "epoch": 0.3856, - "grad_norm": 2.1547036170959473, - "learning_rate": 9.439999999999999e-05, - "loss": 3.6724, + "grad_norm": 5.34423303604126, + "learning_rate": 7.079999999999999e-05, + "loss": 3.7424, "step": 241 }, { "epoch": 0.3872, - "grad_norm": 0.8160881400108337, - "learning_rate": 9.479999999999999e-05, - "loss": 3.6857, + "grad_norm": 3.87858510017395, + "learning_rate": 7.11e-05, + "loss": 3.7498, "step": 242 }, { "epoch": 0.3888, - "grad_norm": 2.1986379623413086, - "learning_rate": 9.52e-05, - "loss": 3.5633, + "grad_norm": 1.0561057329177856, + "learning_rate": 7.139999999999999e-05, + "loss": 3.6034, "step": 243 }, { "epoch": 0.3904, - "grad_norm": 1.8258991241455078, - "learning_rate": 9.559999999999999e-05, - "loss": 3.6461, + "grad_norm": 0.686858594417572, + "learning_rate": 7.17e-05, + "loss": 3.6911, "step": 244 }, { "epoch": 0.392, - "grad_norm": 1.290541410446167, - "learning_rate": 9.599999999999999e-05, - "loss": 3.7378, + "grad_norm": 1.0606696605682373, + "learning_rate": 7.199999999999999e-05, + "loss": 3.7628, "step": 245 }, { "epoch": 0.3936, - "grad_norm": 2.4464797973632812, - "learning_rate": 9.64e-05, - "loss": 3.584, + "grad_norm": 4.318437576293945, + "learning_rate": 7.23e-05, + "loss": 3.6438, "step": 246 }, { "epoch": 0.3952, - "grad_norm": 1.3107831478118896, - "learning_rate": 9.68e-05, - "loss": 3.6602, + "grad_norm": 3.086888074874878, + "learning_rate": 7.259999999999999e-05, + "loss": 3.7093, "step": 247 }, { "epoch": 0.3968, - "grad_norm": 1.3498066663742065, - "learning_rate": 9.719999999999999e-05, - "loss": 3.6242, + "grad_norm": 4.907326698303223, + "learning_rate": 7.29e-05, + "loss": 3.709, "step": 248 }, { "epoch": 0.3984, - "grad_norm": 1.8473575115203857, - "learning_rate": 9.759999999999999e-05, - "loss": 3.6759, + "grad_norm": 5.898931503295898, + "learning_rate": 7.319999999999999e-05, + "loss": 3.775, "step": 249 }, { "epoch": 0.4, - "grad_norm": 1.866666555404663, - "learning_rate": 9.799999999999998e-05, - "loss": 3.753, + "grad_norm": 6.163908004760742, + "learning_rate": 7.35e-05, + "loss": 3.8484, "step": 250 }, { "epoch": 0.4016, - "grad_norm": 100.26708984375, - "learning_rate": 9.839999999999999e-05, - "loss": 7.058, + "grad_norm": 61.644100189208984, + "learning_rate": 7.379999999999999e-05, + "loss": 5.9524, "step": 251 }, { "epoch": 0.4032, - "grad_norm": 24.822065353393555, - "learning_rate": 9.879999999999999e-05, - "loss": 4.1968, + "grad_norm": 12.312813758850098, + "learning_rate": 7.41e-05, + "loss": 3.862, "step": 252 }, { "epoch": 0.4048, - "grad_norm": 22.42405128479004, - "learning_rate": 9.919999999999999e-05, - "loss": 4.3078, + "grad_norm": 11.08806037902832, + "learning_rate": 7.439999999999999e-05, + "loss": 4.0092, "step": 253 }, { "epoch": 0.4064, - "grad_norm": 45.404808044433594, - "learning_rate": 9.96e-05, - "loss": 4.1147, + "grad_norm": 10.701560974121094, + "learning_rate": 7.47e-05, + "loss": 3.9019, "step": 254 }, { "epoch": 0.408, - "grad_norm": 18.83899688720703, - "learning_rate": 9.999999999999999e-05, - "loss": 4.0957, + "grad_norm": 8.322529792785645, + "learning_rate": 7.5e-05, + "loss": 3.8767, "step": 255 }, { "epoch": 0.4096, - "grad_norm": 25.831239700317383, - "learning_rate": 0.00010039999999999999, - "loss": 4.1593, + "grad_norm": 11.361836433410645, + "learning_rate": 7.529999999999999e-05, + "loss": 4.0091, "step": 256 }, { "epoch": 0.4112, - "grad_norm": 8.648876190185547, - "learning_rate": 0.0001008, - "loss": 3.7909, + "grad_norm": 1.967650055885315, + "learning_rate": 7.56e-05, + "loss": 3.7382, "step": 257 }, { "epoch": 0.4128, - "grad_norm": 8.114570617675781, - "learning_rate": 0.0001012, - "loss": 3.9005, + "grad_norm": 2.5838770866394043, + "learning_rate": 7.589999999999999e-05, + "loss": 3.8723, "step": 258 }, { "epoch": 0.4144, - "grad_norm": 4.483002662658691, - "learning_rate": 0.00010159999999999999, - "loss": 3.6031, + "grad_norm": 7.383574962615967, + "learning_rate": 7.62e-05, + "loss": 3.6693, "step": 259 }, { "epoch": 0.416, - "grad_norm": 1.673669457435608, - "learning_rate": 0.000102, - "loss": 3.7067, + "grad_norm": 3.9820754528045654, + "learning_rate": 7.649999999999999e-05, + "loss": 3.74, "step": 260 }, { "epoch": 0.4176, - "grad_norm": 3.5352141857147217, - "learning_rate": 0.00010239999999999998, - "loss": 3.7009, + "grad_norm": 5.922429084777832, + "learning_rate": 7.68e-05, + "loss": 3.7551, "step": 261 }, { "epoch": 0.4192, - "grad_norm": 7.101142883300781, - "learning_rate": 0.00010279999999999999, - "loss": 3.6875, + "grad_norm": 8.945326805114746, + "learning_rate": 7.709999999999999e-05, + "loss": 3.759, "step": 262 }, { "epoch": 0.4208, - "grad_norm": 1.425631046295166, - "learning_rate": 0.00010319999999999999, - "loss": 3.7991, + "grad_norm": 3.62739896774292, + "learning_rate": 7.74e-05, + "loss": 3.8278, "step": 263 }, { "epoch": 0.4224, - "grad_norm": 3.5846240520477295, - "learning_rate": 0.00010359999999999998, - "loss": 3.73, + "grad_norm": 5.89738130569458, + "learning_rate": 7.769999999999999e-05, + "loss": 3.7793, "step": 264 }, { "epoch": 0.424, - "grad_norm": 4.524421215057373, - "learning_rate": 0.000104, - "loss": 3.5848, + "grad_norm": 6.884239673614502, + "learning_rate": 7.8e-05, + "loss": 3.654, "step": 265 }, { "epoch": 0.4256, - "grad_norm": 2.8252696990966797, - "learning_rate": 0.00010439999999999999, - "loss": 3.6126, + "grad_norm": 5.8110857009887695, + "learning_rate": 7.829999999999999e-05, + "loss": 3.6802, "step": 266 }, { "epoch": 0.4272, - "grad_norm": 1.503335952758789, - "learning_rate": 0.00010479999999999999, - "loss": 3.6498, + "grad_norm": 2.915142297744751, + "learning_rate": 7.86e-05, + "loss": 3.6873, "step": 267 }, { "epoch": 0.4288, - "grad_norm": 1.0979737043380737, - "learning_rate": 0.0001052, - "loss": 3.5839, + "grad_norm": 2.7581217288970947, + "learning_rate": 7.89e-05, + "loss": 3.621, "step": 268 }, { "epoch": 0.4304, - "grad_norm": 2.4385807514190674, - "learning_rate": 0.00010559999999999998, - "loss": 3.6874, + "grad_norm": 0.830217719078064, + "learning_rate": 7.92e-05, + "loss": 3.7101, "step": 269 }, { "epoch": 0.432, - "grad_norm": 2.9847817420959473, - "learning_rate": 0.00010599999999999999, - "loss": 3.5842, + "grad_norm": 2.0595314502716064, + "learning_rate": 7.95e-05, + "loss": 3.6151, "step": 270 }, { "epoch": 0.4336, - "grad_norm": 6.495297431945801, - "learning_rate": 0.0001064, - "loss": 3.5658, + "grad_norm": 5.0488739013671875, + "learning_rate": 7.98e-05, + "loss": 3.6021, "step": 271 }, { "epoch": 0.4352, - "grad_norm": 3.128310441970825, - "learning_rate": 0.00010679999999999998, - "loss": 3.6348, + "grad_norm": 5.398313522338867, + "learning_rate": 8.01e-05, + "loss": 3.6869, "step": 272 }, { "epoch": 0.4368, - "grad_norm": 0.8451325297355652, - "learning_rate": 0.00010719999999999999, - "loss": 3.5906, + "grad_norm": 3.0345497131347656, + "learning_rate": 8.04e-05, + "loss": 3.6321, "step": 273 }, { "epoch": 0.4384, - "grad_norm": 1.0812450647354126, - "learning_rate": 0.0001076, - "loss": 3.5742, + "grad_norm": 5.039176940917969, + "learning_rate": 8.07e-05, + "loss": 3.6529, "step": 274 }, { "epoch": 0.44, - "grad_norm": 1.4351269006729126, - "learning_rate": 0.00010799999999999998, - "loss": 3.6606, + "grad_norm": 6.308064937591553, + "learning_rate": 8.1e-05, + "loss": 3.7383, "step": 275 }, { "epoch": 0.4416, - "grad_norm": 2.155651807785034, - "learning_rate": 0.00010839999999999999, - "loss": 3.6149, + "grad_norm": 2.592426300048828, + "learning_rate": 8.13e-05, + "loss": 3.6547, "step": 276 }, { "epoch": 0.4432, - "grad_norm": 1.8064038753509521, - "learning_rate": 0.0001088, - "loss": 3.5758, + "grad_norm": 1.9177695512771606, + "learning_rate": 8.16e-05, + "loss": 3.6161, "step": 277 }, { "epoch": 0.4448, - "grad_norm": 1.7468692064285278, - "learning_rate": 0.00010919999999999998, - "loss": 3.6068, + "grad_norm": 0.7630671858787537, + "learning_rate": 8.19e-05, + "loss": 3.6321, "step": 278 }, { "epoch": 0.4464, - "grad_norm": 1.2160580158233643, - "learning_rate": 0.0001096, - "loss": 3.5516, + "grad_norm": 1.042344093322754, + "learning_rate": 8.22e-05, + "loss": 3.5981, "step": 279 }, { "epoch": 0.448, - "grad_norm": 0.7980257868766785, - "learning_rate": 0.00010999999999999998, - "loss": 3.5646, + "grad_norm": 1.6470465660095215, + "learning_rate": 8.25e-05, + "loss": 3.6142, "step": 280 }, { "epoch": 0.4496, - "grad_norm": 4.852247714996338, - "learning_rate": 0.00011039999999999999, - "loss": 3.643, + "grad_norm": 1.3943268060684204, + "learning_rate": 8.28e-05, + "loss": 3.6655, "step": 281 }, { "epoch": 0.4512, - "grad_norm": 4.469863414764404, - "learning_rate": 0.0001108, - "loss": 3.5545, + "grad_norm": 1.1460942029953003, + "learning_rate": 8.31e-05, + "loss": 3.5895, "step": 282 }, { "epoch": 0.4528, - "grad_norm": 1.6534397602081299, - "learning_rate": 0.00011119999999999998, - "loss": 3.6189, + "grad_norm": 2.365217685699463, + "learning_rate": 8.34e-05, + "loss": 3.6698, "step": 283 }, { "epoch": 0.4544, - "grad_norm": 1.1284641027450562, - "learning_rate": 0.00011159999999999999, - "loss": 3.5333, + "grad_norm": 1.7076328992843628, + "learning_rate": 8.37e-05, + "loss": 3.586, "step": 284 }, { "epoch": 0.456, - "grad_norm": 2.752796173095703, - "learning_rate": 0.000112, - "loss": 3.69, + "grad_norm": 2.5929322242736816, + "learning_rate": 8.4e-05, + "loss": 3.7324, "step": 285 }, { "epoch": 0.4576, - "grad_norm": 0.7017075419425964, - "learning_rate": 0.00011239999999999998, - "loss": 3.6554, + "grad_norm": 2.4930028915405273, + "learning_rate": 8.43e-05, + "loss": 3.7213, "step": 286 }, { "epoch": 0.4592, - "grad_norm": 3.3214011192321777, - "learning_rate": 0.00011279999999999999, - "loss": 3.6023, + "grad_norm": 0.6821871399879456, + "learning_rate": 8.459999999999998e-05, + "loss": 3.624, "step": 287 }, { "epoch": 0.4608, - "grad_norm": 4.091536045074463, - "learning_rate": 0.0001132, - "loss": 3.5451, + "grad_norm": 0.7649940848350525, + "learning_rate": 8.489999999999999e-05, + "loss": 3.5646, "step": 288 }, { "epoch": 0.4624, - "grad_norm": 4.1135358810424805, - "learning_rate": 0.00011359999999999998, - "loss": 3.6137, + "grad_norm": 0.6352983713150024, + "learning_rate": 8.519999999999998e-05, + "loss": 3.6214, "step": 289 }, { "epoch": 0.464, - "grad_norm": 4.408985137939453, - "learning_rate": 0.00011399999999999999, - "loss": 3.5501, + "grad_norm": 1.2602449655532837, + "learning_rate": 8.549999999999999e-05, + "loss": 3.5684, "step": 290 }, { "epoch": 0.4656, - "grad_norm": 1.6145490407943726, - "learning_rate": 0.0001144, - "loss": 3.662, + "grad_norm": 1.568101167678833, + "learning_rate": 8.579999999999998e-05, + "loss": 3.7024, "step": 291 }, { "epoch": 0.4672, - "grad_norm": 1.9789752960205078, - "learning_rate": 0.00011479999999999999, - "loss": 3.493, + "grad_norm": 0.8837398290634155, + "learning_rate": 8.609999999999999e-05, + "loss": 3.5396, "step": 292 }, { "epoch": 0.4688, - "grad_norm": 2.3461720943450928, - "learning_rate": 0.0001152, - "loss": 3.5499, + "grad_norm": 2.6796021461486816, + "learning_rate": 8.639999999999999e-05, + "loss": 3.6308, "step": 293 }, { "epoch": 0.4704, - "grad_norm": 3.17932391166687, - "learning_rate": 0.0001156, - "loss": 3.5999, + "grad_norm": 2.063849925994873, + "learning_rate": 8.669999999999998e-05, + "loss": 3.6546, "step": 294 }, { "epoch": 0.472, - "grad_norm": 3.1920735836029053, - "learning_rate": 0.00011599999999999999, - "loss": 3.534, + "grad_norm": 0.7611846327781677, + "learning_rate": 8.699999999999999e-05, + "loss": 3.5734, "step": 295 }, { "epoch": 0.4736, - "grad_norm": 4.00572395324707, - "learning_rate": 0.0001164, - "loss": 3.5961, + "grad_norm": 0.967029333114624, + "learning_rate": 8.729999999999998e-05, + "loss": 3.6327, "step": 296 }, { "epoch": 0.4752, - "grad_norm": 2.458010673522949, - "learning_rate": 0.00011679999999999998, - "loss": 3.6061, + "grad_norm": 1.4702003002166748, + "learning_rate": 8.759999999999999e-05, + "loss": 3.667, "step": 297 }, { "epoch": 0.4768, - "grad_norm": 2.6859042644500732, - "learning_rate": 0.00011719999999999999, - "loss": 3.7096, + "grad_norm": 1.1918103694915771, + "learning_rate": 8.789999999999998e-05, + "loss": 3.7682, "step": 298 }, { "epoch": 0.4784, - "grad_norm": 1.6110202074050903, - "learning_rate": 0.0001176, - "loss": 3.7597, + "grad_norm": 1.5398286581039429, + "learning_rate": 8.819999999999999e-05, + "loss": 3.8027, "step": 299 }, { "epoch": 0.48, - "grad_norm": 2.9628493785858154, - "learning_rate": 0.00011799999999999998, - "loss": 3.6763, - "step": 300 - }, - { - "epoch": 0.48, - "eval_cer": 0.9768884282696217, - "eval_loss": 3.7717783451080322, - "eval_runtime": 158.6864, - "eval_samples_per_second": 19.762, - "eval_steps_per_second": 1.235, - "eval_wer": 1.0, + "grad_norm": 1.8616193532943726, + "learning_rate": 8.849999999999998e-05, + "loss": 3.7379, "step": 300 }, { "epoch": 0.4816, - "grad_norm": 30.372163772583008, - "learning_rate": 0.00011839999999999999, - "loss": 4.7027, + "grad_norm": 46.695960998535156, + "learning_rate": 8.879999999999999e-05, + "loss": 5.3609, "step": 301 }, { "epoch": 0.4832, - "grad_norm": 48.826873779296875, - "learning_rate": 0.0001188, - "loss": 4.0337, + "grad_norm": 27.103071212768555, + "learning_rate": 8.909999999999998e-05, + "loss": 4.2902, "step": 302 }, { "epoch": 0.4848, - "grad_norm": 1.4793564081192017, - "learning_rate": 0.00011919999999999998, - "loss": 3.6913, + "grad_norm": 3.6255853176116943, + "learning_rate": 8.939999999999999e-05, + "loss": 3.6986, "step": 303 }, { "epoch": 0.4864, - "grad_norm": 3.596323251724243, - "learning_rate": 0.0001196, - "loss": 3.8567, + "grad_norm": 7.375110149383545, + "learning_rate": 8.969999999999998e-05, + "loss": 3.8779, "step": 304 }, { "epoch": 0.488, - "grad_norm": 2.8520336151123047, - "learning_rate": 0.00011999999999999999, - "loss": 3.6953, + "grad_norm": 1.4208751916885376, + "learning_rate": 8.999999999999999e-05, + "loss": 3.7136, "step": 305 }, { "epoch": 0.4896, - "grad_norm": 5.386791706085205, - "learning_rate": 0.00012039999999999999, - "loss": 3.6839, + "grad_norm": 3.40026593208313, + "learning_rate": 9.029999999999999e-05, + "loss": 3.6894, "step": 306 }, { "epoch": 0.4912, - "grad_norm": 2.8725674152374268, - "learning_rate": 0.0001208, - "loss": 3.7514, + "grad_norm": 2.0953195095062256, + "learning_rate": 9.059999999999999e-05, + "loss": 3.7951, "step": 307 }, { "epoch": 0.4928, - "grad_norm": 6.234537124633789, - "learning_rate": 0.00012119999999999999, - "loss": 3.6282, + "grad_norm": 6.6124043464660645, + "learning_rate": 9.089999999999999e-05, + "loss": 3.6771, "step": 308 }, { "epoch": 0.4944, - "grad_norm": 3.3072447776794434, - "learning_rate": 0.00012159999999999999, - "loss": 3.6293, + "grad_norm": 5.1553425788879395, + "learning_rate": 9.12e-05, + "loss": 3.6848, "step": 309 }, { "epoch": 0.496, - "grad_norm": 1.3689359426498413, - "learning_rate": 0.000122, - "loss": 3.5943, + "grad_norm": 3.769348382949829, + "learning_rate": 9.149999999999999e-05, + "loss": 3.6674, "step": 310 }, { "epoch": 0.4976, - "grad_norm": 1.3273299932479858, - "learning_rate": 0.0001224, - "loss": 3.5541, + "grad_norm": 2.442067861557007, + "learning_rate": 9.18e-05, + "loss": 3.6276, "step": 311 }, { "epoch": 0.4992, - "grad_norm": 3.4486923217773438, - "learning_rate": 0.00012279999999999998, - "loss": 3.5964, + "grad_norm": 0.9911152124404907, + "learning_rate": 9.209999999999999e-05, + "loss": 3.6433, "step": 312 }, { "epoch": 0.5008, - "grad_norm": 2.501321792602539, - "learning_rate": 0.00012319999999999999, - "loss": 3.5374, + "grad_norm": 0.9690512418746948, + "learning_rate": 9.24e-05, + "loss": 3.5752, "step": 313 }, { "epoch": 0.5024, - "grad_norm": 5.377409934997559, - "learning_rate": 0.0001236, - "loss": 3.5793, + "grad_norm": 3.90602707862854, + "learning_rate": 9.269999999999999e-05, + "loss": 3.6274, "step": 314 }, { "epoch": 0.504, - "grad_norm": 4.240380764007568, - "learning_rate": 0.00012399999999999998, - "loss": 3.6035, + "grad_norm": 4.008577823638916, + "learning_rate": 9.3e-05, + "loss": 3.6627, "step": 315 }, { "epoch": 0.5056, - "grad_norm": 4.2587714195251465, - "learning_rate": 0.0001244, - "loss": 3.5728, + "grad_norm": 3.5601375102996826, + "learning_rate": 9.329999999999999e-05, + "loss": 3.6389, "step": 316 }, { "epoch": 0.5072, - "grad_norm": 1.2207915782928467, - "learning_rate": 0.00012479999999999997, - "loss": 3.5864, + "grad_norm": 0.9909217357635498, + "learning_rate": 9.36e-05, + "loss": 3.6401, "step": 317 }, { "epoch": 0.5088, - "grad_norm": 1.0006273984909058, - "learning_rate": 0.00012519999999999998, - "loss": 3.5532, + "grad_norm": 1.6349003314971924, + "learning_rate": 9.389999999999999e-05, + "loss": 3.6151, "step": 318 }, { "epoch": 0.5104, - "grad_norm": 2.789165496826172, - "learning_rate": 0.0001256, - "loss": 3.5095, + "grad_norm": 1.3834391832351685, + "learning_rate": 9.419999999999999e-05, + "loss": 3.5776, "step": 319 }, { "epoch": 0.512, - "grad_norm": 1.3580331802368164, - "learning_rate": 0.00012599999999999997, - "loss": 3.6278, + "grad_norm": 0.7489935755729675, + "learning_rate": 9.449999999999999e-05, + "loss": 3.6691, "step": 320 }, { "epoch": 0.5136, - "grad_norm": 2.3157308101654053, - "learning_rate": 0.00012639999999999998, - "loss": 3.5437, + "grad_norm": 0.6968175172805786, + "learning_rate": 9.479999999999999e-05, + "loss": 3.587, "step": 321 }, { "epoch": 0.5152, - "grad_norm": 1.8099067211151123, - "learning_rate": 0.0001268, - "loss": 3.519, + "grad_norm": 1.2773737907409668, + "learning_rate": 9.51e-05, + "loss": 3.5799, "step": 322 }, { "epoch": 0.5168, - "grad_norm": 2.988795042037964, - "learning_rate": 0.00012719999999999997, - "loss": 3.574, + "grad_norm": 3.2610504627227783, + "learning_rate": 9.539999999999999e-05, + "loss": 3.6406, "step": 323 }, { "epoch": 0.5184, - "grad_norm": 1.933131456375122, - "learning_rate": 0.00012759999999999998, - "loss": 3.4918, + "grad_norm": 1.130247712135315, + "learning_rate": 9.57e-05, + "loss": 3.5499, "step": 324 }, { "epoch": 0.52, - "grad_norm": 1.9587433338165283, - "learning_rate": 0.000128, - "loss": 3.5376, + "grad_norm": 0.9368891716003418, + "learning_rate": 9.599999999999999e-05, + "loss": 3.6001, "step": 325 }, { "epoch": 0.5216, - "grad_norm": 3.22497820854187, - "learning_rate": 0.00012839999999999998, - "loss": 3.5487, + "grad_norm": 1.5623962879180908, + "learning_rate": 9.63e-05, + "loss": 3.6115, "step": 326 }, { "epoch": 0.5232, - "grad_norm": 1.520359992980957, - "learning_rate": 0.0001288, - "loss": 3.4637, + "grad_norm": 1.9312142133712769, + "learning_rate": 9.659999999999999e-05, + "loss": 3.5575, "step": 327 }, { "epoch": 0.5248, - "grad_norm": 1.7252830266952515, - "learning_rate": 0.00012919999999999997, - "loss": 3.4904, + "grad_norm": 2.629648208618164, + "learning_rate": 9.69e-05, + "loss": 3.5637, "step": 328 }, { "epoch": 0.5264, - "grad_norm": 2.588486909866333, - "learning_rate": 0.00012959999999999998, - "loss": 3.4088, + "grad_norm": 2.997903823852539, + "learning_rate": 9.719999999999999e-05, + "loss": 3.4773, "step": 329 }, { "epoch": 0.528, - "grad_norm": 2.2423901557922363, - "learning_rate": 0.00013, - "loss": 3.4682, + "grad_norm": 2.4038596153259277, + "learning_rate": 9.75e-05, + "loss": 3.5326, "step": 330 }, { "epoch": 0.5296, - "grad_norm": 2.314682960510254, - "learning_rate": 0.00013039999999999997, - "loss": 3.5424, + "grad_norm": 1.7766273021697998, + "learning_rate": 9.779999999999999e-05, + "loss": 3.5888, "step": 331 }, { "epoch": 0.5312, - "grad_norm": 3.1224074363708496, - "learning_rate": 0.00013079999999999998, - "loss": 3.5728, + "grad_norm": 2.1525938510894775, + "learning_rate": 9.81e-05, + "loss": 3.6005, "step": 332 }, { "epoch": 0.5328, - "grad_norm": 1.9981073141098022, - "learning_rate": 0.0001312, - "loss": 3.4646, + "grad_norm": 1.953926920890808, + "learning_rate": 9.839999999999999e-05, + "loss": 3.5273, "step": 333 }, { "epoch": 0.5344, - "grad_norm": 0.9668769836425781, - "learning_rate": 0.00013159999999999997, - "loss": 3.5032, + "grad_norm": 1.1791383028030396, + "learning_rate": 9.87e-05, + "loss": 3.5676, "step": 334 }, { "epoch": 0.536, - "grad_norm": 2.059154987335205, - "learning_rate": 0.00013199999999999998, - "loss": 3.5411, + "grad_norm": 1.4245538711547852, + "learning_rate": 9.9e-05, + "loss": 3.606, "step": 335 }, { "epoch": 0.5376, - "grad_norm": 1.0898948907852173, - "learning_rate": 0.0001324, - "loss": 3.4957, + "grad_norm": 0.9623693823814392, + "learning_rate": 9.93e-05, + "loss": 3.5484, "step": 336 }, { "epoch": 0.5392, - "grad_norm": 2.375235080718994, - "learning_rate": 0.00013279999999999998, - "loss": 3.5179, + "grad_norm": 2.230302333831787, + "learning_rate": 9.96e-05, + "loss": 3.5715, "step": 337 }, { "epoch": 0.5408, - "grad_norm": 2.4042155742645264, - "learning_rate": 0.00013319999999999999, - "loss": 3.5158, + "grad_norm": 2.5997812747955322, + "learning_rate": 9.99e-05, + "loss": 3.5682, "step": 338 }, { "epoch": 0.5424, - "grad_norm": 2.0467703342437744, - "learning_rate": 0.0001336, - "loss": 3.5472, + "grad_norm": 0.998353123664856, + "learning_rate": 0.0001002, + "loss": 3.6058, "step": 339 }, { "epoch": 0.544, - "grad_norm": 1.2783809900283813, - "learning_rate": 0.00013399999999999998, - "loss": 3.4051, + "grad_norm": 0.9916491508483887, + "learning_rate": 0.0001005, + "loss": 3.4561, "step": 340 }, { "epoch": 0.5456, - "grad_norm": 1.2459137439727783, - "learning_rate": 0.0001344, - "loss": 3.383, + "grad_norm": 1.0415515899658203, + "learning_rate": 0.0001008, + "loss": 3.4528, "step": 341 }, { "epoch": 0.5472, - "grad_norm": 11.236316680908203, - "learning_rate": 0.00013479999999999997, - "loss": 3.8726, + "grad_norm": 10.178709983825684, + "learning_rate": 0.0001011, + "loss": 3.8938, "step": 342 }, { "epoch": 0.5488, - "grad_norm": 5.548703670501709, - "learning_rate": 0.00013519999999999998, - "loss": 3.5908, + "grad_norm": 8.032876014709473, + "learning_rate": 0.0001014, + "loss": 3.7116, "step": 343 }, { "epoch": 0.5504, - "grad_norm": 2.2428252696990967, - "learning_rate": 0.0001356, - "loss": 3.494, + "grad_norm": 4.835402488708496, + "learning_rate": 0.00010169999999999999, + "loss": 3.5765, "step": 344 }, { "epoch": 0.552, - "grad_norm": 1.2952911853790283, - "learning_rate": 0.00013599999999999997, - "loss": 3.5514, + "grad_norm": 4.175968170166016, + "learning_rate": 0.000102, + "loss": 3.6138, "step": 345 }, { "epoch": 0.5536, - "grad_norm": 2.755897283554077, - "learning_rate": 0.00013639999999999998, - "loss": 3.4938, + "grad_norm": 0.8417240977287292, + "learning_rate": 0.00010229999999999999, + "loss": 3.539, "step": 346 }, { "epoch": 0.5552, - "grad_norm": 4.549272537231445, - "learning_rate": 0.0001368, - "loss": 3.5529, + "grad_norm": 2.533202648162842, + "learning_rate": 0.0001026, + "loss": 3.5745, "step": 347 }, { "epoch": 0.5568, - "grad_norm": 5.1981048583984375, - "learning_rate": 0.00013719999999999997, - "loss": 3.4982, + "grad_norm": 3.826181650161743, + "learning_rate": 0.0001029, + "loss": 3.4985, "step": 348 }, { "epoch": 0.5584, - "grad_norm": 1.9575941562652588, - "learning_rate": 0.00013759999999999998, - "loss": 3.5279, + "grad_norm": 1.0143622159957886, + "learning_rate": 0.00010319999999999999, + "loss": 3.559, "step": 349 }, { "epoch": 0.56, - "grad_norm": 3.6745364665985107, - "learning_rate": 0.000138, - "loss": 3.628, + "grad_norm": 3.292059898376465, + "learning_rate": 0.00010349999999999998, + "loss": 3.6784, "step": 350 }, { "epoch": 0.5616, - "grad_norm": 15.095296859741211, - "learning_rate": 0.00013839999999999998, - "loss": 3.8532, + "grad_norm": 14.462305068969727, + "learning_rate": 0.00010379999999999999, + "loss": 3.8307, "step": 351 }, { "epoch": 0.5632, - "grad_norm": 16.068857192993164, - "learning_rate": 0.00013879999999999999, - "loss": 3.9237, + "grad_norm": 15.524165153503418, + "learning_rate": 0.00010409999999999998, + "loss": 3.8937, "step": 352 }, { "epoch": 0.5648, - "grad_norm": 16.43937873840332, - "learning_rate": 0.0001392, - "loss": 3.7986, + "grad_norm": 8.758255004882812, + "learning_rate": 0.00010439999999999999, + "loss": 3.7877, "step": 353 }, { "epoch": 0.5664, - "grad_norm": 16.25771141052246, - "learning_rate": 0.00013959999999999998, - "loss": 4.018, + "grad_norm": 12.151189804077148, + "learning_rate": 0.00010469999999999998, + "loss": 3.8868, "step": 354 }, { "epoch": 0.568, - "grad_norm": 20.026491165161133, - "learning_rate": 0.00014, - "loss": 4.2146, + "grad_norm": 15.403145790100098, + "learning_rate": 0.00010499999999999999, + "loss": 4.0839, "step": 355 }, { "epoch": 0.5696, - "grad_norm": 5.759159564971924, - "learning_rate": 0.0001404, - "loss": 3.5998, + "grad_norm": 5.038736820220947, + "learning_rate": 0.00010529999999999998, + "loss": 3.5888, "step": 356 }, { "epoch": 0.5712, - "grad_norm": 7.332910537719727, - "learning_rate": 0.00014079999999999998, - "loss": 3.7159, + "grad_norm": 2.910677909851074, + "learning_rate": 0.00010559999999999998, + "loss": 3.7264, "step": 357 }, { "epoch": 0.5728, - "grad_norm": 1.8525800704956055, - "learning_rate": 0.0001412, - "loss": 3.6049, + "grad_norm": 2.463589668273926, + "learning_rate": 0.00010589999999999999, + "loss": 3.6665, "step": 358 }, { "epoch": 0.5744, - "grad_norm": 2.778653144836426, - "learning_rate": 0.00014159999999999997, - "loss": 3.5283, + "grad_norm": 4.513500690460205, + "learning_rate": 0.00010619999999999998, + "loss": 3.6046, "step": 359 }, { "epoch": 0.576, - "grad_norm": 1.6119922399520874, - "learning_rate": 0.00014199999999999998, - "loss": 3.5938, + "grad_norm": 2.1371867656707764, + "learning_rate": 0.00010649999999999999, + "loss": 3.6617, "step": 360 }, { "epoch": 0.5776, - "grad_norm": 2.705361843109131, - "learning_rate": 0.0001424, - "loss": 3.568, + "grad_norm": 2.018497943878174, + "learning_rate": 0.00010679999999999998, + "loss": 3.6356, "step": 361 }, { "epoch": 0.5792, - "grad_norm": 4.27663516998291, - "learning_rate": 0.00014279999999999997, - "loss": 3.5577, + "grad_norm": 2.9791243076324463, + "learning_rate": 0.00010709999999999999, + "loss": 3.5828, "step": 362 }, { "epoch": 0.5808, - "grad_norm": 3.09662127494812, - "learning_rate": 0.00014319999999999998, - "loss": 3.5939, + "grad_norm": 0.9730789065361023, + "learning_rate": 0.00010739999999999998, + "loss": 3.6317, "step": 363 }, { "epoch": 0.5824, - "grad_norm": 2.659717559814453, - "learning_rate": 0.0001436, - "loss": 3.5241, + "grad_norm": 0.9338732957839966, + "learning_rate": 0.00010769999999999999, + "loss": 3.5758, "step": 364 }, { "epoch": 0.584, - "grad_norm": 4.375011444091797, - "learning_rate": 0.00014399999999999998, - "loss": 3.4024, + "grad_norm": 2.041814088821411, + "learning_rate": 0.00010799999999999998, + "loss": 3.4225, "step": 365 }, { "epoch": 0.5856, - "grad_norm": 2.0820505619049072, - "learning_rate": 0.00014439999999999999, - "loss": 3.4192, + "grad_norm": 0.7372158765792847, + "learning_rate": 0.00010829999999999999, + "loss": 3.4787, "step": 366 }, { "epoch": 0.5872, - "grad_norm": 3.44874906539917, - "learning_rate": 0.0001448, - "loss": 3.4785, + "grad_norm": 4.640085220336914, + "learning_rate": 0.00010859999999999998, + "loss": 3.566, "step": 367 }, { "epoch": 0.5888, - "grad_norm": 20.90206527709961, - "learning_rate": 0.00014519999999999998, - "loss": 3.7141, + "grad_norm": 10.43756103515625, + "learning_rate": 0.00010889999999999999, + "loss": 3.7959, "step": 368 }, { "epoch": 0.5904, - "grad_norm": 12.979022979736328, - "learning_rate": 0.0001456, - "loss": 3.9909, + "grad_norm": 12.211026191711426, + "learning_rate": 0.00010919999999999998, + "loss": 4.0038, "step": 369 }, { "epoch": 0.592, - "grad_norm": 4.532771110534668, - "learning_rate": 0.000146, - "loss": 3.5721, + "grad_norm": 2.702982187271118, + "learning_rate": 0.00010949999999999999, + "loss": 3.5885, "step": 370 }, { "epoch": 0.5936, - "grad_norm": 6.729887962341309, - "learning_rate": 0.00014639999999999998, - "loss": 3.5496, + "grad_norm": 3.37894868850708, + "learning_rate": 0.00010979999999999999, + "loss": 3.5248, "step": 371 }, { "epoch": 0.5952, - "grad_norm": 1.9874441623687744, - "learning_rate": 0.0001468, - "loss": 3.4444, + "grad_norm": 1.3367834091186523, + "learning_rate": 0.00011009999999999999, + "loss": 3.5093, "step": 372 }, { "epoch": 0.5968, - "grad_norm": 1.2600610256195068, - "learning_rate": 0.00014719999999999997, - "loss": 3.4629, + "grad_norm": 1.7772578001022339, + "learning_rate": 0.00011039999999999999, + "loss": 3.5247, "step": 373 }, { "epoch": 0.5984, - "grad_norm": 1.6816917657852173, - "learning_rate": 0.00014759999999999998, - "loss": 3.5387, + "grad_norm": 1.5332499742507935, + "learning_rate": 0.0001107, + "loss": 3.6032, "step": 374 }, { "epoch": 0.6, - "grad_norm": 3.2985401153564453, - "learning_rate": 0.000148, - "loss": 3.4601, + "grad_norm": 2.7413148880004883, + "learning_rate": 0.00011099999999999999, + "loss": 3.5101, "step": 375 }, { "epoch": 0.6016, - "grad_norm": 2.27606463432312, - "learning_rate": 0.00014839999999999998, - "loss": 3.4595, + "grad_norm": 0.6245549917221069, + "learning_rate": 0.0001113, + "loss": 3.5173, "step": 376 }, { "epoch": 0.6032, - "grad_norm": 2.2081143856048584, - "learning_rate": 0.00014879999999999998, - "loss": 3.5109, + "grad_norm": 0.9614344239234924, + "learning_rate": 0.00011159999999999999, + "loss": 3.5536, "step": 377 }, { "epoch": 0.6048, - "grad_norm": 3.556088924407959, - "learning_rate": 0.0001492, - "loss": 3.3938, + "grad_norm": 1.8567408323287964, + "learning_rate": 0.0001119, + "loss": 3.4189, "step": 378 }, { "epoch": 0.6064, - "grad_norm": 1.6859062910079956, - "learning_rate": 0.00014959999999999998, - "loss": 3.4073, + "grad_norm": 1.1231341361999512, + "learning_rate": 0.00011219999999999999, + "loss": 3.4579, "step": 379 }, { "epoch": 0.608, - "grad_norm": 1.718527913093567, - "learning_rate": 0.00015, - "loss": 3.4326, + "grad_norm": 2.492093086242676, + "learning_rate": 0.0001125, + "loss": 3.4925, "step": 380 }, { "epoch": 0.6096, - "grad_norm": 7.6954522132873535, - "learning_rate": 0.00015039999999999997, - "loss": 3.4145, + "grad_norm": 8.354537963867188, + "learning_rate": 0.00011279999999999999, + "loss": 3.4855, "step": 381 }, { "epoch": 0.6112, - "grad_norm": 3.9803624153137207, - "learning_rate": 0.0001508, - "loss": 3.5334, + "grad_norm": 3.4800968170166016, + "learning_rate": 0.00011309999999999998, + "loss": 3.5702, "step": 382 }, { "epoch": 0.6128, - "grad_norm": 4.626428127288818, - "learning_rate": 0.0001512, - "loss": 3.5431, + "grad_norm": 2.3893015384674072, + "learning_rate": 0.00011339999999999999, + "loss": 3.5403, "step": 383 }, { "epoch": 0.6144, - "grad_norm": 1.796976923942566, - "learning_rate": 0.00015159999999999997, - "loss": 3.3876, + "grad_norm": 1.4556490182876587, + "learning_rate": 0.00011369999999999999, + "loss": 3.4286, "step": 384 }, { "epoch": 0.616, - "grad_norm": 1.869494080543518, - "learning_rate": 0.000152, - "loss": 3.4063, + "grad_norm": 0.9344227910041809, + "learning_rate": 0.00011399999999999999, + "loss": 3.4456, "step": 385 }, { "epoch": 0.6176, - "grad_norm": 1.6276867389678955, - "learning_rate": 0.0001524, - "loss": 3.435, + "grad_norm": 1.8487316370010376, + "learning_rate": 0.00011429999999999999, + "loss": 3.5025, "step": 386 }, { "epoch": 0.6192, - "grad_norm": 1.9487453699111938, - "learning_rate": 0.00015279999999999997, - "loss": 3.4772, + "grad_norm": 2.2494537830352783, + "learning_rate": 0.0001146, + "loss": 3.5277, "step": 387 }, { "epoch": 0.6208, - "grad_norm": 3.8081955909729004, - "learning_rate": 0.0001532, - "loss": 3.4428, + "grad_norm": 1.3514158725738525, + "learning_rate": 0.00011489999999999999, + "loss": 3.4763, "step": 388 }, { "epoch": 0.6224, - "grad_norm": 1.7065116167068481, - "learning_rate": 0.0001536, - "loss": 3.4732, + "grad_norm": 1.518999695777893, + "learning_rate": 0.0001152, + "loss": 3.4946, "step": 389 }, { "epoch": 0.624, - "grad_norm": 4.1957292556762695, - "learning_rate": 0.00015399999999999998, - "loss": 3.4299, + "grad_norm": 1.8156572580337524, + "learning_rate": 0.00011549999999999999, + "loss": 3.4156, "step": 390 }, { "epoch": 0.6256, - "grad_norm": 2.8493101596832275, - "learning_rate": 0.0001544, - "loss": 3.3279, + "grad_norm": 1.6927382946014404, + "learning_rate": 0.0001158, + "loss": 3.3432, "step": 391 }, { "epoch": 0.6272, - "grad_norm": 1.7241597175598145, - "learning_rate": 0.0001548, - "loss": 3.3818, + "grad_norm": 1.9768507480621338, + "learning_rate": 0.00011609999999999999, + "loss": 3.4614, "step": 392 }, { "epoch": 0.6288, - "grad_norm": 1.3103301525115967, - "learning_rate": 0.00015519999999999998, - "loss": 3.4189, + "grad_norm": 2.5529818534851074, + "learning_rate": 0.0001164, + "loss": 3.4839, "step": 393 }, { "epoch": 0.6304, - "grad_norm": 1.0086480379104614, - "learning_rate": 0.00015560000000000001, - "loss": 3.3741, + "grad_norm": 1.4785748720169067, + "learning_rate": 0.00011669999999999999, + "loss": 3.4267, "step": 394 }, { "epoch": 0.632, - "grad_norm": 2.4616684913635254, - "learning_rate": 0.000156, - "loss": 3.3846, + "grad_norm": 0.9147313237190247, + "learning_rate": 0.000117, + "loss": 3.4296, "step": 395 }, { "epoch": 0.6336, - "grad_norm": 4.7007527351379395, - "learning_rate": 0.00015639999999999998, - "loss": 3.4063, + "grad_norm": 1.8749762773513794, + "learning_rate": 0.00011729999999999999, + "loss": 3.3974, "step": 396 }, { "epoch": 0.6352, - "grad_norm": 2.0122146606445312, - "learning_rate": 0.00015679999999999996, - "loss": 3.4581, + "grad_norm": 2.243661403656006, + "learning_rate": 0.0001176, + "loss": 3.4929, "step": 397 }, { "epoch": 0.6368, - "grad_norm": 1.6844937801361084, - "learning_rate": 0.0001572, - "loss": 3.4897, + "grad_norm": 3.580742120742798, + "learning_rate": 0.00011789999999999999, + "loss": 3.5545, "step": 398 }, { "epoch": 0.6384, - "grad_norm": 2.5921356678009033, - "learning_rate": 0.00015759999999999998, - "loss": 3.4199, + "grad_norm": 2.198838472366333, + "learning_rate": 0.0001182, + "loss": 3.4712, "step": 399 }, { "epoch": 0.64, "grad_norm": NaN, - "learning_rate": 0.00015759999999999998, - "loss": 3.1272, - "step": 400 - }, - { - "epoch": 0.64, - "eval_cer": 0.982628452983617, - "eval_loss": 3.464660882949829, - "eval_runtime": 157.7514, - "eval_samples_per_second": 19.879, - "eval_steps_per_second": 1.242, - "eval_wer": 1.0, + "learning_rate": 0.0001182, + "loss": 3.1738, "step": 400 }, { "epoch": 0.6416, - "grad_norm": 14.215404510498047, - "learning_rate": 0.00015799999999999996, - "loss": 3.8237, + "grad_norm": 16.711830139160156, + "learning_rate": 0.0001185, + "loss": 3.9305, "step": 401 }, { "epoch": 0.6432, - "grad_norm": 11.251092910766602, - "learning_rate": 0.0001584, - "loss": 3.6592, + "grad_norm": 15.2301607131958, + "learning_rate": 0.0001188, + "loss": 3.812, "step": 402 }, { "epoch": 0.6448, - "grad_norm": 23.282085418701172, - "learning_rate": 0.00015879999999999998, - "loss": 4.4136, + "grad_norm": 31.35503387451172, + "learning_rate": 0.0001191, + "loss": 4.7454, "step": 403 }, { "epoch": 0.6464, - "grad_norm": 4.128000259399414, - "learning_rate": 0.00015919999999999997, - "loss": 3.5762, + "grad_norm": 10.81604290008545, + "learning_rate": 0.0001194, + "loss": 3.7583, "step": 404 }, { "epoch": 0.648, - "grad_norm": 3.7951207160949707, - "learning_rate": 0.0001596, - "loss": 3.6869, + "grad_norm": 11.12818717956543, + "learning_rate": 0.0001197, + "loss": 3.8342, "step": 405 }, { "epoch": 0.6496, - "grad_norm": 2.6995790004730225, - "learning_rate": 0.00015999999999999999, - "loss": 3.6114, + "grad_norm": 5.6629767417907715, + "learning_rate": 0.00011999999999999999, + "loss": 3.707, "step": 406 }, { "epoch": 0.6512, - "grad_norm": 8.775090217590332, - "learning_rate": 0.00016039999999999997, - "loss": 3.5307, + "grad_norm": 5.830606937408447, + "learning_rate": 0.0001203, + "loss": 3.4846, "step": 407 }, { "epoch": 0.6528, - "grad_norm": 7.154327392578125, - "learning_rate": 0.0001608, - "loss": 3.568, + "grad_norm": 4.803534030914307, + "learning_rate": 0.00012059999999999999, + "loss": 3.5607, "step": 408 }, { "epoch": 0.6544, - "grad_norm": 5.456339359283447, - "learning_rate": 0.0001612, - "loss": 3.5659, + "grad_norm": 4.736687660217285, + "learning_rate": 0.0001209, + "loss": 3.5705, "step": 409 }, { "epoch": 0.656, - "grad_norm": 5.459187030792236, - "learning_rate": 0.00016159999999999997, - "loss": 3.5469, + "grad_norm": 5.9123101234436035, + "learning_rate": 0.00012119999999999999, + "loss": 3.5491, "step": 410 }, { "epoch": 0.6576, - "grad_norm": 4.668329238891602, - "learning_rate": 0.000162, - "loss": 3.5329, + "grad_norm": 4.854342460632324, + "learning_rate": 0.0001215, + "loss": 3.5639, "step": 411 }, { "epoch": 0.6592, - "grad_norm": 2.4972665309906006, - "learning_rate": 0.0001624, - "loss": 3.4671, + "grad_norm": 2.654384136199951, + "learning_rate": 0.00012179999999999999, + "loss": 3.519, "step": 412 }, { "epoch": 0.6608, - "grad_norm": 2.261772394180298, - "learning_rate": 0.00016279999999999997, - "loss": 3.4179, + "grad_norm": 2.97538161277771, + "learning_rate": 0.00012209999999999999, + "loss": 3.4907, "step": 413 }, { "epoch": 0.6624, - "grad_norm": 1.6107017993927002, - "learning_rate": 0.0001632, - "loss": 3.411, + "grad_norm": 1.8297629356384277, + "learning_rate": 0.0001224, + "loss": 3.4962, "step": 414 }, { "epoch": 0.664, - "grad_norm": 3.2576379776000977, - "learning_rate": 0.0001636, - "loss": 3.4519, + "grad_norm": 3.1731834411621094, + "learning_rate": 0.00012269999999999997, + "loss": 3.5136, "step": 415 }, { "epoch": 0.6656, - "grad_norm": 6.433845520019531, - "learning_rate": 0.00016399999999999997, - "loss": 3.5618, + "grad_norm": 7.233218669891357, + "learning_rate": 0.00012299999999999998, + "loss": 3.6006, "step": 416 }, { "epoch": 0.6672, - "grad_norm": 3.0946199893951416, - "learning_rate": 0.0001644, - "loss": 3.4222, + "grad_norm": 3.845116376876831, + "learning_rate": 0.0001233, + "loss": 3.4742, "step": 417 }, { "epoch": 0.6688, - "grad_norm": 4.258899688720703, - "learning_rate": 0.0001648, - "loss": 3.5111, + "grad_norm": 5.4477152824401855, + "learning_rate": 0.0001236, + "loss": 3.5832, "step": 418 }, { "epoch": 0.6704, - "grad_norm": 4.503801345825195, - "learning_rate": 0.00016519999999999998, - "loss": 3.4504, + "grad_norm": 5.551750659942627, + "learning_rate": 0.00012389999999999998, + "loss": 3.5168, "step": 419 }, { "epoch": 0.672, - "grad_norm": 0.8702706694602966, - "learning_rate": 0.0001656, - "loss": 3.3714, + "grad_norm": 1.164810299873352, + "learning_rate": 0.00012419999999999998, + "loss": 3.4397, "step": 420 }, { "epoch": 0.6736, - "grad_norm": 1.4191309213638306, - "learning_rate": 0.000166, - "loss": 3.3781, + "grad_norm": 1.6991772651672363, + "learning_rate": 0.0001245, + "loss": 3.4452, "step": 421 }, { "epoch": 0.6752, - "grad_norm": 1.344421148300171, - "learning_rate": 0.00016639999999999998, - "loss": 3.3594, + "grad_norm": 1.1151283979415894, + "learning_rate": 0.00012479999999999997, + "loss": 3.4537, "step": 422 }, { "epoch": 0.6768, - "grad_norm": 1.4346777200698853, - "learning_rate": 0.0001668, - "loss": 3.3227, + "grad_norm": 1.9064522981643677, + "learning_rate": 0.00012509999999999998, + "loss": 3.4217, "step": 423 }, { "epoch": 0.6784, - "grad_norm": 3.5542514324188232, - "learning_rate": 0.0001672, - "loss": 3.3607, + "grad_norm": 4.329729080200195, + "learning_rate": 0.00012539999999999999, + "loss": 3.4353, "step": 424 }, { "epoch": 0.68, - "grad_norm": 2.4777677059173584, - "learning_rate": 0.00016759999999999998, - "loss": 3.3752, + "grad_norm": 3.2471909523010254, + "learning_rate": 0.0001257, + "loss": 3.4603, "step": 425 }, { "epoch": 0.6816, - "grad_norm": 1.9810409545898438, - "learning_rate": 0.000168, - "loss": 3.3176, + "grad_norm": 2.613004207611084, + "learning_rate": 0.00012599999999999997, + "loss": 3.4278, "step": 426 }, { "epoch": 0.6832, - "grad_norm": 1.3213425874710083, - "learning_rate": 0.0001684, - "loss": 3.3277, + "grad_norm": 0.7707146406173706, + "learning_rate": 0.00012629999999999998, + "loss": 3.4114, "step": 427 }, { "epoch": 0.6848, - "grad_norm": 1.7977039813995361, - "learning_rate": 0.00016879999999999998, - "loss": 3.2458, + "grad_norm": 1.4232515096664429, + "learning_rate": 0.0001266, + "loss": 3.3683, "step": 428 }, { "epoch": 0.6864, - "grad_norm": 1.8572102785110474, - "learning_rate": 0.00016919999999999997, - "loss": 3.2321, + "grad_norm": 2.6852142810821533, + "learning_rate": 0.0001269, + "loss": 3.3575, "step": 429 }, { "epoch": 0.688, - "grad_norm": 1.9749292135238647, - "learning_rate": 0.0001696, - "loss": 3.3227, + "grad_norm": 2.602998733520508, + "learning_rate": 0.00012719999999999997, + "loss": 3.4309, "step": 430 }, { "epoch": 0.6896, - "grad_norm": 1.989938735961914, - "learning_rate": 0.00016999999999999999, - "loss": 3.2361, + "grad_norm": 2.8761863708496094, + "learning_rate": 0.00012749999999999998, + "loss": 3.3435, "step": 431 }, { "epoch": 0.6912, - "grad_norm": 1.1900919675827026, - "learning_rate": 0.00017039999999999997, - "loss": 3.1931, + "grad_norm": 0.779827356338501, + "learning_rate": 0.0001278, + "loss": 3.3467, "step": 432 }, { "epoch": 0.6928, - "grad_norm": 1.588123083114624, - "learning_rate": 0.0001708, - "loss": 3.1711, + "grad_norm": 1.6401891708374023, + "learning_rate": 0.0001281, + "loss": 3.3293, "step": 433 }, { "epoch": 0.6944, - "grad_norm": 2.484400749206543, - "learning_rate": 0.0001712, - "loss": 3.3573, + "grad_norm": 1.1276079416275024, + "learning_rate": 0.00012839999999999998, + "loss": 3.4566, "step": 434 }, { "epoch": 0.696, - "grad_norm": 1.7051153182983398, - "learning_rate": 0.00017159999999999997, - "loss": 3.2167, + "grad_norm": 1.1479798555374146, + "learning_rate": 0.00012869999999999998, + "loss": 3.3803, "step": 435 }, { "epoch": 0.6976, - "grad_norm": 2.2069146633148193, - "learning_rate": 0.000172, - "loss": 3.393, + "grad_norm": 1.1874502897262573, + "learning_rate": 0.000129, + "loss": 3.4799, "step": 436 }, { "epoch": 0.6992, - "grad_norm": 1.0689940452575684, - "learning_rate": 0.0001724, - "loss": 3.3359, + "grad_norm": 1.1631622314453125, + "learning_rate": 0.0001293, + "loss": 3.4652, "step": 437 }, { "epoch": 0.7008, - "grad_norm": 2.242213487625122, - "learning_rate": 0.00017279999999999997, - "loss": 3.2326, + "grad_norm": 1.8987886905670166, + "learning_rate": 0.00012959999999999998, + "loss": 3.3634, "step": 438 }, { "epoch": 0.7024, - "grad_norm": 1.2001672983169556, - "learning_rate": 0.00017319999999999998, - "loss": 3.1571, + "grad_norm": 1.0175995826721191, + "learning_rate": 0.00012989999999999999, + "loss": 3.3244, "step": 439 }, { "epoch": 0.704, - "grad_norm": 1.493927001953125, - "learning_rate": 0.0001736, - "loss": 3.281, + "grad_norm": 2.3897902965545654, + "learning_rate": 0.0001302, + "loss": 3.4436, "step": 440 }, { "epoch": 0.7056, - "grad_norm": 1.6951879262924194, - "learning_rate": 0.00017399999999999997, - "loss": 3.2278, + "grad_norm": 2.532571792602539, + "learning_rate": 0.0001305, + "loss": 3.4136, "step": 441 }, { "epoch": 0.7072, - "grad_norm": 1.2256603240966797, - "learning_rate": 0.00017439999999999998, - "loss": 3.2015, + "grad_norm": 1.2601231336593628, + "learning_rate": 0.00013079999999999998, + "loss": 3.3457, "step": 442 }, { "epoch": 0.7088, - "grad_norm": 2.6825082302093506, - "learning_rate": 0.0001748, - "loss": 3.3138, + "grad_norm": 2.287309408187866, + "learning_rate": 0.0001311, + "loss": 3.4499, "step": 443 }, { "epoch": 0.7104, - "grad_norm": 2.056058645248413, - "learning_rate": 0.00017519999999999998, - "loss": 3.1577, + "grad_norm": 2.2662341594696045, + "learning_rate": 0.0001314, + "loss": 3.3269, "step": 444 }, { "epoch": 0.712, - "grad_norm": 2.27656888961792, - "learning_rate": 0.00017559999999999999, - "loss": 3.2605, + "grad_norm": 3.020737886428833, + "learning_rate": 0.00013169999999999998, + "loss": 3.4164, "step": 445 }, { "epoch": 0.7136, - "grad_norm": 1.663690209388733, - "learning_rate": 0.000176, - "loss": 3.1057, + "grad_norm": 2.3452391624450684, + "learning_rate": 0.00013199999999999998, + "loss": 3.3253, "step": 446 }, { "epoch": 0.7152, - "grad_norm": 1.4355462789535522, - "learning_rate": 0.00017639999999999998, - "loss": 3.3227, + "grad_norm": 2.1812047958374023, + "learning_rate": 0.0001323, + "loss": 3.462, "step": 447 }, { "epoch": 0.7168, - "grad_norm": 1.0722546577453613, - "learning_rate": 0.0001768, - "loss": 3.1746, + "grad_norm": 1.294961929321289, + "learning_rate": 0.0001326, + "loss": 3.3657, "step": 448 }, { "epoch": 0.7184, - "grad_norm": 3.61966872215271, - "learning_rate": 0.0001772, - "loss": 3.2868, + "grad_norm": 4.165415287017822, + "learning_rate": 0.00013289999999999998, + "loss": 3.4706, "step": 449 }, { "epoch": 0.72, - "grad_norm": 3.237584114074707, - "learning_rate": 0.00017759999999999998, - "loss": 3.5749, + "grad_norm": 3.625424861907959, + "learning_rate": 0.00013319999999999999, + "loss": 3.6857, "step": 450 }, { "epoch": 0.7216, - "grad_norm": 42.27267074584961, - "learning_rate": 0.000178, - "loss": 5.2106, + "grad_norm": 42.7130012512207, + "learning_rate": 0.0001335, + "loss": 5.3364, "step": 451 }, { "epoch": 0.7232, - "grad_norm": 17.318639755249023, - "learning_rate": 0.0001784, - "loss": 3.8636, + "grad_norm": 18.547584533691406, + "learning_rate": 0.0001338, + "loss": 4.0571, "step": 452 }, { "epoch": 0.7248, - "grad_norm": 8.741928100585938, - "learning_rate": 0.00017879999999999998, - "loss": 3.4172, + "grad_norm": 11.535635948181152, + "learning_rate": 0.00013409999999999998, + "loss": 3.7247, "step": 453 }, { "epoch": 0.7264, - "grad_norm": 5.296689510345459, - "learning_rate": 0.0001792, - "loss": 3.4227, + "grad_norm": 6.823582649230957, + "learning_rate": 0.0001344, + "loss": 3.6886, "step": 454 }, { "epoch": 0.728, - "grad_norm": 1.3165295124053955, - "learning_rate": 0.0001796, - "loss": 3.2189, + "grad_norm": 2.5420644283294678, + "learning_rate": 0.0001347, + "loss": 3.5274, "step": 455 }, { "epoch": 0.7296, - "grad_norm": 1.799149990081787, - "learning_rate": 0.00017999999999999998, - "loss": 3.1344, + "grad_norm": 0.9652547240257263, + "learning_rate": 0.000135, + "loss": 3.4573, "step": 456 }, { "epoch": 0.7312, - "grad_norm": 1.491166591644287, - "learning_rate": 0.0001804, - "loss": 3.1914, + "grad_norm": 1.7442926168441772, + "learning_rate": 0.00013529999999999998, + "loss": 3.4887, "step": 457 }, { "epoch": 0.7328, - "grad_norm": 3.692824363708496, - "learning_rate": 0.00018079999999999998, - "loss": 3.1351, + "grad_norm": 5.671680927276611, + "learning_rate": 0.0001356, + "loss": 3.4587, "step": 458 }, { "epoch": 0.7344, - "grad_norm": 2.3572800159454346, - "learning_rate": 0.00018119999999999999, - "loss": 3.0732, + "grad_norm": 5.160175800323486, + "learning_rate": 0.0001359, + "loss": 3.4867, "step": 459 }, { "epoch": 0.736, - "grad_norm": 1.9129135608673096, - "learning_rate": 0.00018159999999999997, - "loss": 3.1997, + "grad_norm": 4.00103235244751, + "learning_rate": 0.0001362, + "loss": 3.5526, "step": 460 }, { "epoch": 0.7376, - "grad_norm": 1.8856866359710693, - "learning_rate": 0.00018199999999999998, - "loss": 3.0315, + "grad_norm": 4.9756975173950195, + "learning_rate": 0.00013649999999999998, + "loss": 3.4155, "step": 461 }, { "epoch": 0.7392, - "grad_norm": 2.009660243988037, - "learning_rate": 0.0001824, - "loss": 3.0865, + "grad_norm": 2.167109489440918, + "learning_rate": 0.0001368, + "loss": 3.4686, "step": 462 }, { "epoch": 0.7408, - "grad_norm": 2.5454518795013428, - "learning_rate": 0.00018279999999999997, - "loss": 2.9954, + "grad_norm": 1.0356065034866333, + "learning_rate": 0.0001371, + "loss": 3.4003, "step": 463 }, { "epoch": 0.7424, - "grad_norm": 0.8470408916473389, - "learning_rate": 0.00018319999999999998, - "loss": 2.9569, + "grad_norm": 1.7771834135055542, + "learning_rate": 0.0001374, + "loss": 3.3392, "step": 464 }, { "epoch": 0.744, - "grad_norm": 1.2759836912155151, - "learning_rate": 0.0001836, - "loss": 2.9149, + "grad_norm": 1.7832553386688232, + "learning_rate": 0.00013769999999999999, + "loss": 3.3132, "step": 465 }, { "epoch": 0.7456, - "grad_norm": 0.9253954291343689, - "learning_rate": 0.00018399999999999997, - "loss": 2.8104, + "grad_norm": 2.8095998764038086, + "learning_rate": 0.000138, + "loss": 3.2934, "step": 466 }, { "epoch": 0.7472, - "grad_norm": 1.043556809425354, - "learning_rate": 0.00018439999999999998, - "loss": 2.8224, + "grad_norm": 2.455730438232422, + "learning_rate": 0.0001383, + "loss": 3.3035, "step": 467 }, { "epoch": 0.7488, - "grad_norm": 2.264782190322876, - "learning_rate": 0.0001848, - "loss": 2.7933, + "grad_norm": 2.5177114009857178, + "learning_rate": 0.0001386, + "loss": 3.3005, "step": 468 }, { "epoch": 0.7504, - "grad_norm": 1.1183011531829834, - "learning_rate": 0.00018519999999999998, - "loss": 2.8228, + "grad_norm": 0.8732132315635681, + "learning_rate": 0.0001389, + "loss": 3.2456, "step": 469 }, { "epoch": 0.752, - "grad_norm": 0.8123898506164551, - "learning_rate": 0.00018559999999999998, - "loss": 2.7909, + "grad_norm": 1.003593921661377, + "learning_rate": 0.0001392, + "loss": 3.2672, "step": 470 }, { "epoch": 0.7536, - "grad_norm": 0.9366122484207153, - "learning_rate": 0.000186, - "loss": 2.7708, + "grad_norm": 1.1762638092041016, + "learning_rate": 0.0001395, + "loss": 3.2424, "step": 471 }, { "epoch": 0.7552, - "grad_norm": 1.1672213077545166, - "learning_rate": 0.00018639999999999998, - "loss": 2.8047, + "grad_norm": 1.4990063905715942, + "learning_rate": 0.00013979999999999998, + "loss": 3.2615, "step": 472 }, { "epoch": 0.7568, - "grad_norm": 2.68989634513855, - "learning_rate": 0.0001868, - "loss": 2.7845, + "grad_norm": 2.9979848861694336, + "learning_rate": 0.0001401, + "loss": 3.3328, "step": 473 }, { "epoch": 0.7584, - "grad_norm": 1.9162228107452393, - "learning_rate": 0.0001872, - "loss": 2.6011, + "grad_norm": 1.6674293279647827, + "learning_rate": 0.0001404, + "loss": 3.1763, "step": 474 }, { "epoch": 0.76, - "grad_norm": 2.0587878227233887, - "learning_rate": 0.00018759999999999998, - "loss": 2.7066, + "grad_norm": 1.2908185720443726, + "learning_rate": 0.00014069999999999998, + "loss": 3.2605, "step": 475 }, { "epoch": 0.7616, - "grad_norm": 0.9546549320220947, - "learning_rate": 0.000188, - "loss": 2.7525, + "grad_norm": 0.7701535820960999, + "learning_rate": 0.00014099999999999998, + "loss": 3.2527, "step": 476 }, { "epoch": 0.7632, - "grad_norm": 1.305992841720581, - "learning_rate": 0.00018839999999999997, - "loss": 2.8766, + "grad_norm": 1.2732120752334595, + "learning_rate": 0.0001413, + "loss": 3.393, "step": 477 }, { "epoch": 0.7648, - "grad_norm": 1.25698721408844, - "learning_rate": 0.00018879999999999998, - "loss": 2.7792, + "grad_norm": 1.0643001794815063, + "learning_rate": 0.00014159999999999997, + "loss": 3.2402, "step": 478 }, { "epoch": 0.7664, - "grad_norm": 1.4074276685714722, - "learning_rate": 0.0001892, - "loss": 2.5459, + "grad_norm": 1.2950279712677002, + "learning_rate": 0.00014189999999999998, + "loss": 3.1784, "step": 479 }, { "epoch": 0.768, - "grad_norm": 1.1359269618988037, - "learning_rate": 0.00018959999999999997, - "loss": 2.5343, + "grad_norm": 1.4429744482040405, + "learning_rate": 0.0001422, + "loss": 3.1181, "step": 480 }, { "epoch": 0.7696, - "grad_norm": 1.2957308292388916, - "learning_rate": 0.00018999999999999998, - "loss": 2.6206, + "grad_norm": 0.8893711566925049, + "learning_rate": 0.0001425, + "loss": 3.292, "step": 481 }, { "epoch": 0.7712, - "grad_norm": 0.9115980863571167, - "learning_rate": 0.0001904, - "loss": 2.5279, + "grad_norm": 0.8357609510421753, + "learning_rate": 0.00014279999999999997, + "loss": 3.1505, "step": 482 }, { "epoch": 0.7728, - "grad_norm": 1.0715998411178589, - "learning_rate": 0.00019079999999999998, - "loss": 2.9977, + "grad_norm": 0.9251508712768555, + "learning_rate": 0.00014309999999999998, + "loss": 3.4092, "step": 483 }, { "epoch": 0.7744, - "grad_norm": 3.206315517425537, - "learning_rate": 0.00019119999999999999, - "loss": 2.7214, + "grad_norm": 4.293961524963379, + "learning_rate": 0.0001434, + "loss": 3.3033, "step": 484 }, { "epoch": 0.776, - "grad_norm": 1.8996236324310303, - "learning_rate": 0.0001916, - "loss": 2.5993, + "grad_norm": 3.240405559539795, + "learning_rate": 0.00014369999999999997, + "loss": 3.2177, "step": 485 }, { "epoch": 0.7776, - "grad_norm": 1.2291277647018433, - "learning_rate": 0.00019199999999999998, - "loss": 2.6469, + "grad_norm": 1.5192947387695312, + "learning_rate": 0.00014399999999999998, + "loss": 3.2168, "step": 486 }, { "epoch": 0.7792, - "grad_norm": 1.7361717224121094, - "learning_rate": 0.0001924, - "loss": 2.7336, + "grad_norm": 0.9653074145317078, + "learning_rate": 0.00014429999999999998, + "loss": 3.2922, "step": 487 }, { "epoch": 0.7808, - "grad_norm": 1.7760653495788574, - "learning_rate": 0.0001928, - "loss": 2.7031, + "grad_norm": 4.4581475257873535, + "learning_rate": 0.0001446, + "loss": 3.2312, "step": 488 }, { "epoch": 0.7824, - "grad_norm": 2.499051094055176, - "learning_rate": 0.00019319999999999998, - "loss": 2.7771, + "grad_norm": 1.0327415466308594, + "learning_rate": 0.00014489999999999997, + "loss": 3.2242, "step": 489 }, { "epoch": 0.784, - "grad_norm": 1.7659521102905273, - "learning_rate": 0.0001936, - "loss": 2.6913, + "grad_norm": 1.4427021741867065, + "learning_rate": 0.00014519999999999998, + "loss": 3.2571, "step": 490 }, { "epoch": 0.7856, - "grad_norm": 1.1593416929244995, - "learning_rate": 0.00019399999999999997, - "loss": 2.8942, + "grad_norm": 1.1032732725143433, + "learning_rate": 0.00014549999999999999, + "loss": 3.3207, "step": 491 }, { "epoch": 0.7872, - "grad_norm": 1.008558988571167, - "learning_rate": 0.00019439999999999998, - "loss": 2.6859, + "grad_norm": 1.0624717473983765, + "learning_rate": 0.0001458, + "loss": 3.2055, "step": 492 }, { "epoch": 0.7888, - "grad_norm": 1.389395833015442, - "learning_rate": 0.0001948, - "loss": 2.4138, + "grad_norm": 1.9576382637023926, + "learning_rate": 0.00014609999999999997, + "loss": 3.0291, "step": 493 }, { "epoch": 0.7904, - "grad_norm": 1.649500846862793, - "learning_rate": 0.00019519999999999997, - "loss": 2.8378, + "grad_norm": 1.1364521980285645, + "learning_rate": 0.00014639999999999998, + "loss": 3.2193, "step": 494 }, { "epoch": 0.792, - "grad_norm": 1.2002882957458496, - "learning_rate": 0.00019559999999999998, - "loss": 2.5141, + "grad_norm": 1.5906617641448975, + "learning_rate": 0.0001467, + "loss": 3.2005, "step": 495 }, { "epoch": 0.7936, - "grad_norm": 2.0668468475341797, - "learning_rate": 0.00019599999999999997, - "loss": 2.4013, + "grad_norm": 2.4177134037017822, + "learning_rate": 0.000147, + "loss": 3.0561, "step": 496 }, { "epoch": 0.7952, - "grad_norm": 3.268202066421509, - "learning_rate": 0.00019639999999999998, - "loss": 2.664, + "grad_norm": 2.282595634460449, + "learning_rate": 0.00014729999999999998, + "loss": 3.2687, "step": 497 }, { "epoch": 0.7968, - "grad_norm": 1.8152614831924438, - "learning_rate": 0.00019679999999999999, - "loss": 2.6374, + "grad_norm": 3.5763862133026123, + "learning_rate": 0.00014759999999999998, + "loss": 3.1898, "step": 498 }, { "epoch": 0.7984, - "grad_norm": 2.2615838050842285, - "learning_rate": 0.00019719999999999997, - "loss": 2.9934, + "grad_norm": 3.573193311691284, + "learning_rate": 0.0001479, + "loss": 3.4237, "step": 499 }, { "epoch": 0.8, - "grad_norm": 2.3490400314331055, - "learning_rate": 0.00019759999999999998, - "loss": 3.0699, - "step": 500 - }, - { - "epoch": 0.8, - "eval_cer": 0.6001793757723124, - "eval_loss": 2.7479889392852783, - "eval_runtime": 158.7062, - "eval_samples_per_second": 19.76, - "eval_steps_per_second": 1.235, - "eval_wer": 0.8336959900528442, + "grad_norm": 3.274698495864868, + "learning_rate": 0.0001482, + "loss": 3.4449, "step": 500 }, { "epoch": 0.8016, - "grad_norm": 7.252814292907715, - "learning_rate": 0.000198, - "loss": 2.8085, + "grad_norm": 7.6723127365112305, + "learning_rate": 0.00014849999999999998, + "loss": 3.4171, "step": 501 }, { "epoch": 0.8032, - "grad_norm": 4.065989017486572, - "learning_rate": 0.00019839999999999997, - "loss": 2.5431, + "grad_norm": 3.750722646713257, + "learning_rate": 0.00014879999999999998, + "loss": 3.1539, "step": 502 }, { "epoch": 0.8048, - "grad_norm": 5.091382026672363, - "learning_rate": 0.00019879999999999998, - "loss": 2.8578, + "grad_norm": 6.335118770599365, + "learning_rate": 0.0001491, + "loss": 3.4048, "step": 503 }, { "epoch": 0.8064, - "grad_norm": 2.2774922847747803, - "learning_rate": 0.0001992, - "loss": 2.6625, + "grad_norm": 3.3972010612487793, + "learning_rate": 0.0001494, + "loss": 3.2258, "step": 504 }, { "epoch": 0.808, - "grad_norm": 2.239748239517212, - "learning_rate": 0.00019959999999999997, - "loss": 2.6375, + "grad_norm": 1.252905249595642, + "learning_rate": 0.00014969999999999998, + "loss": 3.2009, "step": 505 }, { "epoch": 0.8096, - "grad_norm": 3.0972511768341064, - "learning_rate": 0.00019999999999999998, - "loss": 2.9029, + "grad_norm": 1.956114411354065, + "learning_rate": 0.00015, + "loss": 3.3399, "step": 506 }, { "epoch": 0.8112, - "grad_norm": 5.276496410369873, - "learning_rate": 0.0002004, - "loss": 2.4195, + "grad_norm": 4.416980743408203, + "learning_rate": 0.0001503, + "loss": 3.0746, "step": 507 }, { "epoch": 0.8128, - "grad_norm": 4.866186618804932, - "learning_rate": 0.00020079999999999997, - "loss": 2.4582, + "grad_norm": 4.364480495452881, + "learning_rate": 0.00015059999999999997, + "loss": 3.0995, "step": 508 }, { "epoch": 0.8144, - "grad_norm": 3.0533981323242188, - "learning_rate": 0.00020119999999999998, - "loss": 2.4498, + "grad_norm": 3.2153031826019287, + "learning_rate": 0.00015089999999999998, + "loss": 3.1032, "step": 509 }, { "epoch": 0.816, - "grad_norm": 1.130332350730896, - "learning_rate": 0.0002016, - "loss": 2.4513, + "grad_norm": 1.0816490650177002, + "learning_rate": 0.0001512, + "loss": 3.0827, "step": 510 }, { "epoch": 0.8176, - "grad_norm": 1.124791145324707, - "learning_rate": 0.00020199999999999998, - "loss": 2.1947, + "grad_norm": 1.157222867012024, + "learning_rate": 0.0001515, + "loss": 2.9231, "step": 511 }, { "epoch": 0.8192, - "grad_norm": 3.729512929916382, - "learning_rate": 0.0002024, - "loss": 2.3357, + "grad_norm": 3.4473843574523926, + "learning_rate": 0.00015179999999999998, + "loss": 2.9558, "step": 512 }, { "epoch": 0.8208, - "grad_norm": 3.9868364334106445, - "learning_rate": 0.0002028, - "loss": 2.4641, + "grad_norm": 4.260528087615967, + "learning_rate": 0.00015209999999999998, + "loss": 3.0114, "step": 513 }, { "epoch": 0.8224, - "grad_norm": 3.2634570598602295, - "learning_rate": 0.00020319999999999998, - "loss": 2.2873, + "grad_norm": 3.8227248191833496, + "learning_rate": 0.0001524, + "loss": 2.912, "step": 514 }, { "epoch": 0.824, - "grad_norm": 6.407214641571045, - "learning_rate": 0.00020359999999999996, - "loss": 2.4176, + "grad_norm": 7.646665573120117, + "learning_rate": 0.0001527, + "loss": 3.1023, "step": 515 }, { "epoch": 0.8256, - "grad_norm": 2.0617222785949707, - "learning_rate": 0.000204, - "loss": 2.3114, + "grad_norm": 2.7784230709075928, + "learning_rate": 0.00015299999999999998, + "loss": 2.9334, "step": 516 }, { "epoch": 0.8272, - "grad_norm": 1.7285425662994385, - "learning_rate": 0.00020439999999999998, - "loss": 2.2132, + "grad_norm": 2.957230806350708, + "learning_rate": 0.00015329999999999999, + "loss": 2.9305, "step": 517 }, { "epoch": 0.8288, - "grad_norm": 1.2327693700790405, - "learning_rate": 0.00020479999999999996, - "loss": 1.9775, + "grad_norm": 1.1709569692611694, + "learning_rate": 0.0001536, + "loss": 2.7356, "step": 518 }, { "epoch": 0.8304, - "grad_norm": 1.495557427406311, - "learning_rate": 0.0002052, - "loss": 2.3095, + "grad_norm": 1.495569109916687, + "learning_rate": 0.0001539, + "loss": 2.9082, "step": 519 }, { "epoch": 0.832, - "grad_norm": 1.9202693700790405, - "learning_rate": 0.00020559999999999998, - "loss": 2.3414, + "grad_norm": 2.442965269088745, + "learning_rate": 0.00015419999999999998, + "loss": 2.8912, "step": 520 }, { "epoch": 0.8336, - "grad_norm": 1.3025583028793335, - "learning_rate": 0.00020599999999999997, - "loss": 2.4883, + "grad_norm": 1.7007681131362915, + "learning_rate": 0.0001545, + "loss": 2.9658, "step": 521 }, { "epoch": 0.8352, - "grad_norm": 1.6052091121673584, - "learning_rate": 0.00020639999999999998, - "loss": 2.1605, + "grad_norm": 2.2086222171783447, + "learning_rate": 0.0001548, + "loss": 2.7858, "step": 522 }, { "epoch": 0.8368, - "grad_norm": 1.410342812538147, - "learning_rate": 0.00020679999999999999, - "loss": 2.2074, + "grad_norm": 1.53826904296875, + "learning_rate": 0.0001551, + "loss": 2.7899, "step": 523 }, { "epoch": 0.8384, - "grad_norm": 2.4669413566589355, - "learning_rate": 0.00020719999999999997, - "loss": 2.6925, + "grad_norm": 2.381734848022461, + "learning_rate": 0.00015539999999999998, + "loss": 3.1133, "step": 524 }, { "epoch": 0.84, - "grad_norm": 2.8002283573150635, - "learning_rate": 0.00020759999999999998, - "loss": 2.4361, + "grad_norm": 3.017554998397827, + "learning_rate": 0.0001557, + "loss": 2.9739, "step": 525 }, { "epoch": 0.8416, - "grad_norm": 2.760272264480591, - "learning_rate": 0.000208, - "loss": 2.3288, + "grad_norm": 3.71621036529541, + "learning_rate": 0.000156, + "loss": 2.8974, "step": 526 }, { "epoch": 0.8432, - "grad_norm": 1.630889654159546, - "learning_rate": 0.00020839999999999997, - "loss": 2.2201, + "grad_norm": 1.7555559873580933, + "learning_rate": 0.0001563, + "loss": 2.8068, "step": 527 }, { "epoch": 0.8448, - "grad_norm": 1.1679999828338623, - "learning_rate": 0.00020879999999999998, - "loss": 2.1824, + "grad_norm": 1.1658614873886108, + "learning_rate": 0.00015659999999999998, + "loss": 2.7185, "step": 528 }, { "epoch": 0.8464, - "grad_norm": 1.2356942892074585, - "learning_rate": 0.0002092, - "loss": 2.0637, + "grad_norm": 0.8518783450126648, + "learning_rate": 0.0001569, + "loss": 2.7038, "step": 529 }, { "epoch": 0.848, - "grad_norm": 0.9838883876800537, - "learning_rate": 0.00020959999999999997, - "loss": 2.1315, + "grad_norm": 1.1382691860198975, + "learning_rate": 0.0001572, + "loss": 2.7108, "step": 530 }, { "epoch": 0.8496, - "grad_norm": 2.134653091430664, - "learning_rate": 0.00020999999999999998, - "loss": 2.3373, + "grad_norm": 1.3236608505249023, + "learning_rate": 0.00015749999999999998, + "loss": 2.862, "step": 531 }, { "epoch": 0.8512, - "grad_norm": 1.8091658353805542, - "learning_rate": 0.0002104, - "loss": 2.1348, + "grad_norm": 1.4057608842849731, + "learning_rate": 0.0001578, + "loss": 2.7198, "step": 532 }, { "epoch": 0.8528, - "grad_norm": 1.6032664775848389, - "learning_rate": 0.00021079999999999997, - "loss": 1.9647, + "grad_norm": 1.2580957412719727, + "learning_rate": 0.0001581, + "loss": 2.5294, "step": 533 }, { "epoch": 0.8544, - "grad_norm": 1.6092320680618286, - "learning_rate": 0.00021119999999999996, - "loss": 2.1598, + "grad_norm": 1.328491449356079, + "learning_rate": 0.0001584, + "loss": 2.6866, "step": 534 }, { "epoch": 0.856, - "grad_norm": 1.3249115943908691, - "learning_rate": 0.0002116, - "loss": 2.4795, + "grad_norm": 1.0519988536834717, + "learning_rate": 0.00015869999999999998, + "loss": 2.8501, "step": 535 }, { "epoch": 0.8576, - "grad_norm": 1.057600975036621, - "learning_rate": 0.00021199999999999998, - "loss": 2.5951, + "grad_norm": 1.733718752861023, + "learning_rate": 0.000159, + "loss": 2.9786, "step": 536 }, { "epoch": 0.8592, - "grad_norm": 1.3457427024841309, - "learning_rate": 0.00021239999999999996, - "loss": 1.9831, + "grad_norm": 1.1301422119140625, + "learning_rate": 0.0001593, + "loss": 2.5756, "step": 537 }, { "epoch": 0.8608, - "grad_norm": 1.3885570764541626, - "learning_rate": 0.0002128, - "loss": 1.9622, + "grad_norm": 1.241924524307251, + "learning_rate": 0.0001596, + "loss": 2.5149, "step": 538 }, { "epoch": 0.8624, - "grad_norm": 1.6927804946899414, - "learning_rate": 0.00021319999999999998, - "loss": 2.2762, + "grad_norm": 1.328609824180603, + "learning_rate": 0.00015989999999999998, + "loss": 2.7648, "step": 539 }, { "epoch": 0.864, - "grad_norm": 1.5378133058547974, - "learning_rate": 0.00021359999999999996, - "loss": 2.1121, + "grad_norm": 1.236745834350586, + "learning_rate": 0.0001602, + "loss": 2.5732, "step": 540 }, { "epoch": 0.8656, - "grad_norm": 2.0830371379852295, - "learning_rate": 0.000214, - "loss": 2.0769, + "grad_norm": 1.5136185884475708, + "learning_rate": 0.0001605, + "loss": 2.5341, "step": 541 }, { "epoch": 0.8672, - "grad_norm": 1.5723661184310913, - "learning_rate": 0.00021439999999999998, - "loss": 2.48, + "grad_norm": 1.1722863912582397, + "learning_rate": 0.0001608, + "loss": 2.9393, "step": 542 }, { "epoch": 0.8688, - "grad_norm": 2.520364999771118, - "learning_rate": 0.00021479999999999996, - "loss": 2.293, + "grad_norm": 1.9231904745101929, + "learning_rate": 0.00016109999999999999, + "loss": 2.674, "step": 543 }, { "epoch": 0.8704, - "grad_norm": 1.5732394456863403, - "learning_rate": 0.0002152, - "loss": 1.9714, + "grad_norm": 1.632614254951477, + "learning_rate": 0.0001614, + "loss": 2.5229, "step": 544 }, { "epoch": 0.872, - "grad_norm": 1.117932677268982, - "learning_rate": 0.00021559999999999998, - "loss": 2.267, + "grad_norm": 1.2549848556518555, + "learning_rate": 0.0001617, + "loss": 2.6451, "step": 545 }, { "epoch": 0.8736, "grad_norm": NaN, - "learning_rate": 0.00021559999999999998, - "loss": 2.4488, + "learning_rate": 0.0001617, + "loss": 2.7689, "step": 546 }, { "epoch": 0.8752, - "grad_norm": 2.9612374305725098, - "learning_rate": 0.00021599999999999996, - "loss": 2.674, + "grad_norm": 1.994581937789917, + "learning_rate": 0.000162, + "loss": 2.9942, "step": 547 }, { "epoch": 0.8768, - "grad_norm": 4.073216915130615, - "learning_rate": 0.0002164, - "loss": 2.7163, + "grad_norm": 2.948986291885376, + "learning_rate": 0.0001623, + "loss": 2.9567, "step": 548 }, { "epoch": 0.8784, - "grad_norm": 1.3353551626205444, - "learning_rate": 0.00021679999999999998, - "loss": 2.4723, + "grad_norm": 3.3447182178497314, + "learning_rate": 0.0001626, + "loss": 2.926, "step": 549 }, { "epoch": 0.88, - "grad_norm": 2.927354097366333, - "learning_rate": 0.00021719999999999997, - "loss": 2.6158, + "grad_norm": 1.6422103643417358, + "learning_rate": 0.0001629, + "loss": 2.9978, "step": 550 }, { "epoch": 0.8816, - "grad_norm": 7.480506420135498, - "learning_rate": 0.0002176, - "loss": 2.6206, + "grad_norm": 17.27240753173828, + "learning_rate": 0.0001632, + "loss": 3.4376, "step": 551 }, { "epoch": 0.8832, - "grad_norm": 10.748481750488281, - "learning_rate": 0.00021799999999999999, - "loss": 3.1361, + "grad_norm": 16.129791259765625, + "learning_rate": 0.0001635, + "loss": 3.5866, "step": 552 }, { "epoch": 0.8848, - "grad_norm": 4.278016090393066, - "learning_rate": 0.00021839999999999997, - "loss": 2.3451, + "grad_norm": 6.709097862243652, + "learning_rate": 0.0001638, + "loss": 2.8288, "step": 553 }, { "epoch": 0.8864, - "grad_norm": 11.78780460357666, - "learning_rate": 0.00021879999999999995, - "loss": 3.1178, + "grad_norm": 15.814329147338867, + "learning_rate": 0.0001641, + "loss": 3.5662, "step": 554 }, { "epoch": 0.888, - "grad_norm": 2.092212438583374, - "learning_rate": 0.0002192, - "loss": 2.3741, + "grad_norm": 1.16456937789917, + "learning_rate": 0.0001644, + "loss": 2.7902, "step": 555 }, { "epoch": 0.8896, - "grad_norm": 2.331865072250366, - "learning_rate": 0.00021959999999999997, - "loss": 1.9955, + "grad_norm": 1.7065085172653198, + "learning_rate": 0.0001647, + "loss": 2.4504, "step": 556 }, { "epoch": 0.8912, - "grad_norm": 2.80426025390625, - "learning_rate": 0.00021999999999999995, - "loss": 2.0726, + "grad_norm": 2.260443925857544, + "learning_rate": 0.000165, + "loss": 2.5193, "step": 557 }, { "epoch": 0.8928, - "grad_norm": 2.815202474594116, - "learning_rate": 0.0002204, - "loss": 2.4074, + "grad_norm": 2.0311453342437744, + "learning_rate": 0.0001653, + "loss": 2.7037, "step": 558 }, { "epoch": 0.8944, - "grad_norm": 2.3765721321105957, - "learning_rate": 0.00022079999999999997, - "loss": 2.206, + "grad_norm": 2.1808862686157227, + "learning_rate": 0.0001656, + "loss": 2.5959, "step": 559 }, { "epoch": 0.896, - "grad_norm": 6.6429314613342285, - "learning_rate": 0.00022119999999999996, - "loss": 2.4754, + "grad_norm": 6.377316951751709, + "learning_rate": 0.0001659, + "loss": 2.839, "step": 560 }, { "epoch": 0.8976, - "grad_norm": 5.93609094619751, - "learning_rate": 0.0002216, - "loss": 2.7358, + "grad_norm": 6.089463710784912, + "learning_rate": 0.0001662, + "loss": 3.1665, "step": 561 }, { "epoch": 0.8992, - "grad_norm": 6.85822868347168, - "learning_rate": 0.00022199999999999998, - "loss": 2.459, + "grad_norm": 6.491182804107666, + "learning_rate": 0.0001665, + "loss": 2.885, "step": 562 }, { "epoch": 0.9008, - "grad_norm": 0.9314383864402771, - "learning_rate": 0.00022239999999999996, - "loss": 1.8933, + "grad_norm": 1.3139764070510864, + "learning_rate": 0.0001668, + "loss": 2.3777, "step": 563 }, { "epoch": 0.9024, - "grad_norm": 1.0340198278427124, - "learning_rate": 0.0002228, - "loss": 2.0382, + "grad_norm": 1.7619560956954956, + "learning_rate": 0.0001671, + "loss": 2.5143, "step": 564 }, { "epoch": 0.904, - "grad_norm": 1.108193278312683, - "learning_rate": 0.00022319999999999998, - "loss": 2.0965, + "grad_norm": 1.494608759880066, + "learning_rate": 0.0001674, + "loss": 2.4989, "step": 565 }, { "epoch": 0.9056, - "grad_norm": 1.070430040359497, - "learning_rate": 0.00022359999999999996, - "loss": 1.9219, + "grad_norm": 1.0623366832733154, + "learning_rate": 0.0001677, + "loss": 2.3889, "step": 566 }, { "epoch": 0.9072, - "grad_norm": 1.2231314182281494, - "learning_rate": 0.000224, - "loss": 1.8723, + "grad_norm": 1.6987491846084595, + "learning_rate": 0.000168, + "loss": 2.3293, "step": 567 }, { "epoch": 0.9088, - "grad_norm": 1.2423832416534424, - "learning_rate": 0.00022439999999999998, - "loss": 1.7471, + "grad_norm": 2.076875925064087, + "learning_rate": 0.0001683, + "loss": 2.2478, "step": 568 }, { "epoch": 0.9104, - "grad_norm": 1.246390461921692, - "learning_rate": 0.00022479999999999996, - "loss": 1.8937, + "grad_norm": 2.2903881072998047, + "learning_rate": 0.0001686, + "loss": 2.3652, "step": 569 }, { "epoch": 0.912, - "grad_norm": 1.130929946899414, - "learning_rate": 0.0002252, - "loss": 1.8579, + "grad_norm": 2.357800245285034, + "learning_rate": 0.00016889999999999996, + "loss": 2.298, "step": 570 }, { "epoch": 0.9136, - "grad_norm": 1.304247260093689, - "learning_rate": 0.00022559999999999998, - "loss": 2.0576, + "grad_norm": 0.7803952693939209, + "learning_rate": 0.00016919999999999997, + "loss": 2.4371, "step": 571 }, { "epoch": 0.9152, - "grad_norm": 1.309567928314209, - "learning_rate": 0.00022599999999999996, - "loss": 1.9817, + "grad_norm": 1.7425607442855835, + "learning_rate": 0.00016949999999999997, + "loss": 2.3749, "step": 572 }, { "epoch": 0.9168, - "grad_norm": 1.2422513961791992, - "learning_rate": 0.0002264, - "loss": 2.2693, + "grad_norm": 1.223272681236267, + "learning_rate": 0.00016979999999999998, + "loss": 2.5863, "step": 573 }, { "epoch": 0.9184, - "grad_norm": 1.0724761486053467, - "learning_rate": 0.00022679999999999998, - "loss": 1.9313, + "grad_norm": 1.270267128944397, + "learning_rate": 0.00017009999999999996, + "loss": 2.3319, "step": 574 }, { "epoch": 0.92, - "grad_norm": 1.7489149570465088, - "learning_rate": 0.00022719999999999997, - "loss": 2.1692, + "grad_norm": 1.346740961074829, + "learning_rate": 0.00017039999999999997, + "loss": 2.5017, "step": 575 }, { "epoch": 0.9216, - "grad_norm": 1.168873906135559, - "learning_rate": 0.0002276, - "loss": 2.0218, + "grad_norm": 1.2833489179611206, + "learning_rate": 0.00017069999999999998, + "loss": 2.374, "step": 576 }, { "epoch": 0.9232, - "grad_norm": 0.9614212512969971, - "learning_rate": 0.00022799999999999999, - "loss": 2.3924, + "grad_norm": 1.1349873542785645, + "learning_rate": 0.00017099999999999998, + "loss": 2.7307, "step": 577 }, { "epoch": 0.9248, - "grad_norm": 1.258357048034668, - "learning_rate": 0.00022839999999999997, - "loss": 1.9544, + "grad_norm": 1.443171501159668, + "learning_rate": 0.00017129999999999996, + "loss": 2.3192, "step": 578 }, { "epoch": 0.9264, - "grad_norm": 2.0249435901641846, - "learning_rate": 0.0002288, - "loss": 1.9929, + "grad_norm": 1.5835188627243042, + "learning_rate": 0.00017159999999999997, + "loss": 2.3311, "step": 579 }, { "epoch": 0.928, - "grad_norm": 1.0736762285232544, - "learning_rate": 0.0002292, - "loss": 1.7423, + "grad_norm": 1.2339696884155273, + "learning_rate": 0.00017189999999999998, + "loss": 2.1611, "step": 580 }, { "epoch": 0.9296, - "grad_norm": 1.4606719017028809, - "learning_rate": 0.00022959999999999997, - "loss": 1.8804, + "grad_norm": 1.6790226697921753, + "learning_rate": 0.00017219999999999998, + "loss": 2.2807, "step": 581 }, { "epoch": 0.9312, - "grad_norm": 0.8972280025482178, - "learning_rate": 0.00023, - "loss": 1.8398, + "grad_norm": 1.297339677810669, + "learning_rate": 0.00017249999999999996, + "loss": 2.2204, "step": 582 }, { "epoch": 0.9328, - "grad_norm": 2.3839757442474365, - "learning_rate": 0.0002304, - "loss": 2.1795, + "grad_norm": 2.8306925296783447, + "learning_rate": 0.00017279999999999997, + "loss": 2.4957, "step": 583 }, { "epoch": 0.9344, - "grad_norm": 1.3923808336257935, - "learning_rate": 0.00023079999999999997, - "loss": 1.7968, + "grad_norm": 1.6782737970352173, + "learning_rate": 0.00017309999999999998, + "loss": 2.1674, "step": 584 }, { "epoch": 0.936, - "grad_norm": 0.9030740857124329, - "learning_rate": 0.0002312, - "loss": 1.6572, + "grad_norm": 0.9002209305763245, + "learning_rate": 0.00017339999999999996, + "loss": 2.0584, "step": 585 }, { "epoch": 0.9376, - "grad_norm": 1.322900414466858, - "learning_rate": 0.0002316, - "loss": 1.8275, + "grad_norm": 1.204094409942627, + "learning_rate": 0.00017369999999999997, + "loss": 2.1932, "step": 586 }, { "epoch": 0.9392, - "grad_norm": 1.1124223470687866, - "learning_rate": 0.00023199999999999997, - "loss": 2.0224, + "grad_norm": 1.075980544090271, + "learning_rate": 0.00017399999999999997, + "loss": 2.3717, "step": 587 }, { "epoch": 0.9408, - "grad_norm": 2.4704806804656982, - "learning_rate": 0.00023239999999999996, - "loss": 1.9902, + "grad_norm": 1.9965050220489502, + "learning_rate": 0.00017429999999999998, + "loss": 2.2855, "step": 588 }, { "epoch": 0.9424, - "grad_norm": 2.128262758255005, - "learning_rate": 0.0002328, - "loss": 1.795, + "grad_norm": 2.01662015914917, + "learning_rate": 0.00017459999999999996, + "loss": 2.1505, "step": 589 }, { "epoch": 0.944, - "grad_norm": 1.577215313911438, - "learning_rate": 0.00023319999999999998, - "loss": 2.0014, + "grad_norm": 1.4860492944717407, + "learning_rate": 0.00017489999999999997, + "loss": 2.2968, "step": 590 }, { "epoch": 0.9456, - "grad_norm": 1.1099101305007935, - "learning_rate": 0.00023359999999999996, - "loss": 1.9604, + "grad_norm": 1.2854816913604736, + "learning_rate": 0.00017519999999999998, + "loss": 2.3285, "step": 591 }, { "epoch": 0.9472, - "grad_norm": 1.6668699979782104, - "learning_rate": 0.000234, - "loss": 2.2408, + "grad_norm": 1.5911052227020264, + "learning_rate": 0.00017549999999999998, + "loss": 2.3975, "step": 592 }, { "epoch": 0.9488, - "grad_norm": 1.8698573112487793, - "learning_rate": 0.00023439999999999998, - "loss": 2.0816, + "grad_norm": 1.413689374923706, + "learning_rate": 0.00017579999999999996, + "loss": 2.3824, "step": 593 }, { "epoch": 0.9504, - "grad_norm": 2.9194648265838623, - "learning_rate": 0.00023479999999999996, - "loss": 1.9059, + "grad_norm": 2.4783072471618652, + "learning_rate": 0.00017609999999999997, + "loss": 2.1916, "step": 594 }, { "epoch": 0.952, - "grad_norm": 1.137349009513855, - "learning_rate": 0.0002352, - "loss": 2.0177, + "grad_norm": 1.7629774808883667, + "learning_rate": 0.00017639999999999998, + "loss": 2.33, "step": 595 }, { "epoch": 0.9536, - "grad_norm": 1.3729033470153809, - "learning_rate": 0.00023559999999999998, - "loss": 2.0672, + "grad_norm": 1.8146947622299194, + "learning_rate": 0.00017669999999999999, + "loss": 2.4165, "step": 596 }, { "epoch": 0.9552, - "grad_norm": 1.4698584079742432, - "learning_rate": 0.00023599999999999996, - "loss": 1.8058, + "grad_norm": 1.5814777612686157, + "learning_rate": 0.00017699999999999997, + "loss": 2.1172, "step": 597 }, { "epoch": 0.9568, - "grad_norm": 1.9527052640914917, - "learning_rate": 0.0002364, - "loss": 2.1384, + "grad_norm": 1.7164720296859741, + "learning_rate": 0.00017729999999999997, + "loss": 2.3839, "step": 598 }, { "epoch": 0.9584, - "grad_norm": 1.9304373264312744, - "learning_rate": 0.00023679999999999998, - "loss": 2.0941, + "grad_norm": 1.7895351648330688, + "learning_rate": 0.00017759999999999998, + "loss": 2.4406, "step": 599 }, { "epoch": 0.96, "grad_norm": NaN, - "learning_rate": 0.00023679999999999998, - "loss": 1.7389, - "step": 600 - }, - { - "epoch": 0.96, - "eval_cer": 0.47166660023119544, - "eval_loss": 2.4468393325805664, - "eval_runtime": 158.3967, - "eval_samples_per_second": 19.798, - "eval_steps_per_second": 1.237, - "eval_wer": 0.6782302352087867, + "learning_rate": 0.00017759999999999998, + "loss": 2.1268, "step": 600 }, { "epoch": 0.9616, - "grad_norm": 22.909198760986328, - "learning_rate": 0.00023719999999999997, - "loss": 4.3211, + "grad_norm": 28.912076950073242, + "learning_rate": 0.0001779, + "loss": 4.5621, "step": 601 }, { "epoch": 0.9632, - "grad_norm": 6.32504415512085, - "learning_rate": 0.0002376, - "loss": 2.4554, + "grad_norm": 9.135653495788574, + "learning_rate": 0.00017819999999999997, + "loss": 2.7774, "step": 602 }, { "epoch": 0.9648, - "grad_norm": 2.5991179943084717, - "learning_rate": 0.00023799999999999998, - "loss": 2.2983, + "grad_norm": 4.935175895690918, + "learning_rate": 0.00017849999999999997, + "loss": 2.6115, "step": 603 }, { "epoch": 0.9664, - "grad_norm": 2.1994707584381104, - "learning_rate": 0.00023839999999999997, - "loss": 1.9377, + "grad_norm": 1.8409152030944824, + "learning_rate": 0.00017879999999999998, + "loss": 2.1965, "step": 604 }, { "epoch": 0.968, - "grad_norm": 3.0079381465911865, - "learning_rate": 0.0002388, - "loss": 2.0362, + "grad_norm": 1.7993711233139038, + "learning_rate": 0.0001791, + "loss": 2.2617, "step": 605 }, { "epoch": 0.9696, - "grad_norm": 2.14380145072937, - "learning_rate": 0.0002392, - "loss": 1.761, + "grad_norm": 1.4365873336791992, + "learning_rate": 0.00017939999999999997, + "loss": 2.0682, "step": 606 }, { "epoch": 0.9712, - "grad_norm": 3.1852333545684814, - "learning_rate": 0.00023959999999999997, - "loss": 1.8808, + "grad_norm": 3.2426984310150146, + "learning_rate": 0.00017969999999999998, + "loss": 2.124, "step": 607 }, { "epoch": 0.9728, - "grad_norm": 1.4008501768112183, - "learning_rate": 0.00023999999999999998, - "loss": 1.9098, + "grad_norm": 1.8241300582885742, + "learning_rate": 0.00017999999999999998, + "loss": 2.204, "step": 608 }, { "epoch": 0.9744, - "grad_norm": 1.1618804931640625, - "learning_rate": 0.0002404, - "loss": 1.95, + "grad_norm": 1.8929377794265747, + "learning_rate": 0.00018029999999999996, + "loss": 2.194, "step": 609 }, { "epoch": 0.976, - "grad_norm": 3.1721315383911133, - "learning_rate": 0.00024079999999999997, - "loss": 1.9567, + "grad_norm": 1.9729321002960205, + "learning_rate": 0.00018059999999999997, + "loss": 2.1454, "step": 610 }, { "epoch": 0.9776, - "grad_norm": 3.0783185958862305, - "learning_rate": 0.00024119999999999998, - "loss": 1.7904, + "grad_norm": 2.0208892822265625, + "learning_rate": 0.00018089999999999998, + "loss": 2.0107, "step": 611 }, { "epoch": 0.9792, - "grad_norm": 2.50898814201355, - "learning_rate": 0.0002416, - "loss": 1.8343, + "grad_norm": 2.0599329471588135, + "learning_rate": 0.00018119999999999999, + "loss": 2.1194, "step": 612 }, { "epoch": 0.9808, - "grad_norm": 1.6948851346969604, - "learning_rate": 0.00024199999999999997, - "loss": 1.7023, + "grad_norm": 1.338659405708313, + "learning_rate": 0.00018149999999999997, + "loss": 1.9717, "step": 613 }, { "epoch": 0.9824, - "grad_norm": 1.2017195224761963, - "learning_rate": 0.00024239999999999998, - "loss": 1.8026, + "grad_norm": 1.0782694816589355, + "learning_rate": 0.00018179999999999997, + "loss": 2.0979, "step": 614 }, { "epoch": 0.984, - "grad_norm": 1.4887975454330444, - "learning_rate": 0.0002428, - "loss": 1.6911, + "grad_norm": 1.9497066736221313, + "learning_rate": 0.00018209999999999998, + "loss": 1.9913, "step": 615 }, { "epoch": 0.9856, - "grad_norm": 0.9019091725349426, - "learning_rate": 0.00024319999999999998, - "loss": 1.5794, + "grad_norm": 1.055597186088562, + "learning_rate": 0.0001824, + "loss": 1.8407, "step": 616 }, { "epoch": 0.9872, - "grad_norm": 1.0633786916732788, - "learning_rate": 0.00024359999999999999, - "loss": 1.9869, + "grad_norm": 1.0797630548477173, + "learning_rate": 0.00018269999999999997, + "loss": 2.2538, "step": 617 }, { "epoch": 0.9888, - "grad_norm": 2.547787666320801, - "learning_rate": 0.000244, - "loss": 2.0831, + "grad_norm": 2.643570899963379, + "learning_rate": 0.00018299999999999998, + "loss": 2.3391, "step": 618 }, { "epoch": 0.9904, - "grad_norm": 2.7645835876464844, - "learning_rate": 0.0002444, - "loss": 1.8612, + "grad_norm": 2.8924105167388916, + "learning_rate": 0.00018329999999999998, + "loss": 2.1194, "step": 619 }, { "epoch": 0.992, - "grad_norm": 2.024125814437866, - "learning_rate": 0.0002448, - "loss": 1.683, + "grad_norm": 2.5089101791381836, + "learning_rate": 0.0001836, + "loss": 1.9541, "step": 620 }, { "epoch": 0.9936, - "grad_norm": 1.5566396713256836, - "learning_rate": 0.0002452, - "loss": 2.254, + "grad_norm": 1.9143519401550293, + "learning_rate": 0.00018389999999999997, + "loss": 2.5036, "step": 621 }, { "epoch": 0.9952, - "grad_norm": 1.9360650777816772, - "learning_rate": 0.00024559999999999995, - "loss": 1.9255, + "grad_norm": 2.4039244651794434, + "learning_rate": 0.00018419999999999998, + "loss": 2.2187, "step": 622 }, { "epoch": 0.9968, - "grad_norm": 1.7478992938995361, - "learning_rate": 0.00024599999999999996, - "loss": 2.09, + "grad_norm": 1.4363809823989868, + "learning_rate": 0.00018449999999999999, + "loss": 2.3675, "step": 623 }, { "epoch": 0.9984, - "grad_norm": 1.9008229970932007, - "learning_rate": 0.00024639999999999997, - "loss": 1.9071, + "grad_norm": 1.517674446105957, + "learning_rate": 0.0001848, + "loss": 2.0923, "step": 624 }, { "epoch": 1.0, - "grad_norm": 4.15518856048584, - "learning_rate": 0.0002468, - "loss": 2.1594, + "grad_norm": 3.559779167175293, + "learning_rate": 0.00018509999999999997, + "loss": 2.3816, "step": 625 }, { "epoch": 1.0016, - "grad_norm": 23.750652313232422, - "learning_rate": 0.0002472, - "loss": 3.5965, + "grad_norm": 23.562456130981445, + "learning_rate": 0.00018539999999999998, + "loss": 4.1691, "step": 626 }, { "epoch": 1.0032, - "grad_norm": 11.189056396484375, - "learning_rate": 0.0002476, - "loss": 2.7926, + "grad_norm": 8.592154502868652, + "learning_rate": 0.0001857, + "loss": 2.4914, "step": 627 }, { "epoch": 1.0048, - "grad_norm": 12.417805671691895, - "learning_rate": 0.00024799999999999996, - "loss": 3.3245, + "grad_norm": 6.448266506195068, + "learning_rate": 0.000186, + "loss": 2.2136, "step": 628 }, { "epoch": 1.0064, - "grad_norm": 4.926300525665283, - "learning_rate": 0.00024839999999999997, - "loss": 2.2806, + "grad_norm": 5.093257427215576, + "learning_rate": 0.00018629999999999997, + "loss": 2.831, "step": 629 }, { "epoch": 1.008, - "grad_norm": 1.0334339141845703, - "learning_rate": 0.0002488, - "loss": 2.0561, + "grad_norm": 1.622463345527649, + "learning_rate": 0.00018659999999999998, + "loss": 2.5009, "step": 630 }, { "epoch": 1.0096, - "grad_norm": 1.211844801902771, - "learning_rate": 0.0002492, - "loss": 1.9468, + "grad_norm": 2.0858829021453857, + "learning_rate": 0.0001869, + "loss": 2.3197, "step": 631 }, { "epoch": 1.0112, - "grad_norm": 5.117887496948242, - "learning_rate": 0.00024959999999999994, - "loss": 2.0291, + "grad_norm": 5.908426284790039, + "learning_rate": 0.0001872, + "loss": 2.4522, "step": 632 }, { "epoch": 1.0128, - "grad_norm": 5.944527626037598, - "learning_rate": 0.00025, - "loss": 2.0087, + "grad_norm": 6.208230018615723, + "learning_rate": 0.00018749999999999998, + "loss": 2.3181, "step": 633 }, { "epoch": 1.0144, - "grad_norm": 5.785839557647705, - "learning_rate": 0.00025039999999999996, - "loss": 2.0019, + "grad_norm": 6.017099380493164, + "learning_rate": 0.00018779999999999998, + "loss": 2.4969, "step": 634 }, { "epoch": 1.016, - "grad_norm": 5.347036361694336, - "learning_rate": 0.00025079999999999997, - "loss": 2.0248, + "grad_norm": 5.968253135681152, + "learning_rate": 0.0001881, + "loss": 2.4739, "step": 635 }, { "epoch": 1.0176, - "grad_norm": 2.230567693710327, - "learning_rate": 0.0002512, - "loss": 2.3992, + "grad_norm": 5.338341236114502, + "learning_rate": 0.00018839999999999997, + "loss": 2.1392, "step": 636 }, { "epoch": 1.0192, - "grad_norm": 3.1952781677246094, - "learning_rate": 0.0002516, - "loss": 1.7205, + "grad_norm": 3.1507129669189453, + "learning_rate": 0.00018869999999999998, + "loss": 2.1685, "step": 637 }, { "epoch": 1.0208, - "grad_norm": 1.0791155099868774, - "learning_rate": 0.00025199999999999995, - "loss": 1.8243, + "grad_norm": 1.9297593832015991, + "learning_rate": 0.00018899999999999999, + "loss": 1.8609, "step": 638 }, { "epoch": 1.0224, - "grad_norm": 1.0545350313186646, - "learning_rate": 0.0002524, - "loss": 1.6489, + "grad_norm": 1.8639192581176758, + "learning_rate": 0.0001893, + "loss": 2.1603, "step": 639 }, { "epoch": 1.024, - "grad_norm": 1.7690002918243408, - "learning_rate": 0.00025279999999999996, - "loss": 1.6945, + "grad_norm": 2.1058402061462402, + "learning_rate": 0.00018959999999999997, + "loss": 1.907, "step": 640 }, { "epoch": 1.0256, - "grad_norm": 1.6258310079574585, - "learning_rate": 0.0002532, - "loss": 1.7829, + "grad_norm": 1.7498681545257568, + "learning_rate": 0.00018989999999999998, + "loss": 1.9661, "step": 641 }, { "epoch": 1.0272, - "grad_norm": 1.9372260570526123, - "learning_rate": 0.0002536, - "loss": 1.6907, + "grad_norm": 2.575547218322754, + "learning_rate": 0.0001902, + "loss": 2.134, "step": 642 }, { "epoch": 1.0288, - "grad_norm": 1.9646409749984741, - "learning_rate": 0.000254, - "loss": 1.7792, + "grad_norm": 1.5888687372207642, + "learning_rate": 0.0001905, + "loss": 2.1148, "step": 643 }, { "epoch": 1.0304, - "grad_norm": 1.4375287294387817, - "learning_rate": 0.00025439999999999995, - "loss": 1.5501, + "grad_norm": 0.9597282409667969, + "learning_rate": 0.00019079999999999998, + "loss": 1.9374, "step": 644 }, { "epoch": 1.032, - "grad_norm": 2.31491756439209, - "learning_rate": 0.0002548, - "loss": 1.665, + "grad_norm": 1.114833116531372, + "learning_rate": 0.00019109999999999998, + "loss": 1.9728, "step": 645 }, { "epoch": 1.0336, - "grad_norm": 0.9905027747154236, - "learning_rate": 0.00025519999999999997, - "loss": 1.6621, + "grad_norm": 0.861680269241333, + "learning_rate": 0.0001914, + "loss": 1.9322, "step": 646 }, { "epoch": 1.0352, - "grad_norm": 1.7029163837432861, - "learning_rate": 0.0002556, - "loss": 1.852, + "grad_norm": 0.8410967588424683, + "learning_rate": 0.0001917, + "loss": 1.8089, "step": 647 }, { "epoch": 1.0368, - "grad_norm": 1.4749594926834106, - "learning_rate": 0.000256, - "loss": 1.8307, + "grad_norm": 0.9906688332557678, + "learning_rate": 0.00019199999999999998, + "loss": 1.9472, "step": 648 }, { "epoch": 1.0384, - "grad_norm": 2.049144744873047, - "learning_rate": 0.0002564, - "loss": 1.8095, + "grad_norm": 0.9408037066459656, + "learning_rate": 0.00019229999999999999, + "loss": 1.9723, "step": 649 }, { "epoch": 1.04, - "grad_norm": 1.4582281112670898, - "learning_rate": 0.00025679999999999995, - "loss": 1.8413, + "grad_norm": 0.9392821788787842, + "learning_rate": 0.0001926, + "loss": 1.99, "step": 650 }, { "epoch": 1.0416, - "grad_norm": 1.0833181142807007, - "learning_rate": 0.00025719999999999996, - "loss": 1.7638, + "grad_norm": 1.3041807413101196, + "learning_rate": 0.0001929, + "loss": 1.9668, "step": 651 }, { "epoch": 1.0432, - "grad_norm": 1.0526366233825684, - "learning_rate": 0.0002576, - "loss": 1.8573, + "grad_norm": 1.1237937211990356, + "learning_rate": 0.00019319999999999998, + "loss": 1.9616, "step": 652 }, { "epoch": 1.0448, - "grad_norm": 0.9940699338912964, - "learning_rate": 0.000258, - "loss": 1.6459, + "grad_norm": 0.9516493082046509, + "learning_rate": 0.0001935, + "loss": 2.046, "step": 653 }, { "epoch": 1.0464, - "grad_norm": 0.8226042985916138, - "learning_rate": 0.00025839999999999994, - "loss": 1.6771, + "grad_norm": 0.9648451209068298, + "learning_rate": 0.0001938, + "loss": 2.0423, "step": 654 }, { "epoch": 1.048, - "grad_norm": 1.3881866931915283, - "learning_rate": 0.0002588, - "loss": 1.8241, + "grad_norm": 1.1745597124099731, + "learning_rate": 0.0001941, + "loss": 1.6941, "step": 655 }, { "epoch": 1.0496, - "grad_norm": 1.8918851613998413, - "learning_rate": 0.00025919999999999996, - "loss": 1.8669, + "grad_norm": 1.258222222328186, + "learning_rate": 0.00019439999999999998, + "loss": 1.8956, "step": 656 }, { "epoch": 1.0512, - "grad_norm": 1.265293836593628, - "learning_rate": 0.00025959999999999997, - "loss": 1.6453, + "grad_norm": 1.8220093250274658, + "learning_rate": 0.0001947, + "loss": 1.906, "step": 657 }, { "epoch": 1.0528, - "grad_norm": 1.5398906469345093, - "learning_rate": 0.00026, - "loss": 1.8982, + "grad_norm": 1.4318503141403198, + "learning_rate": 0.000195, + "loss": 1.9818, "step": 658 }, { "epoch": 1.0544, - "grad_norm": 1.0233063697814941, - "learning_rate": 0.0002604, - "loss": 1.6833, + "grad_norm": 1.7209746837615967, + "learning_rate": 0.00019529999999999998, + "loss": 1.7445, "step": 659 }, { "epoch": 1.056, - "grad_norm": 1.4029862880706787, - "learning_rate": 0.00026079999999999994, - "loss": 1.8216, + "grad_norm": 1.0792475938796997, + "learning_rate": 0.00019559999999999998, + "loss": 2.0065, "step": 660 }, { "epoch": 1.0576, - "grad_norm": 1.1457536220550537, - "learning_rate": 0.0002612, - "loss": 1.6201, + "grad_norm": 1.0406444072723389, + "learning_rate": 0.0001959, + "loss": 1.8579, "step": 661 }, { "epoch": 1.0592, - "grad_norm": 1.3890804052352905, - "learning_rate": 0.00026159999999999996, - "loss": 1.6713, + "grad_norm": 1.5529998540878296, + "learning_rate": 0.0001962, + "loss": 1.8712, "step": 662 }, { "epoch": 1.0608, - "grad_norm": 1.0882718563079834, - "learning_rate": 0.00026199999999999997, - "loss": 1.7714, + "grad_norm": 1.0818697214126587, + "learning_rate": 0.00019649999999999998, + "loss": 1.9727, "step": 663 }, { "epoch": 1.0624, - "grad_norm": 1.2843750715255737, - "learning_rate": 0.0002624, - "loss": 1.6399, + "grad_norm": 1.0074880123138428, + "learning_rate": 0.00019679999999999999, + "loss": 1.9209, "step": 664 }, { "epoch": 1.064, - "grad_norm": 1.7036292552947998, - "learning_rate": 0.0002628, - "loss": 2.1274, + "grad_norm": 2.4491970539093018, + "learning_rate": 0.0001971, + "loss": 2.1664, "step": 665 }, { "epoch": 1.0656, - "grad_norm": 2.123671293258667, - "learning_rate": 0.00026319999999999995, - "loss": 2.1099, + "grad_norm": 1.978613018989563, + "learning_rate": 0.0001974, + "loss": 2.3415, "step": 666 }, { "epoch": 1.0672, - "grad_norm": 1.3578087091445923, - "learning_rate": 0.0002636, - "loss": 1.6789, + "grad_norm": 1.2120779752731323, + "learning_rate": 0.00019769999999999998, + "loss": 2.3223, "step": 667 }, { "epoch": 1.0688, - "grad_norm": 2.183074951171875, - "learning_rate": 0.00026399999999999997, - "loss": 2.2433, + "grad_norm": 1.1476809978485107, + "learning_rate": 0.000198, + "loss": 2.0348, "step": 668 }, { "epoch": 1.0704, - "grad_norm": 1.4325931072235107, - "learning_rate": 0.0002644, - "loss": 2.0169, + "grad_norm": 1.0597435235977173, + "learning_rate": 0.0001983, + "loss": 2.0713, "step": 669 }, { "epoch": 1.072, - "grad_norm": 1.475628137588501, - "learning_rate": 0.0002648, - "loss": 2.0555, + "grad_norm": 1.3737848997116089, + "learning_rate": 0.0001986, + "loss": 2.0087, "step": 670 }, { "epoch": 1.0735999999999999, - "grad_norm": 1.7524343729019165, - "learning_rate": 0.0002652, - "loss": 1.9133, + "grad_norm": 1.1806519031524658, + "learning_rate": 0.00019889999999999998, + "loss": 1.9085, "step": 671 }, { "epoch": 1.0752, - "grad_norm": 1.3496627807617188, - "learning_rate": 0.00026559999999999995, - "loss": 1.8063, + "grad_norm": 1.1602704524993896, + "learning_rate": 0.0001992, + "loss": 2.383, "step": 672 }, { "epoch": 1.0768, - "grad_norm": 1.2976840734481812, - "learning_rate": 0.000266, - "loss": 2.256, + "grad_norm": 1.4072346687316895, + "learning_rate": 0.0001995, + "loss": 2.2611, "step": 673 }, { "epoch": 1.0784, - "grad_norm": 1.704746127128601, - "learning_rate": 0.00026639999999999997, - "loss": 1.7376, + "grad_norm": 1.908771276473999, + "learning_rate": 0.0001998, + "loss": 2.3743, "step": 674 }, { "epoch": 1.08, - "grad_norm": 2.0752077102661133, - "learning_rate": 0.0002668, - "loss": 2.7425, + "grad_norm": NaN, + "learning_rate": 0.0001998, + "loss": 2.1281, "step": 675 }, { "epoch": 1.0816, - "grad_norm": 18.08973503112793, - "learning_rate": 0.0002672, - "loss": 3.7616, + "grad_norm": 19.099306106567383, + "learning_rate": 0.00020009999999999998, + "loss": 3.7561, "step": 676 }, { "epoch": 1.0832, - "grad_norm": 4.786888599395752, - "learning_rate": 0.0002676, - "loss": 2.572, + "grad_norm": 11.901508331298828, + "learning_rate": 0.0002004, + "loss": 3.0756, "step": 677 }, { "epoch": 1.0848, - "grad_norm": 4.977888584136963, - "learning_rate": 0.00026799999999999995, - "loss": 2.6192, + "grad_norm": 7.387051105499268, + "learning_rate": 0.0002007, + "loss": 2.6726, "step": 678 }, { "epoch": 1.0864, - "grad_norm": 1.7943665981292725, - "learning_rate": 0.0002684, - "loss": 2.0963, + "grad_norm": 2.313292980194092, + "learning_rate": 0.000201, + "loss": 2.0645, "step": 679 }, { "epoch": 1.088, - "grad_norm": 3.845595359802246, - "learning_rate": 0.0002688, - "loss": 2.101, + "grad_norm": 2.085894823074341, + "learning_rate": 0.0002013, + "loss": 2.043, "step": 680 }, { "epoch": 1.0896, - "grad_norm": 3.880871295928955, - "learning_rate": 0.0002692, - "loss": 2.0843, + "grad_norm": 3.863337278366089, + "learning_rate": 0.0002016, + "loss": 2.1174, "step": 681 }, { "epoch": 1.0912, - "grad_norm": 4.223239898681641, - "learning_rate": 0.00026959999999999994, - "loss": 2.2103, + "grad_norm": 4.193874835968018, + "learning_rate": 0.0002019, + "loss": 1.9003, "step": 682 }, { "epoch": 1.0928, - "grad_norm": 4.312764644622803, - "learning_rate": 0.00027, - "loss": 1.816, + "grad_norm": 4.875527858734131, + "learning_rate": 0.0002022, + "loss": 2.187, "step": 683 }, { "epoch": 1.0944, - "grad_norm": 2.8680918216705322, - "learning_rate": 0.00027039999999999996, - "loss": 1.5292, + "grad_norm": 2.392341375350952, + "learning_rate": 0.0002025, + "loss": 2.2106, "step": 684 }, { "epoch": 1.096, - "grad_norm": 5.548489093780518, - "learning_rate": 0.00027079999999999997, - "loss": 2.2242, + "grad_norm": 1.1724746227264404, + "learning_rate": 0.0002028, + "loss": 1.894, "step": 685 }, { "epoch": 1.0976, - "grad_norm": 0.9976663589477539, - "learning_rate": 0.0002712, - "loss": 1.8811, + "grad_norm": 1.3257207870483398, + "learning_rate": 0.0002031, + "loss": 1.8114, "step": 686 }, { "epoch": 1.0992, - "grad_norm": 1.6501846313476562, - "learning_rate": 0.0002716, - "loss": 1.5119, + "grad_norm": 1.1348336935043335, + "learning_rate": 0.00020339999999999998, + "loss": 1.6732, "step": 687 }, { "epoch": 1.1008, - "grad_norm": 0.8941024541854858, - "learning_rate": 0.00027199999999999994, - "loss": 1.4123, + "grad_norm": 2.1080403327941895, + "learning_rate": 0.0002037, + "loss": 1.9022, "step": 688 }, { "epoch": 1.1024, - "grad_norm": 1.797055721282959, - "learning_rate": 0.0002724, - "loss": 1.7278, + "grad_norm": 1.1208211183547974, + "learning_rate": 0.000204, + "loss": 1.6641, "step": 689 }, { "epoch": 1.104, - "grad_norm": 0.8527103066444397, - "learning_rate": 0.00027279999999999996, - "loss": 1.5845, + "grad_norm": 1.4055548906326294, + "learning_rate": 0.0002043, + "loss": 1.7557, "step": 690 }, { "epoch": 1.1056, - "grad_norm": 1.487897515296936, - "learning_rate": 0.00027319999999999997, - "loss": 1.5444, + "grad_norm": 1.0471781492233276, + "learning_rate": 0.00020459999999999999, + "loss": 1.629, "step": 691 }, { "epoch": 1.1072, - "grad_norm": 1.2457863092422485, - "learning_rate": 0.0002736, - "loss": 1.646, + "grad_norm": 0.8493616580963135, + "learning_rate": 0.0002049, + "loss": 1.8218, "step": 692 }, { "epoch": 1.1088, - "grad_norm": 1.3996880054473877, - "learning_rate": 0.000274, - "loss": 1.5098, + "grad_norm": 1.1957522630691528, + "learning_rate": 0.0002052, + "loss": 2.1286, "step": 693 }, { "epoch": 1.1104, - "grad_norm": 1.7704321146011353, - "learning_rate": 0.00027439999999999995, - "loss": 1.6845, + "grad_norm": 0.6791967153549194, + "learning_rate": 0.0002055, + "loss": 1.8816, "step": 694 }, { "epoch": 1.112, - "grad_norm": 1.5547606945037842, - "learning_rate": 0.0002748, - "loss": 1.7829, + "grad_norm": 0.8928971290588379, + "learning_rate": 0.0002058, + "loss": 1.8552, "step": 695 }, { "epoch": 1.1136, - "grad_norm": 1.3222217559814453, - "learning_rate": 0.00027519999999999997, - "loss": 1.5888, + "grad_norm": 1.1113934516906738, + "learning_rate": 0.0002061, + "loss": 1.8408, "step": 696 }, { "epoch": 1.1152, - "grad_norm": 0.9179243445396423, - "learning_rate": 0.0002756, - "loss": 1.7316, + "grad_norm": 0.948301374912262, + "learning_rate": 0.00020639999999999998, + "loss": 1.918, "step": 697 }, { "epoch": 1.1168, - "grad_norm": 1.142059087753296, - "learning_rate": 0.000276, - "loss": 1.7593, + "grad_norm": 0.7305535078048706, + "learning_rate": 0.00020669999999999996, + "loss": 1.722, "step": 698 }, { "epoch": 1.1184, - "grad_norm": 1.3432698249816895, - "learning_rate": 0.0002764, - "loss": 1.7522, + "grad_norm": 2.527282953262329, + "learning_rate": 0.00020699999999999996, + "loss": 2.1528, "step": 699 }, { "epoch": 1.12, - "grad_norm": 0.8155900239944458, - "learning_rate": 0.00027679999999999995, - "loss": 1.7013, - "step": 700 - }, - { - "epoch": 1.12, - "eval_cer": 0.44886196037788495, - "eval_loss": 2.1150970458984375, - "eval_runtime": 158.4276, - "eval_samples_per_second": 19.795, - "eval_steps_per_second": 1.237, - "eval_wer": 0.6820018650917004, + "grad_norm": 2.323239326477051, + "learning_rate": 0.00020729999999999997, + "loss": 1.8547, "step": 700 }, { "epoch": 1.1216, - "grad_norm": 0.9745352864265442, - "learning_rate": 0.0002772, - "loss": 1.7217, + "grad_norm": 1.690564751625061, + "learning_rate": 0.00020759999999999998, + "loss": 1.7551, "step": 701 }, { "epoch": 1.1232, - "grad_norm": 0.939247190952301, - "learning_rate": 0.00027759999999999997, - "loss": 1.5636, + "grad_norm": 1.1845993995666504, + "learning_rate": 0.00020789999999999996, + "loss": 2.0184, "step": 702 }, { "epoch": 1.1248, - "grad_norm": 1.1377625465393066, - "learning_rate": 0.000278, - "loss": 1.3686, + "grad_norm": 0.9866268038749695, + "learning_rate": 0.00020819999999999996, + "loss": 1.7094, "step": 703 }, { "epoch": 1.1264, - "grad_norm": 1.3641624450683594, - "learning_rate": 0.0002784, - "loss": 1.6273, + "grad_norm": 1.6895757913589478, + "learning_rate": 0.00020849999999999997, + "loss": 1.805, "step": 704 }, { "epoch": 1.1280000000000001, - "grad_norm": 1.7075438499450684, - "learning_rate": 0.0002788, - "loss": 1.6859, + "grad_norm": 1.2320947647094727, + "learning_rate": 0.00020879999999999998, + "loss": 1.6979, "step": 705 }, { "epoch": 1.1296, - "grad_norm": 0.9067268371582031, - "learning_rate": 0.00027919999999999996, - "loss": 1.8782, + "grad_norm": 1.0378570556640625, + "learning_rate": 0.00020909999999999996, + "loss": 1.6646, "step": 706 }, { "epoch": 1.1312, - "grad_norm": 2.020303964614868, - "learning_rate": 0.00027959999999999997, - "loss": 1.8752, + "grad_norm": 1.7210016250610352, + "learning_rate": 0.00020939999999999997, + "loss": 2.0604, "step": 707 }, { "epoch": 1.1328, - "grad_norm": 1.2338893413543701, - "learning_rate": 0.00028, - "loss": 1.4884, + "grad_norm": 1.0748008489608765, + "learning_rate": 0.00020969999999999997, + "loss": 1.7569, "step": 708 }, { "epoch": 1.1344, - "grad_norm": 1.5765209197998047, - "learning_rate": 0.0002804, - "loss": 1.8257, + "grad_norm": 1.0989258289337158, + "learning_rate": 0.00020999999999999998, + "loss": 1.6594, "step": 709 }, { "epoch": 1.1360000000000001, - "grad_norm": 1.7421718835830688, - "learning_rate": 0.0002808, - "loss": 1.993, + "grad_norm": 1.1351910829544067, + "learning_rate": 0.00021029999999999996, + "loss": 1.9019, "step": 710 }, { "epoch": 1.1376, - "grad_norm": 1.4602911472320557, - "learning_rate": 0.0002812, - "loss": 1.2757, + "grad_norm": 1.115064263343811, + "learning_rate": 0.00021059999999999997, + "loss": 1.6088, "step": 711 }, { "epoch": 1.1392, - "grad_norm": 1.140167236328125, - "learning_rate": 0.00028159999999999996, - "loss": 1.7157, + "grad_norm": 1.037343978881836, + "learning_rate": 0.00021089999999999998, + "loss": 1.5529, "step": 712 }, { "epoch": 1.1408, - "grad_norm": 3.379197120666504, - "learning_rate": 0.00028199999999999997, - "loss": 1.906, + "grad_norm": 1.8999748229980469, + "learning_rate": 0.00021119999999999996, + "loss": 1.7772, "step": 713 }, { "epoch": 1.1424, - "grad_norm": 1.0864256620407104, - "learning_rate": 0.0002824, - "loss": 1.6679, + "grad_norm": 1.6710184812545776, + "learning_rate": 0.00021149999999999996, + "loss": 1.7199, "step": 714 }, { "epoch": 1.144, - "grad_norm": 4.162722587585449, - "learning_rate": 0.0002828, - "loss": 1.85, + "grad_norm": 0.9634356498718262, + "learning_rate": 0.00021179999999999997, + "loss": 1.9266, "step": 715 }, { "epoch": 1.1456, - "grad_norm": 1.3800784349441528, - "learning_rate": 0.00028319999999999994, - "loss": 1.596, + "grad_norm": 1.2477611303329468, + "learning_rate": 0.00021209999999999998, + "loss": 1.7878, "step": 716 }, { "epoch": 1.1472, - "grad_norm": 1.3533785343170166, - "learning_rate": 0.0002836, - "loss": 1.3124, + "grad_norm": 1.0142041444778442, + "learning_rate": 0.00021239999999999996, + "loss": 1.4408, "step": 717 }, { "epoch": 1.1488, - "grad_norm": 1.9931690692901611, - "learning_rate": 0.00028399999999999996, - "loss": 2.2037, + "grad_norm": 4.752976417541504, + "learning_rate": 0.00021269999999999997, + "loss": 2.5672, "step": 718 }, { "epoch": 1.1504, - "grad_norm": 1.353873372077942, - "learning_rate": 0.0002844, - "loss": 1.7149, + "grad_norm": 1.2975269556045532, + "learning_rate": 0.00021299999999999997, + "loss": 1.8788, "step": 719 }, { "epoch": 1.152, - "grad_norm": 1.64341139793396, - "learning_rate": 0.0002848, - "loss": 1.9453, + "grad_norm": 1.2199846506118774, + "learning_rate": 0.00021329999999999998, + "loss": 1.8968, "step": 720 }, { "epoch": 1.1536, - "grad_norm": 1.1980323791503906, - "learning_rate": 0.0002852, - "loss": 1.8312, + "grad_norm": 1.3590461015701294, + "learning_rate": 0.00021359999999999996, + "loss": 1.5399, "step": 721 }, { "epoch": 1.1552, - "grad_norm": 2.0850045680999756, - "learning_rate": 0.00028559999999999995, - "loss": 1.5216, + "grad_norm": 3.0514047145843506, + "learning_rate": 0.00021389999999999997, + "loss": 2.1824, "step": 722 }, { "epoch": 1.1568, - "grad_norm": 1.8492074012756348, - "learning_rate": 0.00028599999999999996, - "loss": 2.448, + "grad_norm": 3.324256658554077, + "learning_rate": 0.00021419999999999998, + "loss": 2.248, "step": 723 }, { "epoch": 1.1584, - "grad_norm": 1.954471468925476, - "learning_rate": 0.00028639999999999997, - "loss": 1.6491, + "grad_norm": 1.6645241975784302, + "learning_rate": 0.00021449999999999998, + "loss": 2.0545, "step": 724 }, { "epoch": 1.16, - "grad_norm": 1.8707425594329834, - "learning_rate": 0.0002868, - "loss": 1.9693, + "grad_norm": 3.6503653526306152, + "learning_rate": 0.00021479999999999996, + "loss": 2.4739, "step": 725 }, { "epoch": 1.1616, - "grad_norm": 28.85036277770996, - "learning_rate": 0.0002872, - "loss": 4.7933, + "grad_norm": 14.17703914642334, + "learning_rate": 0.00021509999999999997, + "loss": 3.3052, "step": 726 }, { "epoch": 1.1632, - "grad_norm": 4.971120834350586, - "learning_rate": 0.0002876, - "loss": 2.2725, + "grad_norm": 12.942245483398438, + "learning_rate": 0.00021539999999999998, + "loss": 3.0785, "step": 727 }, { "epoch": 1.1648, - "grad_norm": 11.561688423156738, - "learning_rate": 0.00028799999999999995, - "loss": 3.1344, + "grad_norm": 3.433819055557251, + "learning_rate": 0.00021569999999999998, + "loss": 1.9356, "step": 728 }, { "epoch": 1.1663999999999999, - "grad_norm": 2.439958333969116, - "learning_rate": 0.00028839999999999996, - "loss": 1.7422, + "grad_norm": 3.700389862060547, + "learning_rate": 0.00021599999999999996, + "loss": 2.3045, "step": 729 }, { "epoch": 1.168, - "grad_norm": 2.66070818901062, - "learning_rate": 0.00028879999999999997, - "loss": 2.2646, + "grad_norm": 4.333859443664551, + "learning_rate": 0.00021629999999999997, + "loss": 2.1794, "step": 730 }, { "epoch": 1.1696, - "grad_norm": 1.5238914489746094, - "learning_rate": 0.0002892, - "loss": 1.971, + "grad_norm": 1.7107884883880615, + "learning_rate": 0.00021659999999999998, + "loss": 2.3069, "step": 731 }, { "epoch": 1.1712, - "grad_norm": 5.540120601654053, - "learning_rate": 0.0002896, - "loss": 1.7024, + "grad_norm": 1.8687323331832886, + "learning_rate": 0.0002169, + "loss": 2.7628, "step": 732 }, { "epoch": 1.1728, - "grad_norm": 5.015547275543213, - "learning_rate": 0.00029, - "loss": 1.9131, + "grad_norm": 4.187427043914795, + "learning_rate": 0.00021719999999999997, + "loss": 1.8492, "step": 733 }, { "epoch": 1.1743999999999999, - "grad_norm": 7.225318431854248, - "learning_rate": 0.00029039999999999996, - "loss": 2.0965, + "grad_norm": 6.42722225189209, + "learning_rate": 0.00021749999999999997, + "loss": 1.9605, "step": 734 }, { "epoch": 1.176, - "grad_norm": 5.978356838226318, - "learning_rate": 0.00029079999999999997, - "loss": 1.9004, + "grad_norm": 5.926379203796387, + "learning_rate": 0.00021779999999999998, + "loss": 1.9447, "step": 735 }, { "epoch": 1.1776, - "grad_norm": 5.611774444580078, - "learning_rate": 0.0002912, - "loss": 2.0044, + "grad_norm": 5.0843119621276855, + "learning_rate": 0.00021809999999999996, + "loss": 1.9547, "step": 736 }, { "epoch": 1.1792, - "grad_norm": 3.98929500579834, - "learning_rate": 0.0002916, - "loss": 1.827, + "grad_norm": 3.0548288822174072, + "learning_rate": 0.00021839999999999997, + "loss": 1.9428, "step": 737 }, { "epoch": 1.1808, - "grad_norm": 1.499614953994751, - "learning_rate": 0.000292, - "loss": 1.7716, + "grad_norm": 1.9500654935836792, + "learning_rate": 0.00021869999999999998, + "loss": 1.707, "step": 738 }, { "epoch": 1.1824, - "grad_norm": 0.908250093460083, - "learning_rate": 0.0002924, - "loss": 1.751, + "grad_norm": 1.0856446027755737, + "learning_rate": 0.00021899999999999998, + "loss": 1.8138, "step": 739 }, { "epoch": 1.184, - "grad_norm": 1.4749705791473389, - "learning_rate": 0.00029279999999999996, - "loss": 1.7102, + "grad_norm": 2.099169969558716, + "learning_rate": 0.00021929999999999996, + "loss": 1.9153, "step": 740 }, { "epoch": 1.1856, - "grad_norm": 2.4379866123199463, - "learning_rate": 0.00029319999999999997, - "loss": 1.5624, + "grad_norm": 3.320614814758301, + "learning_rate": 0.00021959999999999997, + "loss": 1.9022, "step": 741 }, { "epoch": 1.1872, - "grad_norm": 1.296765923500061, - "learning_rate": 0.0002936, - "loss": 1.6655, + "grad_norm": 2.1835997104644775, + "learning_rate": 0.00021989999999999998, + "loss": 1.7657, "step": 742 }, { "epoch": 1.1888, - "grad_norm": 1.7663196325302124, - "learning_rate": 0.000294, - "loss": 1.4732, + "grad_norm": 1.8994768857955933, + "learning_rate": 0.00022019999999999999, + "loss": 1.6526, "step": 743 }, { "epoch": 1.1904, - "grad_norm": 1.1070098876953125, - "learning_rate": 0.00029439999999999995, - "loss": 1.6552, + "grad_norm": 1.6875131130218506, + "learning_rate": 0.00022049999999999997, + "loss": 1.7326, "step": 744 }, { "epoch": 1.192, - "grad_norm": 0.7955397367477417, - "learning_rate": 0.00029479999999999996, - "loss": 1.335, + "grad_norm": 1.7935808897018433, + "learning_rate": 0.00022079999999999997, + "loss": 1.6772, "step": 745 }, { "epoch": 1.1936, - "grad_norm": 1.6219990253448486, - "learning_rate": 0.00029519999999999997, - "loss": 1.5489, + "grad_norm": 1.137858510017395, + "learning_rate": 0.00022109999999999998, + "loss": 1.8035, "step": 746 }, { "epoch": 1.1952, - "grad_norm": 0.8686781525611877, - "learning_rate": 0.0002956, - "loss": 1.417, + "grad_norm": 1.6366093158721924, + "learning_rate": 0.0002214, + "loss": 1.6763, "step": 747 }, { "epoch": 1.1968, - "grad_norm": 2.419113874435425, - "learning_rate": 0.000296, - "loss": 1.6413, + "grad_norm": 1.0513057708740234, + "learning_rate": 0.00022169999999999997, + "loss": 2.0465, "step": 748 }, { "epoch": 1.1984, - "grad_norm": 2.114264488220215, - "learning_rate": 0.0002964, - "loss": 1.9603, + "grad_norm": 2.0557422637939453, + "learning_rate": 0.00022199999999999998, + "loss": 1.6699, "step": 749 }, { "epoch": 1.2, - "grad_norm": 1.2210099697113037, - "learning_rate": 0.00029679999999999995, - "loss": 1.8247, + "grad_norm": 2.343322515487671, + "learning_rate": 0.00022229999999999998, + "loss": 1.5154, "step": 750 }, { "epoch": 1.2016, - "grad_norm": 1.4565551280975342, - "learning_rate": 0.00029719999999999996, - "loss": 1.3969, + "grad_norm": 2.144352912902832, + "learning_rate": 0.0002226, + "loss": 1.9019, "step": 751 }, { "epoch": 1.2032, - "grad_norm": 0.8622385263442993, - "learning_rate": 0.00029759999999999997, - "loss": 1.4452, + "grad_norm": 1.0311518907546997, + "learning_rate": 0.00022289999999999997, + "loss": 1.611, "step": 752 }, { "epoch": 1.2048, - "grad_norm": 0.817267656326294, - "learning_rate": 0.000298, - "loss": 1.8705, + "grad_norm": 1.594190001487732, + "learning_rate": 0.00022319999999999998, + "loss": 1.8844, "step": 753 }, { "epoch": 1.2064, - "grad_norm": 1.0307005643844604, - "learning_rate": 0.0002984, - "loss": 1.4181, + "grad_norm": 1.0888090133666992, + "learning_rate": 0.00022349999999999998, + "loss": 1.4243, "step": 754 }, { "epoch": 1.208, - "grad_norm": 1.6527661085128784, - "learning_rate": 0.0002988, - "loss": 1.6909, + "grad_norm": 0.8538291454315186, + "learning_rate": 0.0002238, + "loss": 1.7549, "step": 755 }, { "epoch": 1.2096, - "grad_norm": 1.1635178327560425, - "learning_rate": 0.00029919999999999995, - "loss": 1.6779, + "grad_norm": 0.9313039779663086, + "learning_rate": 0.00022409999999999997, + "loss": 1.5294, "step": 756 }, { "epoch": 1.2112, - "grad_norm": 1.4051568508148193, - "learning_rate": 0.00029959999999999996, - "loss": 1.2773, + "grad_norm": 1.0965137481689453, + "learning_rate": 0.00022439999999999998, + "loss": 1.5921, "step": 757 }, { "epoch": 1.2128, - "grad_norm": 1.8025474548339844, - "learning_rate": 0.0003, - "loss": 1.7951, + "grad_norm": 1.8259750604629517, + "learning_rate": 0.0002247, + "loss": 1.6646, "step": 758 }, { "epoch": 1.2144, - "grad_norm": 1.042132019996643, - "learning_rate": 0.0003004, - "loss": 1.5419, + "grad_norm": 1.3318666219711304, + "learning_rate": 0.000225, + "loss": 1.7923, "step": 759 }, { "epoch": 1.216, - "grad_norm": 1.1578173637390137, - "learning_rate": 0.00030079999999999994, - "loss": 1.4649, + "grad_norm": 1.7984483242034912, + "learning_rate": 0.00022529999999999997, + "loss": 2.1313, "step": 760 }, { "epoch": 1.2176, - "grad_norm": 2.2228667736053467, - "learning_rate": 0.00030119999999999995, - "loss": 2.0639, + "grad_norm": 1.0261139869689941, + "learning_rate": 0.00022559999999999998, + "loss": 1.8728, "step": 761 }, { "epoch": 1.2192, - "grad_norm": 0.8677162528038025, - "learning_rate": 0.0003016, - "loss": 1.4764, + "grad_norm": 1.089312195777893, + "learning_rate": 0.0002259, + "loss": 1.7172, "step": 762 }, { "epoch": 1.2208, - "grad_norm": 1.2380160093307495, - "learning_rate": 0.00030199999999999997, - "loss": 1.6447, + "grad_norm": 0.9745715856552124, + "learning_rate": 0.00022619999999999997, + "loss": 1.8426, "step": 763 }, { "epoch": 1.2224, - "grad_norm": 1.3636889457702637, - "learning_rate": 0.0003024, - "loss": 1.7581, + "grad_norm": 1.1948559284210205, + "learning_rate": 0.00022649999999999998, + "loss": 1.6767, "step": 764 }, { "epoch": 1.224, - "grad_norm": 1.1308799982070923, - "learning_rate": 0.0003028, - "loss": 1.7186, + "grad_norm": 1.2432363033294678, + "learning_rate": 0.00022679999999999998, + "loss": 1.6967, "step": 765 }, { "epoch": 1.2256, - "grad_norm": 1.5796843767166138, - "learning_rate": 0.00030319999999999994, - "loss": 1.7703, + "grad_norm": 1.209937572479248, + "learning_rate": 0.0002271, + "loss": 2.2466, "step": 766 }, { "epoch": 1.2272, - "grad_norm": 1.0818617343902588, - "learning_rate": 0.00030359999999999995, - "loss": 1.418, + "grad_norm": 1.1278433799743652, + "learning_rate": 0.00022739999999999997, + "loss": 1.8898, "step": 767 }, { "epoch": 1.2288000000000001, - "grad_norm": 2.3142552375793457, - "learning_rate": 0.000304, - "loss": 1.935, + "grad_norm": 0.9980478286743164, + "learning_rate": 0.00022769999999999998, + "loss": 1.8697, "step": 768 }, { "epoch": 1.2304, - "grad_norm": 1.6650938987731934, - "learning_rate": 0.00030439999999999997, - "loss": 2.1436, + "grad_norm": 1.7807966470718384, + "learning_rate": 0.00022799999999999999, + "loss": 1.6999, "step": 769 }, { "epoch": 1.232, - "grad_norm": 1.7241698503494263, - "learning_rate": 0.0003048, - "loss": 1.6536, + "grad_norm": 1.3307067155838013, + "learning_rate": 0.0002283, + "loss": 2.0235, "step": 770 }, { "epoch": 1.2336, - "grad_norm": NaN, - "learning_rate": 0.0003048, - "loss": 1.5988, + "grad_norm": 1.336665391921997, + "learning_rate": 0.00022859999999999997, + "loss": 1.9368, "step": 771 }, { "epoch": 1.2352, - "grad_norm": 1.0828837156295776, - "learning_rate": 0.0003052, - "loss": 2.0584, + "grad_norm": 1.5342364311218262, + "learning_rate": 0.00022889999999999998, + "loss": 1.736, "step": 772 }, { "epoch": 1.2368000000000001, - "grad_norm": 1.8787623643875122, - "learning_rate": 0.00030559999999999995, - "loss": 2.1669, + "grad_norm": 1.804161548614502, + "learning_rate": 0.0002292, + "loss": 2.3303, "step": 773 }, { "epoch": 1.2384, - "grad_norm": 1.8352808952331543, - "learning_rate": 0.00030599999999999996, - "loss": 2.4238, + "grad_norm": 1.1865055561065674, + "learning_rate": 0.0002295, + "loss": 1.8258, "step": 774 }, { "epoch": 1.24, - "grad_norm": 2.0836939811706543, - "learning_rate": 0.0003064, - "loss": 2.7034, + "grad_norm": NaN, + "learning_rate": 0.0002295, + "loss": 1.4765, "step": 775 }, { "epoch": 1.2416, - "grad_norm": 7.965085983276367, - "learning_rate": 0.0003068, - "loss": 2.6308, + "grad_norm": 11.694743156433105, + "learning_rate": 0.00022979999999999997, + "loss": 3.1048, "step": 776 }, { "epoch": 1.2432, - "grad_norm": 2.8979556560516357, - "learning_rate": 0.0003072, - "loss": 1.8492, + "grad_norm": 4.708855628967285, + "learning_rate": 0.00023009999999999998, + "loss": 2.2849, "step": 777 }, { "epoch": 1.2448, - "grad_norm": 1.722564697265625, - "learning_rate": 0.0003076, - "loss": 1.5315, + "grad_norm": 1.5170272588729858, + "learning_rate": 0.0002304, + "loss": 2.1169, "step": 778 }, { "epoch": 1.2464, - "grad_norm": 2.6675570011138916, - "learning_rate": 0.00030799999999999995, - "loss": 2.2452, + "grad_norm": 3.2617526054382324, + "learning_rate": 0.0002307, + "loss": 2.6471, "step": 779 }, { "epoch": 1.248, - "grad_norm": 2.365095853805542, - "learning_rate": 0.00030839999999999996, - "loss": 1.8645, + "grad_norm": 2.189706325531006, + "learning_rate": 0.00023099999999999998, + "loss": 2.264, "step": 780 }, { "epoch": 1.2496, - "grad_norm": 2.9345123767852783, - "learning_rate": 0.0003088, - "loss": 1.9889, + "grad_norm": 2.7677061557769775, + "learning_rate": 0.00023129999999999998, + "loss": 2.2306, "step": 781 }, { "epoch": 1.2511999999999999, - "grad_norm": 2.7510879039764404, - "learning_rate": 0.0003092, - "loss": 1.863, + "grad_norm": 5.468600749969482, + "learning_rate": 0.0002316, + "loss": 2.0234, "step": 782 }, { "epoch": 1.2528000000000001, - "grad_norm": 3.8311636447906494, - "learning_rate": 0.0003096, - "loss": 1.7916, + "grad_norm": 6.928030014038086, + "learning_rate": 0.0002319, + "loss": 1.947, "step": 783 }, { "epoch": 1.2544, - "grad_norm": 2.39135479927063, - "learning_rate": 0.00031, - "loss": 1.5686, + "grad_norm": 7.065882205963135, + "learning_rate": 0.00023219999999999998, + "loss": 1.9462, "step": 784 }, { "epoch": 1.256, - "grad_norm": 2.308666944503784, - "learning_rate": 0.00031039999999999996, - "loss": 1.7752, + "grad_norm": 5.484043121337891, + "learning_rate": 0.00023249999999999999, + "loss": 2.0667, "step": 785 }, { "epoch": 1.2576, - "grad_norm": 1.1244654655456543, - "learning_rate": 0.00031079999999999997, - "loss": 1.5987, + "grad_norm": 4.054402828216553, + "learning_rate": 0.0002328, + "loss": 1.8082, "step": 786 }, { "epoch": 1.2591999999999999, - "grad_norm": 2.477769613265991, - "learning_rate": 0.00031120000000000003, - "loss": 1.5174, + "grad_norm": 2.286044120788574, + "learning_rate": 0.00023309999999999997, + "loss": 1.807, "step": 787 }, { "epoch": 1.2608, - "grad_norm": 1.5737215280532837, - "learning_rate": 0.0003116, - "loss": 1.4032, + "grad_norm": 1.4100284576416016, + "learning_rate": 0.00023339999999999998, + "loss": 1.7669, "step": 788 }, { "epoch": 1.2624, - "grad_norm": 1.4628270864486694, - "learning_rate": 0.000312, - "loss": 1.3916, + "grad_norm": 2.072185754776001, + "learning_rate": 0.0002337, + "loss": 1.549, "step": 789 }, { "epoch": 1.264, - "grad_norm": 2.7115325927734375, - "learning_rate": 0.0003124, - "loss": 1.2787, + "grad_norm": 2.230264663696289, + "learning_rate": 0.000234, + "loss": 1.6862, "step": 790 }, { "epoch": 1.2656, - "grad_norm": 2.164787769317627, - "learning_rate": 0.00031279999999999996, - "loss": 1.4398, + "grad_norm": 3.1086905002593994, + "learning_rate": 0.00023429999999999998, + "loss": 1.593, "step": 791 }, { "epoch": 1.2671999999999999, - "grad_norm": 1.2060961723327637, - "learning_rate": 0.00031319999999999997, - "loss": 1.4638, + "grad_norm": 2.1213574409484863, + "learning_rate": 0.00023459999999999998, + "loss": 1.5835, "step": 792 }, { "epoch": 1.2688, - "grad_norm": 1.0704708099365234, - "learning_rate": 0.0003135999999999999, - "loss": 1.4506, + "grad_norm": 4.03560209274292, + "learning_rate": 0.0002349, + "loss": 2.1431, "step": 793 }, { "epoch": 1.2704, - "grad_norm": 1.1875430345535278, - "learning_rate": 0.000314, - "loss": 1.6161, + "grad_norm": 1.5937985181808472, + "learning_rate": 0.0002352, + "loss": 1.8309, "step": 794 }, { "epoch": 1.272, - "grad_norm": 0.8472498655319214, - "learning_rate": 0.0003144, - "loss": 1.4842, + "grad_norm": 1.3385921716690063, + "learning_rate": 0.00023549999999999998, + "loss": 1.6525, "step": 795 }, { "epoch": 1.2736, - "grad_norm": 1.229743242263794, - "learning_rate": 0.00031479999999999995, - "loss": 1.5075, + "grad_norm": 1.2098594903945923, + "learning_rate": 0.00023579999999999999, + "loss": 1.849, "step": 796 }, { "epoch": 1.2752, - "grad_norm": 0.8419369459152222, - "learning_rate": 0.00031519999999999996, - "loss": 1.3735, + "grad_norm": 1.2983063459396362, + "learning_rate": 0.0002361, + "loss": 1.6045, "step": 797 }, { "epoch": 1.2768, - "grad_norm": 1.6676976680755615, - "learning_rate": 0.0003156, - "loss": 1.5081, + "grad_norm": 1.0883206129074097, + "learning_rate": 0.0002364, + "loss": 1.5005, "step": 798 }, { "epoch": 1.2784, - "grad_norm": 1.597878098487854, - "learning_rate": 0.00031599999999999993, - "loss": 1.513, + "grad_norm": 1.3664700984954834, + "learning_rate": 0.00023669999999999998, + "loss": 1.5107, "step": 799 }, { "epoch": 1.28, - "grad_norm": 0.893544614315033, - "learning_rate": 0.00031639999999999994, - "loss": 1.2995, - "step": 800 - }, - { - "epoch": 1.28, - "eval_cer": 0.415968429864073, - "eval_loss": 2.0757076740264893, - "eval_runtime": 158.58, - "eval_samples_per_second": 19.776, - "eval_steps_per_second": 1.236, - "eval_wer": 0.6244948709978241, + "grad_norm": 1.9094866514205933, + "learning_rate": 0.000237, + "loss": 1.8143, "step": 800 }, { "epoch": 1.2816, - "grad_norm": 3.154768466949463, - "learning_rate": 0.0003168, - "loss": 1.7691, + "grad_norm": 1.615760326385498, + "learning_rate": 0.0002373, + "loss": 1.889, "step": 801 }, { "epoch": 1.2832, - "grad_norm": 0.7817122340202332, - "learning_rate": 0.00031719999999999996, - "loss": 1.5512, + "grad_norm": 1.3856728076934814, + "learning_rate": 0.0002376, + "loss": 1.7875, "step": 802 }, { "epoch": 1.2848, - "grad_norm": 1.3338487148284912, - "learning_rate": 0.00031759999999999997, - "loss": 1.6868, + "grad_norm": 1.2767575979232788, + "learning_rate": 0.00023789999999999998, + "loss": 1.7884, "step": 803 }, { "epoch": 1.2864, - "grad_norm": 1.1260954141616821, - "learning_rate": 0.000318, - "loss": 1.6486, + "grad_norm": 0.93840491771698, + "learning_rate": 0.0002382, + "loss": 1.5185, "step": 804 }, { "epoch": 1.288, - "grad_norm": 1.3136061429977417, - "learning_rate": 0.00031839999999999993, - "loss": 1.6089, + "grad_norm": 0.9999681115150452, + "learning_rate": 0.0002385, + "loss": 1.8636, "step": 805 }, { "epoch": 1.2896, - "grad_norm": 1.137098789215088, - "learning_rate": 0.00031879999999999994, - "loss": 1.5119, + "grad_norm": 1.317078709602356, + "learning_rate": 0.0002388, + "loss": 1.3411, "step": 806 }, { "epoch": 1.2912, - "grad_norm": 0.9974707365036011, - "learning_rate": 0.0003192, - "loss": 1.4021, + "grad_norm": 1.2333732843399048, + "learning_rate": 0.00023909999999999998, + "loss": 1.694, "step": 807 }, { "epoch": 1.2928, - "grad_norm": 1.6253764629364014, - "learning_rate": 0.00031959999999999996, - "loss": 1.6299, + "grad_norm": 1.1172211170196533, + "learning_rate": 0.0002394, + "loss": 1.518, "step": 808 }, { "epoch": 1.2944, - "grad_norm": 1.1321923732757568, - "learning_rate": 0.00031999999999999997, - "loss": 1.5327, + "grad_norm": 1.1694917678833008, + "learning_rate": 0.0002397, + "loss": 1.8538, "step": 809 }, { "epoch": 1.296, - "grad_norm": 0.8714154362678528, - "learning_rate": 0.0003204, - "loss": 1.6207, + "grad_norm": 2.515814781188965, + "learning_rate": 0.00023999999999999998, + "loss": 1.8284, "step": 810 }, { "epoch": 1.2976, - "grad_norm": 1.2042042016983032, - "learning_rate": 0.00032079999999999994, - "loss": 1.6473, + "grad_norm": 2.0576000213623047, + "learning_rate": 0.00024029999999999999, + "loss": 1.9621, "step": 811 }, { "epoch": 1.2992, - "grad_norm": 1.5419819355010986, - "learning_rate": 0.00032119999999999995, - "loss": 1.8554, + "grad_norm": 2.3491764068603516, + "learning_rate": 0.0002406, + "loss": 1.9038, "step": 812 }, { "epoch": 1.3008, - "grad_norm": 1.4085866212844849, - "learning_rate": 0.0003216, - "loss": 1.3106, + "grad_norm": 1.76621675491333, + "learning_rate": 0.0002409, + "loss": 1.637, "step": 813 }, { "epoch": 1.3024, - "grad_norm": 1.227091908454895, - "learning_rate": 0.00032199999999999997, - "loss": 1.7718, + "grad_norm": 1.208563208580017, + "learning_rate": 0.00024119999999999998, + "loss": 1.8757, "step": 814 }, { "epoch": 1.304, - "grad_norm": 1.610036015510559, - "learning_rate": 0.0003224, - "loss": 1.5036, + "grad_norm": 1.1317024230957031, + "learning_rate": 0.0002415, + "loss": 1.8826, "step": 815 }, { "epoch": 1.3056, - "grad_norm": 1.597038745880127, - "learning_rate": 0.0003228, - "loss": 2.0035, + "grad_norm": 1.3644945621490479, + "learning_rate": 0.0002418, + "loss": 1.4626, "step": 816 }, { "epoch": 1.3072, - "grad_norm": 1.719772219657898, - "learning_rate": 0.00032319999999999994, - "loss": 1.7124, + "grad_norm": 1.633872628211975, + "learning_rate": 0.0002421, + "loss": 1.8603, "step": 817 }, { "epoch": 1.3088, - "grad_norm": 1.2312437295913696, - "learning_rate": 0.00032359999999999995, - "loss": 1.664, + "grad_norm": 1.3554219007492065, + "learning_rate": 0.00024239999999999998, + "loss": 1.8397, "step": 818 }, { "epoch": 1.3104, - "grad_norm": 1.160129427909851, - "learning_rate": 0.000324, - "loss": 1.4447, + "grad_norm": 1.2498072385787964, + "learning_rate": 0.0002427, + "loss": 1.8116, "step": 819 }, { "epoch": 1.312, - "grad_norm": 1.1790233850479126, - "learning_rate": 0.00032439999999999997, - "loss": 1.7727, + "grad_norm": 1.3917790651321411, + "learning_rate": 0.000243, + "loss": 1.7565, "step": 820 }, { "epoch": 1.3136, - "grad_norm": 1.2099790573120117, - "learning_rate": 0.0003248, - "loss": 1.6751, + "grad_norm": 1.4509116411209106, + "learning_rate": 0.0002433, + "loss": 1.7601, "step": 821 }, { "epoch": 1.3152, - "grad_norm": 1.0591086149215698, - "learning_rate": 0.0003252, - "loss": 1.6114, + "grad_norm": 1.7720223665237427, + "learning_rate": 0.00024359999999999999, + "loss": 1.6885, "step": 822 }, { "epoch": 1.3168, - "grad_norm": 2.331596612930298, - "learning_rate": 0.00032559999999999995, - "loss": 2.0011, + "grad_norm": 1.7985706329345703, + "learning_rate": 0.00024389999999999997, + "loss": 2.0853, "step": 823 }, { "epoch": 1.3184, - "grad_norm": 1.589240312576294, - "learning_rate": 0.00032599999999999996, - "loss": 1.6671, + "grad_norm": 1.8156722784042358, + "learning_rate": 0.00024419999999999997, + "loss": 2.2531, "step": 824 }, { "epoch": 1.32, - "grad_norm": NaN, - "learning_rate": 0.00032599999999999996, - "loss": 1.9173, + "grad_norm": 1.7719063758850098, + "learning_rate": 0.0002445, + "loss": 2.3759, "step": 825 }, { "epoch": 1.3216, - "grad_norm": 6.769994735717773, - "learning_rate": 0.0003264, - "loss": 2.5387, + "grad_norm": 27.36151695251465, + "learning_rate": 0.0002448, + "loss": 4.4413, "step": 826 }, { "epoch": 1.3232, - "grad_norm": 5.106613636016846, - "learning_rate": 0.0003268, - "loss": 1.9806, + "grad_norm": 8.232678413391113, + "learning_rate": 0.00024509999999999994, + "loss": 2.624, "step": 827 }, { "epoch": 1.3248, - "grad_norm": 2.9851677417755127, - "learning_rate": 0.0003272, - "loss": 1.7926, + "grad_norm": 4.0654497146606445, + "learning_rate": 0.00024539999999999995, + "loss": 1.9936, "step": 828 }, { "epoch": 1.3264, - "grad_norm": 1.8340100049972534, - "learning_rate": 0.0003276, - "loss": 1.9116, + "grad_norm": 5.744853973388672, + "learning_rate": 0.00024569999999999995, + "loss": 2.2401, "step": 829 }, { "epoch": 1.328, - "grad_norm": 8.570950508117676, - "learning_rate": 0.00032799999999999995, - "loss": 2.8044, + "grad_norm": 2.1282172203063965, + "learning_rate": 0.00024599999999999996, + "loss": 1.9586, "step": 830 }, { "epoch": 1.3296000000000001, - "grad_norm": 2.8660714626312256, - "learning_rate": 0.00032839999999999996, - "loss": 1.6779, + "grad_norm": 1.5827854871749878, + "learning_rate": 0.00024629999999999997, + "loss": 1.8686, "step": 831 }, { "epoch": 1.3312, - "grad_norm": 1.373335838317871, - "learning_rate": 0.0003288, - "loss": 2.183, + "grad_norm": 3.850166082382202, + "learning_rate": 0.0002466, + "loss": 1.7661, "step": 832 }, { "epoch": 1.3328, - "grad_norm": 2.5283563137054443, - "learning_rate": 0.0003292, - "loss": 1.6865, + "grad_norm": 4.2956719398498535, + "learning_rate": 0.0002469, + "loss": 1.774, "step": 833 }, { "epoch": 1.3344, - "grad_norm": 3.3186419010162354, - "learning_rate": 0.0003296, - "loss": 1.602, + "grad_norm": 3.491035223007202, + "learning_rate": 0.0002472, + "loss": 1.9191, "step": 834 }, { "epoch": 1.336, - "grad_norm": 2.754730463027954, - "learning_rate": 0.00033, - "loss": 1.4721, + "grad_norm": 3.0301005840301514, + "learning_rate": 0.00024749999999999994, + "loss": 1.8073, "step": 835 }, { "epoch": 1.3376000000000001, - "grad_norm": 0.960064172744751, - "learning_rate": 0.00033039999999999995, - "loss": 1.807, + "grad_norm": 1.9792249202728271, + "learning_rate": 0.00024779999999999995, + "loss": 1.7315, "step": 836 }, { "epoch": 1.3392, - "grad_norm": 1.207796573638916, - "learning_rate": 0.00033079999999999996, - "loss": 1.5925, + "grad_norm": 1.0492668151855469, + "learning_rate": 0.00024809999999999996, + "loss": 1.6863, "step": 837 }, { "epoch": 1.3408, - "grad_norm": 2.0469510555267334, - "learning_rate": 0.0003312, - "loss": 1.6148, + "grad_norm": 1.034518837928772, + "learning_rate": 0.00024839999999999997, + "loss": 1.5301, "step": 838 }, { "epoch": 1.3424, - "grad_norm": 1.0148212909698486, - "learning_rate": 0.00033159999999999993, - "loss": 1.4489, + "grad_norm": 0.9527673125267029, + "learning_rate": 0.0002487, + "loss": 1.509, "step": 839 }, { "epoch": 1.3439999999999999, - "grad_norm": 0.936536431312561, - "learning_rate": 0.000332, - "loss": 1.1779, + "grad_norm": 2.35052227973938, + "learning_rate": 0.000249, + "loss": 1.6575, "step": 840 }, { "epoch": 1.3456000000000001, - "grad_norm": 0.9345409274101257, - "learning_rate": 0.0003324, - "loss": 1.4338, + "grad_norm": 1.5617680549621582, + "learning_rate": 0.0002493, + "loss": 1.3983, "step": 841 }, { "epoch": 1.3472, - "grad_norm": 0.8234386444091797, - "learning_rate": 0.00033279999999999996, - "loss": 1.4326, + "grad_norm": 3.1023738384246826, + "learning_rate": 0.00024959999999999994, + "loss": 1.7373, "step": 842 }, { "epoch": 1.3488, - "grad_norm": 1.0428308248519897, - "learning_rate": 0.00033319999999999997, - "loss": 1.6166, + "grad_norm": 1.3363111019134521, + "learning_rate": 0.00024989999999999995, + "loss": 1.6505, "step": 843 }, { "epoch": 1.3504, - "grad_norm": 0.8084268569946289, - "learning_rate": 0.0003336, - "loss": 1.6855, + "grad_norm": 5.273314476013184, + "learning_rate": 0.00025019999999999996, + "loss": 2.1203, "step": 844 }, { "epoch": 1.3519999999999999, - "grad_norm": 1.372387409210205, - "learning_rate": 0.00033399999999999993, - "loss": 1.5671, + "grad_norm": 1.881166934967041, + "learning_rate": 0.00025049999999999996, + "loss": 1.5656, "step": 845 }, { "epoch": 1.3536000000000001, - "grad_norm": 1.301967978477478, - "learning_rate": 0.0003344, - "loss": 1.4755, + "grad_norm": 1.683435082435608, + "learning_rate": 0.00025079999999999997, + "loss": 1.6089, "step": 846 }, { "epoch": 1.3552, - "grad_norm": 0.9071243405342102, - "learning_rate": 0.0003348, - "loss": 1.3501, + "grad_norm": 1.2385189533233643, + "learning_rate": 0.0002511, + "loss": 1.1731, "step": 847 }, { "epoch": 1.3568, - "grad_norm": 1.210505723953247, - "learning_rate": 0.00033519999999999996, - "loss": 1.5101, + "grad_norm": 0.8636699318885803, + "learning_rate": 0.0002514, + "loss": 1.5439, "step": 848 }, { "epoch": 1.3584, - "grad_norm": 2.225215435028076, - "learning_rate": 0.00033559999999999997, - "loss": 1.6298, + "grad_norm": 0.9964292049407959, + "learning_rate": 0.0002517, + "loss": 1.7566, "step": 849 }, { "epoch": 1.3599999999999999, - "grad_norm": 1.9273438453674316, - "learning_rate": 0.000336, - "loss": 1.5265, + "grad_norm": 1.041764497756958, + "learning_rate": 0.00025199999999999995, + "loss": 1.5745, "step": 850 }, { "epoch": 1.3616, - "grad_norm": 0.9094347953796387, - "learning_rate": 0.00033639999999999994, - "loss": 1.2325, + "grad_norm": 0.9211375117301941, + "learning_rate": 0.00025229999999999995, + "loss": 1.4535, "step": 851 }, { "epoch": 1.3632, - "grad_norm": 1.0150439739227295, - "learning_rate": 0.0003368, - "loss": 1.4602, + "grad_norm": 0.7638111710548401, + "learning_rate": 0.00025259999999999996, + "loss": 1.4994, "step": 852 }, { "epoch": 1.3648, - "grad_norm": 0.9719175696372986, - "learning_rate": 0.0003372, - "loss": 1.7831, + "grad_norm": 1.4103494882583618, + "learning_rate": 0.00025289999999999997, + "loss": 1.3608, "step": 853 }, { "epoch": 1.3664, - "grad_norm": 1.3457897901535034, - "learning_rate": 0.00033759999999999997, - "loss": 1.4172, + "grad_norm": 0.9239462614059448, + "learning_rate": 0.0002532, + "loss": 1.65, "step": 854 }, { "epoch": 1.3679999999999999, - "grad_norm": 2.2474122047424316, - "learning_rate": 0.000338, - "loss": 1.7245, + "grad_norm": 1.2657464742660522, + "learning_rate": 0.0002535, + "loss": 1.4439, "step": 855 }, { "epoch": 1.3696, - "grad_norm": 1.9882428646087646, - "learning_rate": 0.00033839999999999993, - "loss": 1.5253, + "grad_norm": 1.411150336265564, + "learning_rate": 0.0002538, + "loss": 1.5659, "step": 856 }, { "epoch": 1.3712, - "grad_norm": 1.4229366779327393, - "learning_rate": 0.00033879999999999994, - "loss": 1.6048, + "grad_norm": 1.1029832363128662, + "learning_rate": 0.0002541, + "loss": 1.5794, "step": 857 }, { "epoch": 1.3728, - "grad_norm": 0.8712606430053711, - "learning_rate": 0.0003392, - "loss": 1.774, + "grad_norm": 1.2146772146224976, + "learning_rate": 0.00025439999999999995, + "loss": 1.8554, "step": 858 }, { "epoch": 1.3744, - "grad_norm": 0.8492836952209473, - "learning_rate": 0.00033959999999999996, - "loss": 1.5786, + "grad_norm": 1.0912986993789673, + "learning_rate": 0.00025469999999999996, + "loss": 1.9646, "step": 859 }, { "epoch": 1.376, - "grad_norm": 1.6412835121154785, - "learning_rate": 0.00033999999999999997, - "loss": 1.3234, + "grad_norm": 1.115552306175232, + "learning_rate": 0.00025499999999999996, + "loss": 1.4906, "step": 860 }, { "epoch": 1.3776, - "grad_norm": 1.2218254804611206, - "learning_rate": 0.0003404, - "loss": 1.8886, + "grad_norm": 1.5120686292648315, + "learning_rate": 0.00025529999999999997, + "loss": 1.5043, "step": 861 }, { "epoch": 1.3792, - "grad_norm": 1.142381191253662, - "learning_rate": 0.00034079999999999994, - "loss": 1.5134, + "grad_norm": 2.1887874603271484, + "learning_rate": 0.0002556, + "loss": 2.1509, "step": 862 }, { "epoch": 1.3808, - "grad_norm": 1.4286154508590698, - "learning_rate": 0.00034119999999999995, - "loss": 1.3333, + "grad_norm": 1.7975490093231201, + "learning_rate": 0.0002559, + "loss": 1.5343, "step": 863 }, { "epoch": 1.3824, - "grad_norm": 0.9015567302703857, - "learning_rate": 0.0003416, - "loss": 1.5008, + "grad_norm": 0.8767451047897339, + "learning_rate": 0.0002562, + "loss": 1.8846, "step": 864 }, { "epoch": 1.384, - "grad_norm": 0.8696253895759583, - "learning_rate": 0.00034199999999999996, - "loss": 1.8428, + "grad_norm": 1.3470810651779175, + "learning_rate": 0.00025649999999999995, + "loss": 1.8476, "step": 865 }, { "epoch": 1.3856, - "grad_norm": 1.8368676900863647, - "learning_rate": 0.0003424, - "loss": 1.4036, + "grad_norm": 1.2314963340759277, + "learning_rate": 0.00025679999999999995, + "loss": 1.616, "step": 866 }, { "epoch": 1.3872, - "grad_norm": 2.2873785495758057, - "learning_rate": 0.0003428, - "loss": 1.913, + "grad_norm": 1.2231523990631104, + "learning_rate": 0.00025709999999999996, + "loss": 1.7002, "step": 867 }, { "epoch": 1.3888, - "grad_norm": 2.610448122024536, - "learning_rate": 0.00034319999999999994, - "loss": 2.3383, + "grad_norm": 1.706369161605835, + "learning_rate": 0.00025739999999999997, + "loss": 1.882, "step": 868 }, { "epoch": 1.3904, - "grad_norm": 1.2571147680282593, - "learning_rate": 0.00034359999999999995, - "loss": 1.462, + "grad_norm": 1.3242183923721313, + "learning_rate": 0.0002577, + "loss": 1.9856, "step": 869 }, { "epoch": 1.392, - "grad_norm": 1.162726640701294, - "learning_rate": 0.000344, - "loss": 1.4693, + "grad_norm": 1.4956287145614624, + "learning_rate": 0.000258, + "loss": 1.8133, "step": 870 }, { "epoch": 1.3936, - "grad_norm": 1.0974887609481812, - "learning_rate": 0.00034439999999999997, - "loss": 1.7837, + "grad_norm": 1.1745412349700928, + "learning_rate": 0.0002583, + "loss": 2.0496, "step": 871 }, { "epoch": 1.3952, - "grad_norm": 6.15885591506958, - "learning_rate": 0.0003448, - "loss": 2.0816, + "grad_norm": 1.716749906539917, + "learning_rate": 0.0002586, + "loss": 1.8053, "step": 872 }, { "epoch": 1.3968, - "grad_norm": 2.189544200897217, - "learning_rate": 0.0003452, - "loss": 1.6858, + "grad_norm": 1.0582464933395386, + "learning_rate": 0.00025889999999999995, + "loss": 1.8637, "step": 873 }, { "epoch": 1.3984, - "grad_norm": 1.991902470588684, - "learning_rate": 0.00034559999999999994, - "loss": 1.9152, + "grad_norm": 1.3968075513839722, + "learning_rate": 0.00025919999999999996, + "loss": 2.2903, "step": 874 }, { "epoch": 1.4, - "grad_norm": 3.02516770362854, - "learning_rate": 0.00034599999999999995, - "loss": 2.3831, + "grad_norm": 1.8292760848999023, + "learning_rate": 0.00025949999999999997, + "loss": 2.5341, "step": 875 }, { "epoch": 1.4016, - "grad_norm": 10.343817710876465, - "learning_rate": 0.00034639999999999996, - "loss": 2.8638, + "grad_norm": 7.413393497467041, + "learning_rate": 0.00025979999999999997, + "loss": 2.3879, "step": 876 }, { "epoch": 1.4032, - "grad_norm": 7.452947616577148, - "learning_rate": 0.0003467999999999999, - "loss": 2.2708, + "grad_norm": 6.438076496124268, + "learning_rate": 0.0002601, + "loss": 2.4021, "step": 877 }, { "epoch": 1.4048, - "grad_norm": 2.4687275886535645, - "learning_rate": 0.0003472, - "loss": 1.8979, + "grad_norm": 1.5802280902862549, + "learning_rate": 0.0002604, + "loss": 1.7735, "step": 878 }, { "epoch": 1.4064, - "grad_norm": 1.3352081775665283, - "learning_rate": 0.0003476, - "loss": 1.7263, + "grad_norm": 3.181586265563965, + "learning_rate": 0.0002607, + "loss": 2.2812, "step": 879 }, { "epoch": 1.408, - "grad_norm": 1.345573902130127, - "learning_rate": 0.00034799999999999995, - "loss": 2.1026, + "grad_norm": 2.4558494091033936, + "learning_rate": 0.000261, + "loss": 2.1456, "step": 880 }, { "epoch": 1.4096, - "grad_norm": 2.9327187538146973, - "learning_rate": 0.00034839999999999996, - "loss": 2.5618, + "grad_norm": 4.251657485961914, + "learning_rate": 0.00026129999999999995, + "loss": 1.7878, "step": 881 }, { "epoch": 1.4112, - "grad_norm": 4.26807975769043, - "learning_rate": 0.00034879999999999997, - "loss": 1.9103, + "grad_norm": 2.2614433765411377, + "learning_rate": 0.00026159999999999996, + "loss": 2.2472, "step": 882 }, { "epoch": 1.4128, - "grad_norm": 5.132489204406738, - "learning_rate": 0.0003491999999999999, - "loss": 2.1589, + "grad_norm": 4.5385966300964355, + "learning_rate": 0.00026189999999999997, + "loss": 2.0106, "step": 883 }, { "epoch": 1.4144, - "grad_norm": 4.745561122894287, - "learning_rate": 0.0003496, - "loss": 1.7655, + "grad_norm": 4.714118957519531, + "learning_rate": 0.0002622, + "loss": 2.1035, "step": 884 }, { "epoch": 1.416, - "grad_norm": 3.5287861824035645, - "learning_rate": 0.00035, - "loss": 1.661, + "grad_norm": 5.232901573181152, + "learning_rate": 0.0002625, + "loss": 2.0967, "step": 885 }, { "epoch": 1.4176, - "grad_norm": 3.1767563819885254, - "learning_rate": 0.00035039999999999995, - "loss": 1.6753, + "grad_norm": 3.031771421432495, + "learning_rate": 0.0002628, + "loss": 1.8065, "step": 886 }, { "epoch": 1.4192, - "grad_norm": 1.0548657178878784, - "learning_rate": 0.00035079999999999996, - "loss": 1.736, + "grad_norm": 2.2014353275299072, + "learning_rate": 0.0002631, + "loss": 1.7145, "step": 887 }, { "epoch": 1.4208, - "grad_norm": 1.0605950355529785, - "learning_rate": 0.00035119999999999997, - "loss": 1.7661, + "grad_norm": 1.0654090642929077, + "learning_rate": 0.00026339999999999995, + "loss": 1.5715, "step": 888 }, { "epoch": 1.4224, - "grad_norm": 2.0522172451019287, - "learning_rate": 0.0003515999999999999, - "loss": 1.2061, + "grad_norm": 1.569884181022644, + "learning_rate": 0.00026369999999999996, + "loss": 1.4741, "step": 889 }, { "epoch": 1.424, - "grad_norm": 3.3747687339782715, - "learning_rate": 0.000352, - "loss": 1.5711, + "grad_norm": 2.636420249938965, + "learning_rate": 0.00026399999999999997, + "loss": 1.48, "step": 890 }, { "epoch": 1.4256, - "grad_norm": 3.890195608139038, - "learning_rate": 0.0003524, - "loss": 1.9462, + "grad_norm": 3.305544376373291, + "learning_rate": 0.0002643, + "loss": 1.487, "step": 891 }, { "epoch": 1.4272, - "grad_norm": 3.8554182052612305, - "learning_rate": 0.00035279999999999996, - "loss": 1.9326, + "grad_norm": 3.3960185050964355, + "learning_rate": 0.0002646, + "loss": 1.43, "step": 892 }, { "epoch": 1.4288, - "grad_norm": 3.2069602012634277, - "learning_rate": 0.00035319999999999997, - "loss": 1.7907, + "grad_norm": 2.4552576541900635, + "learning_rate": 0.0002649, + "loss": 1.4312, "step": 893 }, { "epoch": 1.4304000000000001, - "grad_norm": 3.5373735427856445, - "learning_rate": 0.0003536, - "loss": 1.6313, + "grad_norm": 2.176464557647705, + "learning_rate": 0.0002652, + "loss": 1.6457, "step": 894 }, { "epoch": 1.432, - "grad_norm": 2.3303987979888916, - "learning_rate": 0.00035399999999999993, - "loss": 1.4145, + "grad_norm": 1.3099313974380493, + "learning_rate": 0.0002655, + "loss": 1.3961, "step": 895 }, { "epoch": 1.4336, - "grad_norm": 2.315999984741211, - "learning_rate": 0.0003544, - "loss": 1.7224, + "grad_norm": 1.3967254161834717, + "learning_rate": 0.00026579999999999996, + "loss": 1.8732, "step": 896 }, { "epoch": 1.4352, - "grad_norm": 1.1353230476379395, - "learning_rate": 0.0003548, - "loss": 1.6141, + "grad_norm": 2.6144227981567383, + "learning_rate": 0.00026609999999999996, + "loss": 1.9415, "step": 897 }, { "epoch": 1.4368, - "grad_norm": 1.321402907371521, - "learning_rate": 0.00035519999999999996, - "loss": 1.5741, + "grad_norm": 2.4597859382629395, + "learning_rate": 0.00026639999999999997, + "loss": 1.6611, "step": 898 }, { "epoch": 1.4384000000000001, - "grad_norm": 1.3665454387664795, - "learning_rate": 0.00035559999999999997, - "loss": 1.3824, + "grad_norm": 3.06152081489563, + "learning_rate": 0.0002667, + "loss": 1.7697, "step": 899 }, { "epoch": 1.44, - "grad_norm": 1.6859432458877563, - "learning_rate": 0.000356, - "loss": 1.6852, - "step": 900 - }, - { - "epoch": 1.44, - "eval_cer": 0.4112289233467533, - "eval_loss": 1.986970067024231, - "eval_runtime": 157.7785, - "eval_samples_per_second": 19.876, - "eval_steps_per_second": 1.242, - "eval_wer": 0.6153973681483784, + "grad_norm": 3.207355260848999, + "learning_rate": 0.000267, + "loss": 1.699, "step": 900 }, { "epoch": 1.4416, - "grad_norm": 1.5820800065994263, - "learning_rate": 0.00035639999999999994, - "loss": 1.4903, + "grad_norm": 2.3851301670074463, + "learning_rate": 0.0002673, + "loss": 2.0775, "step": 901 }, { "epoch": 1.4432, - "grad_norm": 1.1886115074157715, - "learning_rate": 0.0003568, - "loss": 1.6061, + "grad_norm": 1.7599847316741943, + "learning_rate": 0.0002676, + "loss": 1.8226, "step": 902 }, { "epoch": 1.4447999999999999, - "grad_norm": 1.2537235021591187, - "learning_rate": 0.0003572, - "loss": 1.4464, + "grad_norm": 0.9662289023399353, + "learning_rate": 0.0002679, + "loss": 1.7798, "step": 903 }, { "epoch": 1.4464000000000001, - "grad_norm": 1.027907133102417, - "learning_rate": 0.00035759999999999996, - "loss": 1.2771, + "grad_norm": 0.8772083520889282, + "learning_rate": 0.00026819999999999996, + "loss": 1.4852, "step": 904 }, { "epoch": 1.448, - "grad_norm": 1.5466809272766113, - "learning_rate": 0.000358, - "loss": 1.3082, + "grad_norm": 2.0723578929901123, + "learning_rate": 0.00026849999999999997, + "loss": 1.5108, "step": 905 }, { "epoch": 1.4496, - "grad_norm": 1.8753767013549805, - "learning_rate": 0.0003584, - "loss": 1.2227, + "grad_norm": 2.259298086166382, + "learning_rate": 0.0002688, + "loss": 1.6322, "step": 906 }, { "epoch": 1.4512, - "grad_norm": 2.8376686573028564, - "learning_rate": 0.00035879999999999994, - "loss": 1.8457, + "grad_norm": 1.930001974105835, + "learning_rate": 0.0002691, + "loss": 1.834, "step": 907 }, { "epoch": 1.4527999999999999, - "grad_norm": 2.1180849075317383, - "learning_rate": 0.0003592, - "loss": 1.5696, + "grad_norm": 1.9969251155853271, + "learning_rate": 0.0002694, + "loss": 1.4405, "step": 908 }, { "epoch": 1.4544000000000001, - "grad_norm": 1.1068503856658936, - "learning_rate": 0.0003596, - "loss": 1.484, + "grad_norm": 2.078216552734375, + "learning_rate": 0.0002697, + "loss": 2.1698, "step": 909 }, { "epoch": 1.456, - "grad_norm": 1.0974130630493164, - "learning_rate": 0.00035999999999999997, - "loss": 1.6244, + "grad_norm": 1.2375624179840088, + "learning_rate": 0.00027, + "loss": 1.7028, "step": 910 }, { "epoch": 1.4576, - "grad_norm": 1.0330309867858887, - "learning_rate": 0.0003604, - "loss": 1.8489, + "grad_norm": 1.521498203277588, + "learning_rate": 0.00027029999999999996, + "loss": 1.7956, "step": 911 }, { "epoch": 1.4592, - "grad_norm": 1.4640865325927734, - "learning_rate": 0.0003608, - "loss": 1.8396, + "grad_norm": 1.4378412961959839, + "learning_rate": 0.00027059999999999996, + "loss": 1.7424, "step": 912 }, { "epoch": 1.4607999999999999, - "grad_norm": 1.0381420850753784, - "learning_rate": 0.00036119999999999994, - "loss": 1.7392, + "grad_norm": 1.0808361768722534, + "learning_rate": 0.00027089999999999997, + "loss": 1.4528, "step": 913 }, { "epoch": 1.4624, - "grad_norm": 0.9180180430412292, - "learning_rate": 0.00036159999999999995, - "loss": 1.5514, + "grad_norm": 1.5603634119033813, + "learning_rate": 0.0002712, + "loss": 2.0128, "step": 914 }, { "epoch": 1.464, - "grad_norm": 1.0205705165863037, - "learning_rate": 0.000362, - "loss": 1.5544, + "grad_norm": 1.2612849473953247, + "learning_rate": 0.0002715, + "loss": 1.7135, "step": 915 }, { "epoch": 1.4656, - "grad_norm": 0.8514875769615173, - "learning_rate": 0.00036239999999999997, - "loss": 1.6248, + "grad_norm": 1.4489110708236694, + "learning_rate": 0.0002718, + "loss": 1.5092, "step": 916 }, { "epoch": 1.4672, - "grad_norm": 1.1364916563034058, - "learning_rate": 0.0003628, - "loss": 1.5353, + "grad_norm": 1.5542961359024048, + "learning_rate": 0.0002721, + "loss": 1.5612, "step": 917 }, { "epoch": 1.4687999999999999, - "grad_norm": 1.7738701105117798, - "learning_rate": 0.00036319999999999994, - "loss": 1.6905, + "grad_norm": 1.411827802658081, + "learning_rate": 0.0002724, + "loss": 1.5166, "step": 918 }, { "epoch": 1.4704, - "grad_norm": 3.32802414894104, - "learning_rate": 0.00036359999999999995, - "loss": 2.08, + "grad_norm": 1.3303709030151367, + "learning_rate": 0.00027269999999999996, + "loss": 1.3164, "step": 919 }, { "epoch": 1.472, - "grad_norm": 1.756226658821106, - "learning_rate": 0.00036399999999999996, - "loss": 1.6969, + "grad_norm": 1.1644420623779297, + "learning_rate": 0.00027299999999999997, + "loss": 1.8777, "step": 920 }, { "epoch": 1.4736, - "grad_norm": 1.2360084056854248, - "learning_rate": 0.0003643999999999999, - "loss": 1.4355, + "grad_norm": 1.7243609428405762, + "learning_rate": 0.0002733, + "loss": 2.0992, "step": 921 }, { "epoch": 1.4752, - "grad_norm": 1.6236251592636108, - "learning_rate": 0.0003648, - "loss": 2.381, + "grad_norm": 1.2297067642211914, + "learning_rate": 0.0002736, + "loss": 1.5551, "step": 922 }, { "epoch": 1.4768, - "grad_norm": 1.5009859800338745, - "learning_rate": 0.0003652, - "loss": 1.8531, + "grad_norm": 1.3536406755447388, + "learning_rate": 0.0002739, + "loss": 2.1109, "step": 923 }, { "epoch": 1.4784, - "grad_norm": 1.9892377853393555, - "learning_rate": 0.00036559999999999994, - "loss": 2.021, + "grad_norm": 1.1590725183486938, + "learning_rate": 0.0002742, + "loss": 1.965, "step": 924 }, { "epoch": 1.48, - "grad_norm": NaN, - "learning_rate": 0.00036559999999999994, - "loss": 1.6185, + "grad_norm": 2.9230568408966064, + "learning_rate": 0.0002745, + "loss": 2.3223, "step": 925 }, { "epoch": 1.4816, - "grad_norm": 8.870469093322754, - "learning_rate": 0.00036599999999999995, - "loss": 2.6913, + "grad_norm": 5.955326557159424, + "learning_rate": 0.0002748, + "loss": 2.0155, "step": 926 }, { "epoch": 1.4832, - "grad_norm": 6.322633743286133, - "learning_rate": 0.00036639999999999996, - "loss": 2.6344, + "grad_norm": 5.281221866607666, + "learning_rate": 0.00027509999999999996, + "loss": 2.0306, "step": 927 }, { "epoch": 1.4848, - "grad_norm": 3.020659923553467, - "learning_rate": 0.0003667999999999999, - "loss": 1.8173, + "grad_norm": 1.9012714624404907, + "learning_rate": 0.00027539999999999997, + "loss": 1.6593, "step": 928 }, { "epoch": 1.4864, - "grad_norm": 2.220780372619629, - "learning_rate": 0.0003672, - "loss": 1.9342, + "grad_norm": 1.502999186515808, + "learning_rate": 0.0002757, + "loss": 2.0885, "step": 929 }, { "epoch": 1.488, - "grad_norm": 1.8951246738433838, - "learning_rate": 0.0003676, - "loss": 1.6929, + "grad_norm": 2.206328868865967, + "learning_rate": 0.000276, + "loss": 1.5398, "step": 930 }, { "epoch": 1.4896, - "grad_norm": 1.6866941452026367, - "learning_rate": 0.00036799999999999995, - "loss": 2.0583, + "grad_norm": 2.663071393966675, + "learning_rate": 0.0002763, + "loss": 1.6801, "step": 931 }, { "epoch": 1.4912, - "grad_norm": 2.816363573074341, - "learning_rate": 0.00036839999999999996, - "loss": 1.6559, + "grad_norm": 2.390557050704956, + "learning_rate": 0.0002766, + "loss": 1.6475, "step": 932 }, { "epoch": 1.4928, - "grad_norm": 1.5674623250961304, - "learning_rate": 0.00036879999999999997, - "loss": 2.0277, + "grad_norm": 1.131935954093933, + "learning_rate": 0.0002769, + "loss": 1.7341, "step": 933 }, { "epoch": 1.4944, - "grad_norm": 3.0781469345092773, - "learning_rate": 0.0003691999999999999, - "loss": 1.769, + "grad_norm": 1.287434458732605, + "learning_rate": 0.0002772, + "loss": 1.7195, "step": 934 }, { "epoch": 1.496, - "grad_norm": 1.0538510084152222, - "learning_rate": 0.0003696, - "loss": 1.4605, + "grad_norm": 0.8551505208015442, + "learning_rate": 0.00027749999999999997, + "loss": 1.5404, "step": 935 }, { "epoch": 1.4976, - "grad_norm": 0.8226996660232544, - "learning_rate": 0.00037, - "loss": 1.7031, + "grad_norm": 1.8731048107147217, + "learning_rate": 0.0002778, + "loss": 1.495, "step": 936 }, { "epoch": 1.4992, - "grad_norm": 3.8168036937713623, - "learning_rate": 0.00037039999999999995, + "grad_norm": 3.6576366424560547, + "learning_rate": 0.0002781, "loss": 1.7541, "step": 937 }, { "epoch": 1.5008, - "grad_norm": 2.1976897716522217, - "learning_rate": 0.00037079999999999996, - "loss": 1.5996, + "grad_norm": 1.2283856868743896, + "learning_rate": 0.0002784, + "loss": 1.4094, "step": 938 }, { "epoch": 1.5024, - "grad_norm": 1.8852084875106812, - "learning_rate": 0.00037119999999999997, - "loss": 1.8051, + "grad_norm": 2.0950043201446533, + "learning_rate": 0.0002787, + "loss": 1.3984, "step": 939 }, { "epoch": 1.504, - "grad_norm": 1.8353817462921143, - "learning_rate": 0.0003715999999999999, - "loss": 1.2877, + "grad_norm": 1.1806004047393799, + "learning_rate": 0.000279, + "loss": 1.4307, "step": 940 }, { "epoch": 1.5056, - "grad_norm": 1.211934208869934, - "learning_rate": 0.000372, - "loss": 1.3873, + "grad_norm": 0.9991218447685242, + "learning_rate": 0.0002793, + "loss": 1.2988, "step": 941 }, { "epoch": 1.5072, - "grad_norm": 0.9736316204071045, - "learning_rate": 0.0003724, - "loss": 1.4598, + "grad_norm": 1.1656785011291504, + "learning_rate": 0.00027959999999999997, + "loss": 1.4822, "step": 942 }, { "epoch": 1.5088, - "grad_norm": 0.7427512407302856, - "learning_rate": 0.00037279999999999995, - "loss": 1.3927, + "grad_norm": 1.2371271848678589, + "learning_rate": 0.0002799, + "loss": 1.6206, "step": 943 }, { "epoch": 1.5104, - "grad_norm": 1.527969479560852, - "learning_rate": 0.00037319999999999996, - "loss": 1.384, + "grad_norm": 1.1998454332351685, + "learning_rate": 0.0002802, + "loss": 1.4094, "step": 944 }, { "epoch": 1.512, - "grad_norm": 1.2148116827011108, - "learning_rate": 0.0003736, - "loss": 1.4868, + "grad_norm": 1.6202266216278076, + "learning_rate": 0.0002805, + "loss": 1.6976, "step": 945 }, { "epoch": 1.5135999999999998, - "grad_norm": 0.6790880560874939, - "learning_rate": 0.00037399999999999993, - "loss": 1.3247, + "grad_norm": 0.9059401750564575, + "learning_rate": 0.0002808, + "loss": 1.4302, "step": 946 }, { "epoch": 1.5152, - "grad_norm": 0.8779526948928833, - "learning_rate": 0.0003744, - "loss": 1.5266, + "grad_norm": 1.1870366334915161, + "learning_rate": 0.0002811, + "loss": 1.3084, "step": 947 }, { "epoch": 1.5168, - "grad_norm": 1.2756925821304321, - "learning_rate": 0.0003748, - "loss": 1.3128, + "grad_norm": 0.9611499309539795, + "learning_rate": 0.00028139999999999996, + "loss": 1.2804, "step": 948 }, { "epoch": 1.5184, - "grad_norm": 4.33424186706543, - "learning_rate": 0.00037519999999999996, - "loss": 1.7496, + "grad_norm": 1.8231072425842285, + "learning_rate": 0.00028169999999999996, + "loss": 1.9321, "step": 949 }, { "epoch": 1.52, - "grad_norm": 0.9938877820968628, - "learning_rate": 0.00037559999999999997, - "loss": 1.7513, + "grad_norm": 1.3906331062316895, + "learning_rate": 0.00028199999999999997, + "loss": 1.5093, "step": 950 }, { "epoch": 1.5215999999999998, - "grad_norm": 1.0021151304244995, - "learning_rate": 0.000376, - "loss": 1.4579, + "grad_norm": 0.8848568201065063, + "learning_rate": 0.0002823, + "loss": 1.4336, "step": 951 }, { "epoch": 1.5232, - "grad_norm": 1.2110190391540527, - "learning_rate": 0.00037639999999999993, - "loss": 1.4992, + "grad_norm": 0.9558936953544617, + "learning_rate": 0.0002826, + "loss": 1.6246, "step": 952 }, { "epoch": 1.5248, - "grad_norm": 1.012303113937378, - "learning_rate": 0.00037679999999999994, - "loss": 1.4858, + "grad_norm": 0.9805288314819336, + "learning_rate": 0.00028289999999999994, + "loss": 1.3846, "step": 953 }, { "epoch": 1.5264, - "grad_norm": 0.963055431842804, - "learning_rate": 0.0003772, - "loss": 1.2367, + "grad_norm": 0.8371070623397827, + "learning_rate": 0.00028319999999999994, + "loss": 1.7919, "step": 954 }, { "epoch": 1.528, - "grad_norm": 1.213270664215088, - "learning_rate": 0.00037759999999999996, - "loss": 1.4012, + "grad_norm": 0.9453789591789246, + "learning_rate": 0.00028349999999999995, + "loss": 1.3124, "step": 955 }, { "epoch": 1.5295999999999998, - "grad_norm": 1.1823625564575195, - "learning_rate": 0.00037799999999999997, - "loss": 1.5314, + "grad_norm": 1.1171185970306396, + "learning_rate": 0.00028379999999999996, + "loss": 1.5918, "step": 956 }, { "epoch": 1.5312000000000001, - "grad_norm": 1.8237582445144653, - "learning_rate": 0.0003784, - "loss": 1.5552, + "grad_norm": 1.1895886659622192, + "learning_rate": 0.00028409999999999997, + "loss": 1.3716, "step": 957 }, { "epoch": 1.5328, - "grad_norm": 1.1007258892059326, - "learning_rate": 0.00037879999999999994, - "loss": 1.3313, + "grad_norm": 0.9379684329032898, + "learning_rate": 0.0002844, + "loss": 1.453, "step": 958 }, { "epoch": 1.5344, - "grad_norm": 1.2367926836013794, - "learning_rate": 0.00037919999999999995, - "loss": 1.8743, + "grad_norm": 1.0332472324371338, + "learning_rate": 0.0002847, + "loss": 1.5237, "step": 959 }, { "epoch": 1.536, - "grad_norm": 0.8619275689125061, - "learning_rate": 0.0003796, - "loss": 1.6997, + "grad_norm": 0.8858595490455627, + "learning_rate": 0.000285, + "loss": 1.6488, "step": 960 }, { "epoch": 1.5375999999999999, - "grad_norm": 1.0113097429275513, - "learning_rate": 0.00037999999999999997, - "loss": 1.4948, + "grad_norm": 1.681465983390808, + "learning_rate": 0.00028529999999999994, + "loss": 1.6078, "step": 961 }, { "epoch": 1.5392000000000001, - "grad_norm": 1.1451879739761353, - "learning_rate": 0.0003804, - "loss": 1.7052, + "grad_norm": 1.2469699382781982, + "learning_rate": 0.00028559999999999995, + "loss": 1.7671, "step": 962 }, { "epoch": 1.5408, - "grad_norm": 0.9698196053504944, - "learning_rate": 0.0003808, - "loss": 1.1601, + "grad_norm": 1.0610872507095337, + "learning_rate": 0.00028589999999999996, + "loss": 1.3722, "step": 963 }, { "epoch": 1.5424, - "grad_norm": 0.9092131853103638, - "learning_rate": 0.00038119999999999994, - "loss": 1.7966, + "grad_norm": 1.0731545686721802, + "learning_rate": 0.00028619999999999996, + "loss": 1.7164, "step": 964 }, { "epoch": 1.544, - "grad_norm": 1.1889277696609497, - "learning_rate": 0.00038159999999999995, - "loss": 1.8823, + "grad_norm": 1.1132243871688843, + "learning_rate": 0.00028649999999999997, + "loss": 2.0572, "step": 965 }, { "epoch": 1.5455999999999999, - "grad_norm": 1.0519229173660278, - "learning_rate": 0.000382, - "loss": 1.9142, + "grad_norm": 1.1051868200302124, + "learning_rate": 0.0002868, + "loss": 1.6496, "step": 966 }, { "epoch": 1.5472000000000001, - "grad_norm": 1.3340773582458496, - "learning_rate": 0.00038239999999999997, - "loss": 1.9151, + "grad_norm": 1.20839262008667, + "learning_rate": 0.0002871, + "loss": 1.6104, "step": 967 }, { "epoch": 1.5488, - "grad_norm": 2.726638078689575, - "learning_rate": 0.0003828, - "loss": 1.9075, + "grad_norm": 1.0541168451309204, + "learning_rate": 0.00028739999999999994, + "loss": 1.5231, "step": 968 }, { "epoch": 1.5504, - "grad_norm": 2.050051212310791, - "learning_rate": 0.0003832, - "loss": 1.7483, + "grad_norm": 2.385976791381836, + "learning_rate": 0.00028769999999999995, + "loss": 1.5045, "step": 969 }, { "epoch": 1.552, - "grad_norm": 1.1615999937057495, - "learning_rate": 0.00038359999999999995, - "loss": 1.6727, + "grad_norm": 1.1696312427520752, + "learning_rate": 0.00028799999999999995, + "loss": 1.7261, "step": 970 }, { "epoch": 1.5535999999999999, - "grad_norm": 1.1875585317611694, - "learning_rate": 0.00038399999999999996, - "loss": 1.5908, + "grad_norm": 2.113093614578247, + "learning_rate": 0.00028829999999999996, + "loss": 1.9908, "step": 971 }, { "epoch": 1.5552000000000001, - "grad_norm": 1.4524774551391602, - "learning_rate": 0.0003844, - "loss": 2.023, + "grad_norm": 1.855879783630371, + "learning_rate": 0.00028859999999999997, + "loss": 2.2391, "step": 972 }, { "epoch": 1.5568, - "grad_norm": 1.8066257238388062, - "learning_rate": 0.0003848, - "loss": 2.118, + "grad_norm": 2.2188405990600586, + "learning_rate": 0.0002889, + "loss": 2.032, "step": 973 }, { "epoch": 1.5584, - "grad_norm": 1.7197728157043457, - "learning_rate": 0.0003852, - "loss": 2.0651, + "grad_norm": 1.8611500263214111, + "learning_rate": 0.0002892, + "loss": 2.2839, "step": 974 }, { "epoch": 1.56, - "grad_norm": 1.7070485353469849, - "learning_rate": 0.0003856, - "loss": 2.582, + "grad_norm": 2.133455991744995, + "learning_rate": 0.0002895, + "loss": 2.6324, "step": 975 }, { "epoch": 1.5615999999999999, - "grad_norm": 7.113646030426025, - "learning_rate": 0.00038599999999999995, - "loss": 2.124, + "grad_norm": 4.41377067565918, + "learning_rate": 0.00028979999999999994, + "loss": 2.2812, "step": 976 }, { "epoch": 1.5632000000000001, - "grad_norm": 9.63016128540039, - "learning_rate": 0.00038639999999999996, - "loss": 3.0331, + "grad_norm": 1.5284154415130615, + "learning_rate": 0.00029009999999999995, + "loss": 1.8738, "step": 977 }, { "epoch": 1.5648, - "grad_norm": 3.375418186187744, - "learning_rate": 0.0003868, - "loss": 1.7608, + "grad_norm": 1.4382925033569336, + "learning_rate": 0.00029039999999999996, + "loss": 1.7074, "step": 978 }, { "epoch": 1.5664, - "grad_norm": 3.5243453979492188, - "learning_rate": 0.0003872, - "loss": 1.9515, + "grad_norm": 2.024181604385376, + "learning_rate": 0.00029069999999999996, + "loss": 2.068, "step": 979 }, { "epoch": 1.568, - "grad_norm": 3.2788093090057373, - "learning_rate": 0.0003876, - "loss": 1.9329, + "grad_norm": 1.7730194330215454, + "learning_rate": 0.00029099999999999997, + "loss": 1.8904, "step": 980 }, { "epoch": 1.5695999999999999, - "grad_norm": 1.3431663513183594, - "learning_rate": 0.00038799999999999994, - "loss": 1.849, + "grad_norm": 2.1946470737457275, + "learning_rate": 0.0002913, + "loss": 1.8493, "step": 981 }, { "epoch": 1.5712000000000002, - "grad_norm": 3.8363196849823, - "learning_rate": 0.00038839999999999995, - "loss": 1.7101, + "grad_norm": 2.3339195251464844, + "learning_rate": 0.0002916, + "loss": 1.5104, "step": 982 }, { "epoch": 1.5728, - "grad_norm": 4.8841705322265625, - "learning_rate": 0.00038879999999999996, - "loss": 1.9458, + "grad_norm": 1.3478624820709229, + "learning_rate": 0.0002919, + "loss": 1.4885, "step": 983 }, { "epoch": 1.5744, - "grad_norm": 4.90728759765625, - "learning_rate": 0.0003891999999999999, - "loss": 2.0958, + "grad_norm": 1.4026468992233276, + "learning_rate": 0.00029219999999999995, + "loss": 1.6617, "step": 984 }, { "epoch": 1.576, - "grad_norm": 4.307504177093506, - "learning_rate": 0.0003896, - "loss": 1.8693, + "grad_norm": 0.9361331462860107, + "learning_rate": 0.00029249999999999995, + "loss": 1.5275, "step": 985 }, { "epoch": 1.5776, - "grad_norm": 2.8895435333251953, - "learning_rate": 0.00039, - "loss": 1.7851, + "grad_norm": 1.9895044565200806, + "learning_rate": 0.00029279999999999996, + "loss": 1.5585, "step": 986 }, { "epoch": 1.5792000000000002, - "grad_norm": 1.0250154733657837, - "learning_rate": 0.00039039999999999995, - "loss": 1.7486, + "grad_norm": 6.917726516723633, + "learning_rate": 0.00029309999999999997, + "loss": 2.5852, "step": 987 }, { "epoch": 1.5808, - "grad_norm": 0.7546022534370422, - "learning_rate": 0.00039079999999999996, - "loss": 1.2908, + "grad_norm": 1.4938806295394897, + "learning_rate": 0.0002934, + "loss": 1.2511, "step": 988 }, { "epoch": 1.5824, - "grad_norm": 1.4536306858062744, - "learning_rate": 0.00039119999999999997, - "loss": 1.3772, + "grad_norm": 1.2143341302871704, + "learning_rate": 0.0002937, + "loss": 1.4005, "step": 989 }, { "epoch": 1.584, - "grad_norm": 3.1315858364105225, - "learning_rate": 0.0003915999999999999, - "loss": 1.7422, + "grad_norm": 1.1358860731124878, + "learning_rate": 0.000294, + "loss": 1.6835, "step": 990 }, { "epoch": 1.5856, - "grad_norm": 3.7343127727508545, - "learning_rate": 0.00039199999999999993, - "loss": 1.5513, + "grad_norm": 1.1960505247116089, + "learning_rate": 0.00029429999999999994, + "loss": 1.1956, "step": 991 }, { "epoch": 1.5872000000000002, - "grad_norm": 2.6926112174987793, - "learning_rate": 0.0003924, - "loss": 1.538, + "grad_norm": 0.7678542137145996, + "learning_rate": 0.00029459999999999995, + "loss": 1.3352, "step": 992 }, { "epoch": 1.5888, - "grad_norm": 2.2124907970428467, - "learning_rate": 0.00039279999999999995, - "loss": 1.3235, + "grad_norm": 0.9173956513404846, + "learning_rate": 0.00029489999999999996, + "loss": 1.23, "step": 993 }, { "epoch": 1.5904, - "grad_norm": 1.3270829916000366, - "learning_rate": 0.00039319999999999996, - "loss": 1.405, + "grad_norm": 0.8378310799598694, + "learning_rate": 0.00029519999999999997, + "loss": 1.6007, "step": 994 }, { "epoch": 1.592, - "grad_norm": 1.0704325437545776, - "learning_rate": 0.00039359999999999997, - "loss": 1.5179, + "grad_norm": 1.1078053712844849, + "learning_rate": 0.00029549999999999997, + "loss": 1.4643, "step": 995 }, { "epoch": 1.5936, - "grad_norm": 0.7627701759338379, - "learning_rate": 0.00039399999999999993, - "loss": 1.4045, + "grad_norm": 1.037882924079895, + "learning_rate": 0.0002958, + "loss": 1.3761, "step": 996 }, { "epoch": 1.5952, - "grad_norm": 1.1814078092575073, - "learning_rate": 0.00039439999999999994, - "loss": 1.205, + "grad_norm": 0.9665654301643372, + "learning_rate": 0.0002961, + "loss": 1.5703, "step": 997 }, { "epoch": 1.5968, - "grad_norm": 1.1949654817581177, - "learning_rate": 0.0003948, - "loss": 1.5852, + "grad_norm": 0.8781237006187439, + "learning_rate": 0.0002964, + "loss": 1.3703, "step": 998 }, { "epoch": 1.5984, - "grad_norm": 1.570246696472168, - "learning_rate": 0.00039519999999999996, - "loss": 1.4206, + "grad_norm": 1.6409205198287964, + "learning_rate": 0.00029669999999999995, + "loss": 1.3117, "step": 999 }, { "epoch": 1.6, - "grad_norm": 1.0637398958206177, - "learning_rate": 0.00039559999999999997, - "loss": 1.3997, + "grad_norm": 1.333957552909851, + "learning_rate": 0.00029699999999999996, + "loss": 1.5851, "step": 1000 }, { "epoch": 1.6, - "eval_cer": 0.39621716426834613, - "eval_loss": 2.00065016746521, - "eval_runtime": 159.2575, - "eval_samples_per_second": 19.691, - "eval_steps_per_second": 1.231, - "eval_wer": 0.6068801160501502, + "eval_cer": 0.4133216406903974, + "eval_loss": 1.9947007894515991, + "eval_runtime": 158.5726, + "eval_samples_per_second": 19.776, + "eval_steps_per_second": 1.236, + "eval_wer": 0.6194798466480157, "step": 1000 }, { "epoch": 1.6016, - "grad_norm": 1.0269728899002075, - "learning_rate": 0.000396, - "loss": 1.5882, + "grad_norm": 0.975665807723999, + "learning_rate": 0.00029729999999999996, + "loss": 1.3631, "step": 1001 }, { "epoch": 1.6032, - "grad_norm": 1.0317885875701904, - "learning_rate": 0.00039639999999999993, - "loss": 1.7007, + "grad_norm": 1.125833511352539, + "learning_rate": 0.00029759999999999997, + "loss": 1.8541, "step": 1002 }, { "epoch": 1.6048, - "grad_norm": 1.605910062789917, - "learning_rate": 0.00039679999999999994, - "loss": 1.6842, + "grad_norm": 0.9652669429779053, + "learning_rate": 0.0002979, + "loss": 1.4736, "step": 1003 }, { "epoch": 1.6064, - "grad_norm": 1.5912996530532837, - "learning_rate": 0.0003972, - "loss": 1.3565, + "grad_norm": 1.7514461278915405, + "learning_rate": 0.0002982, + "loss": 1.3087, "step": 1004 }, { "epoch": 1.608, - "grad_norm": 0.761491596698761, - "learning_rate": 0.00039759999999999996, - "loss": 1.4429, + "grad_norm": 1.1740005016326904, + "learning_rate": 0.0002985, + "loss": 1.683, "step": 1005 }, { "epoch": 1.6096, - "grad_norm": 0.8185489177703857, - "learning_rate": 0.00039799999999999997, - "loss": 1.319, + "grad_norm": 1.2667077779769897, + "learning_rate": 0.0002988, + "loss": 1.3041, "step": 1006 }, { "epoch": 1.6112, - "grad_norm": 1.1462265253067017, - "learning_rate": 0.0003984, - "loss": 1.4704, + "grad_norm": 0.9430153965950012, + "learning_rate": 0.00029909999999999995, + "loss": 1.62, "step": 1007 }, { "epoch": 1.6128, - "grad_norm": 0.7877523899078369, - "learning_rate": 0.00039879999999999994, - "loss": 1.4737, + "grad_norm": 1.3095754384994507, + "learning_rate": 0.00029939999999999996, + "loss": 2.0639, "step": 1008 }, { "epoch": 1.6143999999999998, - "grad_norm": 1.2089191675186157, - "learning_rate": 0.00039919999999999995, - "loss": 1.8243, + "grad_norm": 1.265551209449768, + "learning_rate": 0.00029969999999999997, + "loss": 1.5279, "step": 1009 }, { "epoch": 1.616, - "grad_norm": 1.1152609586715698, - "learning_rate": 0.0003996, - "loss": 1.5541, + "grad_norm": 1.1508311033248901, + "learning_rate": 0.0003, + "loss": 1.6194, "step": 1010 }, { "epoch": 1.6176, - "grad_norm": 1.2637883424758911, - "learning_rate": 0.00039999999999999996, - "loss": 1.4111, + "grad_norm": 0.9043980836868286, + "learning_rate": 0.0003002999999999999, + "loss": 1.5519, "step": 1011 }, { "epoch": 1.6192, - "grad_norm": 1.272303819656372, - "learning_rate": 0.0004004, - "loss": 1.642, + "grad_norm": 1.0414581298828125, + "learning_rate": 0.0003006, + "loss": 1.4572, "step": 1012 }, { "epoch": 1.6208, - "grad_norm": 0.77174973487854, - "learning_rate": 0.0004008, - "loss": 1.6019, + "grad_norm": 1.113398790359497, + "learning_rate": 0.00030089999999999994, + "loss": 1.5507, "step": 1013 }, { "epoch": 1.6223999999999998, - "grad_norm": 1.0357544422149658, - "learning_rate": 0.00040119999999999994, - "loss": 1.7662, + "grad_norm": 1.0045056343078613, + "learning_rate": 0.00030119999999999995, + "loss": 1.6429, "step": 1014 }, { "epoch": 1.624, - "grad_norm": 1.2624030113220215, - "learning_rate": 0.00040159999999999995, - "loss": 1.44, + "grad_norm": 1.2001416683197021, + "learning_rate": 0.00030149999999999996, + "loss": 1.6436, "step": 1015 }, { "epoch": 1.6256, - "grad_norm": 2.911835193634033, - "learning_rate": 0.000402, - "loss": 1.9629, + "grad_norm": 1.195292353630066, + "learning_rate": 0.00030179999999999996, + "loss": 1.8505, "step": 1016 }, { "epoch": 1.6272, - "grad_norm": 1.212441325187683, - "learning_rate": 0.00040239999999999997, - "loss": 1.368, + "grad_norm": 1.6948364973068237, + "learning_rate": 0.0003020999999999999, + "loss": 1.4089, "step": 1017 }, { "epoch": 1.6288, - "grad_norm": 1.2743821144104004, - "learning_rate": 0.0004028, - "loss": 1.4819, + "grad_norm": 2.20973539352417, + "learning_rate": 0.0003024, + "loss": 2.0513, "step": 1018 }, { "epoch": 1.6303999999999998, - "grad_norm": 1.1178932189941406, - "learning_rate": 0.0004032, - "loss": 1.8235, + "grad_norm": 1.357258677482605, + "learning_rate": 0.00030269999999999993, + "loss": 1.5901, "step": 1019 }, { "epoch": 1.6320000000000001, - "grad_norm": 1.2212958335876465, - "learning_rate": 0.00040359999999999994, - "loss": 1.6722, + "grad_norm": 1.2716312408447266, + "learning_rate": 0.000303, + "loss": 1.8127, "step": 1020 }, { "epoch": 1.6336, - "grad_norm": 1.7435489892959595, - "learning_rate": 0.00040399999999999995, - "loss": 1.5568, + "grad_norm": 1.8802478313446045, + "learning_rate": 0.00030329999999999995, + "loss": 1.9268, "step": 1021 }, { "epoch": 1.6352, - "grad_norm": 1.944061040878296, - "learning_rate": 0.0004044, - "loss": 1.452, + "grad_norm": 1.2685855627059937, + "learning_rate": 0.00030359999999999995, + "loss": 1.8009, "step": 1022 }, { "epoch": 1.6368, - "grad_norm": 1.9267961978912354, - "learning_rate": 0.0004048, - "loss": 1.7871, + "grad_norm": 1.5773471593856812, + "learning_rate": 0.00030389999999999996, + "loss": 1.7688, "step": 1023 }, { "epoch": 1.6383999999999999, - "grad_norm": 2.1189751625061035, - "learning_rate": 0.0004052, - "loss": 2.1738, + "grad_norm": 1.9859917163848877, + "learning_rate": 0.00030419999999999997, + "loss": 2.0243, "step": 1024 }, { "epoch": 1.6400000000000001, - "grad_norm": 1.8044579029083252, - "learning_rate": 0.0004056, - "loss": 2.0679, + "grad_norm": 1.7131433486938477, + "learning_rate": 0.0003044999999999999, + "loss": 1.8381, "step": 1025 }, { "epoch": 1.6416, - "grad_norm": 9.363924980163574, - "learning_rate": 0.00040599999999999995, - "loss": 2.8019, + "grad_norm": 10.376376152038574, + "learning_rate": 0.0003048, + "loss": 3.4879, "step": 1026 }, { "epoch": 1.6432, - "grad_norm": 4.761209964752197, - "learning_rate": 0.00040639999999999996, - "loss": 2.2336, + "grad_norm": 5.2158284187316895, + "learning_rate": 0.00030509999999999994, + "loss": 2.5737, "step": 1027 }, { "epoch": 1.6448, - "grad_norm": 7.563479900360107, - "learning_rate": 0.00040679999999999997, - "loss": 2.7444, + "grad_norm": 7.62057638168335, + "learning_rate": 0.0003054, + "loss": 3.0633, "step": 1028 }, { "epoch": 1.6463999999999999, - "grad_norm": 3.261016845703125, - "learning_rate": 0.0004071999999999999, - "loss": 1.9803, + "grad_norm": 2.5151166915893555, + "learning_rate": 0.00030569999999999995, + "loss": 1.9631, "step": 1029 }, { "epoch": 1.6480000000000001, - "grad_norm": 2.0877084732055664, - "learning_rate": 0.0004076, - "loss": 1.787, + "grad_norm": 2.6367363929748535, + "learning_rate": 0.00030599999999999996, + "loss": 2.0063, "step": 1030 }, { "epoch": 1.6496, - "grad_norm": 1.2679075002670288, - "learning_rate": 0.000408, - "loss": 1.6983, + "grad_norm": 2.630662679672241, + "learning_rate": 0.00030629999999999996, + "loss": 1.661, "step": 1031 }, { "epoch": 1.6512, - "grad_norm": 2.241767168045044, - "learning_rate": 0.00040839999999999995, - "loss": 1.8019, + "grad_norm": 3.1744046211242676, + "learning_rate": 0.00030659999999999997, + "loss": 1.9693, "step": 1032 }, { "epoch": 1.6528, - "grad_norm": 2.6402766704559326, - "learning_rate": 0.00040879999999999996, - "loss": 1.6751, + "grad_norm": 3.2799153327941895, + "learning_rate": 0.0003068999999999999, + "loss": 1.8917, "step": 1033 }, { "epoch": 1.6543999999999999, - "grad_norm": 0.9462832808494568, - "learning_rate": 0.00040919999999999997, - "loss": 1.9119, + "grad_norm": 2.914391279220581, + "learning_rate": 0.0003072, + "loss": 1.8649, "step": 1034 }, { "epoch": 1.6560000000000001, - "grad_norm": 1.57070791721344, - "learning_rate": 0.00040959999999999993, - "loss": 1.4415, + "grad_norm": 1.2659504413604736, + "learning_rate": 0.00030749999999999994, + "loss": 1.7872, "step": 1035 }, { "epoch": 1.6576, - "grad_norm": 0.9740553498268127, - "learning_rate": 0.00041, - "loss": 1.5275, + "grad_norm": 1.2379803657531738, + "learning_rate": 0.0003078, + "loss": 1.4999, "step": 1036 }, { "epoch": 1.6592, - "grad_norm": 2.7589497566223145, - "learning_rate": 0.0004104, - "loss": 1.6575, + "grad_norm": 2.3075740337371826, + "learning_rate": 0.00030809999999999995, + "loss": 1.6165, "step": 1037 }, { "epoch": 1.6608, - "grad_norm": 2.4938724040985107, - "learning_rate": 0.00041079999999999996, - "loss": 1.8596, + "grad_norm": 2.4933927059173584, + "learning_rate": 0.00030839999999999996, + "loss": 1.5173, "step": 1038 }, { "epoch": 1.6623999999999999, - "grad_norm": 1.9689472913742065, - "learning_rate": 0.00041119999999999997, - "loss": 1.3647, + "grad_norm": 1.777960181236267, + "learning_rate": 0.00030869999999999997, + "loss": 1.3944, "step": 1039 }, { "epoch": 1.6640000000000001, - "grad_norm": 2.6272342205047607, - "learning_rate": 0.0004116, - "loss": 1.4989, + "grad_norm": 2.229851484298706, + "learning_rate": 0.000309, + "loss": 1.6344, "step": 1040 }, { "epoch": 1.6656, - "grad_norm": 1.4329277276992798, - "learning_rate": 0.00041199999999999993, - "loss": 1.2195, + "grad_norm": 1.5905592441558838, + "learning_rate": 0.00030929999999999993, + "loss": 1.4025, "step": 1041 }, { "epoch": 1.6672, - "grad_norm": 1.3667677640914917, - "learning_rate": 0.0004124, - "loss": 1.3946, + "grad_norm": 0.9465227723121643, + "learning_rate": 0.0003096, + "loss": 1.5164, "step": 1042 }, { "epoch": 1.6688, - "grad_norm": 0.6925958395004272, - "learning_rate": 0.00041279999999999995, - "loss": 1.3634, + "grad_norm": 1.2524248361587524, + "learning_rate": 0.00030989999999999994, + "loss": 1.5344, "step": 1043 }, { "epoch": 1.6703999999999999, - "grad_norm": 0.9826893210411072, - "learning_rate": 0.00041319999999999996, - "loss": 1.4918, + "grad_norm": 0.9570756554603577, + "learning_rate": 0.0003102, + "loss": 1.4809, "step": 1044 }, { "epoch": 1.6720000000000002, - "grad_norm": 1.4587767124176025, - "learning_rate": 0.00041359999999999997, - "loss": 1.6036, + "grad_norm": 1.3065333366394043, + "learning_rate": 0.00031049999999999996, + "loss": 1.3097, "step": 1045 }, { "epoch": 1.6736, - "grad_norm": 0.8268725872039795, - "learning_rate": 0.0004139999999999999, - "loss": 1.5278, + "grad_norm": 0.7914066314697266, + "learning_rate": 0.00031079999999999997, + "loss": 1.5068, "step": 1046 }, { "epoch": 1.6752, - "grad_norm": 0.8046411871910095, - "learning_rate": 0.00041439999999999994, - "loss": 1.1556, + "grad_norm": 1.5971769094467163, + "learning_rate": 0.00031109999999999997, + "loss": 1.7266, "step": 1047 }, { "epoch": 1.6768, - "grad_norm": 0.6971915364265442, - "learning_rate": 0.0004148, - "loss": 1.6188, + "grad_norm": 2.6839516162872314, + "learning_rate": 0.0003114, + "loss": 1.7296, "step": 1048 }, { "epoch": 1.6784, - "grad_norm": 2.461832284927368, - "learning_rate": 0.00041519999999999995, - "loss": 1.4803, + "grad_norm": 1.134955883026123, + "learning_rate": 0.00031169999999999993, + "loss": 1.1195, "step": 1049 }, { "epoch": 1.6800000000000002, - "grad_norm": 1.7602179050445557, - "learning_rate": 0.00041559999999999996, - "loss": 1.4928, + "grad_norm": 0.9906765222549438, + "learning_rate": 0.000312, + "loss": 1.4243, "step": 1050 }, { "epoch": 1.6816, - "grad_norm": 0.8309889435768127, - "learning_rate": 0.000416, - "loss": 1.4609, + "grad_norm": 1.2784078121185303, + "learning_rate": 0.00031229999999999995, + "loss": 1.5288, "step": 1051 }, { "epoch": 1.6832, - "grad_norm": 1.2826169729232788, - "learning_rate": 0.00041639999999999993, - "loss": 1.5435, + "grad_norm": 0.8589702844619751, + "learning_rate": 0.0003126, + "loss": 1.5027, "step": 1052 }, { "epoch": 1.6848, - "grad_norm": 0.8392765522003174, - "learning_rate": 0.00041679999999999994, - "loss": 1.326, + "grad_norm": 0.9791033864021301, + "learning_rate": 0.00031289999999999996, + "loss": 1.3079, "step": 1053 }, { "epoch": 1.6864, - "grad_norm": 0.830362856388092, - "learning_rate": 0.0004172, - "loss": 1.5522, + "grad_norm": 1.6374212503433228, + "learning_rate": 0.00031319999999999997, + "loss": 1.7103, "step": 1054 }, { "epoch": 1.688, - "grad_norm": 1.0867887735366821, - "learning_rate": 0.00041759999999999996, - "loss": 1.5438, + "grad_norm": 0.799659252166748, + "learning_rate": 0.0003135, + "loss": 1.6644, "step": 1055 }, { "epoch": 1.6896, - "grad_norm": 1.0517538785934448, - "learning_rate": 0.00041799999999999997, - "loss": 1.3964, + "grad_norm": 1.1091631650924683, + "learning_rate": 0.0003138, + "loss": 1.3437, "step": 1056 }, { "epoch": 1.6912, - "grad_norm": 1.4779033660888672, - "learning_rate": 0.0004184, - "loss": 1.5742, + "grad_norm": 1.3691285848617554, + "learning_rate": 0.00031409999999999994, + "loss": 1.3581, "step": 1057 }, { "epoch": 1.6928, - "grad_norm": 0.9230872392654419, - "learning_rate": 0.00041879999999999993, - "loss": 1.3136, + "grad_norm": 1.286253571510315, + "learning_rate": 0.0003144, + "loss": 1.3635, "step": 1058 }, { "epoch": 1.6944, - "grad_norm": 1.0470373630523682, - "learning_rate": 0.00041919999999999994, - "loss": 1.7794, + "grad_norm": 0.8725206851959229, + "learning_rate": 0.00031469999999999995, + "loss": 1.7202, "step": 1059 }, { "epoch": 1.696, - "grad_norm": 1.1661121845245361, - "learning_rate": 0.0004196, - "loss": 1.6266, + "grad_norm": 1.2710314989089966, + "learning_rate": 0.00031499999999999996, + "loss": 1.4148, "step": 1060 }, { "epoch": 1.6976, - "grad_norm": 0.8189252614974976, - "learning_rate": 0.00041999999999999996, - "loss": 1.2236, + "grad_norm": 1.11667799949646, + "learning_rate": 0.00031529999999999997, + "loss": 1.226, "step": 1061 }, { "epoch": 1.6992, - "grad_norm": 1.0342382192611694, - "learning_rate": 0.0004204, - "loss": 1.4281, + "grad_norm": 1.9654427766799927, + "learning_rate": 0.0003156, + "loss": 1.8421, "step": 1062 }, { "epoch": 1.7008, - "grad_norm": 0.9032308459281921, - "learning_rate": 0.0004208, - "loss": 1.5472, + "grad_norm": 1.2792798280715942, + "learning_rate": 0.0003158999999999999, + "loss": 1.6172, "step": 1063 }, { "epoch": 1.7024, - "grad_norm": 0.8276682496070862, - "learning_rate": 0.00042119999999999994, - "loss": 1.397, + "grad_norm": 1.032118558883667, + "learning_rate": 0.0003162, + "loss": 1.5739, "step": 1064 }, { "epoch": 1.704, - "grad_norm": 1.0572770833969116, - "learning_rate": 0.00042159999999999995, - "loss": 1.6688, + "grad_norm": 1.32432222366333, + "learning_rate": 0.00031649999999999994, + "loss": 1.6586, "step": 1065 }, { "epoch": 1.7056, - "grad_norm": 1.0404248237609863, - "learning_rate": 0.00042199999999999996, - "loss": 1.5673, + "grad_norm": 1.1895533800125122, + "learning_rate": 0.0003168, + "loss": 1.6227, "step": 1066 }, { "epoch": 1.7072, - "grad_norm": 1.2805649042129517, - "learning_rate": 0.0004223999999999999, - "loss": 1.6057, + "grad_norm": 1.4331198930740356, + "learning_rate": 0.00031709999999999996, + "loss": 1.5119, "step": 1067 }, { "epoch": 1.7088, - "grad_norm": 1.1005247831344604, - "learning_rate": 0.0004228, - "loss": 1.0563, + "grad_norm": 1.058091163635254, + "learning_rate": 0.00031739999999999996, + "loss": 1.7509, "step": 1068 }, { "epoch": 1.7104, - "grad_norm": 1.6212459802627563, - "learning_rate": 0.0004232, - "loss": 1.4625, + "grad_norm": 0.9952759742736816, + "learning_rate": 0.00031769999999999997, + "loss": 2.0645, "step": 1069 }, { "epoch": 1.712, - "grad_norm": 1.2400896549224854, - "learning_rate": 0.00042359999999999994, - "loss": 1.9359, + "grad_norm": 1.0876610279083252, + "learning_rate": 0.000318, + "loss": 1.5768, "step": 1070 }, { "epoch": 1.7136, - "grad_norm": 1.3954520225524902, - "learning_rate": 0.00042399999999999995, - "loss": 1.6781, + "grad_norm": 1.2695331573486328, + "learning_rate": 0.00031829999999999993, + "loss": 1.549, "step": 1071 }, { "epoch": 1.7151999999999998, - "grad_norm": 1.5502781867980957, - "learning_rate": 0.00042439999999999996, - "loss": 1.6238, + "grad_norm": 1.5271435976028442, + "learning_rate": 0.0003186, + "loss": 1.9173, "step": 1072 }, { "epoch": 1.7168, - "grad_norm": 1.2833088636398315, - "learning_rate": 0.0004247999999999999, - "loss": 1.9013, + "grad_norm": 1.755866527557373, + "learning_rate": 0.00031889999999999995, + "loss": 2.0375, "step": 1073 }, { "epoch": 1.7184, - "grad_norm": 1.4674335718154907, - "learning_rate": 0.0004252, - "loss": 1.531, + "grad_norm": 1.5030609369277954, + "learning_rate": 0.0003192, + "loss": 2.4663, "step": 1074 }, { "epoch": 1.72, "grad_norm": NaN, - "learning_rate": 0.0004252, - "loss": 1.2603, + "learning_rate": 0.0003192, + "loss": 1.5195, "step": 1075 }, { "epoch": 1.7216, - "grad_norm": 9.69878101348877, - "learning_rate": 0.0004256, - "loss": 3.4215, + "grad_norm": 6.813086032867432, + "learning_rate": 0.00031949999999999996, + "loss": 2.6142, "step": 1076 }, { "epoch": 1.7231999999999998, - "grad_norm": 4.316009998321533, - "learning_rate": 0.00042599999999999995, - "loss": 2.3031, + "grad_norm": 6.0582990646362305, + "learning_rate": 0.00031979999999999997, + "loss": 3.1514, "step": 1077 }, { "epoch": 1.7248, - "grad_norm": 2.6098554134368896, - "learning_rate": 0.00042639999999999996, - "loss": 2.5445, + "grad_norm": 1.800521969795227, + "learning_rate": 0.0003201, + "loss": 2.0795, "step": 1078 }, { "epoch": 1.7264, - "grad_norm": 2.034123182296753, - "learning_rate": 0.00042679999999999997, - "loss": 1.8094, + "grad_norm": 1.409582495689392, + "learning_rate": 0.0003204, + "loss": 1.9426, "step": 1079 }, { "epoch": 1.728, - "grad_norm": 2.0914783477783203, - "learning_rate": 0.0004271999999999999, - "loss": 1.9322, + "grad_norm": 2.2173216342926025, + "learning_rate": 0.00032069999999999993, + "loss": 1.8343, "step": 1080 }, { "epoch": 1.7296, - "grad_norm": 2.9455251693725586, - "learning_rate": 0.0004276, - "loss": 2.1734, + "grad_norm": 1.2972121238708496, + "learning_rate": 0.000321, + "loss": 2.2537, "step": 1081 }, { "epoch": 1.7311999999999999, - "grad_norm": 2.6501622200012207, - "learning_rate": 0.000428, - "loss": 2.0604, + "grad_norm": 2.211015224456787, + "learning_rate": 0.00032129999999999995, + "loss": 1.8755, "step": 1082 }, { "epoch": 1.7328000000000001, - "grad_norm": 1.1894841194152832, - "learning_rate": 0.00042839999999999995, - "loss": 2.3638, + "grad_norm": 1.617491364479065, + "learning_rate": 0.0003216, + "loss": 1.6271, "step": 1083 }, { "epoch": 1.7344, - "grad_norm": 1.127158284187317, - "learning_rate": 0.00042879999999999996, - "loss": 1.8291, + "grad_norm": 1.3888314962387085, + "learning_rate": 0.00032189999999999996, + "loss": 1.6329, "step": 1084 }, { "epoch": 1.736, - "grad_norm": 1.0093775987625122, - "learning_rate": 0.00042919999999999997, - "loss": 1.6005, + "grad_norm": 2.2377328872680664, + "learning_rate": 0.00032219999999999997, + "loss": 1.8106, "step": 1085 }, { "epoch": 1.7376, - "grad_norm": 1.7011162042617798, - "learning_rate": 0.0004295999999999999, - "loss": 1.5392, + "grad_norm": 1.4609044790267944, + "learning_rate": 0.0003225, + "loss": 1.2795, "step": 1086 }, { "epoch": 1.7391999999999999, - "grad_norm": 5.270832538604736, - "learning_rate": 0.00043, - "loss": 2.0137, + "grad_norm": 1.2239532470703125, + "learning_rate": 0.0003228, + "loss": 1.7129, "step": 1087 }, { "epoch": 1.7408000000000001, - "grad_norm": 4.517563343048096, - "learning_rate": 0.0004304, - "loss": 1.8987, + "grad_norm": 1.6309138536453247, + "learning_rate": 0.00032309999999999994, + "loss": 1.3214, "step": 1088 }, { "epoch": 1.7424, - "grad_norm": 2.386322498321533, - "learning_rate": 0.00043079999999999995, - "loss": 1.659, + "grad_norm": 2.0995709896087646, + "learning_rate": 0.0003234, + "loss": 1.5447, "step": 1089 }, { "epoch": 1.744, - "grad_norm": 2.8278114795684814, - "learning_rate": 0.00043119999999999996, - "loss": 1.7082, + "grad_norm": 1.7949155569076538, + "learning_rate": 0.00032369999999999995, + "loss": 1.4792, "step": 1090 }, { "epoch": 1.7456, - "grad_norm": 2.6224570274353027, - "learning_rate": 0.0004316, - "loss": 1.844, + "grad_norm": 0.9978499412536621, + "learning_rate": 0.000324, + "loss": 1.2458, "step": 1091 }, { "epoch": 1.7471999999999999, - "grad_norm": 0.962620198726654, - "learning_rate": 0.00043199999999999993, - "loss": 1.74, + "grad_norm": 0.9504018425941467, + "learning_rate": 0.00032429999999999997, + "loss": 1.4711, "step": 1092 }, { "epoch": 1.7488000000000001, - "grad_norm": 1.115366816520691, - "learning_rate": 0.0004324, - "loss": 1.781, + "grad_norm": 1.0632179975509644, + "learning_rate": 0.0003246, + "loss": 1.5718, "step": 1093 }, { "epoch": 1.7504, - "grad_norm": 1.3309029340744019, - "learning_rate": 0.0004328, - "loss": 1.7973, + "grad_norm": 1.2612885236740112, + "learning_rate": 0.0003249, + "loss": 1.542, "step": 1094 }, { "epoch": 1.752, - "grad_norm": 1.3224897384643555, - "learning_rate": 0.00043319999999999996, - "loss": 1.6193, + "grad_norm": 1.4393160343170166, + "learning_rate": 0.0003252, + "loss": 1.4533, "step": 1095 }, { "epoch": 1.7536, - "grad_norm": 0.8370805382728577, - "learning_rate": 0.00043359999999999997, - "loss": 1.541, + "grad_norm": 1.2894624471664429, + "learning_rate": 0.00032549999999999994, + "loss": 1.2829, "step": 1096 }, { "epoch": 1.7551999999999999, - "grad_norm": 1.5298939943313599, - "learning_rate": 0.000434, - "loss": 1.5882, + "grad_norm": 0.9237915873527527, + "learning_rate": 0.0003258, + "loss": 1.5936, "step": 1097 }, { "epoch": 1.7568000000000001, - "grad_norm": 2.6840977668762207, - "learning_rate": 0.00043439999999999993, - "loss": 1.9575, + "grad_norm": 0.7497216463088989, + "learning_rate": 0.00032609999999999996, + "loss": 1.6193, "step": 1098 }, { "epoch": 1.7584, - "grad_norm": 1.4588136672973633, - "learning_rate": 0.0004348, - "loss": 2.0164, + "grad_norm": 2.3699493408203125, + "learning_rate": 0.0003264, + "loss": 1.4169, "step": 1099 }, { "epoch": 1.76, - "grad_norm": 1.8328728675842285, - "learning_rate": 0.0004352, - "loss": 1.768, - "step": 1100 - }, - { - "epoch": 1.76, - "eval_cer": 0.41231713636544826, - "eval_loss": 2.0711567401885986, - "eval_runtime": 157.9834, - "eval_samples_per_second": 19.85, - "eval_steps_per_second": 1.241, - "eval_wer": 0.6448243705315512, + "grad_norm": 1.4830172061920166, + "learning_rate": 0.00032669999999999997, + "loss": 1.3362, "step": 1100 }, { "epoch": 1.7616, - "grad_norm": 1.0604596138000488, - "learning_rate": 0.00043559999999999996, - "loss": 1.4263, + "grad_norm": 1.660394310951233, + "learning_rate": 0.000327, + "loss": 1.5181, "step": 1101 }, { "epoch": 1.7631999999999999, - "grad_norm": 0.9676076769828796, - "learning_rate": 0.00043599999999999997, - "loss": 1.6473, + "grad_norm": 1.3307558298110962, + "learning_rate": 0.0003273, + "loss": 1.5403, "step": 1102 }, { "epoch": 1.7648000000000001, - "grad_norm": 0.9821343421936035, - "learning_rate": 0.0004364, - "loss": 1.8823, + "grad_norm": 1.7247034311294556, + "learning_rate": 0.0003276, + "loss": 1.5718, "step": 1103 }, { "epoch": 1.7664, - "grad_norm": 0.9323663711547852, - "learning_rate": 0.00043679999999999994, - "loss": 1.5884, + "grad_norm": 1.1111986637115479, + "learning_rate": 0.00032789999999999995, + "loss": 1.3949, "step": 1104 }, { "epoch": 1.768, - "grad_norm": 0.6213338971138, - "learning_rate": 0.00043719999999999995, - "loss": 1.5625, + "grad_norm": 1.0019989013671875, + "learning_rate": 0.0003282, + "loss": 1.486, "step": 1105 }, { "epoch": 1.7696, - "grad_norm": 0.767331063747406, - "learning_rate": 0.0004375999999999999, - "loss": 1.5178, + "grad_norm": 0.9997567534446716, + "learning_rate": 0.00032849999999999996, + "loss": 1.4654, "step": 1106 }, { "epoch": 1.7711999999999999, - "grad_norm": 1.7353647947311401, - "learning_rate": 0.00043799999999999997, - "loss": 1.7371, + "grad_norm": 0.9817771315574646, + "learning_rate": 0.0003288, + "loss": 1.4524, "step": 1107 }, { "epoch": 1.7728000000000002, - "grad_norm": 1.3577700853347778, - "learning_rate": 0.0004384, - "loss": 1.5416, + "grad_norm": 1.1444945335388184, + "learning_rate": 0.0003291, + "loss": 1.525, "step": 1108 }, { "epoch": 1.7744, - "grad_norm": 1.7404487133026123, - "learning_rate": 0.00043879999999999993, - "loss": 1.4647, + "grad_norm": 1.1713188886642456, + "learning_rate": 0.0003294, + "loss": 1.4727, "step": 1109 }, { "epoch": 1.776, - "grad_norm": 2.192164659500122, - "learning_rate": 0.00043919999999999994, - "loss": 1.7362, + "grad_norm": 1.6740111112594604, + "learning_rate": 0.0003297, + "loss": 1.6013, "step": 1110 }, { "epoch": 1.7776, - "grad_norm": 0.7440006732940674, - "learning_rate": 0.00043959999999999995, - "loss": 1.8019, + "grad_norm": 1.3131376504898071, + "learning_rate": 0.00033, + "loss": 1.5525, "step": 1111 }, { "epoch": 1.7792, - "grad_norm": 0.8111011385917664, - "learning_rate": 0.0004399999999999999, - "loss": 1.6845, + "grad_norm": 1.5951952934265137, + "learning_rate": 0.00033029999999999995, + "loss": 1.5049, "step": 1112 }, { "epoch": 1.7808000000000002, - "grad_norm": 0.8545581102371216, - "learning_rate": 0.00044039999999999997, - "loss": 1.8391, + "grad_norm": 1.0123345851898193, + "learning_rate": 0.0003306, + "loss": 1.655, "step": 1113 }, { "epoch": 1.7824, - "grad_norm": 1.5863672494888306, - "learning_rate": 0.0004408, - "loss": 1.8954, + "grad_norm": 0.9980636239051819, + "learning_rate": 0.00033089999999999997, + "loss": 1.413, "step": 1114 }, { "epoch": 1.784, - "grad_norm": 0.9695616960525513, - "learning_rate": 0.00044119999999999994, - "loss": 1.4524, + "grad_norm": 1.0149414539337158, + "learning_rate": 0.0003312, + "loss": 1.3944, "step": 1115 }, { "epoch": 1.7856, - "grad_norm": 0.8583353757858276, - "learning_rate": 0.00044159999999999995, - "loss": 1.91, + "grad_norm": 1.0100555419921875, + "learning_rate": 0.0003315, + "loss": 1.6389, "step": 1116 }, { "epoch": 1.7872, - "grad_norm": 1.3024649620056152, - "learning_rate": 0.00044199999999999996, - "loss": 2.0295, + "grad_norm": 1.0911948680877686, + "learning_rate": 0.0003318, + "loss": 1.6691, "step": 1117 }, { "epoch": 1.7888, - "grad_norm": 1.960022211074829, - "learning_rate": 0.0004423999999999999, - "loss": 1.8059, + "grad_norm": 0.9347399473190308, + "learning_rate": 0.00033209999999999994, + "loss": 2.1185, "step": 1118 }, { "epoch": 1.7904, - "grad_norm": 0.7005279660224915, - "learning_rate": 0.0004428, - "loss": 1.7367, + "grad_norm": 0.9940705895423889, + "learning_rate": 0.0003324, + "loss": 1.8248, "step": 1119 }, { "epoch": 1.792, - "grad_norm": 1.53810453414917, - "learning_rate": 0.0004432, - "loss": 1.683, + "grad_norm": 1.3178499937057495, + "learning_rate": 0.00033269999999999996, + "loss": 1.3904, "step": 1120 }, { "epoch": 1.7936, - "grad_norm": 1.2621592283248901, - "learning_rate": 0.00044359999999999994, - "loss": 1.8247, + "grad_norm": 1.338726282119751, + "learning_rate": 0.000333, + "loss": 1.6328, "step": 1121 }, { "epoch": 1.7952, - "grad_norm": 1.0741353034973145, - "learning_rate": 0.00044399999999999995, - "loss": 1.6832, + "grad_norm": 1.6837692260742188, + "learning_rate": 0.00033329999999999997, + "loss": 1.8429, "step": 1122 }, { "epoch": 1.7968, - "grad_norm": 1.36100172996521, - "learning_rate": 0.00044439999999999996, - "loss": 1.4735, + "grad_norm": 1.9074715375900269, + "learning_rate": 0.0003336, + "loss": 1.9857, "step": 1123 }, { "epoch": 1.7984, - "grad_norm": 1.0110926628112793, - "learning_rate": 0.0004447999999999999, - "loss": 1.9363, + "grad_norm": 1.2642357349395752, + "learning_rate": 0.0003339, + "loss": 2.0672, "step": 1124 }, { "epoch": 1.8, - "grad_norm": 1.4565409421920776, - "learning_rate": 0.0004452, - "loss": 2.7498, + "grad_norm": 1.70827054977417, + "learning_rate": 0.0003342, + "loss": 2.2459, "step": 1125 }, { "epoch": 1.8016, - "grad_norm": 13.088458061218262, - "learning_rate": 0.0004456, - "loss": 3.6198, + "grad_norm": 8.30029582977295, + "learning_rate": 0.00033449999999999994, + "loss": 3.0681, "step": 1126 }, { "epoch": 1.8032, - "grad_norm": 2.855563163757324, - "learning_rate": 0.00044599999999999994, - "loss": 2.1456, + "grad_norm": 3.6573855876922607, + "learning_rate": 0.0003348, + "loss": 2.4, "step": 1127 }, { "epoch": 1.8048, - "grad_norm": 1.2719941139221191, - "learning_rate": 0.00044639999999999995, - "loss": 1.972, + "grad_norm": 1.3085684776306152, + "learning_rate": 0.00033509999999999996, + "loss": 2.0479, "step": 1128 }, { "epoch": 1.8064, - "grad_norm": 2.133577585220337, - "learning_rate": 0.00044679999999999996, - "loss": 1.8491, + "grad_norm": 1.3093235492706299, + "learning_rate": 0.0003354, + "loss": 1.6513, "step": 1129 }, { "epoch": 1.808, - "grad_norm": 2.3331825733184814, - "learning_rate": 0.0004471999999999999, - "loss": 2.1332, + "grad_norm": 2.186389207839966, + "learning_rate": 0.0003357, + "loss": 1.64, "step": 1130 }, { "epoch": 1.8096, - "grad_norm": 2.8282387256622314, - "learning_rate": 0.0004476, - "loss": 2.1503, + "grad_norm": 2.4307668209075928, + "learning_rate": 0.000336, + "loss": 1.7372, "step": 1131 }, { "epoch": 1.8112, - "grad_norm": 1.7458895444869995, - "learning_rate": 0.000448, - "loss": 2.0523, + "grad_norm": 1.5897157192230225, + "learning_rate": 0.0003363, + "loss": 1.6175, "step": 1132 }, { "epoch": 1.8128, - "grad_norm": 1.0502371788024902, - "learning_rate": 0.00044839999999999995, - "loss": 1.8634, + "grad_norm": 1.1186329126358032, + "learning_rate": 0.0003366, + "loss": 1.5705, "step": 1133 }, { "epoch": 1.8144, - "grad_norm": 1.3230615854263306, - "learning_rate": 0.00044879999999999996, - "loss": 2.218, + "grad_norm": 1.862901210784912, + "learning_rate": 0.00033689999999999995, + "loss": 1.4379, "step": 1134 }, { "epoch": 1.8159999999999998, - "grad_norm": 2.9722142219543457, - "learning_rate": 0.00044919999999999997, - "loss": 1.9498, + "grad_norm": 3.026568651199341, + "learning_rate": 0.0003372, + "loss": 1.7887, "step": 1135 }, { "epoch": 1.8176, - "grad_norm": 5.391232967376709, - "learning_rate": 0.0004495999999999999, - "loss": 2.1481, + "grad_norm": 3.224273204803467, + "learning_rate": 0.00033749999999999996, + "loss": 1.5298, "step": 1136 }, { "epoch": 1.8192, - "grad_norm": 8.026174545288086, - "learning_rate": 0.00045, - "loss": 2.5871, + "grad_norm": 1.6268669366836548, + "learning_rate": 0.0003377999999999999, + "loss": 1.2549, "step": 1137 }, { "epoch": 1.8208, - "grad_norm": 5.394509315490723, - "learning_rate": 0.0004504, - "loss": 2.076, + "grad_norm": 1.8013404607772827, + "learning_rate": 0.0003381, + "loss": 1.3867, "step": 1138 }, { "epoch": 1.8224, - "grad_norm": 4.2326579093933105, - "learning_rate": 0.00045079999999999995, - "loss": 2.1723, + "grad_norm": 1.5397872924804688, + "learning_rate": 0.00033839999999999993, + "loss": 1.7614, "step": 1139 }, { "epoch": 1.8239999999999998, - "grad_norm": 2.653384208679199, - "learning_rate": 0.00045119999999999996, - "loss": 1.9721, + "grad_norm": 0.8708004355430603, + "learning_rate": 0.0003387, + "loss": 1.1885, "step": 1140 }, { "epoch": 1.8256000000000001, - "grad_norm": 2.5420026779174805, - "learning_rate": 0.00045159999999999997, - "loss": 1.8613, + "grad_norm": 0.7449163198471069, + "learning_rate": 0.00033899999999999995, + "loss": 1.4271, "step": 1141 }, { "epoch": 1.8272, - "grad_norm": 1.363802433013916, - "learning_rate": 0.00045199999999999993, - "loss": 2.0218, + "grad_norm": 0.8833471536636353, + "learning_rate": 0.00033929999999999995, + "loss": 1.8404, "step": 1142 }, { "epoch": 1.8288, - "grad_norm": 1.3699275255203247, - "learning_rate": 0.00045239999999999994, - "loss": 1.9391, + "grad_norm": 1.6660796403884888, + "learning_rate": 0.00033959999999999996, + "loss": 1.3491, "step": 1143 }, { "epoch": 1.8304, - "grad_norm": 1.7976224422454834, - "learning_rate": 0.0004528, - "loss": 1.9632, + "grad_norm": 0.85002601146698, + "learning_rate": 0.00033989999999999997, + "loss": 1.2806, "step": 1144 }, { "epoch": 1.8319999999999999, - "grad_norm": 1.7157306671142578, - "learning_rate": 0.00045319999999999996, - "loss": 2.0015, + "grad_norm": 0.9421733617782593, + "learning_rate": 0.0003401999999999999, + "loss": 1.1968, "step": 1145 }, { "epoch": 1.8336000000000001, - "grad_norm": 1.1712660789489746, - "learning_rate": 0.00045359999999999997, - "loss": 1.7752, + "grad_norm": 0.7952862977981567, + "learning_rate": 0.0003405, + "loss": 1.4257, "step": 1146 }, { "epoch": 1.8352, - "grad_norm": 0.9054010510444641, - "learning_rate": 0.000454, - "loss": 2.0706, + "grad_norm": 1.2081334590911865, + "learning_rate": 0.00034079999999999994, + "loss": 1.3157, "step": 1147 }, { "epoch": 1.8368, - "grad_norm": 0.8177084922790527, - "learning_rate": 0.00045439999999999993, - "loss": 1.7316, + "grad_norm": 1.6554081439971924, + "learning_rate": 0.0003411, + "loss": 1.4017, "step": 1148 }, { "epoch": 1.8384, - "grad_norm": 1.8948848247528076, - "learning_rate": 0.00045479999999999994, - "loss": 1.9209, + "grad_norm": 1.1112024784088135, + "learning_rate": 0.00034139999999999995, + "loss": 1.3912, "step": 1149 }, { "epoch": 1.8399999999999999, - "grad_norm": 2.157670021057129, - "learning_rate": 0.0004552, - "loss": 1.9837, + "grad_norm": 1.1887140274047852, + "learning_rate": 0.00034169999999999996, + "loss": 1.4092, "step": 1150 }, { "epoch": 1.8416000000000001, - "grad_norm": 2.8265748023986816, - "learning_rate": 0.00045559999999999996, - "loss": 2.0989, + "grad_norm": 0.9006046056747437, + "learning_rate": 0.00034199999999999996, + "loss": 1.4782, "step": 1151 }, { "epoch": 1.8432, - "grad_norm": 2.864417552947998, - "learning_rate": 0.00045599999999999997, - "loss": 1.8949, + "grad_norm": 0.8354558944702148, + "learning_rate": 0.00034229999999999997, + "loss": 1.3304, "step": 1152 }, { "epoch": 1.8448, - "grad_norm": 2.235643148422241, - "learning_rate": 0.0004564, - "loss": 1.7172, + "grad_norm": 1.1101363897323608, + "learning_rate": 0.0003425999999999999, + "loss": 1.4787, "step": 1153 }, { "epoch": 1.8464, - "grad_norm": 1.4401240348815918, - "learning_rate": 0.00045679999999999994, - "loss": 1.9149, + "grad_norm": 0.8940262198448181, + "learning_rate": 0.0003429, + "loss": 1.4119, "step": 1154 }, { "epoch": 1.8479999999999999, - "grad_norm": 1.7679356336593628, - "learning_rate": 0.00045719999999999995, - "loss": 1.9761, + "grad_norm": 0.973086953163147, + "learning_rate": 0.00034319999999999994, + "loss": 1.207, "step": 1155 }, { "epoch": 1.8496000000000001, - "grad_norm": 0.6671059131622314, - "learning_rate": 0.0004576, - "loss": 1.7264, + "grad_norm": 0.8686078786849976, + "learning_rate": 0.0003435, + "loss": 1.41, "step": 1156 }, { "epoch": 1.8512, - "grad_norm": 2.028003215789795, - "learning_rate": 0.00045799999999999997, - "loss": 2.0366, + "grad_norm": 1.9869338274002075, + "learning_rate": 0.00034379999999999995, + "loss": 1.6477, "step": 1157 }, { "epoch": 1.8528, - "grad_norm": 1.4847235679626465, - "learning_rate": 0.0004584, - "loss": 1.8503, + "grad_norm": 1.961246371269226, + "learning_rate": 0.00034409999999999996, + "loss": 1.8708, "step": 1158 }, { "epoch": 1.8544, - "grad_norm": 1.177909016609192, - "learning_rate": 0.0004588, - "loss": 1.9129, + "grad_norm": 1.5795491933822632, + "learning_rate": 0.00034439999999999997, + "loss": 1.3292, "step": 1159 }, { "epoch": 1.8559999999999999, - "grad_norm": 0.9721006751060486, - "learning_rate": 0.00045919999999999994, - "loss": 1.8643, + "grad_norm": 1.0717681646347046, + "learning_rate": 0.0003447, + "loss": 1.127, "step": 1160 }, { "epoch": 1.8576000000000001, - "grad_norm": 2.1391115188598633, - "learning_rate": 0.00045959999999999995, - "loss": 2.0204, + "grad_norm": 1.2065831422805786, + "learning_rate": 0.00034499999999999993, + "loss": 1.3418, "step": 1161 }, { "epoch": 1.8592, - "grad_norm": 1.4983505010604858, - "learning_rate": 0.00046, - "loss": 1.7141, + "grad_norm": 1.1401423215866089, + "learning_rate": 0.0003453, + "loss": 1.596, "step": 1162 }, { "epoch": 1.8608, - "grad_norm": 1.2664161920547485, - "learning_rate": 0.00046039999999999997, - "loss": 1.7594, + "grad_norm": 1.329084873199463, + "learning_rate": 0.00034559999999999994, + "loss": 1.5699, "step": 1163 }, { "epoch": 1.8624, - "grad_norm": 0.9460088610649109, - "learning_rate": 0.0004608, - "loss": 1.6094, + "grad_norm": 1.0124589204788208, + "learning_rate": 0.00034589999999999995, + "loss": 1.4628, "step": 1164 }, { "epoch": 1.8639999999999999, - "grad_norm": 0.9295836091041565, - "learning_rate": 0.0004612, - "loss": 1.848, + "grad_norm": 1.0017642974853516, + "learning_rate": 0.00034619999999999996, + "loss": 1.3672, "step": 1165 }, { "epoch": 1.8656000000000001, - "grad_norm": 1.0059525966644287, - "learning_rate": 0.00046159999999999994, - "loss": 1.9431, + "grad_norm": 1.0151982307434082, + "learning_rate": 0.00034649999999999997, + "loss": 1.7449, "step": 1166 }, { "epoch": 1.8672, - "grad_norm": 0.6941124796867371, - "learning_rate": 0.00046199999999999995, - "loss": 1.5408, + "grad_norm": 0.9948499202728271, + "learning_rate": 0.0003467999999999999, + "loss": 1.9511, "step": 1167 }, { "epoch": 1.8688, - "grad_norm": 2.3069934844970703, - "learning_rate": 0.0004624, - "loss": 1.9049, + "grad_norm": 1.5604969263076782, + "learning_rate": 0.0003471, + "loss": 1.8143, "step": 1168 }, { "epoch": 1.8704, - "grad_norm": 1.7218568325042725, - "learning_rate": 0.0004628, - "loss": 1.634, + "grad_norm": NaN, + "learning_rate": 0.0003471, + "loss": 0.9545, "step": 1169 }, { "epoch": 1.8719999999999999, - "grad_norm": 0.6982250213623047, - "learning_rate": 0.0004632, - "loss": 2.0402, + "grad_norm": 1.3670181035995483, + "learning_rate": 0.00034739999999999993, + "loss": 2.0108, "step": 1170 }, { "epoch": 1.8736000000000002, - "grad_norm": 0.6958193778991699, - "learning_rate": 0.00046359999999999994, - "loss": 1.8532, + "grad_norm": 1.0110036134719849, + "learning_rate": 0.0003477, + "loss": 1.3934, "step": 1171 }, { "epoch": 1.8752, - "grad_norm": 3.401237964630127, - "learning_rate": 0.00046399999999999995, - "loss": 2.1863, + "grad_norm": 1.3402187824249268, + "learning_rate": 0.00034799999999999995, + "loss": 2.1356, "step": 1172 }, { "epoch": 1.8768, - "grad_norm": 1.5297185182571411, - "learning_rate": 0.00046439999999999996, - "loss": 2.0749, + "grad_norm": 1.2022168636322021, + "learning_rate": 0.00034829999999999996, + "loss": 2.0219, "step": 1173 }, { "epoch": 1.8784, - "grad_norm": 0.9487005472183228, - "learning_rate": 0.0004647999999999999, - "loss": 2.2366, + "grad_norm": 1.476257562637329, + "learning_rate": 0.00034859999999999996, + "loss": 1.9049, "step": 1174 }, { "epoch": 1.88, - "grad_norm": 1.712845802307129, - "learning_rate": 0.0004651999999999999, - "loss": 2.9485, + "grad_norm": NaN, + "learning_rate": 0.00034859999999999996, + "loss": 1.8173, "step": 1175 }, { "epoch": 1.8816000000000002, - "grad_norm": 8.302671432495117, - "learning_rate": 0.0004656, - "loss": 3.1518, + "grad_norm": 9.76489543914795, + "learning_rate": 0.00034889999999999997, + "loss": 2.8974, "step": 1176 }, { "epoch": 1.8832, - "grad_norm": 1.6076334714889526, - "learning_rate": 0.00046599999999999994, - "loss": 2.0009, + "grad_norm": 3.76676082611084, + "learning_rate": 0.0003491999999999999, + "loss": 1.9594, "step": 1177 }, { "epoch": 1.8848, - "grad_norm": 2.757471799850464, - "learning_rate": 0.00046639999999999995, - "loss": 1.8295, + "grad_norm": 3.224771022796631, + "learning_rate": 0.0003495, + "loss": 1.8728, "step": 1178 }, { "epoch": 1.8864, - "grad_norm": 2.844334602355957, - "learning_rate": 0.00046679999999999996, - "loss": 1.9377, + "grad_norm": 3.7804224491119385, + "learning_rate": 0.00034979999999999994, + "loss": 2.5339, "step": 1179 }, { "epoch": 1.888, - "grad_norm": 1.4853931665420532, - "learning_rate": 0.0004671999999999999, - "loss": 2.4697, + "grad_norm": 1.3428702354431152, + "learning_rate": 0.0003501, + "loss": 1.7693, "step": 1180 }, { "epoch": 1.8896, - "grad_norm": 5.221473693847656, - "learning_rate": 0.00046759999999999993, - "loss": 2.337, + "grad_norm": 1.6462411880493164, + "learning_rate": 0.00035039999999999995, + "loss": 1.9127, "step": 1181 }, { "epoch": 1.8912, - "grad_norm": 5.037121772766113, - "learning_rate": 0.000468, - "loss": 2.47, + "grad_norm": 3.412652015686035, + "learning_rate": 0.00035069999999999996, + "loss": 1.7551, "step": 1182 }, { "epoch": 1.8928, - "grad_norm": 4.587244510650635, - "learning_rate": 0.00046839999999999995, - "loss": 2.3116, + "grad_norm": 3.8241982460021973, + "learning_rate": 0.00035099999999999997, + "loss": 1.9243, "step": 1183 }, { "epoch": 1.8944, - "grad_norm": 3.243819236755371, - "learning_rate": 0.00046879999999999996, - "loss": 2.4598, + "grad_norm": 0.9238096475601196, + "learning_rate": 0.0003513, + "loss": 2.2319, "step": 1184 }, { "epoch": 1.896, - "grad_norm": 2.0243616104125977, - "learning_rate": 0.00046919999999999997, - "loss": 2.3673, + "grad_norm": 2.0135838985443115, + "learning_rate": 0.0003515999999999999, + "loss": 1.9178, "step": 1185 }, { "epoch": 1.8976, - "grad_norm": 0.9840572476387024, - "learning_rate": 0.0004695999999999999, - "loss": 2.2649, + "grad_norm": 2.6889097690582275, + "learning_rate": 0.0003519, + "loss": 1.8285, "step": 1186 }, { "epoch": 1.8992, - "grad_norm": 4.131119728088379, - "learning_rate": 0.00046999999999999993, - "loss": 2.5321, + "grad_norm": 1.5762139558792114, + "learning_rate": 0.00035219999999999994, + "loss": 1.9024, "step": 1187 }, { "epoch": 1.9008, - "grad_norm": 6.485975742340088, - "learning_rate": 0.0004704, - "loss": 2.8195, + "grad_norm": 0.8470468521118164, + "learning_rate": 0.0003525, + "loss": 1.7886, "step": 1188 }, { "epoch": 1.9024, - "grad_norm": 7.827768802642822, - "learning_rate": 0.00047079999999999995, - "loss": 2.8283, + "grad_norm": 1.188670039176941, + "learning_rate": 0.00035279999999999996, + "loss": 1.8531, "step": 1189 }, { "epoch": 1.904, - "grad_norm": 10.367870330810547, - "learning_rate": 0.00047119999999999996, - "loss": 3.0633, + "grad_norm": 2.7455601692199707, + "learning_rate": 0.00035309999999999996, + "loss": 1.9635, "step": 1190 }, { "epoch": 1.9056, - "grad_norm": 7.246518135070801, - "learning_rate": 0.00047159999999999997, - "loss": 2.6929, + "grad_norm": 3.5593600273132324, + "learning_rate": 0.00035339999999999997, + "loss": 1.9976, "step": 1191 }, { "epoch": 1.9072, - "grad_norm": 8.31074333190918, - "learning_rate": 0.0004719999999999999, - "loss": 2.9722, + "grad_norm": 4.159093379974365, + "learning_rate": 0.0003537, + "loss": 1.9367, "step": 1192 }, { "epoch": 1.9088, - "grad_norm": 13.296765327453613, - "learning_rate": 0.00047239999999999994, - "loss": 3.661, + "grad_norm": 4.765223026275635, + "learning_rate": 0.00035399999999999993, + "loss": 2.1654, "step": 1193 }, { "epoch": 1.9104, - "grad_norm": 5.0115580558776855, - "learning_rate": 0.0004728, - "loss": 2.7741, + "grad_norm": 6.879053115844727, + "learning_rate": 0.0003543, + "loss": 2.4834, "step": 1194 }, { "epoch": 1.912, - "grad_norm": 2.283982038497925, - "learning_rate": 0.00047319999999999996, - "loss": 2.4196, + "grad_norm": 5.095763683319092, + "learning_rate": 0.00035459999999999995, + "loss": 2.262, "step": 1195 }, { "epoch": 1.9136, - "grad_norm": 1.2423416376113892, - "learning_rate": 0.00047359999999999997, - "loss": 2.4451, + "grad_norm": 4.1628899574279785, + "learning_rate": 0.0003549, + "loss": 2.3063, "step": 1196 }, { "epoch": 1.9152, - "grad_norm": 1.4220050573349, - "learning_rate": 0.000474, - "loss": 2.5677, + "grad_norm": 2.7070200443267822, + "learning_rate": 0.00035519999999999996, + "loss": 2.1153, "step": 1197 }, { "epoch": 1.9167999999999998, - "grad_norm": 1.648946762084961, - "learning_rate": 0.00047439999999999993, - "loss": 2.5203, + "grad_norm": 2.1983084678649902, + "learning_rate": 0.00035549999999999997, + "loss": 1.8997, "step": 1198 }, { "epoch": 1.9184, - "grad_norm": 1.3786659240722656, - "learning_rate": 0.00047479999999999994, - "loss": 2.3582, + "grad_norm": 0.9442070126533508, + "learning_rate": 0.0003558, + "loss": 2.1675, "step": 1199 }, { "epoch": 1.92, - "grad_norm": 1.37655508518219, - "learning_rate": 0.0004752, - "loss": 2.5192, - "step": 1200 - }, - { - "epoch": 1.92, - "eval_cer": 0.6883644915693387, - "eval_loss": 2.5728604793548584, - "eval_runtime": 158.5376, - "eval_samples_per_second": 19.781, - "eval_steps_per_second": 1.236, - "eval_wer": 0.9177909024971506, + "grad_norm": 0.8357645869255066, + "learning_rate": 0.0003561, + "loss": 1.8516, "step": 1200 }, { "epoch": 1.9216, - "grad_norm": 1.187514066696167, - "learning_rate": 0.00047559999999999996, - "loss": 2.5463, + "grad_norm": 1.744762897491455, + "learning_rate": 0.00035639999999999994, + "loss": 2.0289, "step": 1201 }, { "epoch": 1.9232, - "grad_norm": 1.9947316646575928, - "learning_rate": 0.00047599999999999997, - "loss": 2.4633, + "grad_norm": 2.0870983600616455, + "learning_rate": 0.0003567, + "loss": 2.0565, "step": 1202 }, { "epoch": 1.9247999999999998, - "grad_norm": 3.0653204917907715, - "learning_rate": 0.0004764, - "loss": 2.6337, + "grad_norm": 1.1655802726745605, + "learning_rate": 0.00035699999999999995, + "loss": 2.0301, "step": 1203 }, { "epoch": 1.9264000000000001, - "grad_norm": 4.112390995025635, - "learning_rate": 0.00047679999999999993, - "loss": 2.4977, + "grad_norm": 1.394233226776123, + "learning_rate": 0.0003573, + "loss": 2.0413, "step": 1204 }, { "epoch": 1.928, - "grad_norm": 3.0475566387176514, - "learning_rate": 0.00047719999999999994, - "loss": 2.3692, + "grad_norm": 0.8284745216369629, + "learning_rate": 0.00035759999999999996, + "loss": 2.0316, "step": 1205 }, { "epoch": 1.9296, - "grad_norm": 3.443176507949829, - "learning_rate": 0.0004776, - "loss": 2.5523, + "grad_norm": 1.0152744054794312, + "learning_rate": 0.00035789999999999997, + "loss": 2.0509, "step": 1206 }, { "epoch": 1.9312, - "grad_norm": 2.9816811084747314, - "learning_rate": 0.00047799999999999996, - "loss": 2.5454, + "grad_norm": 1.3899942636489868, + "learning_rate": 0.0003582, + "loss": 1.8778, "step": 1207 }, { "epoch": 1.9327999999999999, - "grad_norm": 2.6588003635406494, - "learning_rate": 0.0004784, - "loss": 2.3668, + "grad_norm": 2.3678336143493652, + "learning_rate": 0.0003585, + "loss": 2.128, "step": 1208 }, { "epoch": 1.9344000000000001, - "grad_norm": 0.9880726933479309, - "learning_rate": 0.0004788, - "loss": 2.5404, + "grad_norm": 2.9605188369750977, + "learning_rate": 0.00035879999999999994, + "loss": 1.902, "step": 1209 }, { "epoch": 1.936, - "grad_norm": 1.2105108499526978, - "learning_rate": 0.00047919999999999994, - "loss": 2.5146, + "grad_norm": 1.4265320301055908, + "learning_rate": 0.0003591, + "loss": 2.0587, "step": 1210 }, { "epoch": 1.9376, - "grad_norm": 0.7539094090461731, - "learning_rate": 0.00047959999999999995, - "loss": 2.5409, + "grad_norm": 2.7985453605651855, + "learning_rate": 0.00035939999999999995, + "loss": 2.1007, "step": 1211 }, { "epoch": 1.9392, - "grad_norm": 1.3209741115570068, - "learning_rate": 0.00047999999999999996, - "loss": 2.2839, + "grad_norm": 1.6142802238464355, + "learning_rate": 0.00035969999999999996, + "loss": 1.7844, "step": 1212 }, { "epoch": 1.9407999999999999, - "grad_norm": 0.5982227325439453, - "learning_rate": 0.0004803999999999999, - "loss": 2.3925, + "grad_norm": 0.7082346081733704, + "learning_rate": 0.00035999999999999997, + "loss": 1.9045, "step": 1213 }, { "epoch": 1.9424000000000001, - "grad_norm": 1.774712085723877, - "learning_rate": 0.0004808, - "loss": 2.0152, + "grad_norm": 0.6736663579940796, + "learning_rate": 0.0003603, + "loss": 1.9604, "step": 1214 }, { "epoch": 1.944, - "grad_norm": 1.231215238571167, - "learning_rate": 0.0004812, - "loss": 2.271, + "grad_norm": 0.8059405088424683, + "learning_rate": 0.00036059999999999993, + "loss": 1.8783, "step": 1215 }, { "epoch": 1.9456, - "grad_norm": 1.5429651737213135, - "learning_rate": 0.00048159999999999994, - "loss": 2.4412, + "grad_norm": 0.795032262802124, + "learning_rate": 0.0003609, + "loss": 1.6925, "step": 1216 }, { "epoch": 1.9472, - "grad_norm": 1.844327449798584, - "learning_rate": 0.00048199999999999995, - "loss": 2.5497, + "grad_norm": 0.7345133423805237, + "learning_rate": 0.00036119999999999994, + "loss": 1.6741, "step": 1217 }, { "epoch": 1.9487999999999999, - "grad_norm": 3.4878883361816406, - "learning_rate": 0.00048239999999999996, - "loss": 2.2781, + "grad_norm": 1.4538378715515137, + "learning_rate": 0.0003615, + "loss": 1.6525, "step": 1218 }, { "epoch": 1.9504000000000001, - "grad_norm": 0.906635046005249, - "learning_rate": 0.0004827999999999999, - "loss": 2.3062, + "grad_norm": 2.7808940410614014, + "learning_rate": 0.00036179999999999996, + "loss": 1.73, "step": 1219 }, { "epoch": 1.952, - "grad_norm": 1.1018857955932617, - "learning_rate": 0.0004832, - "loss": 2.2925, + "grad_norm": 1.7963067293167114, + "learning_rate": 0.00036209999999999997, + "loss": 1.8387, "step": 1220 }, { "epoch": 1.9536, - "grad_norm": 0.7406502366065979, - "learning_rate": 0.0004836, - "loss": 2.0683, + "grad_norm": 1.8159714937210083, + "learning_rate": 0.00036239999999999997, + "loss": 2.1202, "step": 1221 }, { "epoch": 1.9552, - "grad_norm": 1.1070985794067383, - "learning_rate": 0.00048399999999999995, - "loss": 2.2204, + "grad_norm": 1.3005350828170776, + "learning_rate": 0.0003627, + "loss": 2.1657, "step": 1222 }, { "epoch": 1.9567999999999999, - "grad_norm": 1.2866196632385254, - "learning_rate": 0.00048439999999999996, - "loss": 2.5341, + "grad_norm": 0.9724456071853638, + "learning_rate": 0.00036299999999999993, + "loss": 2.1556, "step": 1223 }, { "epoch": 1.9584000000000001, - "grad_norm": 2.646087408065796, - "learning_rate": 0.00048479999999999997, - "loss": 2.4654, + "grad_norm": 0.9316180944442749, + "learning_rate": 0.0003633, + "loss": 1.5908, "step": 1224 }, { "epoch": 1.96, - "grad_norm": 2.4793953895568848, - "learning_rate": 0.0004851999999999999, - "loss": 2.4107, + "grad_norm": 1.7360824346542358, + "learning_rate": 0.00036359999999999995, + "loss": 2.3979, "step": 1225 }, { "epoch": 1.9616, - "grad_norm": 9.734627723693848, - "learning_rate": 0.0004856, - "loss": 2.7599, + "grad_norm": 17.20070457458496, + "learning_rate": 0.0003639, + "loss": 3.8859, "step": 1226 }, { "epoch": 1.9632, - "grad_norm": 14.538433074951172, - "learning_rate": 0.000486, - "loss": 3.1676, + "grad_norm": 7.804291248321533, + "learning_rate": 0.00036419999999999996, + "loss": 2.8236, "step": 1227 }, { "epoch": 1.9647999999999999, - "grad_norm": 1.0963908433914185, - "learning_rate": 0.00048639999999999995, - "loss": 2.0805, + "grad_norm": 4.541631698608398, + "learning_rate": 0.00036449999999999997, + "loss": 2.2704, "step": 1228 }, { "epoch": 1.9664000000000001, - "grad_norm": 2.8844449520111084, - "learning_rate": 0.00048679999999999996, - "loss": 2.1634, + "grad_norm": 5.615588665008545, + "learning_rate": 0.0003648, + "loss": 2.4331, "step": 1229 }, { "epoch": 1.968, - "grad_norm": 1.3918949365615845, - "learning_rate": 0.00048719999999999997, - "loss": 2.1095, + "grad_norm": 2.0212814807891846, + "learning_rate": 0.0003651, + "loss": 2.0981, "step": 1230 }, { "epoch": 1.9696, - "grad_norm": 1.6348989009857178, - "learning_rate": 0.0004875999999999999, - "loss": 1.9739, + "grad_norm": 2.8328936100006104, + "learning_rate": 0.00036539999999999994, + "loss": 1.9869, "step": 1231 }, { "epoch": 1.9712, - "grad_norm": 3.712069034576416, - "learning_rate": 0.000488, - "loss": 1.8339, + "grad_norm": 3.4782793521881104, + "learning_rate": 0.0003657, + "loss": 2.05, "step": 1232 }, { "epoch": 1.9727999999999999, - "grad_norm": 3.0852272510528564, - "learning_rate": 0.0004883999999999999, - "loss": 1.7957, + "grad_norm": 2.7648117542266846, + "learning_rate": 0.00036599999999999995, + "loss": 2.0339, "step": 1233 }, { "epoch": 1.9744000000000002, - "grad_norm": 2.9243290424346924, - "learning_rate": 0.0004888, - "loss": 1.9496, + "grad_norm": 3.1286776065826416, + "learning_rate": 0.0003663, + "loss": 2.0052, "step": 1234 }, { "epoch": 1.976, - "grad_norm": 2.6891889572143555, - "learning_rate": 0.0004892, - "loss": 1.704, + "grad_norm": 1.4266185760498047, + "learning_rate": 0.00036659999999999997, + "loss": 2.1165, "step": 1235 }, { "epoch": 1.9776, - "grad_norm": 1.1795345544815063, - "learning_rate": 0.0004896, - "loss": 2.0643, + "grad_norm": 1.3429757356643677, + "learning_rate": 0.0003669, + "loss": 2.033, "step": 1236 }, { "epoch": 1.9792, - "grad_norm": 2.66219425201416, - "learning_rate": 0.00049, - "loss": 2.0074, + "grad_norm": 0.6780145168304443, + "learning_rate": 0.0003672, + "loss": 2.0324, "step": 1237 }, { "epoch": 1.9808, - "grad_norm": 3.1762375831604004, - "learning_rate": 0.0004904, - "loss": 1.9707, + "grad_norm": 2.1947433948516846, + "learning_rate": 0.0003675, + "loss": 2.0592, "step": 1238 }, { "epoch": 1.9824000000000002, - "grad_norm": 6.450320243835449, - "learning_rate": 0.0004907999999999999, - "loss": 2.1798, + "grad_norm": 4.260788440704346, + "learning_rate": 0.00036779999999999994, + "loss": 2.0549, "step": 1239 }, { "epoch": 1.984, - "grad_norm": 2.975370168685913, - "learning_rate": 0.0004911999999999999, - "loss": 2.0042, + "grad_norm": 5.300332069396973, + "learning_rate": 0.0003681, + "loss": 2.2587, "step": 1240 }, { "epoch": 1.9856, - "grad_norm": 4.335202217102051, - "learning_rate": 0.0004916, - "loss": 2.0929, + "grad_norm": 6.130112171173096, + "learning_rate": 0.00036839999999999996, + "loss": 2.3789, "step": 1241 }, { "epoch": 1.9872, - "grad_norm": 3.7740931510925293, - "learning_rate": 0.0004919999999999999, - "loss": 2.2352, + "grad_norm": 7.301811218261719, + "learning_rate": 0.0003687, + "loss": 2.543, "step": 1242 }, { "epoch": 1.9888, - "grad_norm": 1.6590664386749268, - "learning_rate": 0.0004923999999999999, - "loss": 2.1172, + "grad_norm": 5.791845321655273, + "learning_rate": 0.00036899999999999997, + "loss": 2.2248, "step": 1243 }, { "epoch": 1.9904, - "grad_norm": 1.345360279083252, - "learning_rate": 0.0004927999999999999, - "loss": 1.9999, + "grad_norm": 5.044650077819824, + "learning_rate": 0.0003693, + "loss": 2.3361, "step": 1244 }, { "epoch": 1.992, - "grad_norm": 0.9986281991004944, - "learning_rate": 0.0004932, - "loss": 1.9315, + "grad_norm": 4.885112285614014, + "learning_rate": 0.0003696, + "loss": 2.3957, "step": 1245 }, { "epoch": 1.9936, - "grad_norm": 1.795646071434021, - "learning_rate": 0.0004936, - "loss": 2.275, + "grad_norm": 2.838794469833374, + "learning_rate": 0.0003699, + "loss": 2.2255, "step": 1246 }, { "epoch": 1.9952, - "grad_norm": 1.1249878406524658, - "learning_rate": 0.000494, - "loss": 2.2124, + "grad_norm": 2.508756637573242, + "learning_rate": 0.00037019999999999995, + "loss": 2.1271, "step": 1247 }, { "epoch": 1.9968, - "grad_norm": 0.8439085483551025, - "learning_rate": 0.0004944, - "loss": 2.0854, + "grad_norm": 1.2168052196502686, + "learning_rate": 0.0003705, + "loss": 2.1874, "step": 1248 }, { "epoch": 1.9984, - "grad_norm": 1.0127602815628052, - "learning_rate": 0.0004948, - "loss": 2.3189, + "grad_norm": 2.0647408962249756, + "learning_rate": 0.00037079999999999996, + "loss": 2.3272, "step": 1249 }, { "epoch": 2.0, - "grad_norm": 1.2832350730895996, - "learning_rate": 0.0004952, - "loss": 2.3423, + "grad_norm": 2.734804630279541, + "learning_rate": 0.0003711, + "loss": 2.3636, "step": 1250 }, { "epoch": 2.0016, - "grad_norm": 16.80681800842285, - "learning_rate": 0.0004955999999999999, - "loss": 3.3692, + "grad_norm": 1.4602904319763184, + "learning_rate": 0.0003714, + "loss": 2.7633, "step": 1251 }, { "epoch": 2.0032, - "grad_norm": 11.572160720825195, - "learning_rate": 0.0004959999999999999, - "loss": 2.8517, + "grad_norm": 0.7962429523468018, + "learning_rate": 0.0003717, + "loss": 2.4137, "step": 1252 }, { "epoch": 2.0048, - "grad_norm": 8.817554473876953, - "learning_rate": 0.0004963999999999999, - "loss": 2.5597, + "grad_norm": 2.490060806274414, + "learning_rate": 0.000372, + "loss": 2.5798, "step": 1253 }, { "epoch": 2.0064, - "grad_norm": 5.953640937805176, - "learning_rate": 0.0004967999999999999, - "loss": 2.3292, + "grad_norm": 3.1522207260131836, + "learning_rate": 0.0003723, + "loss": 2.3581, "step": 1254 }, { "epoch": 2.008, - "grad_norm": 6.86954927444458, - "learning_rate": 0.0004971999999999999, - "loss": 2.6954, + "grad_norm": 1.5471434593200684, + "learning_rate": 0.00037259999999999995, + "loss": 2.7235, "step": 1255 }, { "epoch": 2.0096, - "grad_norm": 1.5113718509674072, - "learning_rate": 0.0004976, - "loss": 2.0988, + "grad_norm": 4.790720462799072, + "learning_rate": 0.0003729, + "loss": 2.424, "step": 1256 }, { "epoch": 2.0112, - "grad_norm": 1.745615005493164, - "learning_rate": 0.000498, - "loss": 1.985, + "grad_norm": 3.654318332672119, + "learning_rate": 0.00037319999999999996, + "loss": 2.4581, "step": 1257 }, { "epoch": 2.0128, - "grad_norm": 2.888578414916992, - "learning_rate": 0.0004984, - "loss": 1.9931, + "grad_norm": 4.856836318969727, + "learning_rate": 0.0003735, + "loss": 2.4449, "step": 1258 }, { "epoch": 2.0144, - "grad_norm": 2.6909291744232178, - "learning_rate": 0.0004988, - "loss": 2.3251, + "grad_norm": 3.019021987915039, + "learning_rate": 0.0003738, + "loss": 2.56, "step": 1259 }, { "epoch": 2.016, - "grad_norm": 4.514300346374512, - "learning_rate": 0.0004991999999999999, - "loss": 2.1243, + "grad_norm": 2.4978408813476562, + "learning_rate": 0.0003741, + "loss": 2.2926, "step": 1260 }, { "epoch": 2.0176, - "grad_norm": 2.5229086875915527, - "learning_rate": 0.0004996, - "loss": 2.2339, + "grad_norm": 1.7391501665115356, + "learning_rate": 0.0003744, + "loss": 2.5892, "step": 1261 }, { "epoch": 2.0192, - "grad_norm": 5.69707727432251, - "learning_rate": 0.0005, - "loss": 2.167, + "grad_norm": 1.032046914100647, + "learning_rate": 0.0003747, + "loss": 2.2562, "step": 1262 }, { "epoch": 2.0208, - "grad_norm": 4.998229503631592, - "learning_rate": 0.0005003999999999999, - "loss": 2.3208, + "grad_norm": 1.8993191719055176, + "learning_rate": 0.00037499999999999995, + "loss": 2.2962, "step": 1263 }, { "epoch": 2.0224, - "grad_norm": 5.518265247344971, - "learning_rate": 0.0005007999999999999, - "loss": 2.1296, + "grad_norm": 5.481810569763184, + "learning_rate": 0.00037529999999999996, + "loss": 2.4923, "step": 1264 }, { "epoch": 2.024, - "grad_norm": 5.083476543426514, - "learning_rate": 0.0005011999999999999, - "loss": 2.3207, + "grad_norm": 6.432821273803711, + "learning_rate": 0.00037559999999999997, + "loss": 2.4889, "step": 1265 }, { "epoch": 2.0256, - "grad_norm": 4.121245861053467, - "learning_rate": 0.0005015999999999999, - "loss": 2.0558, + "grad_norm": 7.250077724456787, + "learning_rate": 0.0003758999999999999, + "loss": 2.5924, "step": 1266 }, { "epoch": 2.0272, - "grad_norm": 2.552358865737915, - "learning_rate": 0.000502, - "loss": 2.1283, + "grad_norm": 7.486959457397461, + "learning_rate": 0.0003762, + "loss": 2.5582, "step": 1267 }, { "epoch": 2.0288, - "grad_norm": 1.2170063257217407, - "learning_rate": 0.0005024, - "loss": 2.1844, + "grad_norm": 8.361715316772461, + "learning_rate": 0.00037649999999999994, + "loss": 2.6335, "step": 1268 }, { "epoch": 2.0304, - "grad_norm": 1.5925713777542114, - "learning_rate": 0.0005028, - "loss": 2.1698, + "grad_norm": 6.474916934967041, + "learning_rate": 0.00037679999999999994, + "loss": 2.4855, "step": 1269 }, { "epoch": 2.032, - "grad_norm": 4.9720377922058105, - "learning_rate": 0.0005032, - "loss": 2.2092, + "grad_norm": 7.403282165527344, + "learning_rate": 0.00037709999999999995, + "loss": 2.4993, "step": 1270 }, { "epoch": 2.0336, - "grad_norm": 5.816512107849121, - "learning_rate": 0.0005036, - "loss": 2.5182, + "grad_norm": 5.777534008026123, + "learning_rate": 0.00037739999999999996, + "loss": 2.4794, "step": 1271 }, { "epoch": 2.0352, - "grad_norm": 4.63362455368042, - "learning_rate": 0.0005039999999999999, - "loss": 2.127, + "grad_norm": 4.98473596572876, + "learning_rate": 0.0003776999999999999, + "loss": 2.3016, "step": 1272 }, { "epoch": 2.0368, - "grad_norm": 4.630200386047363, - "learning_rate": 0.0005044, - "loss": 2.296, + "grad_norm": 5.620299339294434, + "learning_rate": 0.00037799999999999997, + "loss": 2.4113, "step": 1273 }, { "epoch": 2.0384, - "grad_norm": 5.835329532623291, - "learning_rate": 0.0005048, - "loss": 2.2678, + "grad_norm": 3.0978798866271973, + "learning_rate": 0.0003782999999999999, + "loss": 2.3554, "step": 1274 }, { "epoch": 2.04, - "grad_norm": 2.8462319374084473, - "learning_rate": 0.0005051999999999999, - "loss": 2.1091, + "grad_norm": 1.7908412218093872, + "learning_rate": 0.0003786, + "loss": 2.2717, "step": 1275 }, { "epoch": 2.0416, - "grad_norm": 2.3021416664123535, - "learning_rate": 0.0005055999999999999, - "loss": 2.212, + "grad_norm": 1.7431128025054932, + "learning_rate": 0.00037889999999999994, + "loss": 2.114, "step": 1276 }, { "epoch": 2.0432, - "grad_norm": 1.7580206394195557, - "learning_rate": 0.0005059999999999999, - "loss": 2.2206, + "grad_norm": 1.9171562194824219, + "learning_rate": 0.00037919999999999995, + "loss": 2.3205, "step": 1277 }, { "epoch": 2.0448, - "grad_norm": 0.9904365539550781, - "learning_rate": 0.0005064, - "loss": 2.2598, + "grad_norm": 1.3162221908569336, + "learning_rate": 0.00037949999999999995, + "loss": 2.1851, "step": 1278 }, { "epoch": 2.0464, - "grad_norm": 1.5216723680496216, - "learning_rate": 0.0005068, - "loss": 2.0426, + "grad_norm": 1.1935993432998657, + "learning_rate": 0.00037979999999999996, + "loss": 2.1238, "step": 1279 }, { "epoch": 2.048, - "grad_norm": 0.9763423204421997, - "learning_rate": 0.0005072, - "loss": 2.2838, + "grad_norm": 1.006394624710083, + "learning_rate": 0.0003800999999999999, + "loss": 2.1991, "step": 1280 }, { "epoch": 2.0496, - "grad_norm": 0.9675254821777344, - "learning_rate": 0.0005076, - "loss": 2.2737, + "grad_norm": 0.6535553336143494, + "learning_rate": 0.0003804, + "loss": 2.2614, "step": 1281 }, { "epoch": 2.0512, - "grad_norm": 0.874781608581543, - "learning_rate": 0.000508, - "loss": 2.0927, + "grad_norm": 2.4608588218688965, + "learning_rate": 0.00038069999999999993, + "loss": 1.9734, "step": 1282 }, { "epoch": 2.0528, - "grad_norm": 1.92698335647583, - "learning_rate": 0.0005084, - "loss": 2.3211, + "grad_norm": 2.3777921199798584, + "learning_rate": 0.000381, + "loss": 1.9419, "step": 1283 }, { "epoch": 2.0544, - "grad_norm": 1.462214708328247, - "learning_rate": 0.0005087999999999999, - "loss": 2.0673, + "grad_norm": 3.6456470489501953, + "learning_rate": 0.00038129999999999994, + "loss": 2.1727, "step": 1284 }, { "epoch": 2.056, - "grad_norm": 1.4558961391448975, - "learning_rate": 0.0005092, - "loss": 2.4106, + "grad_norm": 3.8108816146850586, + "learning_rate": 0.00038159999999999995, + "loss": 2.244, "step": 1285 }, { "epoch": 2.0576, - "grad_norm": 1.2148739099502563, - "learning_rate": 0.0005096, - "loss": 2.3889, + "grad_norm": 3.471174478530884, + "learning_rate": 0.00038189999999999996, + "loss": 2.2731, "step": 1286 }, { "epoch": 2.0592, - "grad_norm": 1.752792239189148, - "learning_rate": 0.0005099999999999999, - "loss": 2.3579, + "grad_norm": 1.8833621740341187, + "learning_rate": 0.00038219999999999997, + "loss": 2.3379, "step": 1287 }, { "epoch": 2.0608, - "grad_norm": 0.8984630107879639, - "learning_rate": 0.0005103999999999999, - "loss": 2.3451, + "grad_norm": 2.832181453704834, + "learning_rate": 0.0003824999999999999, + "loss": 2.064, "step": 1288 }, { "epoch": 2.0624, - "grad_norm": 0.6344359517097473, - "learning_rate": 0.0005108, - "loss": 2.2167, + "grad_norm": 1.3994871377944946, + "learning_rate": 0.0003828, + "loss": 2.0979, "step": 1289 }, { "epoch": 2.064, - "grad_norm": 0.4872526526451111, - "learning_rate": 0.0005112, - "loss": 2.435, + "grad_norm": 0.7377781271934509, + "learning_rate": 0.00038309999999999993, + "loss": 2.127, "step": 1290 }, { "epoch": 2.0656, - "grad_norm": 1.0829209089279175, - "learning_rate": 0.0005116, - "loss": 2.2084, + "grad_norm": 1.0238137245178223, + "learning_rate": 0.0003834, + "loss": 2.0696, "step": 1291 }, { "epoch": 2.0672, - "grad_norm": 1.2053056955337524, - "learning_rate": 0.000512, - "loss": 2.273, + "grad_norm": 1.6712844371795654, + "learning_rate": 0.00038369999999999995, + "loss": 2.2395, "step": 1292 }, { "epoch": 2.0688, - "grad_norm": 2.491065263748169, - "learning_rate": 0.0005124, - "loss": 2.5486, + "grad_norm": 0.8299332857131958, + "learning_rate": 0.00038399999999999996, + "loss": 2.2809, "step": 1293 }, { "epoch": 2.0704, - "grad_norm": 0.7429988384246826, - "learning_rate": 0.0005128, - "loss": 2.4695, + "grad_norm": 1.4740967750549316, + "learning_rate": 0.00038429999999999996, + "loss": 2.2029, "step": 1294 }, { "epoch": 2.072, - "grad_norm": 1.1446125507354736, - "learning_rate": 0.0005131999999999999, - "loss": 2.3198, + "grad_norm": 1.6210805177688599, + "learning_rate": 0.00038459999999999997, + "loss": 2.1513, "step": 1295 }, { "epoch": 2.0736, - "grad_norm": 0.8218686580657959, - "learning_rate": 0.0005135999999999999, - "loss": 2.3743, + "grad_norm": 0.7904883027076721, + "learning_rate": 0.0003848999999999999, + "loss": 2.0126, "step": 1296 }, { "epoch": 2.0752, - "grad_norm": 1.084084153175354, - "learning_rate": 0.0005139999999999999, - "loss": 2.4759, + "grad_norm": 1.0255601406097412, + "learning_rate": 0.0003852, + "loss": 2.0117, "step": 1297 }, { "epoch": 2.0768, - "grad_norm": 1.4052973985671997, - "learning_rate": 0.0005143999999999999, - "loss": 2.3302, + "grad_norm": 2.568159818649292, + "learning_rate": 0.00038549999999999994, + "loss": 1.9919, "step": 1298 }, { "epoch": 2.0784, - "grad_norm": 5.035649299621582, - "learning_rate": 0.0005147999999999999, - "loss": 2.3812, + "grad_norm": 0.7457689046859741, + "learning_rate": 0.0003858, + "loss": 1.8096, "step": 1299 }, { "epoch": 2.08, - "grad_norm": 1.4146735668182373, - "learning_rate": 0.0005152, - "loss": 2.6077, - "step": 1300 - }, - { - "epoch": 2.08, - "eval_cer": 0.4815960457607526, - "eval_loss": 2.4077656269073486, - "eval_runtime": 158.706, - "eval_samples_per_second": 19.76, - "eval_steps_per_second": 1.235, - "eval_wer": 0.8066314371567713, + "grad_norm": 2.0637600421905518, + "learning_rate": 0.00038609999999999995, + "loss": 2.628, "step": 1300 }, { "epoch": 2.0816, - "grad_norm": 15.680081367492676, - "learning_rate": 0.0005156, - "loss": 3.1347, + "grad_norm": 7.795231819152832, + "learning_rate": 0.00038639999999999996, + "loss": 2.7336, "step": 1301 }, { "epoch": 2.0832, - "grad_norm": 7.167609214782715, - "learning_rate": 0.000516, - "loss": 2.594, + "grad_norm": 2.4228038787841797, + "learning_rate": 0.00038669999999999997, + "loss": 2.4832, "step": 1302 }, { "epoch": 2.0848, - "grad_norm": 2.3701157569885254, - "learning_rate": 0.0005164, - "loss": 2.4183, + "grad_norm": 7.511104583740234, + "learning_rate": 0.000387, + "loss": 3.1255, "step": 1303 }, { "epoch": 2.0864, - "grad_norm": 7.440915107727051, - "learning_rate": 0.0005167999999999999, - "loss": 2.8736, + "grad_norm": 4.0060882568359375, + "learning_rate": 0.00038729999999999993, + "loss": 2.4608, "step": 1304 }, { "epoch": 2.088, - "grad_norm": 2.976489543914795, - "learning_rate": 0.0005172, - "loss": 2.5311, + "grad_norm": 6.4439496994018555, + "learning_rate": 0.0003876, + "loss": 2.6808, "step": 1305 }, { "epoch": 2.0896, - "grad_norm": 4.291921615600586, - "learning_rate": 0.0005176, - "loss": 2.2646, + "grad_norm": 7.643751621246338, + "learning_rate": 0.00038789999999999994, + "loss": 2.8287, "step": 1306 }, { "epoch": 2.0912, - "grad_norm": 8.423929214477539, - "learning_rate": 0.0005179999999999999, - "loss": 2.3763, + "grad_norm": 7.657688140869141, + "learning_rate": 0.0003882, + "loss": 2.7325, "step": 1307 }, { "epoch": 2.0928, - "grad_norm": 8.711053848266602, - "learning_rate": 0.0005183999999999999, - "loss": 2.5501, + "grad_norm": 8.446870803833008, + "learning_rate": 0.00038849999999999996, + "loss": 3.022, "step": 1308 }, { "epoch": 2.0944, - "grad_norm": 8.829360008239746, - "learning_rate": 0.0005187999999999999, - "loss": 2.3476, + "grad_norm": 8.167088508605957, + "learning_rate": 0.00038879999999999996, + "loss": 3.0802, "step": 1309 }, { "epoch": 2.096, - "grad_norm": 8.021748542785645, - "learning_rate": 0.0005191999999999999, - "loss": 2.4302, + "grad_norm": 7.480300426483154, + "learning_rate": 0.00038909999999999997, + "loss": 2.8137, "step": 1310 }, { "epoch": 2.0976, - "grad_norm": 3.774869918823242, - "learning_rate": 0.0005195999999999999, - "loss": 2.4614, + "grad_norm": 6.842907905578613, + "learning_rate": 0.0003894, + "loss": 2.7077, "step": 1311 }, { "epoch": 2.0992, - "grad_norm": 6.809609413146973, - "learning_rate": 0.00052, - "loss": 2.2534, + "grad_norm": 5.104220867156982, + "learning_rate": 0.00038969999999999993, + "loss": 2.4495, "step": 1312 }, { "epoch": 2.1008, - "grad_norm": 4.694296836853027, - "learning_rate": 0.0005204, - "loss": 2.1121, + "grad_norm": 3.758356809616089, + "learning_rate": 0.00039, + "loss": 2.239, "step": 1313 }, { "epoch": 2.1024, - "grad_norm": 4.298503398895264, - "learning_rate": 0.0005208, - "loss": 2.1276, + "grad_norm": 2.0488903522491455, + "learning_rate": 0.00039029999999999995, + "loss": 2.2548, "step": 1314 }, { "epoch": 2.104, - "grad_norm": 1.7098180055618286, - "learning_rate": 0.0005212, - "loss": 2.061, + "grad_norm": 1.8532054424285889, + "learning_rate": 0.00039059999999999995, + "loss": 1.9928, "step": 1315 }, { "epoch": 2.1056, - "grad_norm": 1.733177900314331, - "learning_rate": 0.0005215999999999999, - "loss": 2.0207, + "grad_norm": 4.387415885925293, + "learning_rate": 0.00039089999999999996, + "loss": 2.2367, "step": 1316 }, { "epoch": 2.1072, - "grad_norm": 2.170844078063965, - "learning_rate": 0.000522, - "loss": 1.9781, + "grad_norm": 13.355759620666504, + "learning_rate": 0.00039119999999999997, + "loss": 3.1097, "step": 1317 }, { "epoch": 2.1088, - "grad_norm": 4.409342288970947, - "learning_rate": 0.0005224, - "loss": 2.2473, + "grad_norm": 6.663236141204834, + "learning_rate": 0.0003914999999999999, + "loss": 2.3162, "step": 1318 }, { "epoch": 2.1104, - "grad_norm": 3.9420828819274902, - "learning_rate": 0.0005227999999999999, - "loss": 2.2477, + "grad_norm": 8.847925186157227, + "learning_rate": 0.0003918, + "loss": 2.3232, "step": 1319 }, { "epoch": 2.112, - "grad_norm": 4.4460649490356445, - "learning_rate": 0.0005231999999999999, - "loss": 2.0891, + "grad_norm": 8.594993591308594, + "learning_rate": 0.00039209999999999994, + "loss": 2.3261, "step": 1320 }, { "epoch": 2.1136, - "grad_norm": 3.511218547821045, - "learning_rate": 0.0005235999999999999, - "loss": 2.1115, + "grad_norm": 7.956197261810303, + "learning_rate": 0.0003924, + "loss": 2.4478, "step": 1321 }, { "epoch": 2.1152, - "grad_norm": 4.6625142097473145, - "learning_rate": 0.0005239999999999999, - "loss": 2.0631, + "grad_norm": 7.233311176300049, + "learning_rate": 0.00039269999999999995, + "loss": 2.2092, "step": 1322 }, { "epoch": 2.1168, - "grad_norm": 2.5629169940948486, - "learning_rate": 0.0005244, - "loss": 2.1697, + "grad_norm": 4.360085487365723, + "learning_rate": 0.00039299999999999996, + "loss": 2.3606, "step": 1323 }, { "epoch": 2.1184, - "grad_norm": 0.9935573935508728, - "learning_rate": 0.0005248, - "loss": 2.055, + "grad_norm": 4.5778608322143555, + "learning_rate": 0.00039329999999999996, + "loss": 2.2137, "step": 1324 }, { "epoch": 2.12, - "grad_norm": 0.8370351195335388, - "learning_rate": 0.0005252, - "loss": 1.9884, + "grad_norm": 3.4425010681152344, + "learning_rate": 0.00039359999999999997, + "loss": 2.0536, "step": 1325 }, { "epoch": 2.1216, - "grad_norm": 1.188152551651001, - "learning_rate": 0.0005256, - "loss": 2.0258, + "grad_norm": 1.9287109375, + "learning_rate": 0.0003938999999999999, + "loss": 2.0203, "step": 1326 }, { "epoch": 2.1232, - "grad_norm": 1.5342425107955933, - "learning_rate": 0.000526, - "loss": 2.0623, + "grad_norm": 1.093762755393982, + "learning_rate": 0.0003942, + "loss": 2.1598, "step": 1327 }, { "epoch": 2.1248, - "grad_norm": 1.1506816148757935, - "learning_rate": 0.0005263999999999999, - "loss": 2.1881, + "grad_norm": 1.6666079759597778, + "learning_rate": 0.00039449999999999994, + "loss": 2.2012, "step": 1328 }, { "epoch": 2.1264, - "grad_norm": 1.0641168355941772, - "learning_rate": 0.0005267999999999999, - "loss": 2.0839, + "grad_norm": 1.5958610773086548, + "learning_rate": 0.0003948, + "loss": 1.9354, "step": 1329 }, { "epoch": 2.128, - "grad_norm": 0.6758621335029602, - "learning_rate": 0.0005272, - "loss": 2.1336, + "grad_norm": 1.274950385093689, + "learning_rate": 0.00039509999999999995, + "loss": 1.9601, "step": 1330 }, { "epoch": 2.1296, - "grad_norm": 2.4014053344726562, - "learning_rate": 0.0005275999999999999, - "loss": 2.1034, + "grad_norm": 0.9257663488388062, + "learning_rate": 0.00039539999999999996, + "loss": 2.1997, "step": 1331 }, { "epoch": 2.1312, - "grad_norm": 4.051090717315674, - "learning_rate": 0.0005279999999999999, - "loss": 2.1265, + "grad_norm": 0.844883918762207, + "learning_rate": 0.00039569999999999997, + "loss": 1.9768, "step": 1332 }, { "epoch": 2.1328, - "grad_norm": 4.70905876159668, - "learning_rate": 0.0005283999999999999, - "loss": 2.321, + "grad_norm": 3.2150275707244873, + "learning_rate": 0.000396, + "loss": 2.0972, "step": 1333 }, { "epoch": 2.1344, - "grad_norm": 2.109508752822876, - "learning_rate": 0.0005288, - "loss": 2.0784, + "grad_norm": 3.564847469329834, + "learning_rate": 0.00039629999999999993, + "loss": 2.1646, "step": 1334 }, { "epoch": 2.136, - "grad_norm": 3.391434907913208, - "learning_rate": 0.0005292, - "loss": 2.1631, + "grad_norm": 3.8981540203094482, + "learning_rate": 0.0003966, + "loss": 1.9788, "step": 1335 }, { "epoch": 2.1376, - "grad_norm": 2.689666271209717, - "learning_rate": 0.0005296, - "loss": 2.1379, + "grad_norm": 4.616899013519287, + "learning_rate": 0.00039689999999999994, + "loss": 2.2425, "step": 1336 }, { "epoch": 2.1391999999999998, - "grad_norm": 1.4555532932281494, - "learning_rate": 0.00053, - "loss": 2.1257, + "grad_norm": 4.975461483001709, + "learning_rate": 0.0003972, + "loss": 2.1961, "step": 1337 }, { "epoch": 2.1408, - "grad_norm": 1.207741379737854, - "learning_rate": 0.0005304, - "loss": 2.1133, + "grad_norm": 3.547156572341919, + "learning_rate": 0.00039749999999999996, + "loss": 2.1786, "step": 1338 }, { "epoch": 2.1424, - "grad_norm": 1.0733281373977661, - "learning_rate": 0.0005308, - "loss": 2.1144, + "grad_norm": 2.7288384437561035, + "learning_rate": 0.00039779999999999997, + "loss": 2.2044, "step": 1339 }, { "epoch": 2.144, - "grad_norm": 1.5815534591674805, - "learning_rate": 0.0005311999999999999, - "loss": 2.2798, + "grad_norm": 2.037722110748291, + "learning_rate": 0.0003981, + "loss": 1.9798, "step": 1340 }, { "epoch": 2.1456, - "grad_norm": 0.9271948933601379, - "learning_rate": 0.0005315999999999999, - "loss": 2.4806, + "grad_norm": 0.9988428354263306, + "learning_rate": 0.0003984, + "loss": 1.9068, "step": 1341 }, { "epoch": 2.1471999999999998, - "grad_norm": 1.6899449825286865, - "learning_rate": 0.000532, - "loss": 2.6045, + "grad_norm": 0.8600280284881592, + "learning_rate": 0.00039869999999999993, + "loss": 2.2989, "step": 1342 }, { "epoch": 2.1488, - "grad_norm": 3.4768803119659424, - "learning_rate": 0.0005323999999999999, - "loss": 2.4404, + "grad_norm": 0.8826497197151184, + "learning_rate": 0.000399, + "loss": 2.1012, "step": 1343 }, { "epoch": 2.1504, - "grad_norm": 3.074777603149414, - "learning_rate": 0.0005327999999999999, - "loss": 2.2315, + "grad_norm": 3.0037686824798584, + "learning_rate": 0.00039929999999999995, + "loss": 2.1131, "step": 1344 }, { "epoch": 2.152, - "grad_norm": 1.114790439605713, - "learning_rate": 0.0005332, - "loss": 2.2487, + "grad_norm": 1.0067633390426636, + "learning_rate": 0.0003996, + "loss": 2.0187, "step": 1345 }, { "epoch": 2.1536, - "grad_norm": 1.1238393783569336, - "learning_rate": 0.0005336, - "loss": 2.0376, + "grad_norm": 1.6712292432785034, + "learning_rate": 0.00039989999999999996, + "loss": 2.3152, "step": 1346 }, { "epoch": 2.1552, - "grad_norm": 0.6773790121078491, - "learning_rate": 0.000534, - "loss": 2.1915, + "grad_norm": 4.251307964324951, + "learning_rate": 0.00040019999999999997, + "loss": 1.97, "step": 1347 }, { "epoch": 2.1568, - "grad_norm": 0.7934916019439697, - "learning_rate": 0.0005344, - "loss": 2.471, + "grad_norm": 1.497650146484375, + "learning_rate": 0.0004005, + "loss": 2.4916, "step": 1348 }, { "epoch": 2.1584, - "grad_norm": 1.175371527671814, - "learning_rate": 0.0005348, - "loss": 2.4804, + "grad_norm": 1.7032194137573242, + "learning_rate": 0.0004008, + "loss": 2.2926, "step": 1349 }, { "epoch": 2.16, - "grad_norm": NaN, - "learning_rate": 0.0005348, - "loss": 2.1641, + "grad_norm": 1.2858754396438599, + "learning_rate": 0.00040109999999999994, + "loss": 2.5882, "step": 1350 }, { "epoch": 2.1616, - "grad_norm": 21.404155731201172, - "learning_rate": 0.0005352, - "loss": 3.6373, + "grad_norm": 4.7999773025512695, + "learning_rate": 0.0004014, + "loss": 2.2346, "step": 1351 }, { "epoch": 2.1632, - "grad_norm": 3.474867105484009, - "learning_rate": 0.0005356, - "loss": 2.4769, + "grad_norm": 3.09006404876709, + "learning_rate": 0.00040169999999999995, + "loss": 2.2034, "step": 1352 }, { "epoch": 2.1648, - "grad_norm": 10.239166259765625, - "learning_rate": 0.0005359999999999999, - "loss": 2.9065, + "grad_norm": 1.7907410860061646, + "learning_rate": 0.000402, + "loss": 2.2089, "step": 1353 }, { "epoch": 2.1664, - "grad_norm": 1.5855854749679565, - "learning_rate": 0.0005363999999999999, - "loss": 2.3575, + "grad_norm": 3.0609290599823, + "learning_rate": 0.00040229999999999997, + "loss": 2.2091, "step": 1354 }, { "epoch": 2.168, - "grad_norm": 1.557480812072754, - "learning_rate": 0.0005368, - "loss": 2.4068, + "grad_norm": 3.085463285446167, + "learning_rate": 0.0004026, + "loss": 2.7211, "step": 1355 }, { "epoch": 2.1696, - "grad_norm": 3.996270179748535, - "learning_rate": 0.0005371999999999999, - "loss": 2.2568, + "grad_norm": 6.075860023498535, + "learning_rate": 0.0004029, + "loss": 2.3363, "step": 1356 }, { "epoch": 2.1712, - "grad_norm": 2.9636332988739014, - "learning_rate": 0.0005376, - "loss": 2.3238, + "grad_norm": 7.942235469818115, + "learning_rate": 0.0004032, + "loss": 2.4267, "step": 1357 }, { "epoch": 2.1728, - "grad_norm": 5.762381553649902, - "learning_rate": 0.000538, - "loss": 2.4183, + "grad_norm": 9.660760879516602, + "learning_rate": 0.00040349999999999994, + "loss": 2.7312, "step": 1358 }, { "epoch": 2.1744, - "grad_norm": 5.73525333404541, - "learning_rate": 0.0005384, - "loss": 2.32, + "grad_norm": 10.228667259216309, + "learning_rate": 0.0004038, + "loss": 2.676, "step": 1359 }, { "epoch": 2.176, - "grad_norm": 5.973430156707764, - "learning_rate": 0.0005388, - "loss": 2.2027, + "grad_norm": 10.17164134979248, + "learning_rate": 0.00040409999999999996, + "loss": 3.0616, "step": 1360 }, { "epoch": 2.1776, - "grad_norm": 4.767578125, - "learning_rate": 0.0005391999999999999, - "loss": 2.1613, + "grad_norm": 10.906268119812012, + "learning_rate": 0.0004044, + "loss": 3.132, "step": 1361 }, { "epoch": 2.1792, - "grad_norm": 4.585302352905273, - "learning_rate": 0.0005396, - "loss": 2.095, + "grad_norm": 10.258222579956055, + "learning_rate": 0.00040469999999999997, + "loss": 2.9752, "step": 1362 }, { "epoch": 2.1808, - "grad_norm": 2.8744828701019287, - "learning_rate": 0.00054, - "loss": 2.1879, + "grad_norm": 10.155888557434082, + "learning_rate": 0.000405, + "loss": 2.8489, "step": 1363 }, { "epoch": 2.1824, - "grad_norm": 2.7364563941955566, - "learning_rate": 0.0005403999999999999, - "loss": 1.8963, + "grad_norm": 9.648914337158203, + "learning_rate": 0.00040529999999999993, + "loss": 2.7908, "step": 1364 }, { "epoch": 2.184, - "grad_norm": 1.149092435836792, - "learning_rate": 0.0005407999999999999, - "loss": 1.7546, + "grad_norm": 8.77664566040039, + "learning_rate": 0.0004056, + "loss": 2.5796, "step": 1365 }, { "epoch": 2.1856, - "grad_norm": 1.0585405826568604, - "learning_rate": 0.0005411999999999999, - "loss": 1.8459, + "grad_norm": 7.69537878036499, + "learning_rate": 0.00040589999999999995, + "loss": 2.5244, "step": 1366 }, { "epoch": 2.1872, - "grad_norm": 1.0302655696868896, - "learning_rate": 0.0005415999999999999, - "loss": 1.9024, + "grad_norm": 6.346115589141846, + "learning_rate": 0.0004062, + "loss": 2.409, "step": 1367 }, { "epoch": 2.1888, - "grad_norm": 4.1598801612854, - "learning_rate": 0.000542, - "loss": 2.2494, + "grad_norm": 4.901920795440674, + "learning_rate": 0.00040649999999999996, + "loss": 2.3376, "step": 1368 }, { "epoch": 2.1904, - "grad_norm": 3.350557565689087, - "learning_rate": 0.0005424, - "loss": 1.8213, + "grad_norm": 2.19638991355896, + "learning_rate": 0.00040679999999999997, + "loss": 2.1, "step": 1369 }, { "epoch": 2.192, - "grad_norm": 5.341938495635986, - "learning_rate": 0.0005428, - "loss": 2.008, + "grad_norm": 1.7705507278442383, + "learning_rate": 0.0004071, + "loss": 2.0965, "step": 1370 }, { "epoch": 2.1936, - "grad_norm": 3.1162655353546143, - "learning_rate": 0.0005432, - "loss": 2.006, + "grad_norm": 5.908679485321045, + "learning_rate": 0.0004074, + "loss": 2.3494, "step": 1371 }, { "epoch": 2.1952, - "grad_norm": 3.5952706336975098, - "learning_rate": 0.0005436, - "loss": 2.0343, + "grad_norm": 4.028108596801758, + "learning_rate": 0.00040769999999999994, + "loss": 2.3946, "step": 1372 }, { "epoch": 2.1968, - "grad_norm": 1.513656497001648, - "learning_rate": 0.0005439999999999999, - "loss": 2.2091, + "grad_norm": 6.116652488708496, + "learning_rate": 0.000408, + "loss": 2.223, "step": 1373 }, { "epoch": 2.1984, - "grad_norm": 1.1791319847106934, - "learning_rate": 0.0005443999999999999, - "loss": 2.1118, + "grad_norm": 7.863028526306152, + "learning_rate": 0.00040829999999999995, + "loss": 2.6158, "step": 1374 }, { "epoch": 2.2, - "grad_norm": 1.423629879951477, - "learning_rate": 0.0005448, - "loss": 1.9229, + "grad_norm": 7.65760612487793, + "learning_rate": 0.0004086, + "loss": 2.2629, "step": 1375 }, { "epoch": 2.2016, - "grad_norm": 1.9064466953277588, - "learning_rate": 0.0005451999999999999, - "loss": 1.8431, + "grad_norm": 6.600236415863037, + "learning_rate": 0.00040889999999999996, + "loss": 2.1404, "step": 1376 }, { "epoch": 2.2032, - "grad_norm": 1.9968523979187012, - "learning_rate": 0.0005455999999999999, - "loss": 1.8884, + "grad_norm": 7.769944190979004, + "learning_rate": 0.00040919999999999997, + "loss": 2.6362, "step": 1377 }, { "epoch": 2.2048, - "grad_norm": 2.7741806507110596, - "learning_rate": 0.0005459999999999999, - "loss": 2.0129, + "grad_norm": 6.450319766998291, + "learning_rate": 0.0004095, + "loss": 2.3437, "step": 1378 }, { "epoch": 2.2064, - "grad_norm": 2.164896011352539, - "learning_rate": 0.0005463999999999999, - "loss": 1.8725, + "grad_norm": 6.125224590301514, + "learning_rate": 0.0004098, + "loss": 2.2856, "step": 1379 }, { "epoch": 2.208, - "grad_norm": 2.3117516040802, - "learning_rate": 0.0005468, - "loss": 2.1264, + "grad_norm": 3.7598607540130615, + "learning_rate": 0.00041009999999999994, + "loss": 2.4335, "step": 1380 }, { "epoch": 2.2096, - "grad_norm": 1.8635528087615967, - "learning_rate": 0.0005472, - "loss": 1.8341, + "grad_norm": 2.4773128032684326, + "learning_rate": 0.0004104, + "loss": 2.1278, "step": 1381 }, { "epoch": 2.2112, - "grad_norm": 0.991112470626831, - "learning_rate": 0.0005476, - "loss": 2.1591, + "grad_norm": 1.4773629903793335, + "learning_rate": 0.00041069999999999995, + "loss": 2.2617, "step": 1382 }, { "epoch": 2.2128, - "grad_norm": 1.4743876457214355, - "learning_rate": 0.000548, - "loss": 2.1582, + "grad_norm": 1.111156702041626, + "learning_rate": 0.000411, + "loss": 2.2365, "step": 1383 }, { "epoch": 2.2144, - "grad_norm": 0.8621456027030945, - "learning_rate": 0.0005484, - "loss": 1.7388, + "grad_norm": 1.0154348611831665, + "learning_rate": 0.00041129999999999997, + "loss": 2.2948, "step": 1384 }, { "epoch": 2.216, - "grad_norm": 0.8471877574920654, - "learning_rate": 0.0005487999999999999, - "loss": 1.9254, + "grad_norm": 0.8806530833244324, + "learning_rate": 0.0004116, + "loss": 1.9224, "step": 1385 }, { "epoch": 2.2176, - "grad_norm": 1.153436303138733, - "learning_rate": 0.0005491999999999999, - "loss": 1.9162, + "grad_norm": 1.5271722078323364, + "learning_rate": 0.0004119, + "loss": 2.0459, "step": 1386 }, { "epoch": 2.2192, - "grad_norm": 1.1189401149749756, - "learning_rate": 0.0005496, - "loss": 2.1507, + "grad_norm": 1.2786515951156616, + "learning_rate": 0.0004122, + "loss": 1.9195, "step": 1387 }, { "epoch": 2.2208, - "grad_norm": 0.7863136529922485, - "learning_rate": 0.0005499999999999999, - "loss": 1.917, + "grad_norm": 2.0746285915374756, + "learning_rate": 0.00041249999999999994, + "loss": 2.1174, "step": 1388 }, { "epoch": 2.2224, - "grad_norm": 0.6574200391769409, - "learning_rate": 0.0005503999999999999, - "loss": 2.1708, + "grad_norm": 1.9179648160934448, + "learning_rate": 0.00041279999999999995, + "loss": 2.0782, "step": 1389 }, { "epoch": 2.224, - "grad_norm": 1.5995211601257324, - "learning_rate": 0.0005507999999999999, - "loss": 1.8246, + "grad_norm": 3.8180932998657227, + "learning_rate": 0.00041309999999999996, + "loss": 2.2007, "step": 1390 }, { "epoch": 2.2256, - "grad_norm": 1.0908114910125732, - "learning_rate": 0.0005512, - "loss": 2.2587, + "grad_norm": 2.905001163482666, + "learning_rate": 0.0004133999999999999, + "loss": 2.4089, "step": 1391 }, { "epoch": 2.2272, - "grad_norm": 0.8126534223556519, - "learning_rate": 0.0005516, - "loss": 2.077, + "grad_norm": 1.348793864250183, + "learning_rate": 0.00041369999999999997, + "loss": 2.3518, "step": 1392 }, { "epoch": 2.2288, - "grad_norm": 0.8407979607582092, - "learning_rate": 0.000552, - "loss": 2.0135, + "grad_norm": 0.6599010229110718, + "learning_rate": 0.0004139999999999999, + "loss": 2.0014, "step": 1393 }, { "epoch": 2.2304, - "grad_norm": 8.15050983428955, - "learning_rate": 0.0005524, - "loss": 2.8744, + "grad_norm": 0.8205379247665405, + "learning_rate": 0.0004143, + "loss": 2.0897, "step": 1394 }, { "epoch": 2.232, - "grad_norm": 1.2660044431686401, - "learning_rate": 0.0005528, - "loss": 2.0023, + "grad_norm": 1.9124035835266113, + "learning_rate": 0.00041459999999999994, + "loss": 2.2603, "step": 1395 }, { "epoch": 2.2336, - "grad_norm": 0.7754627466201782, - "learning_rate": 0.0005532, - "loss": 2.2246, + "grad_norm": 0.7764268517494202, + "learning_rate": 0.00041489999999999995, + "loss": 2.1528, "step": 1396 }, { "epoch": 2.2352, - "grad_norm": 1.0184060335159302, - "learning_rate": 0.0005535999999999999, - "loss": 2.4245, + "grad_norm": 0.5789710879325867, + "learning_rate": 0.00041519999999999995, + "loss": 2.1246, "step": 1397 }, { "epoch": 2.2368, - "grad_norm": 0.7693429589271545, - "learning_rate": 0.0005539999999999999, - "loss": 2.5069, + "grad_norm": 1.1585594415664673, + "learning_rate": 0.00041549999999999996, + "loss": 2.224, "step": 1398 }, { "epoch": 2.2384, - "grad_norm": 1.50884211063385, - "learning_rate": 0.0005544, - "loss": 2.4099, + "grad_norm": 0.8179572820663452, + "learning_rate": 0.0004157999999999999, + "loss": 2.5131, "step": 1399 }, { "epoch": 2.24, - "grad_norm": 1.125415563583374, - "learning_rate": 0.0005547999999999999, - "loss": 2.6928, - "step": 1400 - }, - { - "epoch": 2.24, - "eval_cer": 0.49043727827161476, - "eval_loss": 2.3595921993255615, - "eval_runtime": 159.1952, - "eval_samples_per_second": 19.699, - "eval_steps_per_second": 1.231, - "eval_wer": 0.7915449176251166, + "grad_norm": 1.2048473358154297, + "learning_rate": 0.0004161, + "loss": 2.7738, "step": 1400 }, { "epoch": 2.2416, - "grad_norm": 9.09476375579834, - "learning_rate": 0.0005551999999999999, - "loss": 2.6805, + "grad_norm": 8.999752044677734, + "learning_rate": 0.00041639999999999993, + "loss": 2.6463, "step": 1401 }, { "epoch": 2.2432, - "grad_norm": 5.154038906097412, - "learning_rate": 0.0005556, - "loss": 2.4796, + "grad_norm": 2.3112637996673584, + "learning_rate": 0.0004167, + "loss": 2.2288, "step": 1402 }, { "epoch": 2.2448, - "grad_norm": 11.566636085510254, - "learning_rate": 0.000556, - "loss": 2.8707, + "grad_norm": 4.989300727844238, + "learning_rate": 0.00041699999999999994, + "loss": 2.4311, "step": 1403 }, { "epoch": 2.2464, - "grad_norm": 6.700075626373291, - "learning_rate": 0.0005564, - "loss": 2.5268, + "grad_norm": 1.8407671451568604, + "learning_rate": 0.00041729999999999995, + "loss": 2.3069, "step": 1404 }, { "epoch": 2.248, - "grad_norm": 3.7919178009033203, - "learning_rate": 0.0005568, - "loss": 2.3305, + "grad_norm": 1.7011024951934814, + "learning_rate": 0.00041759999999999996, + "loss": 2.4687, "step": 1405 }, { "epoch": 2.2496, - "grad_norm": 0.8292596936225891, - "learning_rate": 0.0005571999999999999, - "loss": 2.1945, + "grad_norm": 1.005672574043274, + "learning_rate": 0.00041789999999999997, + "loss": 2.416, "step": 1406 }, { "epoch": 2.2512, - "grad_norm": 0.7672077417373657, - "learning_rate": 0.0005576, - "loss": 2.2364, + "grad_norm": 4.635334014892578, + "learning_rate": 0.0004181999999999999, + "loss": 2.2739, "step": 1407 }, { "epoch": 2.2528, - "grad_norm": 2.8418593406677246, - "learning_rate": 0.000558, - "loss": 2.1488, + "grad_norm": 5.972519397735596, + "learning_rate": 0.0004185, + "loss": 2.2631, "step": 1408 }, { "epoch": 2.2544, - "grad_norm": 1.9393724203109741, - "learning_rate": 0.0005583999999999999, - "loss": 2.3457, + "grad_norm": 6.264317512512207, + "learning_rate": 0.00041879999999999993, + "loss": 2.177, "step": 1409 }, { "epoch": 2.2560000000000002, - "grad_norm": 3.298280715942383, - "learning_rate": 0.0005587999999999999, - "loss": 2.2249, + "grad_norm": 6.628238677978516, + "learning_rate": 0.0004191, + "loss": 2.289, "step": 1410 }, { "epoch": 2.2576, - "grad_norm": 4.188957214355469, - "learning_rate": 0.0005591999999999999, - "loss": 2.1751, + "grad_norm": 5.494356155395508, + "learning_rate": 0.00041939999999999995, + "loss": 2.3197, "step": 1411 }, { "epoch": 2.2592, - "grad_norm": 1.6427080631256104, - "learning_rate": 0.0005595999999999999, - "loss": 2.2901, + "grad_norm": 2.957184314727783, + "learning_rate": 0.00041969999999999996, + "loss": 2.1555, "step": 1412 }, { "epoch": 2.2608, - "grad_norm": 1.5279046297073364, - "learning_rate": 0.00056, - "loss": 2.0703, + "grad_norm": 3.3862264156341553, + "learning_rate": 0.00041999999999999996, + "loss": 2.1691, "step": 1413 }, { "epoch": 2.2624, - "grad_norm": 1.4491573572158813, - "learning_rate": 0.0005604, - "loss": 2.1491, + "grad_norm": 2.009979486465454, + "learning_rate": 0.00042029999999999997, + "loss": 1.9896, "step": 1414 }, { "epoch": 2.2640000000000002, - "grad_norm": 0.9883561730384827, - "learning_rate": 0.0005608, - "loss": 2.1274, + "grad_norm": 0.8138348460197449, + "learning_rate": 0.0004205999999999999, + "loss": 2.0016, "step": 1415 }, { "epoch": 2.2656, - "grad_norm": 5.221831321716309, - "learning_rate": 0.0005612, - "loss": 2.2393, + "grad_norm": 1.9805331230163574, + "learning_rate": 0.0004209, + "loss": 2.0405, "step": 1416 }, { "epoch": 2.2672, - "grad_norm": 3.9041683673858643, - "learning_rate": 0.0005616, - "loss": 2.2982, + "grad_norm": 3.7756824493408203, + "learning_rate": 0.00042119999999999994, + "loss": 2.2341, "step": 1417 }, { "epoch": 2.2688, - "grad_norm": 3.2574350833892822, - "learning_rate": 0.0005619999999999999, - "loss": 2.3462, + "grad_norm": 4.360873222351074, + "learning_rate": 0.00042149999999999995, + "loss": 1.9849, "step": 1418 }, { "epoch": 2.2704, - "grad_norm": 2.5626466274261475, - "learning_rate": 0.0005624, - "loss": 2.3557, + "grad_norm": 4.901808738708496, + "learning_rate": 0.00042179999999999995, + "loss": 2.1964, "step": 1419 }, { "epoch": 2.2720000000000002, - "grad_norm": 6.10444974899292, - "learning_rate": 0.0005627999999999999, - "loss": 2.3624, + "grad_norm": 5.945872783660889, + "learning_rate": 0.00042209999999999996, + "loss": 2.2691, "step": 1420 }, { "epoch": 2.2736, - "grad_norm": 7.629668712615967, - "learning_rate": 0.0005631999999999999, - "loss": 2.5705, + "grad_norm": 6.475261211395264, + "learning_rate": 0.0004223999999999999, + "loss": 2.3456, "step": 1421 }, { "epoch": 2.2752, - "grad_norm": 0.9113142490386963, - "learning_rate": 0.0005635999999999999, - "loss": 2.2048, + "grad_norm": 5.20826530456543, + "learning_rate": 0.0004227, + "loss": 2.3354, "step": 1422 }, { "epoch": 2.2768, - "grad_norm": 2.0822792053222656, - "learning_rate": 0.0005639999999999999, - "loss": 2.2642, + "grad_norm": 5.351243019104004, + "learning_rate": 0.00042299999999999993, + "loss": 2.4577, "step": 1423 }, { "epoch": 2.2784, - "grad_norm": 3.2671005725860596, - "learning_rate": 0.0005644, - "loss": 2.3544, + "grad_norm": 3.6445577144622803, + "learning_rate": 0.0004233, + "loss": 2.2532, "step": 1424 }, { "epoch": 2.2800000000000002, - "grad_norm": 0.9781720042228699, - "learning_rate": 0.0005648, - "loss": 2.5447, + "grad_norm": 2.2167351245880127, + "learning_rate": 0.00042359999999999994, + "loss": 2.2576, "step": 1425 }, { "epoch": 2.2816, - "grad_norm": 3.4515535831451416, - "learning_rate": 0.0005652, - "loss": 2.4399, + "grad_norm": 1.7759617567062378, + "learning_rate": 0.00042389999999999995, + "loss": 2.3795, "step": 1426 }, { "epoch": 2.2832, - "grad_norm": 6.058647155761719, - "learning_rate": 0.0005656, - "loss": 2.3394, + "grad_norm": 1.0135295391082764, + "learning_rate": 0.00042419999999999996, + "loss": 2.3208, "step": 1427 }, { "epoch": 2.2848, - "grad_norm": 5.424431324005127, - "learning_rate": 0.000566, - "loss": 2.3398, + "grad_norm": 2.575662851333618, + "learning_rate": 0.00042449999999999996, + "loss": 2.1992, "step": 1428 }, { "epoch": 2.2864, - "grad_norm": 5.348411560058594, - "learning_rate": 0.0005663999999999999, - "loss": 2.4424, + "grad_norm": 0.6912661194801331, + "learning_rate": 0.0004247999999999999, + "loss": 2.2395, "step": 1429 }, { "epoch": 2.288, - "grad_norm": 2.7252535820007324, - "learning_rate": 0.0005667999999999999, - "loss": 2.1842, + "grad_norm": 0.6977083086967468, + "learning_rate": 0.0004251, + "loss": 2.079, "step": 1430 }, { "epoch": 2.2896, - "grad_norm": 1.38560950756073, - "learning_rate": 0.0005672, - "loss": 2.2131, + "grad_norm": 1.251266598701477, + "learning_rate": 0.00042539999999999993, + "loss": 2.2202, "step": 1431 }, { "epoch": 2.2912, - "grad_norm": 0.8164823055267334, - "learning_rate": 0.0005675999999999999, - "loss": 2.2275, + "grad_norm": 8.140886306762695, + "learning_rate": 0.0004257, + "loss": 2.5216, "step": 1432 }, { "epoch": 2.2928, - "grad_norm": 1.8317232131958008, - "learning_rate": 0.0005679999999999999, - "loss": 2.0737, + "grad_norm": 2.424851894378662, + "learning_rate": 0.00042599999999999995, + "loss": 2.1489, "step": 1433 }, { "epoch": 2.2944, - "grad_norm": 4.063267707824707, - "learning_rate": 0.0005683999999999999, - "loss": 2.4934, + "grad_norm": 1.4449694156646729, + "learning_rate": 0.00042629999999999995, + "loss": 2.1204, "step": 1434 }, { "epoch": 2.296, - "grad_norm": 2.9908406734466553, - "learning_rate": 0.0005688, - "loss": 2.3033, + "grad_norm": 0.626359224319458, + "learning_rate": 0.00042659999999999996, + "loss": 2.1644, "step": 1435 }, { "epoch": 2.2976, - "grad_norm": 4.536923885345459, - "learning_rate": 0.0005692, - "loss": 2.4368, + "grad_norm": 0.8110465407371521, + "learning_rate": 0.00042689999999999997, + "loss": 2.2867, "step": 1436 }, { "epoch": 2.2992, - "grad_norm": 3.1720566749572754, - "learning_rate": 0.0005696, - "loss": 2.246, + "grad_norm": 0.7100591659545898, + "learning_rate": 0.0004271999999999999, + "loss": 2.057, "step": 1437 }, { "epoch": 2.3008, - "grad_norm": 3.758767604827881, - "learning_rate": 0.00057, - "loss": 2.5514, + "grad_norm": 1.4392229318618774, + "learning_rate": 0.0004275, + "loss": 2.1222, "step": 1438 }, { "epoch": 2.3024, - "grad_norm": 0.8983809947967529, - "learning_rate": 0.0005704, - "loss": 2.3606, + "grad_norm": 4.126809120178223, + "learning_rate": 0.00042779999999999994, + "loss": 2.4204, "step": 1439 }, { "epoch": 2.304, - "grad_norm": 1.1368706226348877, - "learning_rate": 0.0005708, - "loss": 2.276, + "grad_norm": 2.7679941654205322, + "learning_rate": 0.0004281, + "loss": 2.413, "step": 1440 }, { "epoch": 2.3056, - "grad_norm": 0.8187576532363892, - "learning_rate": 0.0005711999999999999, - "loss": 2.3264, + "grad_norm": 0.9221087694168091, + "learning_rate": 0.00042839999999999995, + "loss": 2.3055, "step": 1441 }, { "epoch": 2.3072, - "grad_norm": 1.600205421447754, - "learning_rate": 0.0005715999999999999, - "loss": 2.4888, + "grad_norm": 1.424880027770996, + "learning_rate": 0.00042869999999999996, + "loss": 2.2364, "step": 1442 }, { "epoch": 2.3088, - "grad_norm": 1.1413285732269287, - "learning_rate": 0.0005719999999999999, - "loss": 2.2784, + "grad_norm": 1.1211133003234863, + "learning_rate": 0.00042899999999999997, + "loss": 2.0543, "step": 1443 }, { "epoch": 2.3104, - "grad_norm": 0.8480069637298584, - "learning_rate": 0.0005723999999999999, - "loss": 2.4388, + "grad_norm": 1.0176845788955688, + "learning_rate": 0.00042929999999999997, + "loss": 1.9312, "step": 1444 }, { "epoch": 2.312, - "grad_norm": 2.6615993976593018, - "learning_rate": 0.0005727999999999999, - "loss": 2.2917, + "grad_norm": 1.0180336236953735, + "learning_rate": 0.0004295999999999999, + "loss": 2.1439, "step": 1445 }, { "epoch": 2.3136, - "grad_norm": 2.1041688919067383, - "learning_rate": 0.0005732, - "loss": 2.432, + "grad_norm": 2.1806435585021973, + "learning_rate": 0.0004299, + "loss": 2.311, "step": 1446 }, { "epoch": 2.3152, - "grad_norm": 3.780215263366699, - "learning_rate": 0.0005736, - "loss": 2.5987, + "grad_norm": 1.58607816696167, + "learning_rate": 0.00043019999999999994, + "loss": 2.2012, "step": 1447 }, { "epoch": 2.3168, - "grad_norm": 1.8719847202301025, - "learning_rate": 0.000574, - "loss": 2.6563, + "grad_norm": 3.318199634552002, + "learning_rate": 0.0004305, + "loss": 2.4078, "step": 1448 }, { "epoch": 2.3184, - "grad_norm": NaN, - "learning_rate": 0.000574, - "loss": 2.4196, + "grad_norm": 2.1914138793945312, + "learning_rate": 0.00043079999999999995, + "loss": 2.4591, "step": 1449 }, { "epoch": 2.32, - "grad_norm": NaN, - "learning_rate": 0.000574, - "loss": 0.0, + "grad_norm": 2.709254264831543, + "learning_rate": 0.00043109999999999996, + "loss": 2.4848, "step": 1450 }, { "epoch": 2.3216, - "grad_norm": NaN, - "learning_rate": 0.000574, - "loss": 0.0, + "grad_norm": 4.738913536071777, + "learning_rate": 0.00043139999999999997, + "loss": 2.6373, "step": 1451 }, { "epoch": 2.3232, - "grad_norm": NaN, - "learning_rate": 0.000574, - "loss": 0.0, + "grad_norm": 3.7292051315307617, + "learning_rate": 0.0004317, + "loss": 2.5686, "step": 1452 }, { "epoch": 2.3247999999999998, - "grad_norm": NaN, - "learning_rate": 0.000574, - "loss": 0.0, + "grad_norm": 1.3130608797073364, + "learning_rate": 0.00043199999999999993, + "loss": 2.4265, "step": 1453 }, { "epoch": 2.3264, - "grad_norm": NaN, - "learning_rate": 0.000574, - "loss": 1.3804, + "grad_norm": 6.308412551879883, + "learning_rate": 0.0004323, + "loss": 2.4777, "step": 1454 }, { "epoch": 2.328, - "grad_norm": NaN, - "learning_rate": 0.000574, - "loss": 0.0, + "grad_norm": 5.950598239898682, + "learning_rate": 0.00043259999999999994, + "loss": 2.3441, "step": 1455 }, { "epoch": 2.3296, - "grad_norm": NaN, - "learning_rate": 0.000574, - "loss": 0.0, + "grad_norm": 7.419227600097656, + "learning_rate": 0.0004329, + "loss": 2.3924, "step": 1456 }, { "epoch": 2.3312, - "grad_norm": NaN, - "learning_rate": 0.000574, - "loss": 0.0, + "grad_norm": 8.05959415435791, + "learning_rate": 0.00043319999999999996, + "loss": 2.4859, "step": 1457 }, { "epoch": 2.3327999999999998, - "grad_norm": NaN, - "learning_rate": 0.000574, - "loss": 0.0, + "grad_norm": 7.334360122680664, + "learning_rate": 0.00043349999999999997, + "loss": 2.233, "step": 1458 }, { "epoch": 2.3344, - "grad_norm": NaN, - "learning_rate": 0.000574, - "loss": 0.0, + "grad_norm": 6.437933444976807, + "learning_rate": 0.0004338, + "loss": 2.2507, "step": 1459 }, { "epoch": 2.336, - "grad_norm": NaN, - "learning_rate": 0.000574, - "loss": 0.0, + "grad_norm": 6.478224754333496, + "learning_rate": 0.0004341, + "loss": 2.2895, "step": 1460 }, { "epoch": 2.3376, - "grad_norm": NaN, - "learning_rate": 0.000574, - "loss": 0.0, + "grad_norm": 5.727662563323975, + "learning_rate": 0.00043439999999999993, + "loss": 2.0447, "step": 1461 }, { "epoch": 2.3392, - "grad_norm": 1.6323294639587402, - "learning_rate": 0.0005744, - "loss": 2.3085, + "grad_norm": 3.6567492485046387, + "learning_rate": 0.0004347, + "loss": 2.0184, "step": 1462 }, { "epoch": 2.3407999999999998, - "grad_norm": NaN, - "learning_rate": 0.0005744, - "loss": 1.0759, + "grad_norm": 1.113254427909851, + "learning_rate": 0.00043499999999999995, + "loss": 1.9926, "step": 1463 }, { "epoch": 2.3424, - "grad_norm": NaN, - "learning_rate": 0.0005744, - "loss": 0.0, + "grad_norm": 0.7796720862388611, + "learning_rate": 0.00043529999999999996, + "loss": 2.1872, "step": 1464 }, { "epoch": 2.344, - "grad_norm": NaN, - "learning_rate": 0.0005744, - "loss": 1.1082, + "grad_norm": 0.9296607375144958, + "learning_rate": 0.00043559999999999996, + "loss": 2.1299, "step": 1465 }, { "epoch": 2.3456, - "grad_norm": NaN, - "learning_rate": 0.0005744, - "loss": 0.0, + "grad_norm": 3.0827059745788574, + "learning_rate": 0.00043589999999999997, + "loss": 1.88, "step": 1466 }, { "epoch": 2.3472, - "grad_norm": NaN, - "learning_rate": 0.0005744, - "loss": 0.0, + "grad_norm": 3.31209659576416, + "learning_rate": 0.0004361999999999999, + "loss": 1.9073, "step": 1467 }, { "epoch": 2.3487999999999998, - "grad_norm": NaN, - "learning_rate": 0.0005744, - "loss": 0.0, + "grad_norm": 4.975152492523193, + "learning_rate": 0.0004365, + "loss": 2.2116, "step": 1468 }, { "epoch": 2.3504, - "grad_norm": NaN, - "learning_rate": 0.0005744, - "loss": 0.0, + "grad_norm": 6.011470794677734, + "learning_rate": 0.00043679999999999994, + "loss": 1.8669, "step": 1469 }, { "epoch": 2.352, - "grad_norm": NaN, - "learning_rate": 0.0005744, - "loss": 0.0, + "grad_norm": 3.388676404953003, + "learning_rate": 0.0004371, + "loss": 2.0661, "step": 1470 }, { "epoch": 2.3536, - "grad_norm": NaN, - "learning_rate": 0.0005744, - "loss": 0.0, + "grad_norm": 2.6890642642974854, + "learning_rate": 0.00043739999999999995, + "loss": 1.9085, "step": 1471 }, { "epoch": 2.3552, - "grad_norm": NaN, - "learning_rate": 0.0005744, - "loss": 0.0, + "grad_norm": 1.4474709033966064, + "learning_rate": 0.00043769999999999996, + "loss": 1.9495, "step": 1472 }, { "epoch": 2.3568, - "grad_norm": NaN, - "learning_rate": 0.0005744, - "loss": 0.0, + "grad_norm": 3.906247854232788, + "learning_rate": 0.00043799999999999997, + "loss": 2.3595, "step": 1473 }, { "epoch": 2.3584, - "grad_norm": NaN, - "learning_rate": 0.0005744, - "loss": 0.0, + "grad_norm": 1.7890042066574097, + "learning_rate": 0.0004383, + "loss": 2.0283, "step": 1474 }, { "epoch": 2.36, - "grad_norm": NaN, - "learning_rate": 0.0005744, - "loss": 0.0, + "grad_norm": 2.4251508712768555, + "learning_rate": 0.00043859999999999993, + "loss": 2.3857, "step": 1475 }, { "epoch": 2.3616, - "grad_norm": NaN, - "learning_rate": 0.0005744, - "loss": 0.0, + "grad_norm": 2.0929760932922363, + "learning_rate": 0.0004389, + "loss": 1.8558, "step": 1476 }, { "epoch": 2.3632, - "grad_norm": NaN, - "learning_rate": 0.0005744, - "loss": 0.0, + "grad_norm": 1.920934796333313, + "learning_rate": 0.00043919999999999994, + "loss": 2.1394, "step": 1477 }, { "epoch": 2.3648, - "grad_norm": NaN, - "learning_rate": 0.0005744, - "loss": 0.0, + "grad_norm": 1.799728274345398, + "learning_rate": 0.0004395, + "loss": 2.033, "step": 1478 }, { "epoch": 2.3664, - "grad_norm": NaN, - "learning_rate": 0.0005744, - "loss": 0.0, + "grad_norm": 1.0893006324768066, + "learning_rate": 0.00043979999999999996, + "loss": 2.0484, "step": 1479 }, { "epoch": 2.368, - "grad_norm": NaN, - "learning_rate": 0.0005744, - "loss": 1.1866, + "grad_norm": 0.7367232441902161, + "learning_rate": 0.00044009999999999996, + "loss": 2.1303, "step": 1480 }, { "epoch": 2.3696, - "grad_norm": NaN, - "learning_rate": 0.0005744, - "loss": 0.0, + "grad_norm": 2.518019437789917, + "learning_rate": 0.00044039999999999997, + "loss": 2.1944, "step": 1481 }, { "epoch": 2.3712, - "grad_norm": NaN, - "learning_rate": 0.0005744, - "loss": 0.0, + "grad_norm": 4.497959136962891, + "learning_rate": 0.0004407, + "loss": 2.1217, "step": 1482 }, { "epoch": 2.3728, - "grad_norm": NaN, - "learning_rate": 0.0005744, - "loss": 0.0, + "grad_norm": 3.2496092319488525, + "learning_rate": 0.00044099999999999993, + "loss": 2.0976, "step": 1483 }, { "epoch": 2.3744, - "grad_norm": NaN, - "learning_rate": 0.0005744, - "loss": 0.0, + "grad_norm": 3.0006887912750244, + "learning_rate": 0.0004413, + "loss": 2.062, "step": 1484 }, { "epoch": 2.376, - "grad_norm": NaN, - "learning_rate": 0.0005744, - "loss": 1.2168, + "grad_norm": 2.9096486568450928, + "learning_rate": 0.00044159999999999995, + "loss": 2.2014, "step": 1485 }, { "epoch": 2.3776, - "grad_norm": NaN, - "learning_rate": 0.0005744, - "loss": 1.12, + "grad_norm": 3.6192102432250977, + "learning_rate": 0.0004419, + "loss": 2.0639, "step": 1486 }, { "epoch": 2.3792, - "grad_norm": NaN, - "learning_rate": 0.0005744, - "loss": 0.0, + "grad_norm": 1.4094703197479248, + "learning_rate": 0.00044219999999999996, + "loss": 2.1488, "step": 1487 }, { "epoch": 2.3808, - "grad_norm": NaN, - "learning_rate": 0.0005744, - "loss": 0.0, + "grad_norm": 1.171118140220642, + "learning_rate": 0.00044249999999999997, + "loss": 2.0222, "step": 1488 }, { "epoch": 2.3824, - "grad_norm": NaN, - "learning_rate": 0.0005744, - "loss": 1.1949, + "grad_norm": 1.4760013818740845, + "learning_rate": 0.0004428, + "loss": 2.0528, "step": 1489 }, { "epoch": 2.384, - "grad_norm": NaN, - "learning_rate": 0.0005744, - "loss": 0.0, + "grad_norm": 1.7218807935714722, + "learning_rate": 0.0004431, + "loss": 2.0762, "step": 1490 }, { "epoch": 2.3856, - "grad_norm": NaN, - "learning_rate": 0.0005744, - "loss": 2.5412, + "grad_norm": 0.8926534056663513, + "learning_rate": 0.00044339999999999994, + "loss": 2.3282, "step": 1491 }, { "epoch": 2.3872, - "grad_norm": NaN, - "learning_rate": 0.0005744, - "loss": 0.0, + "grad_norm": 1.467978596687317, + "learning_rate": 0.0004437, + "loss": 2.1324, "step": 1492 }, { "epoch": 2.3888, - "grad_norm": NaN, - "learning_rate": 0.0005744, - "loss": 0.0, + "grad_norm": 2.302968978881836, + "learning_rate": 0.00044399999999999995, + "loss": 2.2981, "step": 1493 }, { "epoch": 2.3904, - "grad_norm": NaN, - "learning_rate": 0.0005744, - "loss": 0.0, + "grad_norm": 3.0114901065826416, + "learning_rate": 0.0004443, + "loss": 2.2011, "step": 1494 }, { "epoch": 2.392, - "grad_norm": NaN, - "learning_rate": 0.0005744, - "loss": 0.0, + "grad_norm": 4.253175258636475, + "learning_rate": 0.00044459999999999996, + "loss": 2.1162, "step": 1495 }, { "epoch": 2.3936, - "grad_norm": NaN, - "learning_rate": 0.0005744, - "loss": 0.0, + "grad_norm": 2.1896846294403076, + "learning_rate": 0.00044489999999999997, + "loss": 2.2125, "step": 1496 }, { "epoch": 2.3952, - "grad_norm": NaN, - "learning_rate": 0.0005744, - "loss": 2.2957, + "grad_norm": 1.5464539527893066, + "learning_rate": 0.0004452, + "loss": 2.602, "step": 1497 }, { "epoch": 2.3968, - "grad_norm": NaN, - "learning_rate": 0.0005744, - "loss": 2.4599, + "grad_norm": 1.3925230503082275, + "learning_rate": 0.0004455, + "loss": 2.6023, "step": 1498 }, { "epoch": 2.3984, - "grad_norm": NaN, - "learning_rate": 0.0005744, - "loss": 1.246, + "grad_norm": 2.2040951251983643, + "learning_rate": 0.00044579999999999994, + "loss": 2.4529, "step": 1499 }, { "epoch": 2.4, "grad_norm": NaN, - "learning_rate": 0.0005744, - "loss": 0.0, - "step": 1500 - }, - { - "epoch": 2.4, - "eval_cer": 0.6018615219037748, - "eval_loss": 2.4470601081848145, - "eval_runtime": 158.9611, - "eval_samples_per_second": 19.728, - "eval_steps_per_second": 1.233, - "eval_wer": 0.8782302352087866, + "learning_rate": 0.00044579999999999994, + "loss": 1.9781, "step": 1500 }, { "epoch": 2.4016, - "grad_norm": NaN, - "learning_rate": 0.0005744, - "loss": 0.0, + "grad_norm": 3.5158798694610596, + "learning_rate": 0.0004461, + "loss": 2.7577, "step": 1501 }, { "epoch": 2.4032, - "grad_norm": NaN, - "learning_rate": 0.0005744, - "loss": 0.0, + "grad_norm": 0.7932011485099792, + "learning_rate": 0.00044639999999999995, + "loss": 2.4814, "step": 1502 }, { "epoch": 2.4048, - "grad_norm": NaN, - "learning_rate": 0.0005744, - "loss": 0.0, + "grad_norm": 3.7924387454986572, + "learning_rate": 0.0004467, + "loss": 2.3667, "step": 1503 }, { "epoch": 2.4064, - "grad_norm": NaN, - "learning_rate": 0.0005744, - "loss": 0.0, + "grad_norm": 4.5760626792907715, + "learning_rate": 0.00044699999999999997, + "loss": 2.3983, "step": 1504 }, { "epoch": 2.408, - "grad_norm": NaN, - "learning_rate": 0.0005744, - "loss": 0.0, + "grad_norm": 3.0172371864318848, + "learning_rate": 0.0004473, + "loss": 2.4891, "step": 1505 }, { "epoch": 2.4096, - "grad_norm": NaN, - "learning_rate": 0.0005744, - "loss": 0.0, + "grad_norm": 5.048511028289795, + "learning_rate": 0.0004476, + "loss": 2.5052, "step": 1506 }, { "epoch": 2.4112, - "grad_norm": NaN, - "learning_rate": 0.0005744, - "loss": 0.0, + "grad_norm": 4.192890167236328, + "learning_rate": 0.0004479, + "loss": 2.3123, "step": 1507 }, { "epoch": 2.4128, - "grad_norm": NaN, - "learning_rate": 0.0005744, - "loss": 0.0, + "grad_norm": 1.8736287355422974, + "learning_rate": 0.00044819999999999994, + "loss": 2.4851, "step": 1508 }, { "epoch": 2.4144, - "grad_norm": NaN, - "learning_rate": 0.0005744, - "loss": 0.0, + "grad_norm": 2.5078847408294678, + "learning_rate": 0.0004485, + "loss": 2.3538, "step": 1509 }, { "epoch": 2.416, - "grad_norm": NaN, - "learning_rate": 0.0005744, - "loss": 2.4169, + "grad_norm": 2.2128732204437256, + "learning_rate": 0.00044879999999999996, + "loss": 2.4414, "step": 1510 }, { "epoch": 2.4176, - "grad_norm": NaN, - "learning_rate": 0.0005744, - "loss": 0.0, + "grad_norm": 0.8890330195426941, + "learning_rate": 0.0004491, + "loss": 2.0152, "step": 1511 }, { "epoch": 2.4192, - "grad_norm": NaN, - "learning_rate": 0.0005744, - "loss": 0.0, + "grad_norm": 1.7872178554534912, + "learning_rate": 0.0004494, + "loss": 2.2456, "step": 1512 }, { "epoch": 2.4208, - "grad_norm": NaN, - "learning_rate": 0.0005744, - "loss": 0.0, + "grad_norm": 5.388312816619873, + "learning_rate": 0.0004497, + "loss": 2.2783, "step": 1513 }, { "epoch": 2.4224, - "grad_norm": NaN, - "learning_rate": 0.0005744, - "loss": 0.0, + "grad_norm": 1.207627534866333, + "learning_rate": 0.00045, + "loss": 2.2925, "step": 1514 }, { "epoch": 2.424, - "grad_norm": NaN, - "learning_rate": 0.0005744, - "loss": 0.0, + "grad_norm": 1.7061299085617065, + "learning_rate": 0.00045029999999999994, + "loss": 2.0897, "step": 1515 }, { "epoch": 2.4256, - "grad_norm": NaN, - "learning_rate": 0.0005744, - "loss": 0.0, + "grad_norm": 3.343909978866577, + "learning_rate": 0.00045059999999999995, + "loss": 2.2123, "step": 1516 }, { "epoch": 2.4272, - "grad_norm": NaN, - "learning_rate": 0.0005744, - "loss": 0.0, + "grad_norm": 1.4407389163970947, + "learning_rate": 0.0004508999999999999, + "loss": 2.0337, "step": 1517 }, { "epoch": 2.4288, - "grad_norm": NaN, - "learning_rate": 0.0005744, - "loss": 0.0, + "grad_norm": 1.6448605060577393, + "learning_rate": 0.00045119999999999996, + "loss": 2.0988, "step": 1518 }, { "epoch": 2.4304, - "grad_norm": NaN, - "learning_rate": 0.0005744, - "loss": 2.45, + "grad_norm": 2.379830837249756, + "learning_rate": 0.0004514999999999999, + "loss": 1.9966, "step": 1519 }, { "epoch": 2.432, - "grad_norm": NaN, - "learning_rate": 0.0005744, - "loss": 2.3262, + "grad_norm": 0.9263246059417725, + "learning_rate": 0.0004518, + "loss": 2.1314, "step": 1520 }, { "epoch": 2.4336, - "grad_norm": NaN, - "learning_rate": 0.0005744, - "loss": 0.0, + "grad_norm": 1.748016595840454, + "learning_rate": 0.00045209999999999993, + "loss": 2.1768, "step": 1521 }, { "epoch": 2.4352, - "grad_norm": NaN, - "learning_rate": 0.0005744, - "loss": 0.0, + "grad_norm": 2.4713168144226074, + "learning_rate": 0.00045239999999999994, + "loss": 2.1775, "step": 1522 }, { "epoch": 2.4368, - "grad_norm": NaN, - "learning_rate": 0.0005744, - "loss": 0.0, + "grad_norm": 2.4357311725616455, + "learning_rate": 0.00045269999999999994, + "loss": 2.2528, "step": 1523 }, { "epoch": 2.4384, - "grad_norm": NaN, - "learning_rate": 0.0005744, - "loss": 0.0, + "grad_norm": 1.8041242361068726, + "learning_rate": 0.00045299999999999995, + "loss": 2.082, "step": 1524 }, { "epoch": 2.44, - "grad_norm": NaN, - "learning_rate": 0.0005744, - "loss": 0.0, + "grad_norm": 2.257370948791504, + "learning_rate": 0.0004532999999999999, + "loss": 2.0602, "step": 1525 }, { "epoch": 2.4416, - "grad_norm": NaN, - "learning_rate": 0.0005744, - "loss": 0.0, + "grad_norm": 0.9275694489479065, + "learning_rate": 0.00045359999999999997, + "loss": 2.0142, "step": 1526 }, { "epoch": 2.4432, - "grad_norm": NaN, - "learning_rate": 0.0005744, - "loss": 0.0, + "grad_norm": 0.8489426970481873, + "learning_rate": 0.0004538999999999999, + "loss": 2.1113, "step": 1527 }, { "epoch": 2.4448, - "grad_norm": NaN, - "learning_rate": 0.0005744, - "loss": 0.0, + "grad_norm": 1.2062907218933105, + "learning_rate": 0.0004542, + "loss": 2.1288, "step": 1528 }, { "epoch": 2.4464, - "grad_norm": NaN, - "learning_rate": 0.0005744, - "loss": 2.1949, + "grad_norm": 0.4519451856613159, + "learning_rate": 0.00045449999999999993, + "loss": 2.1187, "step": 1529 }, { "epoch": 2.448, - "grad_norm": NaN, - "learning_rate": 0.0005744, - "loss": 0.0, + "grad_norm": 0.757920503616333, + "learning_rate": 0.00045479999999999994, + "loss": 2.1579, "step": 1530 }, { "epoch": 2.4496, - "grad_norm": NaN, - "learning_rate": 0.0005744, - "loss": 0.0, + "grad_norm": 1.3224443197250366, + "learning_rate": 0.00045509999999999995, + "loss": 1.9504, "step": 1531 }, { "epoch": 2.4512, - "grad_norm": NaN, - "learning_rate": 0.0005744, - "loss": 0.0, + "grad_norm": 2.6802444458007812, + "learning_rate": 0.00045539999999999996, + "loss": 2.1485, "step": 1532 }, { "epoch": 2.4528, - "grad_norm": NaN, - "learning_rate": 0.0005744, - "loss": 0.0, + "grad_norm": 2.8263375759124756, + "learning_rate": 0.0004556999999999999, + "loss": 2.3139, "step": 1533 }, { "epoch": 2.4544, - "grad_norm": NaN, - "learning_rate": 0.0005744, - "loss": 0.0, + "grad_norm": 1.8222333192825317, + "learning_rate": 0.00045599999999999997, + "loss": 2.1328, "step": 1534 }, { "epoch": 2.456, - "grad_norm": NaN, - "learning_rate": 0.0005744, - "loss": 0.0, + "grad_norm": 1.3476057052612305, + "learning_rate": 0.0004562999999999999, + "loss": 2.0043, "step": 1535 }, { "epoch": 2.4576000000000002, - "grad_norm": NaN, - "learning_rate": 0.0005744, - "loss": 0.0, + "grad_norm": 2.153061866760254, + "learning_rate": 0.0004566, + "loss": 2.2033, "step": 1536 }, { "epoch": 2.4592, - "grad_norm": NaN, - "learning_rate": 0.0005744, - "loss": 1.2771, + "grad_norm": 1.0744353532791138, + "learning_rate": 0.00045689999999999994, + "loss": 2.115, "step": 1537 }, { "epoch": 2.4608, - "grad_norm": NaN, - "learning_rate": 0.0005744, - "loss": 0.0, + "grad_norm": 2.4500582218170166, + "learning_rate": 0.00045719999999999995, + "loss": 2.4472, "step": 1538 }, { "epoch": 2.4624, - "grad_norm": NaN, - "learning_rate": 0.0005744, - "loss": 0.0, + "grad_norm": 2.2745625972747803, + "learning_rate": 0.00045749999999999995, + "loss": 2.2543, "step": 1539 }, { "epoch": 2.464, - "grad_norm": NaN, - "learning_rate": 0.0005744, - "loss": 0.0, + "grad_norm": 3.1885974407196045, + "learning_rate": 0.00045779999999999996, + "loss": 2.269, "step": 1540 }, { "epoch": 2.4656000000000002, - "grad_norm": 0.0, - "learning_rate": 0.0005747999999999999, - "loss": 2.7204, + "grad_norm": 3.2963569164276123, + "learning_rate": 0.0004580999999999999, + "loss": 2.2571, "step": 1541 }, { "epoch": 2.4672, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, + "grad_norm": 2.700000762939453, + "learning_rate": 0.0004584, + "loss": 2.4741, "step": 1542 }, { "epoch": 2.4688, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, + "grad_norm": 2.1224586963653564, + "learning_rate": 0.00045869999999999993, + "loss": 2.5694, "step": 1543 }, { "epoch": 2.4704, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 1.2716, + "grad_norm": 1.3228379487991333, + "learning_rate": 0.000459, + "loss": 2.4327, "step": 1544 }, { "epoch": 2.472, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, + "grad_norm": 1.0606297254562378, + "learning_rate": 0.00045929999999999994, + "loss": 2.5588, "step": 1545 }, { "epoch": 2.4736000000000002, "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, + "learning_rate": 0.00045929999999999994, + "loss": 2.2503, "step": 1546 }, { "epoch": 2.4752, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, + "grad_norm": 1.2446088790893555, + "learning_rate": 0.00045959999999999995, + "loss": 2.4258, "step": 1547 }, { "epoch": 2.4768, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, + "grad_norm": 1.7627031803131104, + "learning_rate": 0.00045989999999999996, + "loss": 3.0056, "step": 1548 }, { "epoch": 2.4784, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 2.4006, + "grad_norm": 1.893491506576538, + "learning_rate": 0.00046019999999999996, + "loss": 2.4505, "step": 1549 }, { "epoch": 2.48, "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, + "learning_rate": 0.00046019999999999996, + "loss": 2.2567, "step": 1550 }, { "epoch": 2.4816, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, + "grad_norm": 8.74563217163086, + "learning_rate": 0.0004604999999999999, + "loss": 2.9547, "step": 1551 }, { "epoch": 2.4832, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, + "grad_norm": 1.7902604341506958, + "learning_rate": 0.0004608, + "loss": 2.4997, "step": 1552 }, { "epoch": 2.4848, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, + "grad_norm": 1.6337521076202393, + "learning_rate": 0.00046109999999999993, + "loss": 2.6721, "step": 1553 }, { "epoch": 2.4864, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 2.6069, + "grad_norm": 2.686281681060791, + "learning_rate": 0.0004614, + "loss": 2.4642, "step": 1554 }, { "epoch": 2.488, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, + "grad_norm": 2.8505804538726807, + "learning_rate": 0.00046169999999999995, + "loss": 2.6323, "step": 1555 }, { "epoch": 2.4896, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, + "grad_norm": 4.638845443725586, + "learning_rate": 0.00046199999999999995, + "loss": 2.4094, "step": 1556 }, { "epoch": 2.4912, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, + "grad_norm": 5.030619144439697, + "learning_rate": 0.00046229999999999996, + "loss": 2.4617, "step": 1557 }, { "epoch": 2.4928, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, + "grad_norm": 2.355173349380493, + "learning_rate": 0.00046259999999999997, + "loss": 2.7822, "step": 1558 }, { "epoch": 2.4944, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, + "grad_norm": 3.260460138320923, + "learning_rate": 0.0004628999999999999, + "loss": 2.393, "step": 1559 }, { "epoch": 2.496, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, + "grad_norm": 2.24070143699646, + "learning_rate": 0.0004632, + "loss": 2.3995, "step": 1560 }, { "epoch": 2.4976, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, + "grad_norm": 2.825697422027588, + "learning_rate": 0.00046349999999999994, + "loss": 2.1871, "step": 1561 }, { "epoch": 2.4992, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, + "grad_norm": 1.4646861553192139, + "learning_rate": 0.0004638, + "loss": 2.5, "step": 1562 }, { "epoch": 2.5008, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, + "grad_norm": 0.9098280668258667, + "learning_rate": 0.00046409999999999995, + "loss": 2.3278, "step": 1563 }, { "epoch": 2.5023999999999997, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, + "grad_norm": 2.2409300804138184, + "learning_rate": 0.00046439999999999996, + "loss": 2.2171, "step": 1564 }, { "epoch": 2.504, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, + "grad_norm": 1.769909381866455, + "learning_rate": 0.00046469999999999997, + "loss": 2.1868, "step": 1565 }, { "epoch": 2.5056000000000003, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, + "grad_norm": 0.7229076623916626, + "learning_rate": 0.00046499999999999997, + "loss": 2.4986, "step": 1566 }, { "epoch": 2.5072, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, + "grad_norm": 3.2638115882873535, + "learning_rate": 0.0004652999999999999, + "loss": 2.4535, "step": 1567 }, { "epoch": 2.5088, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, + "grad_norm": 1.0078232288360596, + "learning_rate": 0.0004656, + "loss": 2.3516, "step": 1568 }, { "epoch": 2.5103999999999997, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, + "grad_norm": 0.9068163633346558, + "learning_rate": 0.00046589999999999994, + "loss": 2.2269, "step": 1569 }, { "epoch": 2.512, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, + "grad_norm": 0.7834655046463013, + "learning_rate": 0.00046619999999999995, + "loss": 2.1135, "step": 1570 }, { "epoch": 2.5136, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, + "grad_norm": 0.9387535452842712, + "learning_rate": 0.00046649999999999996, + "loss": 2.3821, "step": 1571 }, { "epoch": 2.5152, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, + "grad_norm": 0.49048078060150146, + "learning_rate": 0.00046679999999999996, + "loss": 2.2074, "step": 1572 }, { "epoch": 2.5168, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, + "grad_norm": 0.7723761200904846, + "learning_rate": 0.0004670999999999999, + "loss": 2.2854, "step": 1573 }, { "epoch": 2.5183999999999997, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, + "grad_norm": 0.7409223318099976, + "learning_rate": 0.0004674, + "loss": 2.3194, "step": 1574 }, { "epoch": 2.52, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, + "grad_norm": 1.611025333404541, + "learning_rate": 0.00046769999999999993, + "loss": 2.1594, "step": 1575 }, { "epoch": 2.5216, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, + "grad_norm": 1.4206929206848145, + "learning_rate": 0.000468, + "loss": 2.2845, "step": 1576 }, { "epoch": 2.5232, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, + "grad_norm": 1.7586792707443237, + "learning_rate": 0.00046829999999999994, + "loss": 2.241, "step": 1577 }, { "epoch": 2.5248, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, + "grad_norm": 0.6851300001144409, + "learning_rate": 0.00046859999999999995, + "loss": 2.1471, "step": 1578 }, { "epoch": 2.5263999999999998, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, + "grad_norm": 0.724346935749054, + "learning_rate": 0.00046889999999999996, + "loss": 2.3002, "step": 1579 }, { "epoch": 2.528, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, + "grad_norm": 1.9342684745788574, + "learning_rate": 0.00046919999999999997, + "loss": 2.3658, "step": 1580 }, { "epoch": 2.5296, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, + "grad_norm": 0.6981943249702454, + "learning_rate": 0.0004694999999999999, + "loss": 2.197, "step": 1581 }, { "epoch": 2.5312, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 2.4863, + "grad_norm": 1.980246901512146, + "learning_rate": 0.0004698, + "loss": 2.1856, "step": 1582 }, { "epoch": 2.5328, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, + "grad_norm": 1.1717616319656372, + "learning_rate": 0.00047009999999999993, + "loss": 2.1176, "step": 1583 }, { "epoch": 2.5343999999999998, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, + "grad_norm": 0.5251626968383789, + "learning_rate": 0.0004704, + "loss": 2.2425, "step": 1584 }, { "epoch": 2.536, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 2.7471, + "grad_norm": 0.5263369679450989, + "learning_rate": 0.00047069999999999995, + "loss": 2.4217, "step": 1585 }, { "epoch": 2.5376, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 2.442, + "grad_norm": 1.7123208045959473, + "learning_rate": 0.00047099999999999996, + "loss": 2.3168, "step": 1586 }, { "epoch": 2.5392, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, + "grad_norm": 0.5260848999023438, + "learning_rate": 0.00047129999999999996, + "loss": 2.3844, "step": 1587 }, { "epoch": 2.5408, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, + "grad_norm": 1.0098893642425537, + "learning_rate": 0.00047159999999999997, + "loss": 2.2402, "step": 1588 }, { "epoch": 2.5423999999999998, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, + "grad_norm": 1.457606554031372, + "learning_rate": 0.0004718999999999999, + "loss": 2.1904, "step": 1589 }, { "epoch": 2.544, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, + "grad_norm": 2.0802197456359863, + "learning_rate": 0.0004722, + "loss": 2.2182, "step": 1590 }, { "epoch": 2.5456, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, + "grad_norm": 0.5586844682693481, + "learning_rate": 0.00047249999999999994, + "loss": 2.2742, "step": 1591 }, { "epoch": 2.5472, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, + "grad_norm": 0.6091138124465942, + "learning_rate": 0.0004728, + "loss": 2.1756, "step": 1592 }, { "epoch": 2.5488, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, + "grad_norm": 0.9087393283843994, + "learning_rate": 0.00047309999999999995, + "loss": 2.1531, "step": 1593 }, { "epoch": 2.5504, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 1.294, + "grad_norm": 1.046461582183838, + "learning_rate": 0.00047339999999999996, + "loss": 2.3781, "step": 1594 }, { "epoch": 2.552, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 2.0787, + "grad_norm": 1.6911176443099976, + "learning_rate": 0.00047369999999999997, + "loss": 2.2942, "step": 1595 }, { "epoch": 2.5536, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 1.1777, + "grad_norm": 2.0928030014038086, + "learning_rate": 0.000474, + "loss": 2.6092, "step": 1596 }, { "epoch": 2.5552, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 1.2794, + "grad_norm": 3.662987232208252, + "learning_rate": 0.00047429999999999993, + "loss": 2.9461, "step": 1597 }, { "epoch": 2.5568, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, + "grad_norm": 1.8414223194122314, + "learning_rate": 0.0004746, + "loss": 2.5515, "step": 1598 }, { "epoch": 2.5584, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 2.5247, + "grad_norm": 2.544562816619873, + "learning_rate": 0.00047489999999999994, + "loss": 2.6383, "step": 1599 }, { "epoch": 2.56, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, - "step": 1600 - }, - { - "epoch": 2.56, - "eval_cer": 0.6112169649619325, - "eval_loss": 2.448988676071167, - "eval_runtime": 158.8119, - "eval_samples_per_second": 19.747, - "eval_steps_per_second": 1.234, - "eval_wer": 0.8888198114185059, + "grad_norm": 1.1964010000228882, + "learning_rate": 0.0004752, + "loss": 2.6473, "step": 1600 }, { "epoch": 2.5616, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, + "grad_norm": 1.2365444898605347, + "learning_rate": 0.00047549999999999996, + "loss": 2.6351, "step": 1601 }, { "epoch": 2.5632, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, + "grad_norm": 2.1067428588867188, + "learning_rate": 0.00047579999999999996, + "loss": 2.521, "step": 1602 }, { "epoch": 2.5648, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, + "grad_norm": 4.032146453857422, + "learning_rate": 0.00047609999999999997, + "loss": 2.7045, "step": 1603 }, { "epoch": 2.5664, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, + "grad_norm": 0.8868406414985657, + "learning_rate": 0.0004764, + "loss": 2.5629, "step": 1604 }, { "epoch": 2.568, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, + "grad_norm": 0.6565248966217041, + "learning_rate": 0.00047669999999999993, + "loss": 2.4577, "step": 1605 }, { "epoch": 2.5696, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, + "grad_norm": 1.5226905345916748, + "learning_rate": 0.000477, + "loss": 2.3847, "step": 1606 }, { "epoch": 2.5712, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, + "grad_norm": 1.1585739850997925, + "learning_rate": 0.00047729999999999995, + "loss": 2.5167, "step": 1607 }, { "epoch": 2.5728, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, + "grad_norm": 6.165229320526123, + "learning_rate": 0.0004776, + "loss": 2.4221, "step": 1608 }, { "epoch": 2.5744, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, + "grad_norm": 5.301511764526367, + "learning_rate": 0.00047789999999999996, + "loss": 2.2772, "step": 1609 }, { "epoch": 2.576, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, + "grad_norm": 5.172726631164551, + "learning_rate": 0.00047819999999999997, + "loss": 2.234, "step": 1610 }, { "epoch": 2.5776, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, + "grad_norm": 5.577144145965576, + "learning_rate": 0.0004785, + "loss": 2.3354, "step": 1611 }, { "epoch": 2.5792, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, + "grad_norm": 3.4328131675720215, + "learning_rate": 0.0004788, + "loss": 2.3838, "step": 1612 }, { "epoch": 2.5808, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, + "grad_norm": 3.080483913421631, + "learning_rate": 0.00047909999999999994, + "loss": 2.3324, "step": 1613 }, { "epoch": 2.5824, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, + "grad_norm": 1.9550327062606812, + "learning_rate": 0.0004794, + "loss": 2.0317, "step": 1614 }, { "epoch": 2.584, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 1.1777, + "grad_norm": 1.0461270809173584, + "learning_rate": 0.00047969999999999995, + "loss": 2.0947, "step": 1615 }, { "epoch": 2.5856, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, + "grad_norm": 1.072354793548584, + "learning_rate": 0.00047999999999999996, + "loss": 2.0531, "step": 1616 }, { "epoch": 2.5872, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, + "grad_norm": 1.404462456703186, + "learning_rate": 0.00048029999999999997, + "loss": 2.0438, "step": 1617 }, { "epoch": 2.5888, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, + "grad_norm": 1.9748609066009521, + "learning_rate": 0.00048059999999999997, + "loss": 2.2188, "step": 1618 }, { "epoch": 2.5904, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, + "grad_norm": 1.9678696393966675, + "learning_rate": 0.0004808999999999999, + "loss": 2.0989, "step": 1619 }, { "epoch": 2.592, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, + "grad_norm": 2.535593032836914, + "learning_rate": 0.0004812, + "loss": 2.2185, "step": 1620 }, { "epoch": 2.5936, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, + "grad_norm": 1.916507363319397, + "learning_rate": 0.00048149999999999994, + "loss": 2.0964, "step": 1621 }, { "epoch": 2.5952, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, + "grad_norm": 1.4805487394332886, + "learning_rate": 0.0004818, + "loss": 2.1418, "step": 1622 }, { "epoch": 2.5968, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, + "grad_norm": 1.451712727546692, + "learning_rate": 0.00048209999999999995, + "loss": 1.9859, "step": 1623 }, { "epoch": 2.5984, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, + "grad_norm": 0.8536416292190552, + "learning_rate": 0.00048239999999999996, + "loss": 1.9668, "step": 1624 }, { "epoch": 2.6, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, + "grad_norm": 0.7038165926933289, + "learning_rate": 0.00048269999999999997, + "loss": 2.1383, "step": 1625 }, { "epoch": 2.6016, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 1.0919, + "grad_norm": 2.2116916179656982, + "learning_rate": 0.000483, + "loss": 2.05, "step": 1626 }, { "epoch": 2.6032, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, + "grad_norm": 0.7246533632278442, + "learning_rate": 0.00048329999999999993, + "loss": 2.0097, "step": 1627 }, { "epoch": 2.6048, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, + "grad_norm": 2.4604125022888184, + "learning_rate": 0.0004836, + "loss": 2.3906, "step": 1628 }, { "epoch": 2.6064, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, + "grad_norm": 1.2371268272399902, + "learning_rate": 0.00048389999999999994, + "loss": 2.003, "step": 1629 }, { "epoch": 2.608, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, + "grad_norm": 0.6980746984481812, + "learning_rate": 0.0004842, + "loss": 1.8967, "step": 1630 }, { "epoch": 2.6096, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, + "grad_norm": 0.7364569902420044, + "learning_rate": 0.00048449999999999996, + "loss": 1.9932, "step": 1631 }, { "epoch": 2.6112, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, + "grad_norm": 0.5320027470588684, + "learning_rate": 0.00048479999999999997, + "loss": 2.0484, "step": 1632 }, { "epoch": 2.6128, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, + "grad_norm": 2.290400981903076, + "learning_rate": 0.0004851, + "loss": 2.3368, "step": 1633 }, { "epoch": 2.6144, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, + "grad_norm": 1.922303318977356, + "learning_rate": 0.0004854, + "loss": 2.059, "step": 1634 }, { "epoch": 2.616, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, + "grad_norm": 1.3151150941848755, + "learning_rate": 0.00048569999999999993, + "loss": 2.2918, "step": 1635 }, { "epoch": 2.6176, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, + "grad_norm": 1.5266954898834229, + "learning_rate": 0.000486, + "loss": 2.1886, "step": 1636 }, { "epoch": 2.6192, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, + "grad_norm": 1.0405694246292114, + "learning_rate": 0.00048629999999999995, + "loss": 2.0374, "step": 1637 }, { "epoch": 2.6208, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, + "grad_norm": 2.313121795654297, + "learning_rate": 0.0004866, + "loss": 2.2178, "step": 1638 }, { "epoch": 2.6224, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, + "grad_norm": 1.2103170156478882, + "learning_rate": 0.00048689999999999996, + "loss": 2.0749, "step": 1639 }, { "epoch": 2.624, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, + "grad_norm": 1.970706820487976, + "learning_rate": 0.00048719999999999997, + "loss": 2.0443, "step": 1640 }, { "epoch": 2.6256, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, + "grad_norm": 1.3431305885314941, + "learning_rate": 0.0004875, + "loss": 1.8063, "step": 1641 }, { "epoch": 2.6272, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, + "grad_norm": 1.1013216972351074, + "learning_rate": 0.00048779999999999993, + "loss": 2.0287, "step": 1642 }, { "epoch": 2.6288, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, + "grad_norm": 1.6037465333938599, + "learning_rate": 0.00048809999999999994, + "loss": 1.9267, "step": 1643 }, { "epoch": 2.6304, - "grad_norm": NaN, - "learning_rate": 0.0005747999999999999, - "loss": 0.0, + "grad_norm": 1.6325836181640625, + "learning_rate": 0.0004883999999999999, + "loss": 1.9847, "step": 1644 }, { "epoch": 2.632, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 2.4299, + "grad_norm": 0.6441347599029541, + "learning_rate": 0.0004887, + "loss": 1.8627, "step": 1645 }, { "epoch": 2.6336, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.052411675453186, + "learning_rate": 0.000489, + "loss": 2.2007, "step": 1646 }, { "epoch": 2.6352, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.7423901557922363, + "learning_rate": 0.0004892999999999999, + "loss": 1.9574, "step": 1647 }, { "epoch": 2.6368, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.6799460053443909, + "learning_rate": 0.0004896, + "loss": 2.0761, "step": 1648 }, { "epoch": 2.6384, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.829913854598999, + "learning_rate": 0.0004898999999999999, + "loss": 2.0512, "step": 1649 }, { "epoch": 2.64, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.362149715423584, + "learning_rate": 0.0004901999999999999, + "loss": 2.6416, "step": 1650 }, { "epoch": 2.6416, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 31.19998550415039, + "learning_rate": 0.0004904999999999999, + "loss": 3.999, "step": 1651 }, { "epoch": 2.6432, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.805262565612793, + "learning_rate": 0.0004907999999999999, + "loss": 2.3079, "step": 1652 }, { "epoch": 2.6448, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.1498773097991943, + "learning_rate": 0.0004911, + "loss": 2.5233, "step": 1653 }, { "epoch": 2.6464, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.1593884229660034, + "learning_rate": 0.0004913999999999999, + "loss": 2.505, "step": 1654 }, { "epoch": 2.648, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.466228723526001, + "learning_rate": 0.0004917, + "loss": 2.3373, "step": 1655 }, { "epoch": 2.6496, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.7555994987487793, + "learning_rate": 0.0004919999999999999, + "loss": 2.07, "step": 1656 }, { "epoch": 2.6512000000000002, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.5982134342193604, + "learning_rate": 0.0004923, + "loss": 2.0622, "step": 1657 }, { "epoch": 2.6528, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.938797473907471, + "learning_rate": 0.0004925999999999999, + "loss": 2.2787, "step": 1658 }, { "epoch": 2.6544, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.7524287700653076, + "learning_rate": 0.0004929, + "loss": 2.1268, "step": 1659 }, { "epoch": 2.656, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.6103334426879883, + "learning_rate": 0.0004932, + "loss": 1.9649, "step": 1660 }, { "epoch": 2.6576, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.3654429912567139, + "learning_rate": 0.0004935, + "loss": 2.1234, "step": 1661 }, { "epoch": 2.6592000000000002, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.2061095237731934, + "learning_rate": 0.0004938, + "loss": 2.073, "step": 1662 }, { "epoch": 2.6608, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.7076228857040405, + "learning_rate": 0.0004940999999999999, + "loss": 1.8841, "step": 1663 }, { "epoch": 2.6624, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.660804033279419, + "learning_rate": 0.0004944, + "loss": 1.971, "step": 1664 }, { "epoch": 2.664, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.5702351331710815, + "learning_rate": 0.0004946999999999999, + "loss": 1.9213, "step": 1665 }, { "epoch": 2.6656, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.6872142553329468, + "learning_rate": 0.0004949999999999999, + "loss": 1.9347, "step": 1666 }, { "epoch": 2.6672000000000002, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.4251344203948975, + "learning_rate": 0.0004953, + "loss": 2.3131, "step": 1667 }, { "epoch": 2.6688, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.768301784992218, + "learning_rate": 0.0004955999999999999, + "loss": 2.0859, "step": 1668 }, { "epoch": 2.6704, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.746718168258667, + "learning_rate": 0.0004959, + "loss": 2.052, "step": 1669 }, { "epoch": 2.672, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.39943528175354, + "learning_rate": 0.0004961999999999999, + "loss": 1.8502, "step": 1670 }, { "epoch": 2.6736, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.1550344228744507, + "learning_rate": 0.0004965, + "loss": 2.1201, "step": 1671 }, { "epoch": 2.6752000000000002, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.6754303574562073, + "learning_rate": 0.0004967999999999999, + "loss": 2.0508, "step": 1672 }, { "epoch": 2.6768, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.0880494117736816, + "learning_rate": 0.0004971, + "loss": 1.9002, "step": 1673 }, { "epoch": 2.6784, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.8594996333122253, + "learning_rate": 0.0004974, + "loss": 2.1085, "step": 1674 }, { "epoch": 2.68, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.8045554757118225, + "learning_rate": 0.0004977, + "loss": 2.0949, "step": 1675 }, { "epoch": 2.6816, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.7858266830444336, + "learning_rate": 0.000498, + "loss": 1.7859, "step": 1676 }, { "epoch": 2.6832000000000003, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.4103102684020996, + "learning_rate": 0.0004982999999999999, + "loss": 1.8545, "step": 1677 }, { "epoch": 2.6848, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.5115968585014343, + "learning_rate": 0.0004986, + "loss": 2.1158, "step": 1678 }, { "epoch": 2.6864, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.2175918817520142, + "learning_rate": 0.0004988999999999999, + "loss": 1.9361, "step": 1679 }, { "epoch": 2.6879999999999997, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.6044435501098633, + "learning_rate": 0.0004991999999999999, + "loss": 1.8655, "step": 1680 }, { "epoch": 2.6896, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.8397135138511658, + "learning_rate": 0.0004994999999999999, + "loss": 1.9778, "step": 1681 }, { "epoch": 2.6912000000000003, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.6573118567466736, + "learning_rate": 0.0004997999999999999, + "loss": 1.9161, "step": 1682 }, { "epoch": 2.6928, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.799583911895752, + "learning_rate": 0.0005001, + "loss": 2.1068, "step": 1683 }, { "epoch": 2.6944, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.9398294687271118, + "learning_rate": 0.0005003999999999999, + "loss": 2.1357, "step": 1684 }, { "epoch": 2.6959999999999997, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.780285656452179, + "learning_rate": 0.0005007, + "loss": 2.1124, "step": 1685 }, { "epoch": 2.6976, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.7242156863212585, + "learning_rate": 0.0005009999999999999, + "loss": 2.1581, "step": 1686 }, { "epoch": 2.6992000000000003, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.8511390089988708, + "learning_rate": 0.0005013, + "loss": 1.9856, "step": 1687 }, { "epoch": 2.7008, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.6015489101409912, + "learning_rate": 0.0005015999999999999, + "loss": 2.1612, "step": 1688 }, { "epoch": 2.7024, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.49614596366882324, + "learning_rate": 0.0005019, + "loss": 1.9588, "step": 1689 }, { "epoch": 2.7039999999999997, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.5268350839614868, + "learning_rate": 0.0005022, + "loss": 1.739, "step": 1690 }, { "epoch": 2.7056, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.3183175325393677, + "learning_rate": 0.0005025, + "loss": 2.3823, "step": 1691 }, { "epoch": 2.7072000000000003, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.8698546290397644, + "learning_rate": 0.0005028, + "loss": 1.8146, "step": 1692 }, { "epoch": 2.7088, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.7403873205184937, + "learning_rate": 0.0005030999999999999, + "loss": 2.0503, "step": 1693 }, { "epoch": 2.7104, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.6730201840400696, + "learning_rate": 0.0005034, + "loss": 2.1787, "step": 1694 }, { "epoch": 2.7119999999999997, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.9731574654579163, + "learning_rate": 0.0005036999999999999, + "loss": 1.7114, "step": 1695 }, { "epoch": 2.7136, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.7726271748542786, + "learning_rate": 0.0005039999999999999, + "loss": 1.8783, "step": 1696 }, { "epoch": 2.7152, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.9202337265014648, + "learning_rate": 0.0005043, + "loss": 2.3164, "step": 1697 }, { "epoch": 2.7168, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.9275617599487305, + "learning_rate": 0.0005045999999999999, + "loss": 2.359, "step": 1698 }, { "epoch": 2.7184, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.6160383224487305, + "learning_rate": 0.0005049, + "loss": 2.0426, "step": 1699 }, { "epoch": 2.7199999999999998, "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, - "step": 1700 - }, - { - "epoch": 2.7199999999999998, - "eval_cer": 1.0, - "eval_loss": NaN, - "eval_runtime": 162.8269, - "eval_samples_per_second": 19.26, - "eval_steps_per_second": 1.204, - "eval_wer": 1.0, + "learning_rate": 0.0005049, + "loss": 2.2236, "step": 1700 }, { "epoch": 2.7216, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 10.13432502746582, + "learning_rate": 0.0005051999999999999, + "loss": 3.2363, "step": 1701 }, { "epoch": 2.7232, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.4809963703155518, + "learning_rate": 0.0005055, + "loss": 2.4094, "step": 1702 }, { "epoch": 2.7248, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.4243474006652832, + "learning_rate": 0.0005057999999999999, + "loss": 2.2277, "step": 1703 }, { "epoch": 2.7264, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.8862740993499756, + "learning_rate": 0.0005061, + "loss": 2.4024, "step": 1704 }, { "epoch": 2.7279999999999998, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.541082382202148, + "learning_rate": 0.0005064, + "loss": 2.1635, "step": 1705 }, { "epoch": 2.7296, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.9146323204040527, + "learning_rate": 0.0005067, + "loss": 2.1451, "step": 1706 }, { "epoch": 2.7312, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.9531452655792236, + "learning_rate": 0.000507, + "loss": 2.1887, "step": 1707 }, { "epoch": 2.7328, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 6.83717155456543, + "learning_rate": 0.0005073, + "loss": 2.6369, "step": 1708 }, { "epoch": 2.7344, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.303669810295105, + "learning_rate": 0.0005076, + "loss": 2.0226, "step": 1709 }, { "epoch": 2.7359999999999998, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.5467097163200378, + "learning_rate": 0.0005078999999999999, + "loss": 1.9454, "step": 1710 }, { "epoch": 2.7376, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.554548740386963, + "learning_rate": 0.0005082, + "loss": 2.0932, "step": 1711 }, { "epoch": 2.7392, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.5141626596450806, + "learning_rate": 0.0005085, + "loss": 2.1044, "step": 1712 }, { "epoch": 2.7408, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.388733148574829, + "learning_rate": 0.0005087999999999999, + "loss": 2.3155, "step": 1713 }, { "epoch": 2.7424, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.090745210647583, + "learning_rate": 0.0005091, + "loss": 2.0168, "step": 1714 }, { "epoch": 2.7439999999999998, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.5126121044158936, + "learning_rate": 0.0005093999999999999, + "loss": 1.988, "step": 1715 }, { "epoch": 2.7456, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.547797918319702, + "learning_rate": 0.0005097, + "loss": 1.9305, "step": 1716 }, { "epoch": 2.7472, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.9627645611763, + "learning_rate": 0.0005099999999999999, + "loss": 1.9474, "step": 1717 }, { "epoch": 2.7488, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.361758232116699, + "learning_rate": 0.0005103, + "loss": 2.0709, "step": 1718 }, { "epoch": 2.7504, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.1550334692001343, + "learning_rate": 0.0005105999999999999, + "loss": 1.8985, "step": 1719 }, { "epoch": 2.752, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.120604157447815, + "learning_rate": 0.0005109, + "loss": 1.8786, "step": 1720 }, { "epoch": 2.7536, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.5634665489196777, + "learning_rate": 0.0005112, + "loss": 1.9776, "step": 1721 }, { "epoch": 2.7552, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.949796676635742, + "learning_rate": 0.0005115, + "loss": 1.9238, "step": 1722 }, { "epoch": 2.7568, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.8489679098129272, + "learning_rate": 0.0005118, + "loss": 1.695, "step": 1723 }, { "epoch": 2.7584, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.8145275115966797, + "learning_rate": 0.0005120999999999999, + "loss": 2.0701, "step": 1724 }, { "epoch": 2.76, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.7843981981277466, + "learning_rate": 0.0005124, + "loss": 1.7572, "step": 1725 }, { "epoch": 2.7616, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.4644101560115814, + "learning_rate": 0.0005126999999999999, + "loss": 1.7349, "step": 1726 }, { "epoch": 2.7632, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.9464445114135742, + "learning_rate": 0.0005129999999999999, + "loss": 1.9303, "step": 1727 }, { "epoch": 2.7648, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.003891110420227, + "learning_rate": 0.0005133, + "loss": 1.7827, "step": 1728 }, { "epoch": 2.7664, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.8749507665634155, + "learning_rate": 0.0005135999999999999, + "loss": 1.941, "step": 1729 }, { "epoch": 2.768, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.8504045009613037, + "learning_rate": 0.0005139, + "loss": 2.2031, "step": 1730 }, { "epoch": 2.7696, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.7272230982780457, + "learning_rate": 0.0005141999999999999, + "loss": 2.2033, "step": 1731 }, { "epoch": 2.7712, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.5356547832489014, + "learning_rate": 0.0005145, + "loss": 1.7236, "step": 1732 }, { "epoch": 2.7728, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.2203516960144043, + "learning_rate": 0.0005147999999999999, + "loss": 2.0358, "step": 1733 }, { "epoch": 2.7744, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.8269984126091003, + "learning_rate": 0.0005151, + "loss": 2.1528, "step": 1734 }, { "epoch": 2.776, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.1500989198684692, + "learning_rate": 0.0005154, + "loss": 1.9726, "step": 1735 }, { "epoch": 2.7776, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.8158721923828125, + "learning_rate": 0.0005157, + "loss": 2.0221, "step": 1736 }, { "epoch": 2.7792, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.5364477634429932, + "learning_rate": 0.000516, + "loss": 1.9459, "step": 1737 }, { "epoch": 2.7808, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.5089154839515686, + "learning_rate": 0.0005163, + "loss": 1.842, "step": 1738 }, { "epoch": 2.7824, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.3278019428253174, + "learning_rate": 0.0005166, + "loss": 2.0213, "step": 1739 }, { "epoch": 2.784, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.6192806959152222, + "learning_rate": 0.0005168999999999999, + "loss": 1.7591, "step": 1740 }, { "epoch": 2.7856, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.6816684007644653, + "learning_rate": 0.0005172, + "loss": 1.9409, "step": 1741 }, { "epoch": 2.7872, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.6508601307868958, + "learning_rate": 0.0005175, + "loss": 1.8534, "step": 1742 }, { "epoch": 2.7888, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.8660383224487305, + "learning_rate": 0.0005177999999999999, + "loss": 1.9621, "step": 1743 }, { "epoch": 2.7904, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.5654451251029968, + "learning_rate": 0.0005181, + "loss": 1.8793, "step": 1744 }, { "epoch": 2.792, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.010545015335083, + "learning_rate": 0.0005183999999999999, + "loss": 1.8519, "step": 1745 }, { "epoch": 2.7936, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.6233418583869934, + "learning_rate": 0.0005187, + "loss": 2.305, "step": 1746 }, { "epoch": 2.7952, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.8341806530952454, + "learning_rate": 0.0005189999999999999, + "loss": 2.2414, "step": 1747 }, { "epoch": 2.7968, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.7348121404647827, + "learning_rate": 0.0005193, + "loss": 1.8707, "step": 1748 }, { "epoch": 2.7984, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.8103607892990112, + "learning_rate": 0.0005195999999999999, + "loss": 2.0222, "step": 1749 }, { "epoch": 2.8, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.6983300447463989, + "learning_rate": 0.0005199, + "loss": 2.1295, "step": 1750 }, { "epoch": 2.8016, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.782421588897705, + "learning_rate": 0.0005202, + "loss": 2.5126, "step": 1751 }, { "epoch": 2.8032, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.138143539428711, + "learning_rate": 0.0005205, + "loss": 2.1409, "step": 1752 }, { "epoch": 2.8048, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.9656838178634644, + "learning_rate": 0.0005208, + "loss": 2.3417, "step": 1753 }, { "epoch": 2.8064, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.654718816280365, + "learning_rate": 0.0005211, + "loss": 2.114, "step": 1754 }, { "epoch": 2.808, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.243199348449707, + "learning_rate": 0.0005214, + "loss": 2.5875, "step": 1755 }, { "epoch": 2.8096, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.9750616550445557, + "learning_rate": 0.0005216999999999999, + "loss": 2.0969, "step": 1756 }, { "epoch": 2.8112, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.2397475242614746, + "learning_rate": 0.000522, + "loss": 2.0965, "step": 1757 }, { "epoch": 2.8128, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.9542648792266846, + "learning_rate": 0.0005223, + "loss": 2.0542, "step": 1758 }, { "epoch": 2.8144, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.8388017416000366, + "learning_rate": 0.0005225999999999999, + "loss": 2.2752, "step": 1759 }, { "epoch": 2.816, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.9828288555145264, + "learning_rate": 0.0005229, + "loss": 1.901, "step": 1760 }, { "epoch": 2.8176, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.787229061126709, + "learning_rate": 0.0005231999999999999, + "loss": 1.7862, "step": 1761 }, { "epoch": 2.8192, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.40390992164611816, + "learning_rate": 0.0005235, + "loss": 1.9124, "step": 1762 }, { "epoch": 2.8208, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.3556129932403564, + "learning_rate": 0.0005237999999999999, + "loss": 2.0211, "step": 1763 }, { "epoch": 2.8224, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.4864728450775146, + "learning_rate": 0.0005241, + "loss": 1.775, "step": 1764 }, { "epoch": 2.824, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.8218860626220703, + "learning_rate": 0.0005244, + "loss": 1.8258, "step": 1765 }, { "epoch": 2.8256, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.097517967224121, + "learning_rate": 0.0005247, + "loss": 1.8297, "step": 1766 }, { "epoch": 2.8272, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.49936580657959, + "learning_rate": 0.000525, + "loss": 2.0836, "step": 1767 }, { "epoch": 2.8288, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.7731903791427612, + "learning_rate": 0.0005252999999999999, + "loss": 1.7411, "step": 1768 }, { "epoch": 2.8304, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.8479838967323303, + "learning_rate": 0.0005256, + "loss": 1.6936, "step": 1769 }, { "epoch": 2.832, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.757756233215332, + "learning_rate": 0.0005258999999999999, + "loss": 1.9003, "step": 1770 }, { "epoch": 2.8336, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.8400256633758545, + "learning_rate": 0.0005262, + "loss": 1.7133, "step": 1771 }, { "epoch": 2.8352, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.8254936933517456, + "learning_rate": 0.0005265, + "loss": 1.8496, "step": 1772 }, { "epoch": 2.8368, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.5556833148002625, + "learning_rate": 0.0005267999999999999, + "loss": 1.8549, "step": 1773 }, { "epoch": 2.8384, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.6342326402664185, + "learning_rate": 0.0005271, + "loss": 2.008, "step": 1774 }, { "epoch": 2.84, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.6860460638999939, + "learning_rate": 0.0005273999999999999, + "loss": 1.7957, "step": 1775 }, { "epoch": 2.8416, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.0764575004577637, + "learning_rate": 0.0005276999999999999, + "loss": 2.0447, "step": 1776 }, { "epoch": 2.8432, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.4549970030784607, + "learning_rate": 0.0005279999999999999, + "loss": 2.0276, "step": 1777 }, { "epoch": 2.8448, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.5434354543685913, + "learning_rate": 0.0005282999999999999, + "loss": 1.9484, "step": 1778 }, { "epoch": 2.8464, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.8522821664810181, + "learning_rate": 0.0005286, + "loss": 1.7575, "step": 1779 }, { "epoch": 2.848, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.750393271446228, + "learning_rate": 0.0005288999999999999, + "loss": 1.9477, "step": 1780 }, { "epoch": 2.8496, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.7382807731628418, + "learning_rate": 0.0005292, + "loss": 1.8868, "step": 1781 }, { "epoch": 2.8512, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.1927576065063477, + "learning_rate": 0.0005294999999999999, + "loss": 2.0142, "step": 1782 }, { "epoch": 2.8528000000000002, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.0110162496566772, + "learning_rate": 0.0005298, + "loss": 2.0582, "step": 1783 }, { "epoch": 2.8544, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.6020126342773438, + "learning_rate": 0.0005300999999999999, + "loss": 1.7602, "step": 1784 }, { "epoch": 2.856, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.7490088939666748, + "learning_rate": 0.0005304, + "loss": 1.8641, "step": 1785 }, { "epoch": 2.8576, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.5195388793945312, + "learning_rate": 0.0005306999999999999, + "loss": 1.7468, "step": 1786 }, { "epoch": 2.8592, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.7462009191513062, + "learning_rate": 0.000531, + "loss": 1.9464, "step": 1787 }, { "epoch": 2.8608000000000002, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.5070817470550537, + "learning_rate": 0.0005313, + "loss": 1.949, "step": 1788 }, { "epoch": 2.8624, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.6855885982513428, + "learning_rate": 0.0005315999999999999, + "loss": 2.1353, "step": 1789 }, { "epoch": 2.864, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.7518872618675232, + "learning_rate": 0.0005319, + "loss": 2.0692, "step": 1790 }, { "epoch": 2.8656, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.1828621625900269, + "learning_rate": 0.0005321999999999999, + "loss": 1.9593, "step": 1791 }, { "epoch": 2.8672, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.2454311847686768, + "learning_rate": 0.0005324999999999999, + "loss": 1.8795, "step": 1792 }, { "epoch": 2.8688000000000002, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.3150829076766968, + "learning_rate": 0.0005327999999999999, + "loss": 2.0185, "step": 1793 }, { "epoch": 2.8704, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.9219515323638916, + "learning_rate": 0.0005330999999999999, + "loss": 2.183, "step": 1794 }, { "epoch": 2.872, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.1829967498779297, + "learning_rate": 0.0005334, + "loss": 2.3794, "step": 1795 }, { "epoch": 2.8736, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.068989872932434, + "learning_rate": 0.0005336999999999999, + "loss": 1.9122, "step": 1796 }, { "epoch": 2.8752, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.4533793926239014, + "learning_rate": 0.000534, + "loss": 1.7805, "step": 1797 }, { "epoch": 2.8768000000000002, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.4632128477096558, + "learning_rate": 0.0005342999999999999, + "loss": 2.3252, "step": 1798 }, { "epoch": 2.8784, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.011090636253357, + "learning_rate": 0.0005346, + "loss": 2.1687, "step": 1799 }, { "epoch": 2.88, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, - "step": 1800 - }, - { - "epoch": 2.88, - "eval_cer": 1.0, - "eval_loss": NaN, - "eval_runtime": 158.7253, - "eval_samples_per_second": 19.757, - "eval_steps_per_second": 1.235, - "eval_wer": 1.0, + "grad_norm": 1.1097999811172485, + "learning_rate": 0.0005348999999999999, + "loss": 2.2309, "step": 1800 }, { "epoch": 2.8816, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 10.508502006530762, + "learning_rate": 0.0005352, + "loss": 2.6601, "step": 1801 }, { "epoch": 2.8832, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 8.70348834991455, + "learning_rate": 0.0005355, + "loss": 2.6031, "step": 1802 }, { "epoch": 2.8848000000000003, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.40385365486145, + "learning_rate": 0.0005358, + "loss": 2.0591, "step": 1803 }, { "epoch": 2.8864, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.063434600830078, + "learning_rate": 0.0005361, + "loss": 2.304, "step": 1804 }, { "epoch": 2.888, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.9008119106292725, + "learning_rate": 0.0005363999999999999, + "loss": 2.3875, "step": 1805 }, { "epoch": 2.8895999999999997, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.3916290998458862, + "learning_rate": 0.0005367, + "loss": 1.9926, "step": 1806 }, { "epoch": 2.8912, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.7277209758758545, + "learning_rate": 0.0005369999999999999, + "loss": 1.9297, "step": 1807 }, { "epoch": 2.8928000000000003, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.7167696952819824, + "learning_rate": 0.0005372999999999999, + "loss": 1.8832, "step": 1808 }, { "epoch": 2.8944, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.2191977500915527, + "learning_rate": 0.0005376, + "loss": 2.1223, "step": 1809 }, { "epoch": 2.896, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 6.684467315673828, + "learning_rate": 0.0005378999999999999, + "loss": 2.4019, "step": 1810 }, { "epoch": 2.8975999999999997, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.767059326171875, + "learning_rate": 0.0005382, + "loss": 1.9166, "step": 1811 }, { "epoch": 2.8992, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.2353407144546509, + "learning_rate": 0.0005384999999999999, + "loss": 2.0115, "step": 1812 }, { "epoch": 2.9008000000000003, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.391242742538452, + "learning_rate": 0.0005388, + "loss": 1.7517, "step": 1813 }, { "epoch": 2.9024, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.6257333755493164, + "learning_rate": 0.0005390999999999999, + "loss": 1.7542, "step": 1814 }, { "epoch": 2.904, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.6803297996520996, + "learning_rate": 0.0005394, + "loss": 1.7067, "step": 1815 }, { "epoch": 2.9055999999999997, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.7414671182632446, + "learning_rate": 0.0005396999999999999, + "loss": 1.7507, "step": 1816 }, { "epoch": 2.9072, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.1055442094802856, + "learning_rate": 0.00054, + "loss": 1.9703, "step": 1817 }, { "epoch": 2.9088000000000003, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.4091672897338867, + "learning_rate": 0.0005403, + "loss": 1.9673, "step": 1818 }, { "epoch": 2.9104, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.8116196393966675, + "learning_rate": 0.0005405999999999999, + "loss": 1.7938, "step": 1819 }, { "epoch": 2.912, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.4093546867370605, + "learning_rate": 0.0005409, + "loss": 1.9355, "step": 1820 }, { "epoch": 2.9135999999999997, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.8408976197242737, + "learning_rate": 0.0005411999999999999, + "loss": 2.0643, "step": 1821 }, { "epoch": 2.9152, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 7.2744598388671875, + "learning_rate": 0.0005414999999999999, + "loss": 2.2347, "step": 1822 }, { "epoch": 2.9168, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.5432764291763306, + "learning_rate": 0.0005417999999999999, + "loss": 1.9725, "step": 1823 }, { "epoch": 2.9184, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.5486776232719421, + "learning_rate": 0.0005420999999999999, + "loss": 1.5897, "step": 1824 }, { "epoch": 2.92, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.742293119430542, + "learning_rate": 0.0005424, + "loss": 2.0866, "step": 1825 }, { "epoch": 2.9215999999999998, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.380513310432434, + "learning_rate": 0.0005426999999999999, + "loss": 1.7298, "step": 1826 }, { "epoch": 2.9232, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.8607488870620728, + "learning_rate": 0.000543, + "loss": 1.8805, "step": 1827 }, { "epoch": 2.9248, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.006989598274231, + "learning_rate": 0.0005432999999999999, + "loss": 2.0631, "step": 1828 }, { "epoch": 2.9264, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.6280760765075684, + "learning_rate": 0.0005436, + "loss": 1.8232, "step": 1829 }, { "epoch": 2.928, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.5118163228034973, + "learning_rate": 0.0005438999999999999, + "loss": 2.0281, "step": 1830 }, { "epoch": 2.9295999999999998, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.9178122282028198, + "learning_rate": 0.0005442, + "loss": 1.7694, "step": 1831 }, { "epoch": 2.9312, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.245267629623413, + "learning_rate": 0.0005445, + "loss": 2.2356, "step": 1832 }, { "epoch": 2.9328, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.5646647214889526, + "learning_rate": 0.0005448, + "loss": 2.111, "step": 1833 }, { "epoch": 2.9344, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.696097493171692, + "learning_rate": 0.0005451, + "loss": 2.0895, "step": 1834 }, { "epoch": 2.936, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.9454824328422546, + "learning_rate": 0.0005453999999999999, + "loss": 1.9672, "step": 1835 }, { "epoch": 2.9375999999999998, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.8050206899642944, + "learning_rate": 0.0005457, + "loss": 1.8222, "step": 1836 }, { "epoch": 2.9392, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.688675880432129, + "learning_rate": 0.0005459999999999999, + "loss": 1.9278, "step": 1837 }, { "epoch": 2.9408, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.3390727043151855, + "learning_rate": 0.0005462999999999999, + "loss": 2.0807, "step": 1838 }, { "epoch": 2.9424, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.0786921977996826, + "learning_rate": 0.0005466, + "loss": 2.1307, "step": 1839 }, { "epoch": 2.944, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.0735629796981812, + "learning_rate": 0.0005468999999999999, + "loss": 1.7635, "step": 1840 }, { "epoch": 2.9455999999999998, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.2507575750350952, + "learning_rate": 0.0005472, + "loss": 1.812, "step": 1841 }, { "epoch": 2.9472, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.5095469355583191, + "learning_rate": 0.0005474999999999999, + "loss": 1.9677, "step": 1842 }, { "epoch": 2.9488, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.0531517267227173, + "learning_rate": 0.0005478, + "loss": 1.8112, "step": 1843 }, { "epoch": 2.9504, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.995157241821289, + "learning_rate": 0.0005480999999999999, + "loss": 2.3632, "step": 1844 }, { "epoch": 2.952, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.5749807953834534, + "learning_rate": 0.0005484, + "loss": 2.0844, "step": 1845 }, { "epoch": 2.9536, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.7241598963737488, + "learning_rate": 0.0005487, + "loss": 2.0575, "step": 1846 }, { "epoch": 2.9552, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.396909475326538, + "learning_rate": 0.000549, + "loss": 2.2126, "step": 1847 }, { "epoch": 2.9568, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.5103518962860107, + "learning_rate": 0.0005493, + "loss": 2.276, "step": 1848 }, { "epoch": 2.9584, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.392578601837158, + "learning_rate": 0.0005496, + "loss": 2.3884, "step": 1849 }, { "epoch": 2.96, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.7753760814666748, + "learning_rate": 0.0005499, + "loss": 2.3444, "step": 1850 }, { "epoch": 2.9616, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.469567060470581, + "learning_rate": 0.0005501999999999999, + "loss": 2.3639, "step": 1851 }, { "epoch": 2.9632, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.1051902770996094, + "learning_rate": 0.0005505, + "loss": 2.4397, "step": 1852 }, { "epoch": 2.9648, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.8053222894668579, + "learning_rate": 0.0005507999999999999, + "loss": 2.4935, "step": 1853 }, { "epoch": 2.9664, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.687619924545288, + "learning_rate": 0.0005510999999999999, + "loss": 1.8974, "step": 1854 }, { "epoch": 2.968, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.211838722229004, + "learning_rate": 0.0005514, + "loss": 1.8626, "step": 1855 }, { "epoch": 2.9696, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.72635817527771, + "learning_rate": 0.0005516999999999999, + "loss": 1.9059, "step": 1856 }, { "epoch": 2.9712, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.077843427658081, + "learning_rate": 0.000552, + "loss": 1.9353, "step": 1857 }, { "epoch": 2.9728, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.494789123535156, + "learning_rate": 0.0005522999999999999, + "loss": 2.1245, "step": 1858 }, { "epoch": 2.9744, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.749758243560791, + "learning_rate": 0.0005526, + "loss": 1.8147, "step": 1859 }, { "epoch": 2.976, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.0033164024353027, + "learning_rate": 0.0005528999999999999, + "loss": 1.8362, "step": 1860 }, { "epoch": 2.9776, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.6518697738647461, + "learning_rate": 0.0005532, + "loss": 2.2349, "step": 1861 }, { "epoch": 2.9792, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.695722222328186, + "learning_rate": 0.0005535, + "loss": 1.7574, "step": 1862 }, { "epoch": 2.9808, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.5895578861236572, + "learning_rate": 0.0005538, + "loss": 2.0763, "step": 1863 }, { "epoch": 2.9824, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.0296926498413086, + "learning_rate": 0.0005541, + "loss": 1.9809, "step": 1864 }, { "epoch": 2.984, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.6186490654945374, + "learning_rate": 0.0005544, + "loss": 1.8883, "step": 1865 }, { "epoch": 2.9856, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.3189398050308228, + "learning_rate": 0.0005547, + "loss": 1.8607, "step": 1866 }, { "epoch": 2.9872, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.6981890201568604, + "learning_rate": 0.0005549999999999999, + "loss": 1.9978, "step": 1867 }, { "epoch": 2.9888, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.823566734790802, + "learning_rate": 0.0005552999999999999, + "loss": 1.744, "step": 1868 }, { "epoch": 2.9904, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.6368497610092163, + "learning_rate": 0.0005556, + "loss": 2.0121, "step": 1869 }, { "epoch": 2.992, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.7579899430274963, + "learning_rate": 0.0005558999999999999, + "loss": 1.778, "step": 1870 }, { "epoch": 2.9936, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.4312794208526611, + "learning_rate": 0.0005562, + "loss": 2.0611, "step": 1871 }, { "epoch": 2.9952, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.6638422608375549, + "learning_rate": 0.0005564999999999999, + "loss": 1.9862, "step": 1872 }, { "epoch": 2.9968, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.5944936275482178, + "learning_rate": 0.0005568, + "loss": 2.1282, "step": 1873 }, { "epoch": 2.9984, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.530720829963684, + "learning_rate": 0.0005570999999999999, + "loss": 1.972, "step": 1874 }, { "epoch": 3.0, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.1473948955535889, + "learning_rate": 0.0005574, + "loss": 2.3039, "step": 1875 }, { "epoch": 3.0016, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 15.021516799926758, + "learning_rate": 0.0005577, + "loss": 3.0943, "step": 1876 }, { "epoch": 3.0032, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.774639129638672, + "learning_rate": 0.000558, + "loss": 2.3365, "step": 1877 }, { "epoch": 3.0048, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 7.604449272155762, + "learning_rate": 0.0005583, + "loss": 2.5506, "step": 1878 }, { "epoch": 3.0064, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.6512893438339233, + "learning_rate": 0.0005586, + "loss": 2.2356, "step": 1879 }, { "epoch": 3.008, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.2958693504333496, + "learning_rate": 0.0005589, + "loss": 2.1657, "step": 1880 }, { "epoch": 3.0096, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.4474709033966064, + "learning_rate": 0.0005591999999999999, + "loss": 2.0949, "step": 1881 }, { "epoch": 3.0112, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.083594560623169, + "learning_rate": 0.0005595, + "loss": 2.3073, "step": 1882 }, { "epoch": 3.0128, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.5751953125, + "learning_rate": 0.0005598, + "loss": 2.0699, "step": 1883 }, { "epoch": 3.0144, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.7937655448913574, + "learning_rate": 0.0005600999999999999, + "loss": 2.0185, "step": 1884 }, { "epoch": 3.016, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.5389955043792725, + "learning_rate": 0.0005604, + "loss": 1.7487, "step": 1885 }, { "epoch": 3.0176, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.738939642906189, + "learning_rate": 0.0005606999999999999, + "loss": 1.9005, "step": 1886 }, { "epoch": 3.0192, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.3595213890075684, + "learning_rate": 0.000561, + "loss": 2.011, "step": 1887 }, { "epoch": 3.0208, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.682566225528717, + "learning_rate": 0.0005612999999999999, + "loss": 1.8104, "step": 1888 }, { "epoch": 3.0224, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.8090096712112427, + "learning_rate": 0.0005616, + "loss": 1.7363, "step": 1889 }, { "epoch": 3.024, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.5234781503677368, + "learning_rate": 0.0005618999999999999, + "loss": 1.8555, "step": 1890 }, { "epoch": 3.0256, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.7015899419784546, + "learning_rate": 0.0005622, + "loss": 1.7636, "step": 1891 }, { "epoch": 3.0272, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 7.202921390533447, + "learning_rate": 0.0005625, + "loss": 2.6288, "step": 1892 }, { "epoch": 3.0288, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.232006549835205, + "learning_rate": 0.0005627999999999999, + "loss": 1.7678, "step": 1893 }, { "epoch": 3.0304, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.543165683746338, + "learning_rate": 0.0005631, + "loss": 2.1546, "step": 1894 }, { "epoch": 3.032, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.719556212425232, + "learning_rate": 0.0005633999999999999, + "loss": 1.7731, "step": 1895 }, { "epoch": 3.0336, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.8694783449172974, + "learning_rate": 0.0005637, + "loss": 1.7355, "step": 1896 }, { "epoch": 3.0352, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.0235390663146973, + "learning_rate": 0.0005639999999999999, + "loss": 1.9301, "step": 1897 }, { "epoch": 3.0368, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.5643758177757263, + "learning_rate": 0.0005643, + "loss": 2.0847, "step": 1898 }, { "epoch": 3.0384, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 5.277967929840088, + "learning_rate": 0.0005646, + "loss": 2.1718, "step": 1899 }, { "epoch": 3.04, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, - "step": 1900 - }, - { - "epoch": 3.04, - "eval_cer": 1.0, - "eval_loss": NaN, - "eval_runtime": 160.4562, - "eval_samples_per_second": 19.544, - "eval_steps_per_second": 1.222, - "eval_wer": 1.0, + "grad_norm": 1.1477729082107544, + "learning_rate": 0.0005648999999999999, + "loss": 2.0403, "step": 1900 }, { "epoch": 3.0416, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.6605058908462524, + "learning_rate": 0.0005652, + "loss": 2.0047, "step": 1901 }, { "epoch": 3.0432, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.6357007622718811, + "learning_rate": 0.0005654999999999999, + "loss": 2.0975, "step": 1902 }, { "epoch": 3.0448, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.7702534198760986, + "learning_rate": 0.0005657999999999999, + "loss": 1.9427, "step": 1903 }, { "epoch": 3.0464, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.5849918127059937, + "learning_rate": 0.0005660999999999999, + "loss": 1.6914, "step": 1904 }, { "epoch": 3.048, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.6325346827507019, + "learning_rate": 0.0005663999999999999, + "loss": 1.7743, "step": 1905 }, { "epoch": 3.0496, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.7176706790924072, + "learning_rate": 0.0005667, + "loss": 1.9213, "step": 1906 }, { "epoch": 3.0512, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.0038143396377563, + "learning_rate": 0.0005669999999999999, + "loss": 1.9311, "step": 1907 }, { "epoch": 3.0528, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.6072099804878235, + "learning_rate": 0.0005673, + "loss": 2.0835, "step": 1908 }, { "epoch": 3.0544, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.0105469226837158, + "learning_rate": 0.0005675999999999999, + "loss": 1.8864, "step": 1909 }, { "epoch": 3.056, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.8080129027366638, + "learning_rate": 0.0005679, + "loss": 1.9892, "step": 1910 }, { "epoch": 3.0576, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.45754092931747437, + "learning_rate": 0.0005681999999999999, + "loss": 1.9756, "step": 1911 }, { "epoch": 3.0592, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.233243465423584, + "learning_rate": 0.0005685, + "loss": 1.8072, "step": 1912 }, { "epoch": 3.0608, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.556039810180664, + "learning_rate": 0.0005688, + "loss": 1.7558, "step": 1913 }, { "epoch": 3.0624, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.975345253944397, + "learning_rate": 0.0005691, + "loss": 1.6875, "step": 1914 }, { "epoch": 3.064, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.621182918548584, + "learning_rate": 0.0005694, + "loss": 1.9418, "step": 1915 }, { "epoch": 3.0656, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.2835545539855957, + "learning_rate": 0.0005696999999999999, + "loss": 1.9206, "step": 1916 }, { "epoch": 3.0672, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.9312288761138916, + "learning_rate": 0.00057, + "loss": 2.0601, "step": 1917 }, { "epoch": 3.0688, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.1318893432617188, + "learning_rate": 0.0005702999999999999, + "loss": 2.1345, "step": 1918 }, { "epoch": 3.0704, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.6130589246749878, + "learning_rate": 0.0005705999999999999, + "loss": 2.1618, "step": 1919 }, { "epoch": 3.072, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.576610803604126, + "learning_rate": 0.0005708999999999999, + "loss": 2.1284, "step": 1920 }, { "epoch": 3.0736, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.5201882719993591, + "learning_rate": 0.0005711999999999999, + "loss": 1.6506, "step": 1921 }, { "epoch": 3.0752, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.6475142240524292, + "learning_rate": 0.0005715, + "loss": 2.5413, "step": 1922 }, { "epoch": 3.0768, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.4755803346633911, + "learning_rate": 0.0005717999999999999, + "loss": 2.3249, "step": 1923 }, { "epoch": 3.0784, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.319411516189575, + "learning_rate": 0.0005721, + "loss": 1.914, "step": 1924 }, { "epoch": 3.08, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.6974052786827087, + "learning_rate": 0.0005723999999999999, + "loss": 2.448, "step": 1925 }, { "epoch": 3.0816, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 6.256194114685059, + "learning_rate": 0.0005727, + "loss": 2.5141, "step": 1926 }, { "epoch": 3.0832, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.4125750064849854, + "learning_rate": 0.0005729999999999999, + "loss": 2.3816, "step": 1927 }, { "epoch": 3.0848, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.0927053689956665, + "learning_rate": 0.0005733, + "loss": 2.0794, "step": 1928 }, { "epoch": 3.0864, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.9806353449821472, + "learning_rate": 0.0005736, + "loss": 2.0871, "step": 1929 }, { "epoch": 3.088, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.8955163955688477, + "learning_rate": 0.0005738999999999999, + "loss": 2.462, "step": 1930 }, { "epoch": 3.0896, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.9962303638458252, + "learning_rate": 0.0005742, + "loss": 2.1367, "step": 1931 }, { "epoch": 3.0912, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.6139843463897705, + "learning_rate": 0.0005744999999999999, + "loss": 2.0657, "step": 1932 }, { "epoch": 3.0928, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.9958500862121582, + "learning_rate": 0.0005747999999999999, + "loss": 2.1125, "step": 1933 }, { "epoch": 3.0944, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.8471503257751465, + "learning_rate": 0.0005750999999999999, + "loss": 1.7954, "step": 1934 }, { "epoch": 3.096, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.0833854675292969, + "learning_rate": 0.0005753999999999999, + "loss": 2.1097, "step": 1935 }, { "epoch": 3.0976, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.8012189865112305, + "learning_rate": 0.0005757, + "loss": 1.9832, "step": 1936 }, { "epoch": 3.0992, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.5345687866210938, + "learning_rate": 0.0005759999999999999, + "loss": 1.7323, "step": 1937 }, { "epoch": 3.1008, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.6692140102386475, + "learning_rate": 0.0005763, + "loss": 2.145, "step": 1938 }, { "epoch": 3.1024, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.1121718883514404, + "learning_rate": 0.0005765999999999999, + "loss": 1.8593, "step": 1939 }, { "epoch": 3.104, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.7572036981582642, + "learning_rate": 0.0005769, + "loss": 1.7605, "step": 1940 }, { "epoch": 3.1056, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.4940950274467468, + "learning_rate": 0.0005771999999999999, + "loss": 1.7811, "step": 1941 }, { "epoch": 3.1072, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.4116694927215576, + "learning_rate": 0.0005775, + "loss": 1.7468, "step": 1942 }, { "epoch": 3.1088, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.1187795400619507, + "learning_rate": 0.0005778, + "loss": 2.1021, "step": 1943 }, { "epoch": 3.1104, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 5.035007953643799, + "learning_rate": 0.0005781, + "loss": 2.0287, "step": 1944 }, { "epoch": 3.112, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.9366320371627808, + "learning_rate": 0.0005784, + "loss": 1.8002, "step": 1945 }, { "epoch": 3.1136, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.9380377531051636, + "learning_rate": 0.0005786999999999999, + "loss": 1.9192, "step": 1946 }, { "epoch": 3.1152, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.6677077412605286, + "learning_rate": 0.000579, + "loss": 1.8212, "step": 1947 }, { "epoch": 3.1168, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.8764153718948364, + "learning_rate": 0.0005792999999999999, + "loss": 1.9796, "step": 1948 }, { "epoch": 3.1184, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.44954901933670044, + "learning_rate": 0.0005795999999999999, + "loss": 1.7954, "step": 1949 }, { "epoch": 3.12, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.038944125175476, + "learning_rate": 0.0005799, + "loss": 2.1065, "step": 1950 }, { "epoch": 3.1216, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.420203924179077, + "learning_rate": 0.0005801999999999999, + "loss": 1.8525, "step": 1951 }, { "epoch": 3.1232, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.3937735557556152, + "learning_rate": 0.0005805, + "loss": 1.8255, "step": 1952 }, { "epoch": 3.1248, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.7212362289428711, + "learning_rate": 0.0005807999999999999, + "loss": 1.8263, "step": 1953 }, { "epoch": 3.1264, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.8853600025177002, + "learning_rate": 0.0005811, + "loss": 1.8761, "step": 1954 }, { "epoch": 3.128, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.4678289890289307, + "learning_rate": 0.0005813999999999999, + "loss": 2.0627, "step": 1955 }, { "epoch": 3.1296, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.1042805910110474, + "learning_rate": 0.0005817, + "loss": 1.8495, "step": 1956 }, { "epoch": 3.1312, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.6540318131446838, + "learning_rate": 0.0005819999999999999, + "loss": 1.8383, "step": 1957 }, { "epoch": 3.1328, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.8567636013031006, + "learning_rate": 0.0005823, + "loss": 1.8997, "step": 1958 }, { "epoch": 3.1344, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.660646677017212, + "learning_rate": 0.0005826, + "loss": 1.8058, "step": 1959 }, { "epoch": 3.136, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.4510436058044434, + "learning_rate": 0.0005829, + "loss": 1.9501, "step": 1960 }, { "epoch": 3.1376, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.678549289703369, + "learning_rate": 0.0005832, + "loss": 2.2273, "step": 1961 }, { "epoch": 3.1391999999999998, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.9886896014213562, + "learning_rate": 0.0005834999999999999, + "loss": 1.8041, "step": 1962 }, { "epoch": 3.1408, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.7258390188217163, + "learning_rate": 0.0005838, + "loss": 1.9527, "step": 1963 }, { "epoch": 3.1424, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.1479647159576416, + "learning_rate": 0.0005840999999999999, + "loss": 2.2193, "step": 1964 }, { "epoch": 3.144, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.9991073608398438, + "learning_rate": 0.0005843999999999999, + "loss": 2.1787, "step": 1965 }, { "epoch": 3.1456, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.5090608596801758, + "learning_rate": 0.0005847, + "loss": 1.9899, "step": 1966 }, { "epoch": 3.1471999999999998, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.6410496830940247, + "learning_rate": 0.0005849999999999999, + "loss": 1.7924, "step": 1967 }, { "epoch": 3.1488, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.2429163455963135, + "learning_rate": 0.0005853, + "loss": 2.1826, "step": 1968 }, { "epoch": 3.1504, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.9956017136573792, + "learning_rate": 0.0005855999999999999, + "loss": 1.994, "step": 1969 }, { "epoch": 3.152, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.732032299041748, + "learning_rate": 0.0005859, + "loss": 2.0436, "step": 1970 }, { "epoch": 3.1536, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.820035219192505, + "learning_rate": 0.0005861999999999999, + "loss": 1.8376, "step": 1971 }, { "epoch": 3.1552, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.5163549184799194, + "learning_rate": 0.0005865, + "loss": 2.1958, "step": 1972 }, { "epoch": 3.1568, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.5381282567977905, + "learning_rate": 0.0005868, + "loss": 2.3952, "step": 1973 }, { "epoch": 3.1584, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.8239006400108337, + "learning_rate": 0.0005871, + "loss": 2.3353, "step": 1974 }, { "epoch": 3.16, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.8696528673171997, + "learning_rate": 0.0005874, + "loss": 2.0183, "step": 1975 }, { "epoch": 3.1616, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 8.004875183105469, + "learning_rate": 0.0005876999999999999, + "loss": 2.9254, "step": 1976 }, { "epoch": 3.1632, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.3852179050445557, + "learning_rate": 0.000588, + "loss": 2.2966, "step": 1977 }, { "epoch": 3.1648, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.2804005146026611, + "learning_rate": 0.0005882999999999999, + "loss": 2.3582, "step": 1978 }, { "epoch": 3.1664, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 9.041597366333008, + "learning_rate": 0.0005885999999999999, + "loss": 2.702, "step": 1979 }, { "epoch": 3.168, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.890049934387207, + "learning_rate": 0.0005889, + "loss": 2.0844, "step": 1980 }, { "epoch": 3.1696, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.977160692214966, + "learning_rate": 0.0005891999999999999, + "loss": 2.2058, "step": 1981 }, { "epoch": 3.1712, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.654447078704834, + "learning_rate": 0.0005895, + "loss": 2.1782, "step": 1982 }, { "epoch": 3.1728, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.9558517932891846, + "learning_rate": 0.0005897999999999999, + "loss": 2.2067, "step": 1983 }, { "epoch": 3.1744, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.9267162084579468, + "learning_rate": 0.0005901, + "loss": 1.9776, "step": 1984 }, { "epoch": 3.176, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.7199661731719971, + "learning_rate": 0.0005903999999999999, + "loss": 2.1208, "step": 1985 }, { "epoch": 3.1776, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.00618052482605, + "learning_rate": 0.0005907, + "loss": 1.991, "step": 1986 }, { "epoch": 3.1792, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.5676202774047852, + "learning_rate": 0.0005909999999999999, + "loss": 2.0481, "step": 1987 }, { "epoch": 3.1808, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.522994041442871, + "learning_rate": 0.0005913, + "loss": 2.1883, "step": 1988 }, { "epoch": 3.1824, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.1395773887634277, + "learning_rate": 0.0005916, + "loss": 1.7289, "step": 1989 }, { "epoch": 3.184, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.1741175651550293, + "learning_rate": 0.0005919, + "loss": 1.8146, "step": 1990 }, { "epoch": 3.1856, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.9648697972297668, + "learning_rate": 0.0005922, + "loss": 1.9346, "step": 1991 }, { "epoch": 3.1872, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.453245759010315, + "learning_rate": 0.0005924999999999999, + "loss": 1.887, "step": 1992 }, { "epoch": 3.1888, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.3214921951293945, + "learning_rate": 0.0005928, + "loss": 1.9685, "step": 1993 }, { "epoch": 3.1904, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.6422308683395386, + "learning_rate": 0.0005930999999999999, + "loss": 1.7544, "step": 1994 }, { "epoch": 3.192, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.8797899484634399, + "learning_rate": 0.0005933999999999999, + "loss": 2.1596, "step": 1995 }, { "epoch": 3.1936, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.5090241432189941, + "learning_rate": 0.0005937, + "loss": 1.697, "step": 1996 }, { "epoch": 3.1952, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.829024612903595, + "learning_rate": 0.0005939999999999999, + "loss": 2.0842, "step": 1997 }, { "epoch": 3.1968, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.7225773930549622, + "learning_rate": 0.0005943, + "loss": 1.8067, "step": 1998 }, { "epoch": 3.1984, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.5545061230659485, + "learning_rate": 0.0005945999999999999, + "loss": 1.874, "step": 1999 }, { "epoch": 3.2, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.47109493613243103, + "learning_rate": 0.0005949, + "loss": 1.8352, "step": 2000 }, { "epoch": 3.2, - "eval_cer": 1.0, - "eval_loss": NaN, - "eval_runtime": 159.6533, - "eval_samples_per_second": 19.643, - "eval_steps_per_second": 1.228, - "eval_wer": 1.0, + "eval_cer": 0.472372144935624, + "eval_loss": 2.1490743160247803, + "eval_runtime": 158.1752, + "eval_samples_per_second": 19.826, + "eval_steps_per_second": 1.239, + "eval_wer": 0.7894725935136255, "step": 2000 }, { "epoch": 3.2016, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.672227382659912, + "learning_rate": 0.0005951999999999999, + "loss": 1.8896, "step": 2001 }, { "epoch": 3.2032, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.8217174410820007, + "learning_rate": 0.0005955, + "loss": 1.7263, "step": 2002 }, { "epoch": 3.2048, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.7968158721923828, + "learning_rate": 0.0005958, + "loss": 2.1543, "step": 2003 }, { "epoch": 3.2064, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.697176456451416, + "learning_rate": 0.0005961, + "loss": 1.9787, "step": 2004 }, { "epoch": 3.208, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.1341164112091064, + "learning_rate": 0.0005964, + "loss": 1.9606, "step": 2005 }, { "epoch": 3.2096, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.5551044940948486, + "learning_rate": 0.0005967, + "loss": 1.7492, "step": 2006 }, { "epoch": 3.2112, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.8107208013534546, + "learning_rate": 0.000597, + "loss": 1.936, "step": 2007 }, { "epoch": 3.2128, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.5374244451522827, + "learning_rate": 0.0005972999999999999, + "loss": 2.0281, "step": 2008 }, { "epoch": 3.2144, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.6073170900344849, + "learning_rate": 0.0005976, + "loss": 1.9526, "step": 2009 }, { "epoch": 3.216, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.5787049531936646, + "learning_rate": 0.0005979, + "loss": 1.7531, "step": 2010 }, { "epoch": 3.2176, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.9718129634857178, + "learning_rate": 0.0005981999999999999, + "loss": 1.9827, "step": 2011 }, { "epoch": 3.2192, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.6538378596305847, + "learning_rate": 0.0005985, + "loss": 1.9704, "step": 2012 }, { "epoch": 3.2208, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.7849467992782593, + "learning_rate": 0.0005987999999999999, + "loss": 1.9564, "step": 2013 }, { "epoch": 3.2224, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.7062731981277466, + "learning_rate": 0.0005991, + "loss": 1.8622, "step": 2014 }, { "epoch": 3.224, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.715021014213562, + "learning_rate": 0.0005993999999999999, + "loss": 2.0217, "step": 2015 }, { "epoch": 3.2256, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.1211111545562744, + "learning_rate": 0.0005997, + "loss": 2.2437, "step": 2016 }, { "epoch": 3.2272, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.8204267621040344, + "learning_rate": 0.0006, + "loss": 1.9696, "step": 2017 }, { "epoch": 3.2288, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.9571385383605957, + "learning_rate": 0.00059985, + "loss": 1.8388, "step": 2018 }, { "epoch": 3.2304, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.6814630627632141, + "learning_rate": 0.0005997, + "loss": 1.8718, "step": 2019 }, { "epoch": 3.232, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.779271125793457, + "learning_rate": 0.0005995499999999999, + "loss": 2.0305, "step": 2020 }, { "epoch": 3.2336, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.7469867467880249, + "learning_rate": 0.0005993999999999999, + "loss": 1.7589, "step": 2021 }, { "epoch": 3.2352, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.8041962385177612, + "learning_rate": 0.00059925, + "loss": 1.7746, "step": 2022 }, { "epoch": 3.2368, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.6314986944198608, + "learning_rate": 0.0005991, + "loss": 2.4096, "step": 2023 }, { "epoch": 3.2384, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.6365277767181396, + "learning_rate": 0.0005989499999999999, + "loss": 2.1849, "step": 2024 }, { "epoch": 3.24, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.8172062039375305, + "learning_rate": 0.0005987999999999999, + "loss": 2.0949, "step": 2025 }, { "epoch": 3.2416, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.4743893146514893, + "learning_rate": 0.0005986499999999999, + "loss": 2.3107, "step": 2026 }, { "epoch": 3.2432, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.591100215911865, + "learning_rate": 0.0005985, + "loss": 2.4814, "step": 2027 }, { "epoch": 3.2448, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 10.602311134338379, + "learning_rate": 0.0005983499999999999, + "loss": 2.8672, "step": 2028 }, { "epoch": 3.2464, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.2622711658477783, + "learning_rate": 0.0005981999999999999, + "loss": 2.4354, "step": 2029 }, { "epoch": 3.248, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.489107131958008, + "learning_rate": 0.0005980499999999999, + "loss": 2.2468, "step": 2030 }, { "epoch": 3.2496, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 6.640115737915039, + "learning_rate": 0.0005979, + "loss": 2.2934, "step": 2031 }, { "epoch": 3.2512, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.885188579559326, + "learning_rate": 0.00059775, + "loss": 2.5731, "step": 2032 }, { "epoch": 3.2528, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 7.099841117858887, + "learning_rate": 0.0005976, + "loss": 2.323, "step": 2033 }, { "epoch": 3.2544, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 6.89034366607666, + "learning_rate": 0.0005974499999999999, + "loss": 2.3686, "step": 2034 }, { "epoch": 3.2560000000000002, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 7.048368453979492, + "learning_rate": 0.0005972999999999999, + "loss": 2.0215, "step": 2035 }, { "epoch": 3.2576, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 7.314151287078857, + "learning_rate": 0.00059715, + "loss": 2.0318, "step": 2036 }, { "epoch": 3.2592, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 5.657911777496338, + "learning_rate": 0.000597, + "loss": 2.1174, "step": 2037 }, { "epoch": 3.2608, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.7400920391082764, + "learning_rate": 0.00059685, + "loss": 2.2229, "step": 2038 }, { "epoch": 3.2624, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.803786039352417, + "learning_rate": 0.0005967, + "loss": 1.8763, "step": 2039 }, { "epoch": 3.2640000000000002, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.4826065301895142, + "learning_rate": 0.0005965499999999999, + "loss": 1.9995, "step": 2040 }, { "epoch": 3.2656, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.1381956338882446, + "learning_rate": 0.0005964, + "loss": 1.7465, "step": 2041 }, { "epoch": 3.2672, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.0622131824493408, + "learning_rate": 0.00059625, + "loss": 2.011, "step": 2042 }, { "epoch": 3.2688, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.2198865413665771, + "learning_rate": 0.0005961, + "loss": 1.9016, "step": 2043 }, { "epoch": 3.2704, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.8227574825286865, + "learning_rate": 0.0005959499999999999, + "loss": 1.9384, "step": 2044 }, { "epoch": 3.2720000000000002, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.4436991214752197, + "learning_rate": 0.0005958, + "loss": 1.8939, "step": 2045 }, { "epoch": 3.2736, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.6856980323791504, + "learning_rate": 0.00059565, + "loss": 1.9194, "step": 2046 }, { "epoch": 3.2752, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.3934940099716187, + "learning_rate": 0.0005955, + "loss": 1.8451, "step": 2047 }, { "epoch": 3.2768, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.9621049165725708, + "learning_rate": 0.0005953499999999999, + "loss": 2.1891, "step": 2048 }, { "epoch": 3.2784, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.1755985021591187, + "learning_rate": 0.0005951999999999999, + "loss": 1.9826, "step": 2049 }, { "epoch": 3.2800000000000002, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.0933473110198975, + "learning_rate": 0.00059505, + "loss": 1.7576, "step": 2050 }, { "epoch": 3.2816, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.2009758949279785, + "learning_rate": 0.0005949, + "loss": 1.9181, "step": 2051 }, { "epoch": 3.2832, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.6885703206062317, + "learning_rate": 0.0005947499999999999, + "loss": 1.6412, "step": 2052 }, { "epoch": 3.2848, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.3572481870651245, + "learning_rate": 0.0005945999999999999, + "loss": 2.0448, "step": 2053 }, { "epoch": 3.2864, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.438791036605835, + "learning_rate": 0.00059445, + "loss": 1.9378, "step": 2054 }, { "epoch": 3.288, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.896614670753479, + "learning_rate": 0.0005943, + "loss": 1.8265, "step": 2055 }, { "epoch": 3.2896, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.369891405105591, + "learning_rate": 0.0005941499999999999, + "loss": 1.9555, "step": 2056 }, { "epoch": 3.2912, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.7984668016433716, + "learning_rate": 0.0005939999999999999, + "loss": 1.8772, "step": 2057 }, { "epoch": 3.2928, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.576426088809967, + "learning_rate": 0.0005938499999999999, + "loss": 1.7301, "step": 2058 }, { "epoch": 3.2944, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.9307464361190796, + "learning_rate": 0.0005937, + "loss": 1.9711, "step": 2059 }, { "epoch": 3.296, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.5098490715026855, + "learning_rate": 0.0005935499999999999, + "loss": 1.9865, "step": 2060 }, { "epoch": 3.2976, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.9491452574729919, + "learning_rate": 0.0005933999999999999, + "loss": 2.0115, "step": 2061 }, { "epoch": 3.2992, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.528455913066864, + "learning_rate": 0.0005932499999999999, + "loss": 1.8317, "step": 2062 }, { "epoch": 3.3008, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.3873507976531982, + "learning_rate": 0.0005930999999999999, + "loss": 2.2621, "step": 2063 }, { "epoch": 3.3024, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.1286377906799316, + "learning_rate": 0.00059295, + "loss": 2.0241, "step": 2064 }, { "epoch": 3.304, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.4034842252731323, + "learning_rate": 0.0005928, + "loss": 1.9359, "step": 2065 }, { "epoch": 3.3056, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.8125107884407043, + "learning_rate": 0.0005926499999999999, + "loss": 2.0162, "step": 2066 }, { "epoch": 3.3072, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.2552961111068726, + "learning_rate": 0.0005924999999999999, + "loss": 2.1944, "step": 2067 }, { "epoch": 3.3088, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.9320792555809021, + "learning_rate": 0.00059235, + "loss": 2.109, "step": 2068 }, { "epoch": 3.3104, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.9960517883300781, + "learning_rate": 0.0005922, + "loss": 2.0276, "step": 2069 }, { "epoch": 3.312, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.9736140370368958, + "learning_rate": 0.00059205, + "loss": 2.1911, "step": 2070 }, { "epoch": 3.3136, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.5229682922363281, + "learning_rate": 0.0005919, + "loss": 2.1335, "step": 2071 }, { "epoch": 3.3152, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.261260747909546, + "learning_rate": 0.0005917499999999999, + "loss": 2.0071, "step": 2072 }, { "epoch": 3.3168, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.7125620245933533, + "learning_rate": 0.0005916, + "loss": 2.1277, "step": 2073 }, { "epoch": 3.3184, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.672423839569092, + "learning_rate": 0.00059145, + "loss": 2.2381, "step": 2074 }, { "epoch": 3.32, "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "learning_rate": 0.00059145, + "loss": 2.606, "step": 2075 }, { "epoch": 3.3216, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 12.852534294128418, + "learning_rate": 0.0005913, + "loss": 2.6536, "step": 2076 }, { "epoch": 3.3232, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.6311789751052856, + "learning_rate": 0.0005911499999999999, + "loss": 2.2505, "step": 2077 }, { "epoch": 3.3247999999999998, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.5713582038879395, + "learning_rate": 0.0005909999999999999, + "loss": 2.1638, "step": 2078 }, { "epoch": 3.3264, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.8965359926223755, + "learning_rate": 0.00059085, + "loss": 2.3546, "step": 2079 }, { "epoch": 3.328, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.8224315643310547, + "learning_rate": 0.0005907, + "loss": 2.1339, "step": 2080 }, { "epoch": 3.3296, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.236239433288574, + "learning_rate": 0.0005905499999999999, + "loss": 2.1391, "step": 2081 }, { "epoch": 3.3312, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.328113555908203, + "learning_rate": 0.0005903999999999999, + "loss": 1.9743, "step": 2082 }, { "epoch": 3.3327999999999998, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.0247113704681396, + "learning_rate": 0.00059025, + "loss": 2.1961, "step": 2083 }, { "epoch": 3.3344, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.5589113235473633, + "learning_rate": 0.0005901, + "loss": 1.904, "step": 2084 }, { "epoch": 3.336, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.7790500521659851, + "learning_rate": 0.0005899499999999999, + "loss": 2.1242, "step": 2085 }, { "epoch": 3.3376, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.423574924468994, + "learning_rate": 0.0005897999999999999, + "loss": 2.1614, "step": 2086 }, { "epoch": 3.3392, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.7823060750961304, + "learning_rate": 0.0005896499999999999, + "loss": 2.0141, "step": 2087 }, { "epoch": 3.3407999999999998, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.7111868858337402, + "learning_rate": 0.0005895, + "loss": 2.097, "step": 2088 }, { "epoch": 3.3424, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.6286764144897461, + "learning_rate": 0.0005893499999999999, + "loss": 1.8148, "step": 2089 }, { "epoch": 3.344, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.0036760568618774, + "learning_rate": 0.0005891999999999999, + "loss": 1.7824, "step": 2090 }, { "epoch": 3.3456, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.625751256942749, + "learning_rate": 0.0005890499999999999, + "loss": 2.0269, "step": 2091 }, { "epoch": 3.3472, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.7607942223548889, + "learning_rate": 0.0005889, + "loss": 1.7751, "step": 2092 }, { "epoch": 3.3487999999999998, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.3163102865219116, + "learning_rate": 0.00058875, + "loss": 1.9094, "step": 2093 }, { "epoch": 3.3504, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.412693738937378, + "learning_rate": 0.0005885999999999999, + "loss": 1.7847, "step": 2094 }, { "epoch": 3.352, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.028609275817871, + "learning_rate": 0.0005884499999999999, + "loss": 1.9335, "step": 2095 }, { "epoch": 3.3536, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.106916904449463, + "learning_rate": 0.0005882999999999999, + "loss": 1.9819, "step": 2096 }, { "epoch": 3.3552, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.9769391417503357, + "learning_rate": 0.00058815, + "loss": 2.178, "step": 2097 }, { "epoch": 3.3568, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.7147427797317505, + "learning_rate": 0.000588, + "loss": 1.7971, "step": 2098 }, { "epoch": 3.3584, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.773476779460907, + "learning_rate": 0.00058785, + "loss": 1.7747, "step": 2099 }, { "epoch": 3.36, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, - "step": 2100 - }, - { - "epoch": 3.36, - "eval_cer": 1.0, - "eval_loss": NaN, - "eval_runtime": 161.4742, - "eval_samples_per_second": 19.421, - "eval_steps_per_second": 1.214, - "eval_wer": 1.0, + "grad_norm": 0.7806805372238159, + "learning_rate": 0.0005876999999999999, + "loss": 1.9335, "step": 2100 }, { "epoch": 3.3616, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.0920946598052979, + "learning_rate": 0.0005875499999999999, + "loss": 1.8085, "step": 2101 }, { "epoch": 3.3632, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.8656213283538818, + "learning_rate": 0.0005874, + "loss": 2.0736, "step": 2102 }, { "epoch": 3.3648, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.247507095336914, + "learning_rate": 0.00058725, + "loss": 2.0338, "step": 2103 }, { "epoch": 3.3664, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.6680485010147095, + "learning_rate": 0.0005871, + "loss": 1.9589, "step": 2104 }, { "epoch": 3.368, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.4975533485412598, + "learning_rate": 0.0005869499999999999, + "loss": 1.9832, "step": 2105 }, { "epoch": 3.3696, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.8459756374359131, + "learning_rate": 0.0005868, + "loss": 1.8083, "step": 2106 }, { "epoch": 3.3712, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.097440242767334, + "learning_rate": 0.00058665, + "loss": 1.661, "step": 2107 }, { "epoch": 3.3728, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.0254461765289307, + "learning_rate": 0.0005865, + "loss": 1.9348, "step": 2108 }, { "epoch": 3.3744, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.9976134896278381, + "learning_rate": 0.0005863499999999999, + "loss": 1.7156, "step": 2109 }, { "epoch": 3.376, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.9642464518547058, + "learning_rate": 0.0005861999999999999, + "loss": 1.8166, "step": 2110 }, { "epoch": 3.3776, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.2344017028808594, + "learning_rate": 0.00058605, + "loss": 2.4248, "step": 2111 }, { "epoch": 3.3792, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.6256465315818787, + "learning_rate": 0.0005859, + "loss": 2.1543, "step": 2112 }, { "epoch": 3.3808, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.9947925806045532, + "learning_rate": 0.0005857499999999999, + "loss": 1.9461, "step": 2113 }, { "epoch": 3.3824, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.0438300371170044, + "learning_rate": 0.0005855999999999999, + "loss": 2.2066, "step": 2114 }, { "epoch": 3.384, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.5882972478866577, + "learning_rate": 0.00058545, + "loss": 2.1843, "step": 2115 }, { "epoch": 3.3856, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.0569475889205933, + "learning_rate": 0.0005853, + "loss": 1.9829, "step": 2116 }, { "epoch": 3.3872, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.5413305163383484, + "learning_rate": 0.0005851499999999999, + "loss": 2.0784, "step": 2117 }, { "epoch": 3.3888, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.626608669757843, + "learning_rate": 0.0005849999999999999, + "loss": 1.7897, "step": 2118 }, { "epoch": 3.3904, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.6285375356674194, + "learning_rate": 0.0005848499999999999, + "loss": 1.9263, "step": 2119 }, { "epoch": 3.392, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.4336178302764893, + "learning_rate": 0.0005847, + "loss": 1.9986, "step": 2120 }, { "epoch": 3.3936, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.0149215459823608, + "learning_rate": 0.0005845499999999999, + "loss": 1.8295, "step": 2121 }, { "epoch": 3.3952, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.226423740386963, + "learning_rate": 0.0005843999999999999, + "loss": 2.2674, "step": 2122 }, { "epoch": 3.3968, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.9696506261825562, + "learning_rate": 0.0005842499999999999, + "loss": 1.9007, "step": 2123 }, { "epoch": 3.3984, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.1912128925323486, + "learning_rate": 0.0005840999999999999, + "loss": 2.5396, "step": 2124 }, { "epoch": 3.4, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.123094081878662, + "learning_rate": 0.00058395, + "loss": 2.2945, "step": 2125 }, { "epoch": 3.4016, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.7035903930664062, + "learning_rate": 0.0005838, + "loss": 2.487, "step": 2126 }, { "epoch": 3.4032, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.1892151832580566, + "learning_rate": 0.0005836499999999999, + "loss": 2.3695, "step": 2127 }, { "epoch": 3.4048, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.2689785957336426, + "learning_rate": 0.0005834999999999999, + "loss": 2.2589, "step": 2128 }, { "epoch": 3.4064, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.1017853021621704, + "learning_rate": 0.00058335, + "loss": 2.3216, "step": 2129 }, { "epoch": 3.408, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.3482823371887207, + "learning_rate": 0.0005832, + "loss": 2.0859, "step": 2130 }, { "epoch": 3.4096, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.300066947937012, + "learning_rate": 0.00058305, + "loss": 2.5814, "step": 2131 }, { "epoch": 3.4112, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.3666080236434937, + "learning_rate": 0.0005829, + "loss": 2.4822, "step": 2132 }, { "epoch": 3.4128, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.999903678894043, + "learning_rate": 0.0005827499999999999, + "loss": 1.9601, "step": 2133 }, { "epoch": 3.4144, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 5.403241157531738, + "learning_rate": 0.0005826, + "loss": 2.1051, "step": 2134 }, { "epoch": 3.416, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.511754989624023, + "learning_rate": 0.00058245, + "loss": 2.1249, "step": 2135 }, { "epoch": 3.4176, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.8616156578063965, + "learning_rate": 0.0005823, + "loss": 2.02, "step": 2136 }, { "epoch": 3.4192, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 5.203505516052246, + "learning_rate": 0.0005821499999999999, + "loss": 2.0634, "step": 2137 }, { "epoch": 3.4208, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.6515045166015625, + "learning_rate": 0.0005819999999999999, + "loss": 1.9337, "step": 2138 }, { "epoch": 3.4224, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.657661199569702, + "learning_rate": 0.00058185, + "loss": 1.8973, "step": 2139 }, { "epoch": 3.424, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.9265851974487305, + "learning_rate": 0.0005817, + "loss": 2.1529, "step": 2140 }, { "epoch": 3.4256, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.6995819211006165, + "learning_rate": 0.0005815499999999999, + "loss": 1.9286, "step": 2141 }, { "epoch": 3.4272, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.5046209096908569, + "learning_rate": 0.0005813999999999999, + "loss": 1.8917, "step": 2142 }, { "epoch": 3.4288, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.5500861406326294, + "learning_rate": 0.00058125, + "loss": 1.771, "step": 2143 }, { "epoch": 3.4304, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.7403568029403687, + "learning_rate": 0.0005811, + "loss": 1.9863, "step": 2144 }, { "epoch": 3.432, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.9957071542739868, + "learning_rate": 0.00058095, + "loss": 2.0033, "step": 2145 }, { "epoch": 3.4336, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.2035460472106934, + "learning_rate": 0.0005807999999999999, + "loss": 1.708, "step": 2146 }, { "epoch": 3.4352, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.9599239826202393, + "learning_rate": 0.0005806499999999999, + "loss": 2.2345, "step": 2147 }, { "epoch": 3.4368, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.531463384628296, + "learning_rate": 0.0005805, + "loss": 2.0405, "step": 2148 }, { "epoch": 3.4384, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.0532867908477783, + "learning_rate": 0.00058035, + "loss": 1.824, "step": 2149 }, { "epoch": 3.44, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.9876016974449158, + "learning_rate": 0.0005801999999999999, + "loss": 1.8832, "step": 2150 }, { "epoch": 3.4416, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.5361031293869019, + "learning_rate": 0.0005800499999999999, + "loss": 1.9323, "step": 2151 }, { "epoch": 3.4432, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.4722892045974731, + "learning_rate": 0.0005799, + "loss": 1.72, "step": 2152 }, { "epoch": 3.4448, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.8053250312805176, + "learning_rate": 0.00057975, + "loss": 2.1203, "step": 2153 }, { "epoch": 3.4464, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.2486746311187744, + "learning_rate": 0.0005795999999999999, + "loss": 1.6535, "step": 2154 }, { "epoch": 3.448, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.140870451927185, + "learning_rate": 0.0005794499999999999, + "loss": 2.0749, "step": 2155 }, { "epoch": 3.4496, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.8704653978347778, + "learning_rate": 0.0005792999999999999, + "loss": 2.0133, "step": 2156 }, { "epoch": 3.4512, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.039218544960022, + "learning_rate": 0.00057915, + "loss": 1.8325, "step": 2157 }, { "epoch": 3.4528, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.5828589200973511, + "learning_rate": 0.000579, + "loss": 1.8943, "step": 2158 }, { "epoch": 3.4544, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.9117224216461182, + "learning_rate": 0.0005788499999999999, + "loss": 2.1518, "step": 2159 }, { "epoch": 3.456, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.7245050072669983, + "learning_rate": 0.0005786999999999999, + "loss": 1.9638, "step": 2160 }, { "epoch": 3.4576000000000002, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.124849319458008, + "learning_rate": 0.0005785499999999999, + "loss": 1.8997, "step": 2161 }, { "epoch": 3.4592, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.2298882007598877, + "learning_rate": 0.0005784, + "loss": 2.3108, "step": 2162 }, { "epoch": 3.4608, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.4195408821105957, + "learning_rate": 0.00057825, + "loss": 1.9164, "step": 2163 }, { "epoch": 3.4624, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.2068179845809937, + "learning_rate": 0.0005781, + "loss": 2.1875, "step": 2164 }, { "epoch": 3.464, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.8306836485862732, + "learning_rate": 0.0005779499999999999, + "loss": 2.1143, "step": 2165 }, { "epoch": 3.4656000000000002, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.5825016498565674, + "learning_rate": 0.0005778, + "loss": 2.0437, "step": 2166 }, { "epoch": 3.4672, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.7154268622398376, + "learning_rate": 0.00057765, + "loss": 1.8824, "step": 2167 }, { "epoch": 3.4688, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.1212105751037598, + "learning_rate": 0.0005775, + "loss": 2.2052, "step": 2168 }, { "epoch": 3.4704, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.1738407611846924, + "learning_rate": 0.00057735, + "loss": 2.0513, "step": 2169 }, { "epoch": 3.472, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.4919712543487549, + "learning_rate": 0.0005771999999999999, + "loss": 1.8867, "step": 2170 }, { "epoch": 3.4736000000000002, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.7555265426635742, + "learning_rate": 0.00057705, + "loss": 1.8786, "step": 2171 }, { "epoch": 3.4752, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.0224101543426514, + "learning_rate": 0.0005769, + "loss": 2.0857, "step": 2172 }, { "epoch": 3.4768, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.5990490913391113, + "learning_rate": 0.00057675, + "loss": 2.1757, "step": 2173 }, { "epoch": 3.4784, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.5588961839675903, + "learning_rate": 0.0005765999999999999, + "loss": 2.0964, "step": 2174 }, { "epoch": 3.48, "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "learning_rate": 0.0005765999999999999, + "loss": 2.2852, "step": 2175 }, { "epoch": 3.4816, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 18.246580123901367, + "learning_rate": 0.0005764499999999999, + "loss": 3.2206, "step": 2176 }, { "epoch": 3.4832, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.455694198608398, + "learning_rate": 0.0005763, + "loss": 2.588, "step": 2177 }, { "epoch": 3.4848, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.2639204263687134, + "learning_rate": 0.00057615, + "loss": 2.1951, "step": 2178 }, { "epoch": 3.4864, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.898868441581726, + "learning_rate": 0.0005759999999999999, + "loss": 2.1804, "step": 2179 }, { "epoch": 3.488, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.800933301448822, + "learning_rate": 0.0005758499999999999, + "loss": 2.2862, "step": 2180 }, { "epoch": 3.4896, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.313568592071533, + "learning_rate": 0.0005757, + "loss": 2.2126, "step": 2181 }, { "epoch": 3.4912, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.0708627700805664, + "learning_rate": 0.00057555, + "loss": 2.1175, "step": 2182 }, { "epoch": 3.4928, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.972640037536621, + "learning_rate": 0.0005753999999999999, + "loss": 2.0464, "step": 2183 }, { "epoch": 3.4944, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.3649954795837402, + "learning_rate": 0.0005752499999999999, + "loss": 2.0686, "step": 2184 }, { "epoch": 3.496, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.0719057321548462, + "learning_rate": 0.0005750999999999999, + "loss": 2.0874, "step": 2185 }, { "epoch": 3.4976, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.8816778659820557, + "learning_rate": 0.00057495, + "loss": 2.1214, "step": 2186 }, { "epoch": 3.4992, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.8715779781341553, + "learning_rate": 0.0005747999999999999, + "loss": 1.887, "step": 2187 }, { "epoch": 3.5008, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.8262019753456116, + "learning_rate": 0.0005746499999999999, + "loss": 1.938, "step": 2188 }, { "epoch": 3.5023999999999997, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.9271842241287231, + "learning_rate": 0.0005744999999999999, + "loss": 1.7774, "step": 2189 }, { "epoch": 3.504, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.82608962059021, + "learning_rate": 0.00057435, + "loss": 1.8269, "step": 2190 }, { "epoch": 3.5056000000000003, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.2712914943695068, + "learning_rate": 0.0005742, + "loss": 1.9094, "step": 2191 }, { "epoch": 3.5072, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.9681758880615234, + "learning_rate": 0.00057405, + "loss": 1.8966, "step": 2192 }, { "epoch": 3.5088, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.9789705276489258, + "learning_rate": 0.0005738999999999999, + "loss": 2.0215, "step": 2193 }, { "epoch": 3.5103999999999997, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.8557264804840088, + "learning_rate": 0.0005737499999999999, + "loss": 2.0703, "step": 2194 }, { "epoch": 3.512, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.9306953549385071, + "learning_rate": 0.0005736, + "loss": 1.687, "step": 2195 }, { "epoch": 3.5136, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.0400307178497314, + "learning_rate": 0.00057345, + "loss": 1.9039, "step": 2196 }, { "epoch": 3.5152, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.0845005512237549, + "learning_rate": 0.0005733, + "loss": 1.8057, "step": 2197 }, { "epoch": 3.5168, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.5625959634780884, + "learning_rate": 0.00057315, + "loss": 1.7292, "step": 2198 }, { "epoch": 3.5183999999999997, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.7412774562835693, + "learning_rate": 0.0005729999999999999, + "loss": 1.858, "step": 2199 }, { "epoch": 3.52, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, - "step": 2200 - }, - { - "epoch": 3.52, - "eval_cer": 1.0, - "eval_loss": NaN, - "eval_runtime": 157.7226, - "eval_samples_per_second": 19.883, - "eval_steps_per_second": 1.243, - "eval_wer": 1.0, + "grad_norm": 0.7688709497451782, + "learning_rate": 0.00057285, + "loss": 1.9147, "step": 2200 }, { "epoch": 3.5216, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.7633388638496399, + "learning_rate": 0.0005727, + "loss": 1.9017, "step": 2201 }, { "epoch": 3.5232, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.4895182251930237, + "learning_rate": 0.00057255, + "loss": 1.8924, "step": 2202 }, { "epoch": 3.5248, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.5399225950241089, + "learning_rate": 0.0005723999999999999, + "loss": 1.6966, "step": 2203 }, { "epoch": 3.5263999999999998, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.073164701461792, + "learning_rate": 0.00057225, + "loss": 1.7686, "step": 2204 }, { "epoch": 3.528, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.9257833957672119, + "learning_rate": 0.0005721, + "loss": 1.9916, "step": 2205 }, { "epoch": 3.5296, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.169154405593872, + "learning_rate": 0.00057195, + "loss": 2.0111, "step": 2206 }, { "epoch": 3.5312, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.2770828008651733, + "learning_rate": 0.0005717999999999999, + "loss": 1.6243, "step": 2207 }, { "epoch": 3.5328, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.6825202703475952, + "learning_rate": 0.0005716499999999999, + "loss": 1.8726, "step": 2208 }, { "epoch": 3.5343999999999998, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.7958777546882629, + "learning_rate": 0.0005715, + "loss": 1.7756, "step": 2209 }, { "epoch": 3.536, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.5408828258514404, + "learning_rate": 0.00057135, + "loss": 1.9524, "step": 2210 }, { "epoch": 3.5376, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.8456842303276062, + "learning_rate": 0.0005711999999999999, + "loss": 2.0548, "step": 2211 }, { "epoch": 3.5392, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.612095057964325, + "learning_rate": 0.0005710499999999999, + "loss": 2.2309, "step": 2212 }, { "epoch": 3.5408, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.2765395641326904, + "learning_rate": 0.0005708999999999999, + "loss": 2.3198, "step": 2213 }, { "epoch": 3.5423999999999998, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.418147325515747, + "learning_rate": 0.00057075, + "loss": 1.9075, "step": 2214 }, { "epoch": 3.544, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.177223205566406, + "learning_rate": 0.0005705999999999999, + "loss": 2.0796, "step": 2215 }, { "epoch": 3.5456, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.717703938484192, + "learning_rate": 0.0005704499999999999, + "loss": 2.3406, "step": 2216 }, { "epoch": 3.5472, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.981576681137085, + "learning_rate": 0.0005702999999999999, + "loss": 2.2258, "step": 2217 }, { "epoch": 3.5488, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.4039616584777832, + "learning_rate": 0.00057015, + "loss": 1.7069, "step": 2218 }, { "epoch": 3.5504, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.3874764442443848, + "learning_rate": 0.00057, + "loss": 1.9812, "step": 2219 }, { "epoch": 3.552, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.5490448474884033, + "learning_rate": 0.0005698499999999999, + "loss": 1.9941, "step": 2220 }, { "epoch": 3.5536, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.8983708620071411, + "learning_rate": 0.0005696999999999999, + "loss": 2.2199, "step": 2221 }, { "epoch": 3.5552, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.5205702781677246, + "learning_rate": 0.0005695499999999999, + "loss": 2.207, "step": 2222 }, { "epoch": 3.5568, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.4460971355438232, + "learning_rate": 0.0005694, + "loss": 2.485, "step": 2223 }, { "epoch": 3.5584, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.5268499851226807, + "learning_rate": 0.00056925, + "loss": 2.3391, "step": 2224 }, { "epoch": 3.56, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.8089544773101807, + "learning_rate": 0.0005691, + "loss": 2.5278, "step": 2225 }, { "epoch": 3.5616, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 14.736753463745117, + "learning_rate": 0.0005689499999999999, + "loss": 2.7849, "step": 2226 }, { "epoch": 3.5632, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 6.6178412437438965, + "learning_rate": 0.0005688, + "loss": 2.3745, "step": 2227 }, { "epoch": 3.5648, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 10.448739051818848, + "learning_rate": 0.00056865, + "loss": 2.56, "step": 2228 }, { "epoch": 3.5664, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 5.70924711227417, + "learning_rate": 0.0005685, + "loss": 2.4006, "step": 2229 }, { "epoch": 3.568, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.747269153594971, + "learning_rate": 0.00056835, + "loss": 2.1758, "step": 2230 }, { "epoch": 3.5696, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.0769402980804443, + "learning_rate": 0.0005681999999999999, + "loss": 2.0656, "step": 2231 }, { "epoch": 3.5712, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.1517651081085205, + "learning_rate": 0.00056805, + "loss": 2.0493, "step": 2232 }, { "epoch": 3.5728, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.1098272800445557, + "learning_rate": 0.0005679, + "loss": 1.889, "step": 2233 }, { "epoch": 3.5744, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.1274043321609497, + "learning_rate": 0.00056775, + "loss": 1.9614, "step": 2234 }, { "epoch": 3.576, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.16263484954834, + "learning_rate": 0.0005675999999999999, + "loss": 1.9668, "step": 2235 }, { "epoch": 3.5776, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.4705198407173157, + "learning_rate": 0.0005674499999999999, + "loss": 1.9278, "step": 2236 }, { "epoch": 3.5792, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.4408669471740723, + "learning_rate": 0.0005673, + "loss": 1.8853, "step": 2237 }, { "epoch": 3.5808, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.9206117987632751, + "learning_rate": 0.00056715, + "loss": 1.9511, "step": 2238 }, { "epoch": 3.5824, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.8084490299224854, + "learning_rate": 0.0005669999999999999, + "loss": 1.7235, "step": 2239 }, { "epoch": 3.584, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.9082566499710083, + "learning_rate": 0.0005668499999999999, + "loss": 1.6991, "step": 2240 }, { "epoch": 3.5856, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.1258739233016968, + "learning_rate": 0.0005667, + "loss": 1.8765, "step": 2241 }, { "epoch": 3.5872, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.7276796102523804, + "learning_rate": 0.00056655, + "loss": 1.6729, "step": 2242 }, { "epoch": 3.5888, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.931858539581299, + "learning_rate": 0.0005663999999999999, + "loss": 1.746, "step": 2243 }, { "epoch": 3.5904, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.8679087162017822, + "learning_rate": 0.0005662499999999999, + "loss": 2.0839, "step": 2244 }, { "epoch": 3.592, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.481023073196411, + "learning_rate": 0.0005660999999999999, + "loss": 1.8341, "step": 2245 }, { "epoch": 3.5936, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.399448871612549, + "learning_rate": 0.00056595, + "loss": 1.9251, "step": 2246 }, { "epoch": 3.5952, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.5826406478881836, + "learning_rate": 0.0005657999999999999, + "loss": 1.6982, "step": 2247 }, { "epoch": 3.5968, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.7085354328155518, + "learning_rate": 0.0005656499999999999, + "loss": 1.8364, "step": 2248 }, { "epoch": 3.5984, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.819995641708374, + "learning_rate": 0.0005654999999999999, + "loss": 2.0573, "step": 2249 }, { "epoch": 3.6, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.9627421498298645, + "learning_rate": 0.0005653499999999999, + "loss": 1.8237, "step": 2250 }, { "epoch": 3.6016, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.1660212278366089, + "learning_rate": 0.0005652, + "loss": 2.1266, "step": 2251 }, { "epoch": 3.6032, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.0879263877868652, + "learning_rate": 0.0005650499999999999, + "loss": 1.9085, "step": 2252 }, { "epoch": 3.6048, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.3968796730041504, + "learning_rate": 0.0005648999999999999, + "loss": 1.7709, "step": 2253 }, { "epoch": 3.6064, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.195981025695801, + "learning_rate": 0.0005647499999999999, + "loss": 1.8685, "step": 2254 }, { "epoch": 3.608, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.6781668663024902, + "learning_rate": 0.0005646, + "loss": 1.8469, "step": 2255 }, { "epoch": 3.6096, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.7052363157272339, + "learning_rate": 0.00056445, + "loss": 1.9456, "step": 2256 }, { "epoch": 3.6112, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.590246558189392, + "learning_rate": 0.0005643, + "loss": 2.0146, "step": 2257 }, { "epoch": 3.6128, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.8339739441871643, + "learning_rate": 0.0005641499999999999, + "loss": 1.8619, "step": 2258 }, { "epoch": 3.6144, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.960278272628784, + "learning_rate": 0.0005639999999999999, + "loss": 1.8805, "step": 2259 }, { "epoch": 3.616, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.9200167655944824, + "learning_rate": 0.00056385, + "loss": 1.7948, "step": 2260 }, { "epoch": 3.6176, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.0074909925460815, + "learning_rate": 0.0005637, + "loss": 1.8153, "step": 2261 }, { "epoch": 3.6192, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.8568616509437561, + "learning_rate": 0.00056355, + "loss": 2.0004, "step": 2262 }, { "epoch": 3.6208, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.956713080406189, + "learning_rate": 0.0005633999999999999, + "loss": 2.0911, "step": 2263 }, { "epoch": 3.6224, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.9284929037094116, + "learning_rate": 0.00056325, + "loss": 1.8288, "step": 2264 }, { "epoch": 3.624, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.894782543182373, + "learning_rate": 0.0005631, + "loss": 1.9543, "step": 2265 }, { "epoch": 3.6256, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.8398752212524414, + "learning_rate": 0.00056295, + "loss": 2.1485, "step": 2266 }, { "epoch": 3.6272, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.454357147216797, + "learning_rate": 0.0005627999999999999, + "loss": 2.0567, "step": 2267 }, { "epoch": 3.6288, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.641742467880249, + "learning_rate": 0.0005626499999999999, + "loss": 2.2341, "step": 2268 }, { "epoch": 3.6304, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.7430512309074402, + "learning_rate": 0.0005625, + "loss": 2.0746, "step": 2269 }, { "epoch": 3.632, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.865148663520813, + "learning_rate": 0.00056235, + "loss": 2.1452, "step": 2270 }, { "epoch": 3.6336, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.5117383003234863, + "learning_rate": 0.0005622, + "loss": 1.9833, "step": 2271 }, { "epoch": 3.6352, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.9610617756843567, + "learning_rate": 0.0005620499999999999, + "loss": 2.2757, "step": 2272 }, { "epoch": 3.6368, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.3695871829986572, + "learning_rate": 0.0005618999999999999, + "loss": 2.0467, "step": 2273 }, { "epoch": 3.6384, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.8381495475769043, + "learning_rate": 0.00056175, + "loss": 2.2694, "step": 2274 }, { "epoch": 3.64, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.5025103092193604, + "learning_rate": 0.0005616, + "loss": 2.7829, "step": 2275 }, { "epoch": 3.6416, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 9.791909217834473, + "learning_rate": 0.0005614499999999999, + "loss": 2.5232, "step": 2276 }, { "epoch": 3.6432, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.33229660987854, + "learning_rate": 0.0005612999999999999, + "loss": 1.9347, "step": 2277 }, { "epoch": 3.6448, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 7.1871113777160645, + "learning_rate": 0.00056115, + "loss": 2.4298, "step": 2278 }, { "epoch": 3.6464, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.01583194732666, + "learning_rate": 0.000561, + "loss": 2.3458, "step": 2279 }, { "epoch": 3.648, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.6337729692459106, + "learning_rate": 0.0005608499999999999, + "loss": 2.2367, "step": 2280 }, { "epoch": 3.6496, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.28971004486084, + "learning_rate": 0.0005606999999999999, + "loss": 2.3382, "step": 2281 }, { "epoch": 3.6512000000000002, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.2333521842956543, + "learning_rate": 0.0005605499999999999, + "loss": 2.0587, "step": 2282 }, { "epoch": 3.6528, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.1958487033843994, + "learning_rate": 0.0005604, + "loss": 1.8286, "step": 2283 }, { "epoch": 3.6544, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.8140027523040771, + "learning_rate": 0.00056025, + "loss": 1.9269, "step": 2284 }, { "epoch": 3.656, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.204829216003418, + "learning_rate": 0.0005600999999999999, + "loss": 1.7265, "step": 2285 }, { "epoch": 3.6576, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.2901850938796997, + "learning_rate": 0.0005599499999999999, + "loss": 2.0729, "step": 2286 }, { "epoch": 3.6592000000000002, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.9441965818405151, + "learning_rate": 0.0005598, + "loss": 1.7402, "step": 2287 }, { "epoch": 3.6608, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.9471627473831177, + "learning_rate": 0.00055965, + "loss": 1.9046, "step": 2288 }, { "epoch": 3.6624, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.4211928844451904, + "learning_rate": 0.0005595, + "loss": 1.7967, "step": 2289 }, { "epoch": 3.664, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.5628670454025269, + "learning_rate": 0.00055935, + "loss": 1.5733, "step": 2290 }, { "epoch": 3.6656, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.8597891330718994, + "learning_rate": 0.0005591999999999999, + "loss": 1.8503, "step": 2291 }, { "epoch": 3.6672000000000002, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.4017963409423828, + "learning_rate": 0.00055905, + "loss": 1.8248, "step": 2292 }, { "epoch": 3.6688, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.8284274339675903, + "learning_rate": 0.0005589, + "loss": 1.9808, "step": 2293 }, { "epoch": 3.6704, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.6716489791870117, + "learning_rate": 0.00055875, + "loss": 1.756, "step": 2294 }, { "epoch": 3.672, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.6753265857696533, + "learning_rate": 0.0005586, + "loss": 1.9613, "step": 2295 }, { "epoch": 3.6736, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.1374127864837646, + "learning_rate": 0.0005584499999999999, + "loss": 1.699, "step": 2296 }, { "epoch": 3.6752000000000002, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.8114046454429626, + "learning_rate": 0.0005583, + "loss": 1.8101, "step": 2297 }, { "epoch": 3.6768, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.2519382238388062, + "learning_rate": 0.00055815, + "loss": 1.9894, "step": 2298 }, { "epoch": 3.6784, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.3280014991760254, + "learning_rate": 0.000558, + "loss": 1.6685, "step": 2299 }, { "epoch": 3.68, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, - "step": 2300 - }, - { - "epoch": 3.68, - "eval_cer": 1.0, - "eval_loss": NaN, - "eval_runtime": 158.3126, - "eval_samples_per_second": 19.809, - "eval_steps_per_second": 1.238, - "eval_wer": 1.0, + "grad_norm": 0.5929081439971924, + "learning_rate": 0.0005578499999999999, + "loss": 1.8905, "step": 2300 }, { "epoch": 3.6816, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.48221904039382935, + "learning_rate": 0.0005577, + "loss": 1.9267, "step": 2301 }, { "epoch": 3.6832000000000003, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.7531263828277588, + "learning_rate": 0.00055755, + "loss": 1.6909, "step": 2302 }, { "epoch": 3.6848, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.1919063329696655, + "learning_rate": 0.0005574, + "loss": 1.6607, "step": 2303 }, { "epoch": 3.6864, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.6339486837387085, + "learning_rate": 0.0005572499999999999, + "loss": 2.0789, "step": 2304 }, { "epoch": 3.6879999999999997, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.7811375260353088, + "learning_rate": 0.0005570999999999999, + "loss": 2.0737, "step": 2305 }, { "epoch": 3.6896, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.5296683311462402, + "learning_rate": 0.00055695, + "loss": 2.0396, "step": 2306 }, { "epoch": 3.6912000000000003, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.6752074360847473, + "learning_rate": 0.0005568, + "loss": 1.8817, "step": 2307 }, { "epoch": 3.6928, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.5748279094696045, + "learning_rate": 0.0005566499999999999, + "loss": 1.7463, "step": 2308 }, { "epoch": 3.6944, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.6146976947784424, + "learning_rate": 0.0005564999999999999, + "loss": 1.7846, "step": 2309 }, { "epoch": 3.6959999999999997, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.5607794523239136, + "learning_rate": 0.0005563499999999999, + "loss": 2.0738, "step": 2310 }, { "epoch": 3.6976, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.7432671785354614, + "learning_rate": 0.0005562, + "loss": 1.8349, "step": 2311 }, { "epoch": 3.6992000000000003, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.9773478507995605, + "learning_rate": 0.0005560499999999999, + "loss": 2.1008, "step": 2312 }, { "epoch": 3.7008, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.4571824073791504, + "learning_rate": 0.0005558999999999999, + "loss": 2.1606, "step": 2313 }, { "epoch": 3.7024, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.5633294582366943, + "learning_rate": 0.0005557499999999999, + "loss": 1.7593, "step": 2314 }, { "epoch": 3.7039999999999997, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.5776718258857727, + "learning_rate": 0.0005556, + "loss": 1.8431, "step": 2315 }, { "epoch": 3.7056, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.6635186672210693, + "learning_rate": 0.00055545, + "loss": 2.0499, "step": 2316 }, { "epoch": 3.7072000000000003, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 8.93550968170166, + "learning_rate": 0.0005552999999999999, + "loss": 2.1281, "step": 2317 }, { "epoch": 3.7088, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.0490763187408447, + "learning_rate": 0.0005551499999999999, + "loss": 1.8437, "step": 2318 }, { "epoch": 3.7104, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.8951698541641235, + "learning_rate": 0.0005549999999999999, + "loss": 2.1411, "step": 2319 }, { "epoch": 3.7119999999999997, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.894776463508606, + "learning_rate": 0.00055485, + "loss": 2.3416, "step": 2320 }, { "epoch": 3.7136, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.8387718796730042, + "learning_rate": 0.0005547, + "loss": 2.3502, "step": 2321 }, { "epoch": 3.7152, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.7882157564163208, + "learning_rate": 0.00055455, + "loss": 2.3886, "step": 2322 }, { "epoch": 3.7168, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.4860434532165527, + "learning_rate": 0.0005544, + "loss": 1.9161, "step": 2323 }, { "epoch": 3.7184, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.8020563125610352, + "learning_rate": 0.00055425, + "loss": 1.8318, "step": 2324 }, { "epoch": 3.7199999999999998, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.03687047958374, + "learning_rate": 0.0005541, + "loss": 2.7315, "step": 2325 }, { "epoch": 3.7216, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.88319993019104, + "learning_rate": 0.00055395, + "loss": 2.3845, "step": 2326 }, { "epoch": 3.7232, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.421705722808838, + "learning_rate": 0.0005538, + "loss": 2.0725, "step": 2327 }, { "epoch": 3.7248, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.533794403076172, + "learning_rate": 0.0005536499999999999, + "loss": 2.5097, "step": 2328 }, { "epoch": 3.7264, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.7621231079101562, + "learning_rate": 0.0005535, + "loss": 1.9029, "step": 2329 }, { "epoch": 3.7279999999999998, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 6.015029430389404, + "learning_rate": 0.00055335, + "loss": 2.3865, "step": 2330 }, { "epoch": 3.7296, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.4987126588821411, + "learning_rate": 0.0005532, + "loss": 2.0924, "step": 2331 }, { "epoch": 3.7312, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.7394864559173584, + "learning_rate": 0.0005530499999999999, + "loss": 1.9813, "step": 2332 }, { "epoch": 3.7328, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.7396926879882812, + "learning_rate": 0.0005528999999999999, + "loss": 2.0473, "step": 2333 }, { "epoch": 3.7344, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.7807384133338928, + "learning_rate": 0.00055275, + "loss": 1.8939, "step": 2334 }, { "epoch": 3.7359999999999998, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 6.359540939331055, + "learning_rate": 0.0005526, + "loss": 2.2214, "step": 2335 }, { "epoch": 3.7376, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.0581278800964355, + "learning_rate": 0.0005524499999999999, + "loss": 1.8789, "step": 2336 }, { "epoch": 3.7392, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.1891355514526367, + "learning_rate": 0.0005522999999999999, + "loss": 1.8932, "step": 2337 }, { "epoch": 3.7408, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.2215185165405273, + "learning_rate": 0.00055215, + "loss": 2.0718, "step": 2338 }, { "epoch": 3.7424, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.1517174243927, + "learning_rate": 0.000552, + "loss": 1.9519, "step": 2339 }, { "epoch": 3.7439999999999998, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.9815576076507568, + "learning_rate": 0.0005518499999999999, + "loss": 2.1945, "step": 2340 }, { "epoch": 3.7456, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.4585610628128052, + "learning_rate": 0.0005516999999999999, + "loss": 1.7822, "step": 2341 }, { "epoch": 3.7472, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.6124539971351624, + "learning_rate": 0.0005515499999999999, + "loss": 1.8715, "step": 2342 }, { "epoch": 3.7488, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.5956341624259949, + "learning_rate": 0.0005514, + "loss": 1.7431, "step": 2343 }, { "epoch": 3.7504, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.5568941831588745, + "learning_rate": 0.0005512499999999999, + "loss": 1.6602, "step": 2344 }, { "epoch": 3.752, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.7756979465484619, + "learning_rate": 0.0005510999999999999, + "loss": 1.7601, "step": 2345 }, { "epoch": 3.7536, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.4702774286270142, + "learning_rate": 0.0005509499999999999, + "loss": 1.7106, "step": 2346 }, { "epoch": 3.7552, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.420515239238739, + "learning_rate": 0.0005507999999999999, + "loss": 1.8738, "step": 2347 }, { "epoch": 3.7568, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.6160723567008972, + "learning_rate": 0.00055065, + "loss": 1.7276, "step": 2348 }, { "epoch": 3.7584, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.9224544167518616, + "learning_rate": 0.0005505, + "loss": 1.9053, "step": 2349 }, { "epoch": 3.76, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.9273090362548828, + "learning_rate": 0.0005503499999999999, + "loss": 2.0036, "step": 2350 }, { "epoch": 3.7616, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.6562850475311279, + "learning_rate": 0.0005501999999999999, + "loss": 1.875, "step": 2351 }, { "epoch": 3.7632, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.0540215969085693, + "learning_rate": 0.00055005, + "loss": 1.6405, "step": 2352 }, { "epoch": 3.7648, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.9246771335601807, + "learning_rate": 0.0005499, + "loss": 1.6913, "step": 2353 }, { "epoch": 3.7664, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.6716334819793701, + "learning_rate": 0.00054975, + "loss": 1.7426, "step": 2354 }, { "epoch": 3.768, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.676240861415863, + "learning_rate": 0.0005496, + "loss": 1.9785, "step": 2355 }, { "epoch": 3.7696, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.9691559076309204, + "learning_rate": 0.0005494499999999999, + "loss": 1.7538, "step": 2356 }, { "epoch": 3.7712, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.520119309425354, + "learning_rate": 0.0005493, + "loss": 1.8912, "step": 2357 }, { "epoch": 3.7728, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.9442250728607178, + "learning_rate": 0.00054915, + "loss": 2.0052, "step": 2358 }, { "epoch": 3.7744, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.3105292320251465, + "learning_rate": 0.000549, + "loss": 2.0257, "step": 2359 }, { "epoch": 3.776, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.8196446895599365, + "learning_rate": 0.0005488499999999999, + "loss": 1.8295, "step": 2360 }, { "epoch": 3.7776, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.038273334503174, + "learning_rate": 0.0005487, + "loss": 1.7075, "step": 2361 }, { "epoch": 3.7792, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.246741771697998, + "learning_rate": 0.00054855, + "loss": 2.0361, "step": 2362 }, { "epoch": 3.7808, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.7947719693183899, + "learning_rate": 0.0005484, + "loss": 2.0181, "step": 2363 }, { "epoch": 3.7824, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.7120828032493591, + "learning_rate": 0.0005482499999999999, + "loss": 1.6842, "step": 2364 }, { "epoch": 3.784, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.5462130308151245, + "learning_rate": 0.0005480999999999999, + "loss": 1.9249, "step": 2365 }, { "epoch": 3.7856, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.064439296722412, + "learning_rate": 0.00054795, + "loss": 2.1631, "step": 2366 }, { "epoch": 3.7872, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.4512231349945068, + "learning_rate": 0.0005478, + "loss": 1.8168, "step": 2367 }, { "epoch": 3.7888, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.4550942182540894, + "learning_rate": 0.0005476499999999999, + "loss": 1.8208, "step": 2368 }, { "epoch": 3.7904, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.692131280899048, + "learning_rate": 0.0005474999999999999, + "loss": 2.1885, "step": 2369 }, { "epoch": 3.792, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.1041882038116455, + "learning_rate": 0.0005473499999999999, + "loss": 2.4662, "step": 2370 }, { "epoch": 3.7936, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.566340684890747, + "learning_rate": 0.0005472, + "loss": 2.2271, "step": 2371 }, { "epoch": 3.7952, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.1705924272537231, + "learning_rate": 0.0005470499999999999, + "loss": 2.176, "step": 2372 }, { "epoch": 3.7968, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.118368148803711, + "learning_rate": 0.0005468999999999999, + "loss": 1.7705, "step": 2373 }, { "epoch": 3.7984, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.025905132293701, + "learning_rate": 0.0005467499999999999, + "loss": 2.2127, "step": 2374 }, { "epoch": 3.8, "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "learning_rate": 0.0005467499999999999, + "loss": 2.5841, "step": 2375 }, { "epoch": 3.8016, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.623035430908203, + "learning_rate": 0.0005466, + "loss": 2.6382, "step": 2376 }, { "epoch": 3.8032, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 13.104719161987305, + "learning_rate": 0.00054645, + "loss": 2.9251, "step": 2377 }, { "epoch": 3.8048, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.7435858845710754, + "learning_rate": 0.0005462999999999999, + "loss": 2.1993, "step": 2378 }, { "epoch": 3.8064, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.5204355716705322, + "learning_rate": 0.0005461499999999999, + "loss": 2.5489, "step": 2379 }, { "epoch": 3.808, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.5795775651931763, + "learning_rate": 0.0005459999999999999, + "loss": 2.2741, "step": 2380 }, { "epoch": 3.8096, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.9056549072265625, + "learning_rate": 0.00054585, + "loss": 1.968, "step": 2381 }, { "epoch": 3.8112, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.2619552612304688, + "learning_rate": 0.0005457, + "loss": 2.1325, "step": 2382 }, { "epoch": 3.8128, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.6153170466423035, + "learning_rate": 0.00054555, + "loss": 2.2253, "step": 2383 }, { "epoch": 3.8144, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.7306612730026245, + "learning_rate": 0.0005453999999999999, + "loss": 2.5022, "step": 2384 }, { "epoch": 3.816, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.6599177122116089, + "learning_rate": 0.0005452499999999999, + "loss": 1.8768, "step": 2385 }, { "epoch": 3.8176, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.967004120349884, + "learning_rate": 0.0005451, + "loss": 2.0821, "step": 2386 }, { "epoch": 3.8192, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.32717764377594, + "learning_rate": 0.00054495, + "loss": 1.6733, "step": 2387 }, { "epoch": 3.8208, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.4059408903121948, + "learning_rate": 0.0005448, + "loss": 1.9364, "step": 2388 }, { "epoch": 3.8224, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.1379108428955078, + "learning_rate": 0.0005446499999999999, + "loss": 1.742, "step": 2389 }, { "epoch": 3.824, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 5.164423942565918, + "learning_rate": 0.0005445, + "loss": 1.9497, "step": 2390 }, { "epoch": 3.8256, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.832740843296051, + "learning_rate": 0.00054435, + "loss": 1.7181, "step": 2391 }, { "epoch": 3.8272, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.2295310497283936, + "learning_rate": 0.0005442, + "loss": 1.8497, "step": 2392 }, { "epoch": 3.8288, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.0332822799682617, + "learning_rate": 0.0005440499999999999, + "loss": 1.8861, "step": 2393 }, { "epoch": 3.8304, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.0031321048736572, + "learning_rate": 0.0005438999999999999, + "loss": 1.7992, "step": 2394 }, { "epoch": 3.832, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.8428364396095276, + "learning_rate": 0.00054375, + "loss": 1.8351, "step": 2395 }, { "epoch": 3.8336, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.3388116359710693, + "learning_rate": 0.0005436, + "loss": 1.5866, "step": 2396 }, { "epoch": 3.8352, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.911939859390259, + "learning_rate": 0.00054345, + "loss": 1.8575, "step": 2397 }, { "epoch": 3.8368, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.5401694178581238, + "learning_rate": 0.0005432999999999999, + "loss": 1.8289, "step": 2398 }, { "epoch": 3.8384, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.1643283367156982, + "learning_rate": 0.00054315, + "loss": 1.8512, "step": 2399 }, { "epoch": 3.84, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, - "step": 2400 - }, - { - "epoch": 3.84, - "eval_cer": 1.0, - "eval_loss": NaN, - "eval_runtime": 158.5906, - "eval_samples_per_second": 19.774, - "eval_steps_per_second": 1.236, - "eval_wer": 1.0, + "grad_norm": 0.6926487684249878, + "learning_rate": 0.000543, + "loss": 1.762, "step": 2400 }, { "epoch": 3.8416, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.6354236006736755, + "learning_rate": 0.00054285, + "loss": 1.9255, "step": 2401 }, { "epoch": 3.8432, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.6662388443946838, + "learning_rate": 0.0005426999999999999, + "loss": 1.777, "step": 2402 }, { "epoch": 3.8448, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.7354152798652649, + "learning_rate": 0.0005425499999999999, + "loss": 1.7863, "step": 2403 }, { "epoch": 3.8464, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.2617355585098267, + "learning_rate": 0.0005424, + "loss": 1.6884, "step": 2404 }, { "epoch": 3.848, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.6013496518135071, + "learning_rate": 0.00054225, + "loss": 1.8549, "step": 2405 }, { "epoch": 3.8496, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.9593853950500488, + "learning_rate": 0.0005420999999999999, + "loss": 1.9073, "step": 2406 }, { "epoch": 3.8512, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.4911746978759766, + "learning_rate": 0.0005419499999999999, + "loss": 1.8498, "step": 2407 }, { "epoch": 3.8528000000000002, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.5293194055557251, + "learning_rate": 0.0005417999999999999, + "loss": 2.0972, "step": 2408 }, { "epoch": 3.8544, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.1277806758880615, + "learning_rate": 0.00054165, + "loss": 1.5233, "step": 2409 }, { "epoch": 3.856, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.8829110264778137, + "learning_rate": 0.0005414999999999999, + "loss": 1.8209, "step": 2410 }, { "epoch": 3.8576, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.754530668258667, + "learning_rate": 0.0005413499999999999, + "loss": 1.8229, "step": 2411 }, { "epoch": 3.8592, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.7509057521820068, + "learning_rate": 0.0005411999999999999, + "loss": 1.6897, "step": 2412 }, { "epoch": 3.8608000000000002, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.7444249391555786, + "learning_rate": 0.00054105, + "loss": 1.761, "step": 2413 }, { "epoch": 3.8624, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.0965861082077026, + "learning_rate": 0.0005409, + "loss": 1.8965, "step": 2414 }, { "epoch": 3.864, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.9176476001739502, + "learning_rate": 0.00054075, + "loss": 1.7672, "step": 2415 }, { "epoch": 3.8656, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.6273837089538574, + "learning_rate": 0.0005405999999999999, + "loss": 2.1376, "step": 2416 }, { "epoch": 3.8672, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.04819917678833, + "learning_rate": 0.0005404499999999999, + "loss": 1.6846, "step": 2417 }, { "epoch": 3.8688000000000002, "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "learning_rate": 0.0005404499999999999, + "loss": 0.9789, "step": 2418 }, { "epoch": 3.8704, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.5052590370178223, + "learning_rate": 0.0005403, + "loss": 2.0238, "step": 2419 }, { "epoch": 3.872, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.7614632248878479, + "learning_rate": 0.00054015, + "loss": 1.9592, "step": 2420 }, { "epoch": 3.8736, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.096020460128784, + "learning_rate": 0.00054, + "loss": 2.0654, "step": 2421 }, { "epoch": 3.8752, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.6296525001525879, + "learning_rate": 0.00053985, + "loss": 1.896, "step": 2422 }, { "epoch": 3.8768000000000002, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.4501912593841553, + "learning_rate": 0.0005396999999999999, + "loss": 2.1727, "step": 2423 }, { "epoch": 3.8784, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.7454720139503479, + "learning_rate": 0.00053955, + "loss": 2.0744, "step": 2424 }, { "epoch": 3.88, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.1214377880096436, + "learning_rate": 0.0005394, + "loss": 2.232, "step": 2425 }, { "epoch": 3.8816, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 25.60105323791504, + "learning_rate": 0.00053925, + "loss": 3.8323, "step": 2426 }, { "epoch": 3.8832, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.382150888442993, + "learning_rate": 0.0005390999999999999, + "loss": 2.3454, "step": 2427 }, { "epoch": 3.8848000000000003, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.5768654346466064, + "learning_rate": 0.00053895, + "loss": 2.6715, "step": 2428 }, { "epoch": 3.8864, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.9383482933044434, + "learning_rate": 0.0005388, + "loss": 2.5674, "step": 2429 }, { "epoch": 3.888, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.8392831087112427, + "learning_rate": 0.00053865, + "loss": 2.3529, "step": 2430 }, { "epoch": 3.8895999999999997, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 6.674063682556152, + "learning_rate": 0.0005384999999999999, + "loss": 2.0441, "step": 2431 }, { "epoch": 3.8912, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 6.914694786071777, + "learning_rate": 0.0005383499999999999, + "loss": 2.2088, "step": 2432 }, { "epoch": 3.8928000000000003, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 6.900940418243408, + "learning_rate": 0.0005382, + "loss": 2.1332, "step": 2433 }, { "epoch": 3.8944, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 8.249759674072266, + "learning_rate": 0.00053805, + "loss": 2.1121, "step": 2434 }, { "epoch": 3.896, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.5094621181488037, + "learning_rate": 0.0005378999999999999, + "loss": 1.928, "step": 2435 }, { "epoch": 3.8975999999999997, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.613595962524414, + "learning_rate": 0.0005377499999999999, + "loss": 2.1583, "step": 2436 }, { "epoch": 3.8992, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 5.618670463562012, + "learning_rate": 0.0005376, + "loss": 2.1516, "step": 2437 }, { "epoch": 3.9008000000000003, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.50870943069458, + "learning_rate": 0.00053745, + "loss": 2.1445, "step": 2438 }, { "epoch": 3.9024, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.8594318628311157, + "learning_rate": 0.0005372999999999999, + "loss": 1.9006, "step": 2439 }, { "epoch": 3.904, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.5116372108459473, + "learning_rate": 0.0005371499999999999, + "loss": 1.8591, "step": 2440 }, { "epoch": 3.9055999999999997, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.8958240747451782, + "learning_rate": 0.0005369999999999999, + "loss": 1.736, "step": 2441 }, { "epoch": 3.9072, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.5704692602157593, + "learning_rate": 0.00053685, + "loss": 1.7145, "step": 2442 }, { "epoch": 3.9088000000000003, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.9452965259552002, + "learning_rate": 0.0005367, + "loss": 1.63, "step": 2443 }, { "epoch": 3.9104, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 5.6801676750183105, + "learning_rate": 0.0005365499999999999, + "loss": 1.8781, "step": 2444 }, { "epoch": 3.912, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.4948861598968506, + "learning_rate": 0.0005363999999999999, + "loss": 1.8972, "step": 2445 }, { "epoch": 3.9135999999999997, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.4566302299499512, + "learning_rate": 0.0005362499999999999, + "loss": 1.9978, "step": 2446 }, { "epoch": 3.9152, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.9597867131233215, + "learning_rate": 0.0005361, + "loss": 1.8151, "step": 2447 }, { "epoch": 3.9168, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.585303544998169, + "learning_rate": 0.00053595, + "loss": 1.8052, "step": 2448 }, { "epoch": 3.9184, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.8317283391952515, + "learning_rate": 0.0005358, + "loss": 2.0335, "step": 2449 }, { "epoch": 3.92, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.6935200691223145, + "learning_rate": 0.0005356499999999999, + "loss": 1.8919, "step": 2450 }, { "epoch": 3.9215999999999998, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.9726048111915588, + "learning_rate": 0.0005355, + "loss": 1.7266, "step": 2451 }, { "epoch": 3.9232, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.2094123363494873, + "learning_rate": 0.00053535, + "loss": 1.815, "step": 2452 }, { "epoch": 3.9248, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.217449426651001, + "learning_rate": 0.0005352, + "loss": 1.9153, "step": 2453 }, { "epoch": 3.9264, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.6052618622779846, + "learning_rate": 0.00053505, + "loss": 1.8094, "step": 2454 }, { "epoch": 3.928, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.7380341291427612, + "learning_rate": 0.0005348999999999999, + "loss": 1.7991, "step": 2455 }, { "epoch": 3.9295999999999998, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.1621791124343872, + "learning_rate": 0.00053475, + "loss": 1.6631, "step": 2456 }, { "epoch": 3.9312, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.3532447814941406, + "learning_rate": 0.0005346, + "loss": 2.0495, "step": 2457 }, { "epoch": 3.9328, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.9457495808601379, + "learning_rate": 0.00053445, + "loss": 1.7621, "step": 2458 }, { "epoch": 3.9344, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.45427045226097107, + "learning_rate": 0.0005342999999999999, + "loss": 1.7291, "step": 2459 }, { "epoch": 3.936, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.4032189846038818, + "learning_rate": 0.00053415, + "loss": 2.1577, "step": 2460 }, { "epoch": 3.9375999999999998, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.6300065517425537, + "learning_rate": 0.000534, + "loss": 1.7902, "step": 2461 }, { "epoch": 3.9392, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.3903911113739014, + "learning_rate": 0.00053385, + "loss": 1.8668, "step": 2462 }, { "epoch": 3.9408, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.4470239877700806, + "learning_rate": 0.0005336999999999999, + "loss": 1.9876, "step": 2463 }, { "epoch": 3.9424, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.315370798110962, + "learning_rate": 0.0005335499999999999, + "loss": 1.9871, "step": 2464 }, { "epoch": 3.944, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.4395367503166199, + "learning_rate": 0.0005334, + "loss": 1.9859, "step": 2465 }, { "epoch": 3.9455999999999998, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.0911147594451904, + "learning_rate": 0.00053325, + "loss": 1.6294, "step": 2466 }, { "epoch": 3.9472, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.0740833282470703, + "learning_rate": 0.0005330999999999999, + "loss": 1.7948, "step": 2467 }, { "epoch": 3.9488, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.919568657875061, + "learning_rate": 0.0005329499999999999, + "loss": 1.8349, "step": 2468 }, { "epoch": 3.9504, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.3979318141937256, + "learning_rate": 0.0005327999999999999, + "loss": 1.6861, "step": 2469 }, { "epoch": 3.952, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.6491036415100098, + "learning_rate": 0.00053265, + "loss": 1.9822, "step": 2470 }, { "epoch": 3.9536, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.751991868019104, + "learning_rate": 0.0005324999999999999, + "loss": 2.215, "step": 2471 }, { "epoch": 3.9552, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.5577988624572754, + "learning_rate": 0.0005323499999999999, + "loss": 2.007, "step": 2472 }, { "epoch": 3.9568, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.3308780193328857, + "learning_rate": 0.0005321999999999999, + "loss": 2.1599, "step": 2473 }, { "epoch": 3.9584, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.2450164556503296, + "learning_rate": 0.00053205, + "loss": 2.0319, "step": 2474 }, { "epoch": 3.96, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.967799186706543, + "learning_rate": 0.0005319, + "loss": 2.9772, "step": 2475 }, { "epoch": 3.9616, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.091135263442993, + "learning_rate": 0.00053175, + "loss": 2.1827, "step": 2476 }, { "epoch": 3.9632, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.1590499877929688, + "learning_rate": 0.0005315999999999999, + "loss": 2.5292, "step": 2477 }, { "epoch": 3.9648, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.2901257276535034, + "learning_rate": 0.0005314499999999999, + "loss": 1.9916, "step": 2478 }, { "epoch": 3.9664, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.5774224400520325, + "learning_rate": 0.0005313, + "loss": 1.9182, "step": 2479 }, { "epoch": 3.968, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.7556675672531128, + "learning_rate": 0.00053115, + "loss": 1.7674, "step": 2480 }, { "epoch": 3.9696, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.9686545133590698, + "learning_rate": 0.000531, + "loss": 1.8127, "step": 2481 }, { "epoch": 3.9712, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.6765801906585693, + "learning_rate": 0.00053085, + "loss": 1.8753, "step": 2482 }, { "epoch": 3.9728, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.5046657919883728, + "learning_rate": 0.0005306999999999999, + "loss": 1.7438, "step": 2483 }, { "epoch": 3.9744, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.2890503406524658, + "learning_rate": 0.00053055, + "loss": 1.7384, "step": 2484 }, { "epoch": 3.976, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.7676361799240112, + "learning_rate": 0.0005304, + "loss": 1.7624, "step": 2485 }, { "epoch": 3.9776, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.7966771125793457, + "learning_rate": 0.00053025, + "loss": 1.6265, "step": 2486 }, { "epoch": 3.9792, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.9140196442604065, + "learning_rate": 0.0005300999999999999, + "loss": 1.8388, "step": 2487 }, { "epoch": 3.9808, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.5415698289871216, + "learning_rate": 0.00052995, + "loss": 2.0654, "step": 2488 }, { "epoch": 3.9824, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.7051250338554382, + "learning_rate": 0.0005298, + "loss": 1.6277, "step": 2489 }, { "epoch": 3.984, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.6715264916419983, + "learning_rate": 0.00052965, + "loss": 1.9315, "step": 2490 }, { "epoch": 3.9856, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.7733835577964783, + "learning_rate": 0.0005294999999999999, + "loss": 1.9635, "step": 2491 }, { "epoch": 3.9872, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.0471179485321045, + "learning_rate": 0.0005293499999999999, + "loss": 2.0734, "step": 2492 }, { "epoch": 3.9888, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.750136137008667, + "learning_rate": 0.0005292, + "loss": 1.7293, "step": 2493 }, { "epoch": 3.9904, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.5307488441467285, + "learning_rate": 0.00052905, + "loss": 1.8903, "step": 2494 }, { "epoch": 3.992, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.4690338969230652, + "learning_rate": 0.0005288999999999999, + "loss": 1.8027, "step": 2495 }, { "epoch": 3.9936, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.9644952416419983, + "learning_rate": 0.0005287499999999999, + "loss": 1.7084, "step": 2496 }, { "epoch": 3.9952, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.2549068927764893, + "learning_rate": 0.0005286, + "loss": 1.8686, "step": 2497 }, { "epoch": 3.9968, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.7961026430130005, + "learning_rate": 0.00052845, + "loss": 1.9095, "step": 2498 }, { "epoch": 3.9984, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.194058418273926, + "learning_rate": 0.0005282999999999999, + "loss": 2.1499, "step": 2499 }, { "epoch": 4.0, "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, - "step": 2500 - }, - { - "epoch": 4.0, - "eval_cer": 1.0, - "eval_loss": NaN, - "eval_runtime": 158.647, - "eval_samples_per_second": 19.767, - "eval_steps_per_second": 1.235, - "eval_wer": 1.0, + "learning_rate": 0.0005282999999999999, + "loss": 2.388, "step": 2500 }, { "epoch": 4.0016, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.620855450630188, + "learning_rate": 0.0005281499999999999, + "loss": 2.3597, "step": 2501 }, { "epoch": 4.0032, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.7790733575820923, + "learning_rate": 0.0005279999999999999, + "loss": 2.3832, "step": 2502 }, { "epoch": 4.0048, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.5202178955078125, + "learning_rate": 0.00052785, + "loss": 1.8287, "step": 2503 }, { "epoch": 4.0064, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.0778002738952637, + "learning_rate": 0.0005276999999999999, + "loss": 2.0969, "step": 2504 }, { "epoch": 4.008, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.827845335006714, + "learning_rate": 0.0005275499999999999, + "loss": 1.8303, "step": 2505 }, { "epoch": 4.0096, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.7472437620162964, + "learning_rate": 0.0005273999999999999, + "loss": 2.1248, "step": 2506 }, { "epoch": 4.0112, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 7.076861381530762, + "learning_rate": 0.0005272499999999999, + "loss": 2.0531, "step": 2507 }, { "epoch": 4.0128, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.7650519013404846, + "learning_rate": 0.0005271, + "loss": 2.0667, "step": 2508 }, { "epoch": 4.0144, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.6342208385467529, + "learning_rate": 0.00052695, + "loss": 2.0258, "step": 2509 }, { "epoch": 4.016, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.6137911081314087, + "learning_rate": 0.0005267999999999999, + "loss": 2.0024, "step": 2510 }, { "epoch": 4.0176, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.357777714729309, + "learning_rate": 0.0005266499999999999, + "loss": 1.7083, "step": 2511 }, { "epoch": 4.0192, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.2996630668640137, + "learning_rate": 0.0005265, + "loss": 1.8742, "step": 2512 }, { "epoch": 4.0208, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 5.800222396850586, + "learning_rate": 0.00052635, + "loss": 1.8311, "step": 2513 }, { "epoch": 4.0224, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.6833199262619019, + "learning_rate": 0.0005262, + "loss": 1.8099, "step": 2514 }, { "epoch": 4.024, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.0427937507629395, + "learning_rate": 0.00052605, + "loss": 1.673, "step": 2515 }, { "epoch": 4.0256, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.4635120630264282, + "learning_rate": 0.0005258999999999999, + "loss": 1.7433, "step": 2516 }, { "epoch": 4.0272, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.7591537833213806, + "learning_rate": 0.00052575, + "loss": 1.9307, "step": 2517 }, { "epoch": 4.0288, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 7.746607780456543, + "learning_rate": 0.0005256, + "loss": 2.116, "step": 2518 }, { "epoch": 4.0304, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.2699731588363647, + "learning_rate": 0.00052545, + "loss": 1.7969, "step": 2519 }, { "epoch": 4.032, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.5821665525436401, + "learning_rate": 0.0005252999999999999, + "loss": 1.8846, "step": 2520 }, { "epoch": 4.0336, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.0592167377471924, + "learning_rate": 0.0005251499999999999, + "loss": 1.7655, "step": 2521 }, { "epoch": 4.0352, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.6359199285507202, + "learning_rate": 0.000525, + "loss": 1.8698, "step": 2522 }, { "epoch": 4.0368, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.7579365968704224, + "learning_rate": 0.00052485, + "loss": 1.5655, "step": 2523 }, { "epoch": 4.0384, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.8781223297119141, + "learning_rate": 0.0005247, + "loss": 1.4955, "step": 2524 }, { "epoch": 4.04, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.2994422912597656, + "learning_rate": 0.0005245499999999999, + "loss": 1.8925, "step": 2525 }, { "epoch": 4.0416, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.251488447189331, + "learning_rate": 0.0005244, + "loss": 1.8904, "step": 2526 }, { "epoch": 4.0432, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.7512491941452026, + "learning_rate": 0.00052425, + "loss": 1.8389, "step": 2527 }, { "epoch": 4.0448, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.4011669158935547, + "learning_rate": 0.0005241, + "loss": 1.4594, "step": 2528 }, { "epoch": 4.0464, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.0186192989349365, + "learning_rate": 0.0005239499999999999, + "loss": 1.5813, "step": 2529 }, { "epoch": 4.048, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.7245293259620667, + "learning_rate": 0.0005237999999999999, + "loss": 1.8401, "step": 2530 }, { "epoch": 4.0496, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.343597412109375, + "learning_rate": 0.00052365, + "loss": 2.2162, "step": 2531 }, { "epoch": 4.0512, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.9441453814506531, + "learning_rate": 0.0005235, + "loss": 1.87, "step": 2532 }, { "epoch": 4.0528, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.8596580624580383, + "learning_rate": 0.0005233499999999999, + "loss": 2.0299, "step": 2533 }, { "epoch": 4.0544, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.0425570011138916, + "learning_rate": 0.0005231999999999999, + "loss": 1.7958, "step": 2534 }, { "epoch": 4.056, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.5851573944091797, + "learning_rate": 0.00052305, + "loss": 2.0469, "step": 2535 }, { "epoch": 4.0576, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.8094300031661987, + "learning_rate": 0.0005229, + "loss": 1.9261, "step": 2536 }, { "epoch": 4.0592, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.7694682478904724, + "learning_rate": 0.0005227499999999999, + "loss": 1.8697, "step": 2537 }, { "epoch": 4.0608, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.58815598487854, + "learning_rate": 0.0005225999999999999, + "loss": 2.1828, "step": 2538 }, { "epoch": 4.0624, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.3614821434020996, + "learning_rate": 0.0005224499999999999, + "loss": 1.8252, "step": 2539 }, { "epoch": 4.064, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.085479497909546, + "learning_rate": 0.0005223, + "loss": 1.7422, "step": 2540 }, { "epoch": 4.0656, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.0933291912078857, + "learning_rate": 0.00052215, + "loss": 1.8915, "step": 2541 }, { "epoch": 4.0672, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.1529048681259155, + "learning_rate": 0.000522, + "loss": 1.9153, "step": 2542 }, { "epoch": 4.0688, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.6861165165901184, + "learning_rate": 0.0005218499999999999, + "loss": 2.0807, "step": 2543 }, { "epoch": 4.0704, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.9590578079223633, + "learning_rate": 0.0005216999999999999, + "loss": 1.6799, "step": 2544 }, { "epoch": 4.072, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.5041584968566895, + "learning_rate": 0.00052155, + "loss": 1.7953, "step": 2545 }, { "epoch": 4.0736, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.7498306035995483, + "learning_rate": 0.0005214, + "loss": 2.0022, "step": 2546 }, { "epoch": 4.0752, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.0625548362731934, + "learning_rate": 0.00052125, + "loss": 1.9091, "step": 2547 }, { "epoch": 4.0768, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.7577870488166809, + "learning_rate": 0.0005211, + "loss": 1.9865, "step": 2548 }, { "epoch": 4.0784, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.3451220989227295, + "learning_rate": 0.00052095, + "loss": 2.2004, "step": 2549 }, { "epoch": 4.08, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.8755617141723633, + "learning_rate": 0.0005208, + "loss": 2.4375, "step": 2550 }, { "epoch": 4.0816, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 15.953272819519043, + "learning_rate": 0.00052065, + "loss": 3.0587, "step": 2551 }, { "epoch": 4.0832, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 5.910642147064209, + "learning_rate": 0.0005205, + "loss": 2.5175, "step": 2552 }, { "epoch": 4.0848, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.549471378326416, + "learning_rate": 0.0005203499999999999, + "loss": 2.3515, "step": 2553 }, { "epoch": 4.0864, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 5.504085063934326, + "learning_rate": 0.0005202, + "loss": 2.2878, "step": 2554 }, { "epoch": 4.088, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.7265090942382812, + "learning_rate": 0.00052005, + "loss": 2.1208, "step": 2555 }, { "epoch": 4.0896, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.5239818096160889, + "learning_rate": 0.0005199, + "loss": 1.9764, "step": 2556 }, { "epoch": 4.0912, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.3443516492843628, + "learning_rate": 0.0005197499999999999, + "loss": 1.8913, "step": 2557 }, { "epoch": 4.0928, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.3654530048370361, + "learning_rate": 0.0005195999999999999, + "loss": 1.8563, "step": 2558 }, { "epoch": 4.0944, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 7.616000652313232, + "learning_rate": 0.00051945, + "loss": 2.4474, "step": 2559 }, { "epoch": 4.096, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.2244317531585693, + "learning_rate": 0.0005193, + "loss": 2.0492, "step": 2560 }, { "epoch": 4.0976, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.2767560482025146, + "learning_rate": 0.0005191499999999999, + "loss": 1.9614, "step": 2561 }, { "epoch": 4.0992, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.6678037047386169, + "learning_rate": 0.0005189999999999999, + "loss": 1.9511, "step": 2562 }, { "epoch": 4.1008, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.193560242652893, + "learning_rate": 0.00051885, + "loss": 1.6949, "step": 2563 }, { "epoch": 4.1024, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.6942158937454224, + "learning_rate": 0.0005187, + "loss": 1.6269, "step": 2564 }, { "epoch": 4.104, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.2404253482818604, + "learning_rate": 0.0005185499999999999, + "loss": 1.7762, "step": 2565 }, { "epoch": 4.1056, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.07823646068573, + "learning_rate": 0.0005183999999999999, + "loss": 1.7197, "step": 2566 }, { "epoch": 4.1072, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.7477423548698425, + "learning_rate": 0.0005182499999999999, + "loss": 1.5912, "step": 2567 }, { "epoch": 4.1088, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.1995832920074463, + "learning_rate": 0.0005181, + "loss": 1.8297, "step": 2568 }, { "epoch": 4.1104, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.0321600437164307, + "learning_rate": 0.0005179499999999999, + "loss": 1.7202, "step": 2569 }, { "epoch": 4.112, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.0413057804107666, + "learning_rate": 0.0005177999999999999, + "loss": 1.6415, "step": 2570 }, { "epoch": 4.1136, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.817716598510742, + "learning_rate": 0.0005176499999999999, + "loss": 1.7745, "step": 2571 }, { "epoch": 4.1152, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.430630683898926, + "learning_rate": 0.0005175, + "loss": 1.96, "step": 2572 }, { "epoch": 4.1168, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.758425712585449, + "learning_rate": 0.00051735, + "loss": 1.6291, "step": 2573 }, { "epoch": 4.1184, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.5142273902893066, + "learning_rate": 0.0005172, + "loss": 2.0679, "step": 2574 }, { "epoch": 4.12, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.235620141029358, + "learning_rate": 0.00051705, + "loss": 1.828, "step": 2575 }, { "epoch": 4.1216, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.9212775230407715, + "learning_rate": 0.0005168999999999999, + "loss": 1.7722, "step": 2576 }, { "epoch": 4.1232, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.6647447943687439, + "learning_rate": 0.00051675, + "loss": 1.906, "step": 2577 }, { "epoch": 4.1248, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.5452325344085693, + "learning_rate": 0.0005166, + "loss": 2.1333, "step": 2578 }, { "epoch": 4.1264, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.7366634011268616, + "learning_rate": 0.00051645, + "loss": 1.8803, "step": 2579 }, { "epoch": 4.128, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.7835949659347534, + "learning_rate": 0.0005163, + "loss": 1.9943, "step": 2580 }, { "epoch": 4.1296, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.6397596001625061, + "learning_rate": 0.0005161499999999999, + "loss": 1.7529, "step": 2581 }, { "epoch": 4.1312, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.079551935195923, + "learning_rate": 0.000516, + "loss": 1.9347, "step": 2582 }, { "epoch": 4.1328, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.6539299488067627, + "learning_rate": 0.00051585, + "loss": 1.7913, "step": 2583 }, { "epoch": 4.1344, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.2330212593078613, + "learning_rate": 0.0005157, + "loss": 2.1188, "step": 2584 }, { "epoch": 4.136, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.305746078491211, + "learning_rate": 0.0005155499999999999, + "loss": 1.9467, "step": 2585 }, { "epoch": 4.1376, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.9980270862579346, + "learning_rate": 0.0005154, + "loss": 2.1646, "step": 2586 }, { "epoch": 4.1392, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.092460036277771, + "learning_rate": 0.00051525, + "loss": 1.8353, "step": 2587 }, { "epoch": 4.1408, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.8863208889961243, + "learning_rate": 0.0005151, + "loss": 2.0309, "step": 2588 }, { "epoch": 4.1424, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.000679850578308, + "learning_rate": 0.0005149499999999999, + "loss": 1.8691, "step": 2589 }, { "epoch": 4.144, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.8714463710784912, + "learning_rate": 0.0005147999999999999, + "loss": 1.8734, "step": 2590 }, { "epoch": 4.1456, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.2173759937286377, + "learning_rate": 0.00051465, + "loss": 2.3323, "step": 2591 }, { "epoch": 4.1472, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.8660641312599182, + "learning_rate": 0.0005145, + "loss": 2.1977, "step": 2592 }, { "epoch": 4.1488, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.4730968475341797, + "learning_rate": 0.0005143499999999999, + "loss": 2.1989, "step": 2593 }, { "epoch": 4.1504, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.7288036942481995, + "learning_rate": 0.0005141999999999999, + "loss": 1.7003, "step": 2594 }, { "epoch": 4.152, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.9436076283454895, + "learning_rate": 0.0005140499999999999, + "loss": 2.1883, "step": 2595 }, { "epoch": 4.1536, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.6240314245224, + "learning_rate": 0.0005139, + "loss": 1.9301, "step": 2596 }, { "epoch": 4.1552, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.359818935394287, + "learning_rate": 0.0005137499999999999, + "loss": 2.1822, "step": 2597 }, { "epoch": 4.1568, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.464782238006592, + "learning_rate": 0.0005135999999999999, + "loss": 2.2363, "step": 2598 }, { "epoch": 4.1584, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.7934373617172241, + "learning_rate": 0.0005134499999999999, + "loss": 2.5678, "step": 2599 }, { "epoch": 4.16, "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, - "step": 2600 - }, - { - "epoch": 4.16, - "eval_cer": 1.0, - "eval_loss": NaN, - "eval_runtime": 159.8763, - "eval_samples_per_second": 19.615, - "eval_steps_per_second": 1.226, - "eval_wer": 1.0, + "learning_rate": 0.0005134499999999999, + "loss": 2.6063, "step": 2600 }, { "epoch": 4.1616, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.552441358566284, + "learning_rate": 0.0005133, + "loss": 1.8414, "step": 2601 }, { "epoch": 4.1632, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.0714163780212402, + "learning_rate": 0.00051315, + "loss": 2.1064, "step": 2602 }, { "epoch": 4.1648, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 9.196826934814453, + "learning_rate": 0.0005129999999999999, + "loss": 2.618, "step": 2603 }, { "epoch": 4.1664, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 6.336667060852051, + "learning_rate": 0.0005128499999999999, + "loss": 2.388, "step": 2604 }, { "epoch": 4.168, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.398655891418457, + "learning_rate": 0.0005126999999999999, + "loss": 2.3414, "step": 2605 }, { "epoch": 4.1696, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.885204315185547, + "learning_rate": 0.00051255, + "loss": 2.0417, "step": 2606 }, { "epoch": 4.1712, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.9722228050231934, + "learning_rate": 0.0005124, + "loss": 2.0161, "step": 2607 }, { "epoch": 4.1728, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.073782444000244, + "learning_rate": 0.00051225, + "loss": 1.8719, "step": 2608 }, { "epoch": 4.1744, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.423966884613037, + "learning_rate": 0.0005120999999999999, + "loss": 1.9612, "step": 2609 }, { "epoch": 4.176, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 6.711324214935303, + "learning_rate": 0.00051195, + "loss": 2.8242, "step": 2610 }, { "epoch": 4.1776, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.0616002082824707, + "learning_rate": 0.0005118, + "loss": 1.7886, "step": 2611 }, { "epoch": 4.1792, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.0508960485458374, + "learning_rate": 0.00051165, + "loss": 1.651, "step": 2612 }, { "epoch": 4.1808, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.420872211456299, + "learning_rate": 0.0005115, + "loss": 1.6546, "step": 2613 }, { "epoch": 4.1824, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.8311823606491089, + "learning_rate": 0.0005113499999999999, + "loss": 1.7491, "step": 2614 }, { "epoch": 4.184, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.1865077018737793, + "learning_rate": 0.0005112, + "loss": 1.8246, "step": 2615 }, { "epoch": 4.1856, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.867258071899414, + "learning_rate": 0.00051105, + "loss": 1.8609, "step": 2616 }, { "epoch": 4.1872, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.9281314611434937, + "learning_rate": 0.0005109, + "loss": 1.7332, "step": 2617 }, { "epoch": 4.1888, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.820015907287598, + "learning_rate": 0.0005107499999999999, + "loss": 1.9586, "step": 2618 }, { "epoch": 4.1904, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.031200885772705, + "learning_rate": 0.0005105999999999999, + "loss": 1.5983, "step": 2619 }, { "epoch": 4.192, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.678032398223877, + "learning_rate": 0.00051045, + "loss": 1.7672, "step": 2620 }, { "epoch": 4.1936, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 7.8487162590026855, + "learning_rate": 0.0005103, + "loss": 2.5603, "step": 2621 }, { "epoch": 4.1952, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.8613600730895996, + "learning_rate": 0.0005101499999999999, + "loss": 1.747, "step": 2622 }, { "epoch": 4.1968, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.7120552659034729, + "learning_rate": 0.0005099999999999999, + "loss": 1.8262, "step": 2623 }, { "epoch": 4.1984, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.790601968765259, + "learning_rate": 0.00050985, + "loss": 2.1263, "step": 2624 }, { "epoch": 4.2, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.589802622795105, + "learning_rate": 0.0005097, + "loss": 1.7034, "step": 2625 }, { "epoch": 4.2016, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.7992726564407349, + "learning_rate": 0.0005095499999999999, + "loss": 1.9422, "step": 2626 }, { "epoch": 4.2032, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.7839942574501038, + "learning_rate": 0.0005093999999999999, + "loss": 1.6971, "step": 2627 }, { "epoch": 4.2048, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.5990935564041138, + "learning_rate": 0.0005092499999999999, + "loss": 1.7538, "step": 2628 }, { "epoch": 4.2064, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.8034536838531494, + "learning_rate": 0.0005091, + "loss": 1.7504, "step": 2629 }, { "epoch": 4.208, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.9381115436553955, + "learning_rate": 0.0005089499999999999, + "loss": 2.0219, "step": 2630 }, { "epoch": 4.2096, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.685932159423828, + "learning_rate": 0.0005087999999999999, + "loss": 1.7687, "step": 2631 }, { "epoch": 4.2112, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.8656719923019409, + "learning_rate": 0.0005086499999999999, + "loss": 1.5446, "step": 2632 }, { "epoch": 4.2128, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.400219202041626, + "learning_rate": 0.0005085, + "loss": 2.0702, "step": 2633 }, { "epoch": 4.2144, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.720746397972107, + "learning_rate": 0.00050835, + "loss": 2.0072, "step": 2634 }, { "epoch": 4.216, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.3523253202438354, + "learning_rate": 0.0005082, + "loss": 1.8526, "step": 2635 }, { "epoch": 4.2176, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.1961480379104614, + "learning_rate": 0.0005080499999999999, + "loss": 1.9631, "step": 2636 }, { "epoch": 4.2192, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.8764796257019043, + "learning_rate": 0.0005078999999999999, + "loss": 1.8485, "step": 2637 }, { "epoch": 4.2208, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.8735462427139282, + "learning_rate": 0.00050775, + "loss": 2.1159, "step": 2638 }, { "epoch": 4.2224, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.1814446449279785, + "learning_rate": 0.0005076, + "loss": 2.0074, "step": 2639 }, { "epoch": 4.224, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.8434025645256042, + "learning_rate": 0.00050745, + "loss": 1.7273, "step": 2640 }, { "epoch": 4.2256, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.152543544769287, + "learning_rate": 0.0005073, + "loss": 2.4096, "step": 2641 }, { "epoch": 4.2272, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.9265046715736389, + "learning_rate": 0.0005071499999999999, + "loss": 2.0438, "step": 2642 }, { "epoch": 4.2288, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.5280780792236328, + "learning_rate": 0.000507, + "loss": 1.8898, "step": 2643 }, { "epoch": 4.2304, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.905768096446991, + "learning_rate": 0.00050685, + "loss": 2.3321, "step": 2644 }, { "epoch": 4.232, "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "learning_rate": 0.00050685, + "loss": 1.1, "step": 2645 }, { "epoch": 4.2336, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.8761323690414429, + "learning_rate": 0.0005067, + "loss": 1.9053, "step": 2646 }, { "epoch": 4.2352, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.441887617111206, + "learning_rate": 0.0005065499999999999, + "loss": 2.1834, "step": 2647 }, { "epoch": 4.2368, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.4189505577087402, + "learning_rate": 0.0005064, + "loss": 2.2359, "step": 2648 }, { "epoch": 4.2384, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.740108013153076, + "learning_rate": 0.00050625, + "loss": 2.0901, "step": 2649 }, { "epoch": 4.24, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.9102271795272827, + "learning_rate": 0.0005061, + "loss": 2.6585, "step": 2650 }, { "epoch": 4.2416, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 14.789002418518066, + "learning_rate": 0.00050595, + "loss": 2.8544, "step": 2651 }, { "epoch": 4.2432, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.1927456855773926, + "learning_rate": 0.0005057999999999999, + "loss": 2.0763, "step": 2652 }, { "epoch": 4.2448, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 6.375650882720947, + "learning_rate": 0.00050565, + "loss": 2.3458, "step": 2653 }, { "epoch": 4.2464, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.2653048038482666, + "learning_rate": 0.0005055, + "loss": 2.3287, "step": 2654 }, { "epoch": 4.248, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.7491447925567627, + "learning_rate": 0.00050535, + "loss": 2.1713, "step": 2655 }, { "epoch": 4.2496, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.120514392852783, + "learning_rate": 0.0005051999999999999, + "loss": 1.9272, "step": 2656 }, { "epoch": 4.2512, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.3644896745681763, + "learning_rate": 0.0005050499999999999, + "loss": 2.2318, "step": 2657 }, { "epoch": 4.2528, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.0127274990081787, + "learning_rate": 0.0005049, + "loss": 2.2203, "step": 2658 }, { "epoch": 4.2544, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.9749441146850586, + "learning_rate": 0.00050475, + "loss": 1.941, "step": 2659 }, { "epoch": 4.256, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.2064893245697021, + "learning_rate": 0.0005045999999999999, + "loss": 1.9854, "step": 2660 }, { "epoch": 4.2576, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.9935476779937744, + "learning_rate": 0.0005044499999999999, + "loss": 1.8353, "step": 2661 }, { "epoch": 4.2592, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.09097146987915, + "learning_rate": 0.0005043, + "loss": 1.9936, "step": 2662 }, { "epoch": 4.2608, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.7409591674804688, + "learning_rate": 0.00050415, + "loss": 1.7158, "step": 2663 }, { "epoch": 4.2624, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.9653496742248535, + "learning_rate": 0.0005039999999999999, + "loss": 1.6247, "step": 2664 }, { "epoch": 4.264, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.4377232789993286, + "learning_rate": 0.0005038499999999999, + "loss": 1.796, "step": 2665 }, { "epoch": 4.2656, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.103083610534668, + "learning_rate": 0.0005036999999999999, + "loss": 1.7619, "step": 2666 }, { "epoch": 4.2672, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.7207717895507812, + "learning_rate": 0.00050355, + "loss": 1.7422, "step": 2667 }, { "epoch": 4.2688, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 6.597609043121338, + "learning_rate": 0.0005034, + "loss": 2.0255, "step": 2668 }, { "epoch": 4.2704, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.644750118255615, + "learning_rate": 0.0005032499999999999, + "loss": 1.7337, "step": 2669 }, { "epoch": 4.272, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 5.096191883087158, + "learning_rate": 0.0005030999999999999, + "loss": 1.7898, "step": 2670 }, { "epoch": 4.2736, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 6.152255058288574, + "learning_rate": 0.00050295, + "loss": 1.6478, "step": 2671 }, { "epoch": 4.2752, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.358015537261963, + "learning_rate": 0.0005028, + "loss": 1.6115, "step": 2672 }, { "epoch": 4.2768, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.264955759048462, + "learning_rate": 0.00050265, + "loss": 1.8123, "step": 2673 }, { "epoch": 4.2783999999999995, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.8399302959442139, + "learning_rate": 0.0005025, + "loss": 1.97, "step": 2674 }, { "epoch": 4.28, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.8927841186523438, + "learning_rate": 0.0005023499999999999, + "loss": 1.7362, "step": 2675 }, { "epoch": 4.2816, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.1136553287506104, + "learning_rate": 0.0005022, + "loss": 1.5103, "step": 2676 }, { "epoch": 4.2832, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.200005054473877, + "learning_rate": 0.00050205, + "loss": 1.8307, "step": 2677 }, { "epoch": 4.2848, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.8706302642822266, + "learning_rate": 0.0005019, + "loss": 2.0445, "step": 2678 }, { "epoch": 4.2864, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.9827845096588135, + "learning_rate": 0.00050175, + "loss": 1.6169, "step": 2679 }, { "epoch": 4.288, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.5512272119522095, + "learning_rate": 0.0005015999999999999, + "loss": 2.0119, "step": 2680 }, { "epoch": 4.2896, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.1824843883514404, + "learning_rate": 0.00050145, + "loss": 2.1293, "step": 2681 }, { "epoch": 4.2912, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.637053370475769, + "learning_rate": 0.0005013, + "loss": 1.7512, "step": 2682 }, { "epoch": 4.2928, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.891478955745697, + "learning_rate": 0.00050115, + "loss": 1.6766, "step": 2683 }, { "epoch": 4.2943999999999996, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.968799352645874, + "learning_rate": 0.0005009999999999999, + "loss": 1.7987, "step": 2684 }, { "epoch": 4.296, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.8135929107666016, + "learning_rate": 0.00050085, + "loss": 1.513, "step": 2685 }, { "epoch": 4.2976, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.973942995071411, + "learning_rate": 0.0005007, + "loss": 1.9517, "step": 2686 }, { "epoch": 4.2992, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.5238656997680664, + "learning_rate": 0.00050055, + "loss": 1.871, "step": 2687 }, { "epoch": 4.3008, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.8395425081253052, + "learning_rate": 0.0005003999999999999, + "loss": 2.3133, "step": 2688 }, { "epoch": 4.3024000000000004, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.2735295295715332, + "learning_rate": 0.0005002499999999999, + "loss": 2.0122, "step": 2689 }, { "epoch": 4.304, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.705003023147583, + "learning_rate": 0.0005001, + "loss": 1.7076, "step": 2690 }, { "epoch": 4.3056, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.0689762830734253, + "learning_rate": 0.00049995, + "loss": 1.8977, "step": 2691 }, { "epoch": 4.3072, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.1098942756652832, + "learning_rate": 0.0004997999999999999, + "loss": 2.1884, "step": 2692 }, { "epoch": 4.3088, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.3457235097885132, + "learning_rate": 0.0004996499999999999, + "loss": 1.9898, "step": 2693 }, { "epoch": 4.3104, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.3474767208099365, + "learning_rate": 0.0004994999999999999, + "loss": 2.1797, "step": 2694 }, { "epoch": 4.312, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.194025754928589, + "learning_rate": 0.00049935, + "loss": 1.7558, "step": 2695 }, { "epoch": 4.3136, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 5.322290420532227, + "learning_rate": 0.0004991999999999999, + "loss": 2.1153, "step": 2696 }, { "epoch": 4.3152, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.1777799129486084, + "learning_rate": 0.0004990499999999999, + "loss": 1.9527, "step": 2697 }, { "epoch": 4.3168, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.741779088973999, + "learning_rate": 0.0004988999999999999, + "loss": 2.2402, "step": 2698 }, { "epoch": 4.3184000000000005, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.0727362632751465, + "learning_rate": 0.00049875, + "loss": 2.1972, "step": 2699 }, { "epoch": 4.32, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, - "step": 2700 - }, - { - "epoch": 4.32, - "eval_cer": 1.0, - "eval_loss": NaN, - "eval_runtime": 159.7893, - "eval_samples_per_second": 19.626, - "eval_steps_per_second": 1.227, - "eval_wer": 1.0, + "grad_norm": 2.3544955253601074, + "learning_rate": 0.0004986, + "loss": 2.1574, "step": 2700 }, { "epoch": 4.3216, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.276401996612549, + "learning_rate": 0.00049845, + "loss": 2.4749, "step": 2701 }, { "epoch": 4.3232, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 12.329696655273438, + "learning_rate": 0.0004982999999999999, + "loss": 3.0003, "step": 2702 }, { "epoch": 4.3248, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.4918222427368164, + "learning_rate": 0.0004981499999999999, + "loss": 2.1861, "step": 2703 }, { "epoch": 4.3264, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.620767116546631, + "learning_rate": 0.000498, + "loss": 2.1404, "step": 2704 }, { "epoch": 4.328, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.1579036712646484, + "learning_rate": 0.00049785, + "loss": 1.9692, "step": 2705 }, { "epoch": 4.3296, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.295342206954956, + "learning_rate": 0.0004977, + "loss": 1.9675, "step": 2706 }, { "epoch": 4.3312, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.0219806432724, + "learning_rate": 0.00049755, + "loss": 1.7226, "step": 2707 }, { "epoch": 4.3328, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.3120267391204834, + "learning_rate": 0.0004974, + "loss": 2.0358, "step": 2708 }, { "epoch": 4.3344, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.3944854736328125, + "learning_rate": 0.00049725, + "loss": 2.0762, "step": 2709 }, { "epoch": 4.336, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.1443352699279785, + "learning_rate": 0.0004971, + "loss": 1.9482, "step": 2710 }, { "epoch": 4.3376, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.1645523309707642, + "learning_rate": 0.00049695, + "loss": 1.7209, "step": 2711 }, { "epoch": 4.3392, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.0434651374816895, + "learning_rate": 0.0004967999999999999, + "loss": 2.124, "step": 2712 }, { "epoch": 4.3408, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.5835245847702026, + "learning_rate": 0.00049665, + "loss": 1.9263, "step": 2713 }, { "epoch": 4.3424, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.5039459466934204, + "learning_rate": 0.0004965, + "loss": 1.9301, "step": 2714 }, { "epoch": 4.344, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.2301478385925293, + "learning_rate": 0.00049635, + "loss": 1.8303, "step": 2715 }, { "epoch": 4.3456, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.2232022285461426, + "learning_rate": 0.0004961999999999999, + "loss": 1.9568, "step": 2716 }, { "epoch": 4.3472, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.9055202603340149, + "learning_rate": 0.0004960499999999999, + "loss": 1.6978, "step": 2717 }, { "epoch": 4.3488, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.9296348690986633, + "learning_rate": 0.0004959, + "loss": 1.5963, "step": 2718 }, { "epoch": 4.3504, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.9808449745178223, + "learning_rate": 0.00049575, + "loss": 1.5989, "step": 2719 }, { "epoch": 4.352, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.236293315887451, + "learning_rate": 0.0004955999999999999, + "loss": 1.9088, "step": 2720 }, { "epoch": 4.3536, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.9127609729766846, + "learning_rate": 0.0004954499999999999, + "loss": 1.6453, "step": 2721 }, { "epoch": 4.3552, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.41826868057251, + "learning_rate": 0.0004953, + "loss": 1.9933, "step": 2722 }, { "epoch": 4.3568, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 5.878091812133789, + "learning_rate": 0.00049515, + "loss": 1.7266, "step": 2723 }, { "epoch": 4.3584, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.5814834833145142, + "learning_rate": 0.0004949999999999999, + "loss": 1.7923, "step": 2724 }, { "epoch": 4.36, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.397496461868286, + "learning_rate": 0.0004948499999999999, + "loss": 1.7079, "step": 2725 }, { "epoch": 4.3616, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.9779950380325317, + "learning_rate": 0.0004946999999999999, + "loss": 1.9622, "step": 2726 }, { "epoch": 4.3632, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.6545337438583374, + "learning_rate": 0.00049455, + "loss": 2.1439, "step": 2727 }, { "epoch": 4.3648, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.8302526473999023, + "learning_rate": 0.0004944, + "loss": 2.0157, "step": 2728 }, { "epoch": 4.3664, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.294466257095337, + "learning_rate": 0.0004942499999999999, + "loss": 1.6352, "step": 2729 }, { "epoch": 4.368, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.946574091911316, + "learning_rate": 0.0004940999999999999, + "loss": 1.7012, "step": 2730 }, { "epoch": 4.3696, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.769045889377594, + "learning_rate": 0.0004939499999999999, + "loss": 1.6464, "step": 2731 }, { "epoch": 4.3712, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.6270204782485962, + "learning_rate": 0.0004938, + "loss": 1.9476, "step": 2732 }, { "epoch": 4.3728, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.318570137023926, + "learning_rate": 0.00049365, + "loss": 1.7405, "step": 2733 }, { "epoch": 4.3744, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.380737066268921, + "learning_rate": 0.0004935, + "loss": 1.8886, "step": 2734 }, { "epoch": 4.376, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.8267993927001953, + "learning_rate": 0.0004933499999999999, + "loss": 1.7565, "step": 2735 }, { "epoch": 4.3776, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.3764233589172363, + "learning_rate": 0.0004932, + "loss": 1.8319, "step": 2736 }, { "epoch": 4.3792, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.615099549293518, + "learning_rate": 0.00049305, + "loss": 1.5021, "step": 2737 }, { "epoch": 4.3808, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.2117981910705566, + "learning_rate": 0.0004929, + "loss": 1.7998, "step": 2738 }, { "epoch": 4.3824, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.9510959386825562, + "learning_rate": 0.00049275, + "loss": 1.9194, "step": 2739 }, { "epoch": 4.384, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.5256377458572388, + "learning_rate": 0.0004925999999999999, + "loss": 2.4345, "step": 2740 }, { "epoch": 4.3856, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.9747059941291809, + "learning_rate": 0.00049245, + "loss": 2.1591, "step": 2741 }, { "epoch": 4.3872, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.8639597296714783, + "learning_rate": 0.0004923, + "loss": 1.7427, "step": 2742 }, { "epoch": 4.3888, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.0038577318191528, + "learning_rate": 0.00049215, + "loss": 1.7629, "step": 2743 }, { "epoch": 4.3904, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.7393220663070679, + "learning_rate": 0.0004919999999999999, + "loss": 2.0587, "step": 2744 }, { "epoch": 4.392, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.120962619781494, + "learning_rate": 0.00049185, + "loss": 1.7976, "step": 2745 }, { "epoch": 4.3936, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.9911690950393677, + "learning_rate": 0.0004917, + "loss": 2.1529, "step": 2746 }, { "epoch": 4.3952, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 5.39595365524292, + "learning_rate": 0.00049155, + "loss": 2.2529, "step": 2747 }, { "epoch": 4.3968, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.0449539422988892, + "learning_rate": 0.0004913999999999999, + "loss": 2.2485, "step": 2748 }, { "epoch": 4.3984, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.2274101972579956, + "learning_rate": 0.0004912499999999999, + "loss": 2.6684, "step": 2749 }, { "epoch": 4.4, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.6110919713974, + "learning_rate": 0.0004911, + "loss": 2.3316, "step": 2750 }, { "epoch": 4.4016, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 31.37392234802246, + "learning_rate": 0.00049095, + "loss": 3.3606, "step": 2751 }, { "epoch": 4.4032, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 13.821800231933594, + "learning_rate": 0.0004907999999999999, + "loss": 2.6849, "step": 2752 }, { "epoch": 4.4048, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.9470937252044678, + "learning_rate": 0.0004906499999999999, + "loss": 2.1471, "step": 2753 }, { "epoch": 4.4064, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.6643931865692139, + "learning_rate": 0.0004904999999999999, + "loss": 2.1997, "step": 2754 }, { "epoch": 4.408, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.0974528789520264, + "learning_rate": 0.00049035, + "loss": 2.2333, "step": 2755 }, { "epoch": 4.4096, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 10.75770092010498, + "learning_rate": 0.0004901999999999999, + "loss": 2.7282, "step": 2756 }, { "epoch": 4.4112, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.481705665588379, + "learning_rate": 0.0004900499999999999, + "loss": 2.1102, "step": 2757 }, { "epoch": 4.4128, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.0984652042388916, + "learning_rate": 0.0004898999999999999, + "loss": 2.0301, "step": 2758 }, { "epoch": 4.4144, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.651517391204834, + "learning_rate": 0.00048975, + "loss": 1.9718, "step": 2759 }, { "epoch": 4.416, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.137758493423462, + "learning_rate": 0.0004896, + "loss": 1.8473, "step": 2760 }, { "epoch": 4.4176, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.6671268939971924, + "learning_rate": 0.0004894499999999999, + "loss": 2.1645, "step": 2761 }, { "epoch": 4.4192, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.523720383644104, + "learning_rate": 0.0004892999999999999, + "loss": 1.8029, "step": 2762 }, { "epoch": 4.4208, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 6.891670227050781, + "learning_rate": 0.0004891499999999999, + "loss": 2.1067, "step": 2763 }, { "epoch": 4.4224, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.942965030670166, + "learning_rate": 0.000489, + "loss": 1.6909, "step": 2764 }, { "epoch": 4.424, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.303746223449707, + "learning_rate": 0.00048885, + "loss": 2.1012, "step": 2765 }, { "epoch": 4.4256, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.8003095388412476, + "learning_rate": 0.0004887, + "loss": 2.0751, "step": 2766 }, { "epoch": 4.4272, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.143497943878174, + "learning_rate": 0.0004885499999999999, + "loss": 1.866, "step": 2767 }, { "epoch": 4.4288, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.6083755493164062, + "learning_rate": 0.0004883999999999999, + "loss": 1.8097, "step": 2768 }, { "epoch": 4.4304, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.3343874216079712, + "learning_rate": 0.0004882499999999999, + "loss": 1.8029, "step": 2769 }, { "epoch": 4.432, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.8573079109191895, + "learning_rate": 0.00048809999999999994, + "loss": 1.7581, "step": 2770 }, { "epoch": 4.4336, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 5.798096656799316, + "learning_rate": 0.00048794999999999996, + "loss": 2.0261, "step": 2771 }, { "epoch": 4.4352, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.447819232940674, + "learning_rate": 0.00048779999999999993, + "loss": 1.6978, "step": 2772 }, { "epoch": 4.4368, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.9434682130813599, + "learning_rate": 0.00048764999999999995, + "loss": 1.5958, "step": 2773 }, { "epoch": 4.4384, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.6188219785690308, + "learning_rate": 0.0004875, + "loss": 2.3481, "step": 2774 }, { "epoch": 4.44, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.8848549127578735, + "learning_rate": 0.00048734999999999995, + "loss": 1.8844, "step": 2775 }, { "epoch": 4.4416, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.049158811569214, + "learning_rate": 0.00048719999999999997, + "loss": 1.8728, "step": 2776 }, { "epoch": 4.4432, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.0514065027236938, + "learning_rate": 0.00048704999999999994, + "loss": 1.8622, "step": 2777 }, { "epoch": 4.4448, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.5144424438476562, + "learning_rate": 0.00048689999999999996, + "loss": 1.887, "step": 2778 }, { "epoch": 4.4464, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.3987104892730713, + "learning_rate": 0.00048675, + "loss": 1.8795, "step": 2779 }, { "epoch": 4.448, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.4233217239379883, + "learning_rate": 0.0004866, + "loss": 1.9542, "step": 2780 }, { "epoch": 4.4496, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.9357428550720215, + "learning_rate": 0.0004864499999999999, + "loss": 1.8799, "step": 2781 }, { "epoch": 4.4512, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.104844093322754, + "learning_rate": 0.00048629999999999995, + "loss": 1.6141, "step": 2782 }, { "epoch": 4.4528, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.3217804431915283, + "learning_rate": 0.00048614999999999997, + "loss": 1.7236, "step": 2783 }, { "epoch": 4.4544, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.2796238660812378, + "learning_rate": 0.000486, + "loss": 2.0714, "step": 2784 }, { "epoch": 4.456, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.0485546588897705, + "learning_rate": 0.00048584999999999996, + "loss": 1.8654, "step": 2785 }, { "epoch": 4.4576, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.945281028747559, + "learning_rate": 0.00048569999999999993, + "loss": 1.7194, "step": 2786 }, { "epoch": 4.4592, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.205522298812866, + "learning_rate": 0.00048554999999999996, + "loss": 1.8191, "step": 2787 }, { "epoch": 4.4608, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.893653392791748, + "learning_rate": 0.0004854, + "loss": 1.9642, "step": 2788 }, { "epoch": 4.4624, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.2910706996917725, + "learning_rate": 0.00048524999999999995, + "loss": 2.0733, "step": 2789 }, { "epoch": 4.464, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.876647710800171, + "learning_rate": 0.0004851, + "loss": 2.0299, "step": 2790 }, { "epoch": 4.4656, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.0618470907211304, + "learning_rate": 0.00048495, + "loss": 1.8538, "step": 2791 }, { "epoch": 4.4672, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.4978771209716797, + "learning_rate": 0.00048479999999999997, + "loss": 1.8727, "step": 2792 }, { "epoch": 4.4688, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.992602825164795, + "learning_rate": 0.00048464999999999994, + "loss": 2.0375, "step": 2793 }, { "epoch": 4.4704, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 7.787044525146484, + "learning_rate": 0.00048449999999999996, + "loss": 2.1719, "step": 2794 }, { "epoch": 4.4719999999999995, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.2934472560882568, + "learning_rate": 0.00048435, + "loss": 1.6233, "step": 2795 }, { "epoch": 4.4736, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.5270168781280518, + "learning_rate": 0.0004842, + "loss": 2.1265, "step": 2796 }, { "epoch": 4.4752, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.369183540344238, + "learning_rate": 0.0004840499999999999, + "loss": 1.9866, "step": 2797 }, { "epoch": 4.4768, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.1386245489120483, + "learning_rate": 0.00048389999999999994, + "loss": 2.4311, "step": 2798 }, { "epoch": 4.4784, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.0232652425765991, + "learning_rate": 0.00048374999999999997, + "loss": 1.8705, "step": 2799 }, { "epoch": 4.48, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, - "step": 2800 - }, - { - "epoch": 4.48, - "eval_cer": 1.0, - "eval_loss": NaN, - "eval_runtime": 158.6637, - "eval_samples_per_second": 19.765, - "eval_steps_per_second": 1.235, - "eval_wer": 1.0, + "grad_norm": 3.219254970550537, + "learning_rate": 0.0004836, + "loss": 2.4248, "step": 2800 }, { "epoch": 4.4816, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 20.291139602661133, + "learning_rate": 0.00048344999999999996, + "loss": 3.2457, "step": 2801 }, { "epoch": 4.4832, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.5292768478393555, + "learning_rate": 0.00048329999999999993, + "loss": 1.8494, "step": 2802 }, { "epoch": 4.4848, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 16.353145599365234, + "learning_rate": 0.00048314999999999995, + "loss": 2.4255, "step": 2803 }, { "epoch": 4.4864, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 27.58832550048828, + "learning_rate": 0.000483, + "loss": 3.1663, "step": 2804 }, { "epoch": 4.4879999999999995, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 10.299684524536133, + "learning_rate": 0.00048284999999999995, + "loss": 2.4567, "step": 2805 }, { "epoch": 4.4896, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.7438368797302246, + "learning_rate": 0.00048269999999999997, + "loss": 2.0492, "step": 2806 }, { "epoch": 4.4912, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.070641040802002, + "learning_rate": 0.00048255, + "loss": 2.2044, "step": 2807 }, { "epoch": 4.4928, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.334407925605774, + "learning_rate": 0.00048239999999999996, + "loss": 1.9938, "step": 2808 }, { "epoch": 4.4944, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.9917300939559937, + "learning_rate": 0.00048224999999999993, + "loss": 1.8998, "step": 2809 }, { "epoch": 4.496, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.3776772022247314, + "learning_rate": 0.00048209999999999995, + "loss": 1.8164, "step": 2810 }, { "epoch": 4.4976, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.257997751235962, + "learning_rate": 0.00048195, + "loss": 1.8108, "step": 2811 }, { "epoch": 4.4992, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.6873910427093506, + "learning_rate": 0.0004818, + "loss": 2.3107, "step": 2812 }, { "epoch": 4.5008, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.2140886783599854, + "learning_rate": 0.0004816499999999999, + "loss": 1.6474, "step": 2813 }, { "epoch": 4.5024, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.3614856004714966, + "learning_rate": 0.00048149999999999994, + "loss": 1.9323, "step": 2814 }, { "epoch": 4.504, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.0496727228164673, + "learning_rate": 0.00048134999999999996, + "loss": 1.6896, "step": 2815 }, { "epoch": 4.5056, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.8492391109466553, + "learning_rate": 0.0004812, + "loss": 1.748, "step": 2816 }, { "epoch": 4.5072, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.207594394683838, + "learning_rate": 0.00048104999999999996, + "loss": 1.6843, "step": 2817 }, { "epoch": 4.5088, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.5215344429016113, + "learning_rate": 0.0004808999999999999, + "loss": 1.7607, "step": 2818 }, { "epoch": 4.5104, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.886686325073242, + "learning_rate": 0.00048074999999999995, + "loss": 2.1518, "step": 2819 }, { "epoch": 4.5120000000000005, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.15547513961792, + "learning_rate": 0.00048059999999999997, + "loss": 2.1099, "step": 2820 }, { "epoch": 4.5136, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.1715599298477173, + "learning_rate": 0.00048044999999999994, + "loss": 1.8574, "step": 2821 }, { "epoch": 4.5152, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.2730066776275635, + "learning_rate": 0.00048029999999999997, + "loss": 2.2673, "step": 2822 }, { "epoch": 4.5168, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.0949064493179321, + "learning_rate": 0.00048015, + "loss": 1.7473, "step": 2823 }, { "epoch": 4.5184, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.350168228149414, + "learning_rate": 0.00047999999999999996, + "loss": 1.712, "step": 2824 }, { "epoch": 4.52, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.9208452701568604, + "learning_rate": 0.00047984999999999993, + "loss": 1.7704, "step": 2825 }, { "epoch": 4.5216, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.5936332941055298, + "learning_rate": 0.00047969999999999995, + "loss": 2.0433, "step": 2826 }, { "epoch": 4.5232, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.146118402481079, + "learning_rate": 0.00047955, + "loss": 1.9635, "step": 2827 }, { "epoch": 4.5248, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.349436044692993, + "learning_rate": 0.0004794, + "loss": 1.8187, "step": 2828 }, { "epoch": 4.5264, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.8291957378387451, + "learning_rate": 0.0004792499999999999, + "loss": 1.8534, "step": 2829 }, { "epoch": 4.5280000000000005, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.6717575788497925, + "learning_rate": 0.00047909999999999994, + "loss": 1.7578, "step": 2830 }, { "epoch": 4.5296, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.676916241645813, + "learning_rate": 0.00047894999999999996, + "loss": 2.0669, "step": 2831 }, { "epoch": 4.5312, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.7795640230178833, + "learning_rate": 0.0004788, + "loss": 1.8332, "step": 2832 }, { "epoch": 4.5328, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.9749438762664795, + "learning_rate": 0.00047864999999999995, + "loss": 1.8182, "step": 2833 }, { "epoch": 4.5344, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.5495363473892212, + "learning_rate": 0.0004785, + "loss": 2.0464, "step": 2834 }, { "epoch": 4.536, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.3625857830047607, + "learning_rate": 0.00047834999999999994, + "loss": 2.1856, "step": 2835 }, { "epoch": 4.5376, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.267985820770264, + "learning_rate": 0.00047819999999999997, + "loss": 1.5406, "step": 2836 }, { "epoch": 4.5392, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.528249263763428, + "learning_rate": 0.00047804999999999994, + "loss": 2.138, "step": 2837 }, { "epoch": 4.5408, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.518407106399536, + "learning_rate": 0.00047789999999999996, + "loss": 1.7701, "step": 2838 }, { "epoch": 4.5424, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.524309158325195, + "learning_rate": 0.00047775, + "loss": 1.8426, "step": 2839 }, { "epoch": 4.5440000000000005, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.2349860668182373, + "learning_rate": 0.0004776, + "loss": 1.7634, "step": 2840 }, { "epoch": 4.5456, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.0253136157989502, + "learning_rate": 0.0004774499999999999, + "loss": 1.9, "step": 2841 }, { "epoch": 4.5472, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 5.908934593200684, + "learning_rate": 0.00047729999999999995, + "loss": 2.2211, "step": 2842 }, { "epoch": 4.5488, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.103787899017334, + "learning_rate": 0.00047714999999999997, + "loss": 1.7664, "step": 2843 }, { "epoch": 4.5504, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.5886802673339844, + "learning_rate": 0.000477, + "loss": 2.1891, "step": 2844 }, { "epoch": 4.552, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 5.039703845977783, + "learning_rate": 0.0004768499999999999, + "loss": 2.2356, "step": 2845 }, { "epoch": 4.5536, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.9848023653030396, + "learning_rate": 0.00047669999999999993, + "loss": 1.7484, "step": 2846 }, { "epoch": 4.5552, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.5086679458618164, + "learning_rate": 0.00047654999999999996, + "loss": 2.4515, "step": 2847 }, { "epoch": 4.5568, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.0788493156433105, + "learning_rate": 0.0004764, + "loss": 2.3566, "step": 2848 }, { "epoch": 4.5584, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.1540278196334839, + "learning_rate": 0.00047624999999999995, + "loss": 2.5733, "step": 2849 }, { "epoch": 4.5600000000000005, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.1869611740112305, + "learning_rate": 0.00047609999999999997, + "loss": 2.7406, "step": 2850 }, { "epoch": 4.5616, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 18.052366256713867, + "learning_rate": 0.00047594999999999994, + "loss": 2.3533, "step": 2851 }, { "epoch": 4.5632, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 12.693831443786621, + "learning_rate": 0.00047579999999999996, + "loss": 2.6903, "step": 2852 }, { "epoch": 4.5648, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 12.301849365234375, + "learning_rate": 0.00047564999999999993, + "loss": 2.5039, "step": 2853 }, { "epoch": 4.5664, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 13.84140682220459, + "learning_rate": 0.00047549999999999996, + "loss": 2.4382, "step": 2854 }, { "epoch": 4.568, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 8.18992805480957, + "learning_rate": 0.00047535, + "loss": 2.265, "step": 2855 }, { "epoch": 4.5696, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.735015392303467, + "learning_rate": 0.0004752, + "loss": 1.8568, "step": 2856 }, { "epoch": 4.5712, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.784823179244995, + "learning_rate": 0.0004750499999999999, + "loss": 2.0006, "step": 2857 }, { "epoch": 4.5728, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.5699920654296875, + "learning_rate": 0.00047489999999999994, + "loss": 1.8837, "step": 2858 }, { "epoch": 4.5744, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.1189074516296387, + "learning_rate": 0.00047474999999999997, + "loss": 1.9481, "step": 2859 }, { "epoch": 4.576, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.3212521076202393, + "learning_rate": 0.0004746, + "loss": 1.5919, "step": 2860 }, { "epoch": 4.5776, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 6.494596481323242, + "learning_rate": 0.0004744499999999999, + "loss": 2.3915, "step": 2861 }, { "epoch": 4.5792, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.6995341777801514, + "learning_rate": 0.00047429999999999993, + "loss": 1.9911, "step": 2862 }, { "epoch": 4.5808, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.6684291362762451, + "learning_rate": 0.00047414999999999995, + "loss": 2.093, "step": 2863 }, { "epoch": 4.5824, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.3050556182861328, + "learning_rate": 0.000474, + "loss": 1.8913, "step": 2864 }, { "epoch": 4.584, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.187692880630493, + "learning_rate": 0.00047384999999999994, + "loss": 1.8978, "step": 2865 }, { "epoch": 4.5856, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 6.71722412109375, + "learning_rate": 0.00047369999999999997, + "loss": 1.9785, "step": 2866 }, { "epoch": 4.5872, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.2038593292236328, + "learning_rate": 0.00047354999999999994, + "loss": 1.5634, "step": 2867 }, { "epoch": 4.5888, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.5433554649353027, + "learning_rate": 0.00047339999999999996, + "loss": 1.8322, "step": 2868 }, { "epoch": 4.5904, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.5715316534042358, + "learning_rate": 0.00047324999999999993, + "loss": 1.8705, "step": 2869 }, { "epoch": 4.592, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.9844120144844055, + "learning_rate": 0.00047309999999999995, + "loss": 1.8422, "step": 2870 }, { "epoch": 4.5936, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.68367338180542, + "learning_rate": 0.00047295, + "loss": 2.1635, "step": 2871 }, { "epoch": 4.5952, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.4707579612731934, + "learning_rate": 0.0004728, + "loss": 1.8738, "step": 2872 }, { "epoch": 4.5968, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.1058239936828613, + "learning_rate": 0.0004726499999999999, + "loss": 1.7108, "step": 2873 }, { "epoch": 4.5984, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.5762336254119873, + "learning_rate": 0.00047249999999999994, + "loss": 1.4881, "step": 2874 }, { "epoch": 4.6, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.265877366065979, + "learning_rate": 0.00047234999999999996, + "loss": 2.0167, "step": 2875 }, { "epoch": 4.6016, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.916661024093628, + "learning_rate": 0.0004722, + "loss": 1.6457, "step": 2876 }, { "epoch": 4.6032, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 5.10941219329834, + "learning_rate": 0.00047204999999999995, + "loss": 1.7151, "step": 2877 }, { "epoch": 4.6048, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.0412096977233887, + "learning_rate": 0.0004718999999999999, + "loss": 1.5545, "step": 2878 }, { "epoch": 4.6064, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.154681921005249, + "learning_rate": 0.00047174999999999995, + "loss": 1.5489, "step": 2879 }, { "epoch": 4.608, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.9898120164871216, + "learning_rate": 0.00047159999999999997, + "loss": 2.1657, "step": 2880 }, { "epoch": 4.6096, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.4565964937210083, + "learning_rate": 0.00047144999999999994, + "loss": 1.453, "step": 2881 }, { "epoch": 4.6112, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.1306456327438354, + "learning_rate": 0.00047129999999999996, + "loss": 1.9696, "step": 2882 }, { "epoch": 4.6128, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.4046789407730103, + "learning_rate": 0.00047115, + "loss": 1.9966, "step": 2883 }, { "epoch": 4.6144, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.3461419343948364, + "learning_rate": 0.00047099999999999996, + "loss": 1.576, "step": 2884 }, { "epoch": 4.616, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.1443793773651123, + "learning_rate": 0.0004708499999999999, + "loss": 2.1933, "step": 2885 }, { "epoch": 4.6176, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.0423054695129395, + "learning_rate": 0.00047069999999999995, + "loss": 1.9633, "step": 2886 }, { "epoch": 4.6192, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.0566394329071045, + "learning_rate": 0.00047054999999999997, + "loss": 1.8235, "step": 2887 }, { "epoch": 4.6208, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.431408166885376, + "learning_rate": 0.0004704, + "loss": 1.7626, "step": 2888 }, { "epoch": 4.6224, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.2627670764923096, + "learning_rate": 0.0004702499999999999, + "loss": 2.1956, "step": 2889 }, { "epoch": 4.624, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.180410146713257, + "learning_rate": 0.00047009999999999993, + "loss": 2.3186, "step": 2890 }, { "epoch": 4.6256, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.5140712261199951, + "learning_rate": 0.00046994999999999996, + "loss": 2.1441, "step": 2891 }, { "epoch": 4.6272, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 5.021548271179199, + "learning_rate": 0.0004698, + "loss": 2.1141, "step": 2892 }, { "epoch": 4.6288, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.0709006786346436, + "learning_rate": 0.00046964999999999995, + "loss": 1.9173, "step": 2893 }, { "epoch": 4.6304, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.4512224197387695, + "learning_rate": 0.0004694999999999999, + "loss": 2.2301, "step": 2894 }, { "epoch": 4.632, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.720222234725952, + "learning_rate": 0.00046934999999999994, + "loss": 2.0691, "step": 2895 }, { "epoch": 4.6336, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.0958988666534424, + "learning_rate": 0.00046919999999999997, + "loss": 1.8199, "step": 2896 }, { "epoch": 4.6352, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.1114755868911743, + "learning_rate": 0.00046904999999999994, + "loss": 1.9451, "step": 2897 }, { "epoch": 4.6368, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.1838823556900024, + "learning_rate": 0.00046889999999999996, + "loss": 2.1416, "step": 2898 }, { "epoch": 4.6384, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.0702041387557983, + "learning_rate": 0.00046875, + "loss": 2.1344, "step": 2899 }, { "epoch": 4.64, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, - "step": 2900 - }, - { - "epoch": 4.64, - "eval_cer": 1.0, - "eval_loss": NaN, - "eval_runtime": 156.9728, - "eval_samples_per_second": 19.978, - "eval_steps_per_second": 1.249, - "eval_wer": 1.0, + "grad_norm": 2.476487398147583, + "learning_rate": 0.00046859999999999995, + "loss": 2.2368, "step": 2900 }, { "epoch": 4.6416, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 26.568328857421875, + "learning_rate": 0.00046845, + "loss": 3.5976, "step": 2901 }, { "epoch": 4.6432, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.8742352724075317, + "learning_rate": 0.00046829999999999994, + "loss": 2.0308, "step": 2902 }, { "epoch": 4.6448, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 14.89755630493164, + "learning_rate": 0.00046814999999999997, + "loss": 2.6192, "step": 2903 }, { "epoch": 4.6464, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 10.758499145507812, + "learning_rate": 0.000468, + "loss": 2.4896, "step": 2904 }, { "epoch": 4.648, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.8912417888641357, + "learning_rate": 0.00046785, + "loss": 2.2466, "step": 2905 }, { "epoch": 4.6495999999999995, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 15.772459030151367, + "learning_rate": 0.00046769999999999993, + "loss": 2.9467, "step": 2906 }, { "epoch": 4.6512, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.318552494049072, + "learning_rate": 0.00046754999999999995, + "loss": 2.4392, "step": 2907 }, { "epoch": 4.6528, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.18526029586792, + "learning_rate": 0.0004674, + "loss": 2.5809, "step": 2908 }, { "epoch": 4.6544, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.1041102409362793, + "learning_rate": 0.00046725, + "loss": 2.1863, "step": 2909 }, { "epoch": 4.656, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 6.712016582489014, + "learning_rate": 0.0004670999999999999, + "loss": 1.8193, "step": 2910 }, { "epoch": 4.6576, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.29626727104187, + "learning_rate": 0.00046694999999999994, + "loss": 1.9008, "step": 2911 }, { "epoch": 4.6592, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.208040714263916, + "learning_rate": 0.00046679999999999996, + "loss": 1.9267, "step": 2912 }, { "epoch": 4.6608, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.9327380657196045, + "learning_rate": 0.00046665, + "loss": 1.9868, "step": 2913 }, { "epoch": 4.6624, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.5247790813446045, + "learning_rate": 0.00046649999999999996, + "loss": 1.7713, "step": 2914 }, { "epoch": 4.664, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.439487934112549, + "learning_rate": 0.00046635, + "loss": 2.0238, "step": 2915 }, { "epoch": 4.6655999999999995, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.8292617797851562, + "learning_rate": 0.00046619999999999995, + "loss": 1.7811, "step": 2916 }, { "epoch": 4.6672, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.1137208938598633, + "learning_rate": 0.00046604999999999997, + "loss": 1.8638, "step": 2917 }, { "epoch": 4.6688, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.3321573734283447, + "learning_rate": 0.00046589999999999994, + "loss": 1.8959, "step": 2918 }, { "epoch": 4.6704, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.2265563011169434, + "learning_rate": 0.00046574999999999996, + "loss": 1.7412, "step": 2919 }, { "epoch": 4.672, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.4950315952301025, + "learning_rate": 0.0004656, + "loss": 1.7304, "step": 2920 }, { "epoch": 4.6736, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.3217532634735107, + "learning_rate": 0.00046545, + "loss": 1.6422, "step": 2921 }, { "epoch": 4.6752, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.4180880784988403, + "learning_rate": 0.0004652999999999999, + "loss": 2.1201, "step": 2922 }, { "epoch": 4.6768, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.970060110092163, + "learning_rate": 0.00046514999999999995, + "loss": 1.7879, "step": 2923 }, { "epoch": 4.6784, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.532371997833252, + "learning_rate": 0.00046499999999999997, + "loss": 2.1768, "step": 2924 }, { "epoch": 4.68, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.154553174972534, + "learning_rate": 0.00046485, + "loss": 1.8605, "step": 2925 }, { "epoch": 4.6815999999999995, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.9632632732391357, + "learning_rate": 0.00046469999999999997, + "loss": 2.1543, "step": 2926 }, { "epoch": 4.6832, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 5.718745708465576, + "learning_rate": 0.00046454999999999993, + "loss": 1.7158, "step": 2927 }, { "epoch": 4.6848, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.1586384773254395, + "learning_rate": 0.00046439999999999996, + "loss": 1.8619, "step": 2928 }, { "epoch": 4.6864, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.7387466430664062, + "learning_rate": 0.00046425, + "loss": 1.7637, "step": 2929 }, { "epoch": 4.688, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.1410984992980957, + "learning_rate": 0.00046409999999999995, + "loss": 1.689, "step": 2930 }, { "epoch": 4.6896, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.7668966054916382, + "learning_rate": 0.00046395, + "loss": 2.3125, "step": 2931 }, { "epoch": 4.6912, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.661885976791382, + "learning_rate": 0.0004638, + "loss": 1.8615, "step": 2932 }, { "epoch": 4.6928, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.0198774337768555, + "learning_rate": 0.00046364999999999997, + "loss": 2.1258, "step": 2933 }, { "epoch": 4.6944, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.6803481578826904, + "learning_rate": 0.00046349999999999994, + "loss": 1.838, "step": 2934 }, { "epoch": 4.696, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.10424542427063, + "learning_rate": 0.00046334999999999996, + "loss": 2.2768, "step": 2935 }, { "epoch": 4.6975999999999996, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.8201396465301514, + "learning_rate": 0.0004632, + "loss": 1.6918, "step": 2936 }, { "epoch": 4.6992, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.3550063371658325, + "learning_rate": 0.00046305, + "loss": 1.9115, "step": 2937 }, { "epoch": 4.7008, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.885533332824707, + "learning_rate": 0.0004628999999999999, + "loss": 1.8377, "step": 2938 }, { "epoch": 4.7024, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 6.513812065124512, + "learning_rate": 0.00046274999999999995, + "loss": 2.2139, "step": 2939 }, { "epoch": 4.704, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 6.13215970993042, + "learning_rate": 0.00046259999999999997, + "loss": 2.3479, "step": 2940 }, { "epoch": 4.7056000000000004, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.0954887866973877, + "learning_rate": 0.00046245, + "loss": 1.9406, "step": 2941 }, { "epoch": 4.7072, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.6229615211486816, + "learning_rate": 0.00046229999999999996, + "loss": 1.9179, "step": 2942 }, { "epoch": 4.7088, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.992983341217041, + "learning_rate": 0.00046214999999999993, + "loss": 1.7419, "step": 2943 }, { "epoch": 4.7104, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.7511937618255615, + "learning_rate": 0.00046199999999999995, + "loss": 2.2057, "step": 2944 }, { "epoch": 4.712, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.7940322160720825, + "learning_rate": 0.00046185, + "loss": 2.1836, "step": 2945 }, { "epoch": 4.7136, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.443817615509033, + "learning_rate": 0.00046169999999999995, + "loss": 1.7678, "step": 2946 }, { "epoch": 4.7152, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.4557013511657715, + "learning_rate": 0.00046154999999999997, + "loss": 1.7645, "step": 2947 }, { "epoch": 4.7168, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.7131155729293823, + "learning_rate": 0.0004614, + "loss": 1.9185, "step": 2948 }, { "epoch": 4.7184, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.7961368560791016, + "learning_rate": 0.00046124999999999996, + "loss": 2.414, "step": 2949 }, { "epoch": 4.72, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.0249550342559814, + "learning_rate": 0.00046109999999999993, + "loss": 2.5392, "step": 2950 }, { "epoch": 4.7216000000000005, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 15.6370210647583, + "learning_rate": 0.00046094999999999996, + "loss": 2.5412, "step": 2951 }, { "epoch": 4.7232, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 12.782587051391602, + "learning_rate": 0.0004608, + "loss": 2.6349, "step": 2952 }, { "epoch": 4.7248, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 9.132637023925781, + "learning_rate": 0.00046065, + "loss": 2.2466, "step": 2953 }, { "epoch": 4.7264, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 15.21384048461914, + "learning_rate": 0.0004604999999999999, + "loss": 2.4299, "step": 2954 }, { "epoch": 4.728, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.304612159729004, + "learning_rate": 0.00046034999999999994, + "loss": 1.9851, "step": 2955 }, { "epoch": 4.7296, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.457035541534424, + "learning_rate": 0.00046019999999999996, + "loss": 1.9655, "step": 2956 }, { "epoch": 4.7312, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.9795613288879395, + "learning_rate": 0.00046005, + "loss": 1.9755, "step": 2957 }, { "epoch": 4.7328, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.6748929023742676, + "learning_rate": 0.00045989999999999996, + "loss": 1.9936, "step": 2958 }, { "epoch": 4.7344, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.5004043579101562, + "learning_rate": 0.0004597499999999999, + "loss": 2.254, "step": 2959 }, { "epoch": 4.736, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.3329801559448242, + "learning_rate": 0.00045959999999999995, + "loss": 1.7511, "step": 2960 }, { "epoch": 4.7376000000000005, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 8.246700286865234, + "learning_rate": 0.00045945, + "loss": 1.6901, "step": 2961 }, { "epoch": 4.7392, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 6.481882095336914, + "learning_rate": 0.00045929999999999994, + "loss": 2.0359, "step": 2962 }, { "epoch": 4.7408, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 12.636874198913574, + "learning_rate": 0.00045914999999999997, + "loss": 2.1203, "step": 2963 }, { "epoch": 4.7424, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 8.530386924743652, + "learning_rate": 0.000459, + "loss": 1.78, "step": 2964 }, { "epoch": 4.744, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 6.78328275680542, + "learning_rate": 0.00045884999999999996, + "loss": 1.8293, "step": 2965 }, { "epoch": 4.7456, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.4835050106048584, + "learning_rate": 0.00045869999999999993, + "loss": 1.9996, "step": 2966 }, { "epoch": 4.7472, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.7223535776138306, + "learning_rate": 0.00045854999999999995, + "loss": 2.1396, "step": 2967 }, { "epoch": 4.7488, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.6066192388534546, + "learning_rate": 0.0004584, + "loss": 1.7808, "step": 2968 }, { "epoch": 4.7504, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.09505558013916, + "learning_rate": 0.00045825, + "loss": 1.6402, "step": 2969 }, { "epoch": 4.752, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.1714239120483398, + "learning_rate": 0.0004580999999999999, + "loss": 1.8108, "step": 2970 }, { "epoch": 4.7536000000000005, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.506039619445801, + "learning_rate": 0.00045794999999999994, + "loss": 1.9285, "step": 2971 }, { "epoch": 4.7552, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.0846710205078125, + "learning_rate": 0.00045779999999999996, + "loss": 1.6564, "step": 2972 }, { "epoch": 4.7568, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.5491540431976318, + "learning_rate": 0.00045765, + "loss": 1.6533, "step": 2973 }, { "epoch": 4.7584, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.191540241241455, + "learning_rate": 0.00045749999999999995, + "loss": 1.6933, "step": 2974 }, { "epoch": 4.76, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.992797374725342, + "learning_rate": 0.00045735, + "loss": 1.886, "step": 2975 }, { "epoch": 4.7616, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.2760454416275024, + "learning_rate": 0.00045719999999999995, + "loss": 1.9759, "step": 2976 }, { "epoch": 4.7632, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.1154367923736572, + "learning_rate": 0.00045704999999999997, + "loss": 1.8352, "step": 2977 }, { "epoch": 4.7648, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.2374048233032227, + "learning_rate": 0.00045689999999999994, + "loss": 2.0876, "step": 2978 }, { "epoch": 4.7664, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.916267991065979, + "learning_rate": 0.00045674999999999996, + "loss": 1.8537, "step": 2979 }, { "epoch": 4.768, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.9522725343704224, + "learning_rate": 0.0004566, + "loss": 1.8254, "step": 2980 }, { "epoch": 4.7696, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.4546539783477783, + "learning_rate": 0.00045645, + "loss": 2.0879, "step": 2981 }, { "epoch": 4.7712, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.203273296356201, + "learning_rate": 0.0004562999999999999, + "loss": 2.0429, "step": 2982 }, { "epoch": 4.7728, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.57181978225708, + "learning_rate": 0.00045614999999999995, + "loss": 2.3272, "step": 2983 }, { "epoch": 4.7744, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.5816924571990967, + "learning_rate": 0.00045599999999999997, + "loss": 1.9874, "step": 2984 }, { "epoch": 4.776, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.3904832601547241, + "learning_rate": 0.00045585, + "loss": 1.8478, "step": 2985 }, { "epoch": 4.7776, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.9760186672210693, + "learning_rate": 0.0004556999999999999, + "loss": 2.0731, "step": 2986 }, { "epoch": 4.7792, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.3998148441314697, + "learning_rate": 0.00045554999999999993, + "loss": 2.1853, "step": 2987 }, { "epoch": 4.7808, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.3389252424240112, + "learning_rate": 0.00045539999999999996, + "loss": 2.0391, "step": 2988 }, { "epoch": 4.7824, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 7.23905086517334, + "learning_rate": 0.00045525, + "loss": 2.2752, "step": 2989 }, { "epoch": 4.784, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.6538360118865967, + "learning_rate": 0.00045509999999999995, + "loss": 1.9852, "step": 2990 }, { "epoch": 4.7856, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 6.154603004455566, + "learning_rate": 0.00045494999999999997, + "loss": 1.9189, "step": 2991 }, { "epoch": 4.7872, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 6.789768695831299, + "learning_rate": 0.00045479999999999994, + "loss": 2.0853, "step": 2992 }, { "epoch": 4.7888, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 7.0123066902160645, + "learning_rate": 0.00045464999999999997, + "loss": 2.1091, "step": 2993 }, { "epoch": 4.7904, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 6.259772777557373, + "learning_rate": 0.00045449999999999993, + "loss": 1.8027, "step": 2994 }, { "epoch": 4.792, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.9710586071014404, + "learning_rate": 0.00045434999999999996, + "loss": 2.2043, "step": 2995 }, { "epoch": 4.7936, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.4823061227798462, + "learning_rate": 0.0004542, + "loss": 1.6574, "step": 2996 }, { "epoch": 4.7952, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.3044240474700928, + "learning_rate": 0.00045405, + "loss": 2.3636, "step": 2997 }, { "epoch": 4.7968, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 5.694255828857422, + "learning_rate": 0.0004538999999999999, + "loss": 2.4637, "step": 2998 }, { "epoch": 4.7984, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.3031346797943115, + "learning_rate": 0.00045374999999999994, + "loss": 2.1207, "step": 2999 }, { "epoch": 4.8, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.5991824865341187, + "learning_rate": 0.00045359999999999997, + "loss": 2.3755, "step": 3000 }, { "epoch": 4.8, - "eval_cer": 1.0, - "eval_loss": NaN, - "eval_runtime": 160.2639, - "eval_samples_per_second": 19.568, + "eval_cer": 0.4432534778969187, + "eval_loss": 2.37933349609375, + "eval_runtime": 160.2371, + "eval_samples_per_second": 19.571, "eval_steps_per_second": 1.223, - "eval_wer": 1.0, + "eval_wer": 0.7270127447932857, "step": 3000 }, { "epoch": 4.8016, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 24.620410919189453, + "learning_rate": 0.00045345, + "loss": 3.1706, "step": 3001 }, { "epoch": 4.8032, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 6.307674407958984, + "learning_rate": 0.0004532999999999999, + "loss": 2.1999, "step": 3002 }, { "epoch": 4.8048, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 23.469606399536133, + "learning_rate": 0.00045314999999999993, + "loss": 3.5088, "step": 3003 }, { "epoch": 4.8064, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.592309832572937, + "learning_rate": 0.00045299999999999995, + "loss": 2.1195, "step": 3004 }, { "epoch": 4.808, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 6.467976093292236, + "learning_rate": 0.00045285, + "loss": 2.3242, "step": 3005 }, { "epoch": 4.8096, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.7775919437408447, + "learning_rate": 0.00045269999999999994, + "loss": 1.9178, "step": 3006 }, { "epoch": 4.8112, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 6.592406272888184, + "learning_rate": 0.00045254999999999997, + "loss": 2.5356, "step": 3007 }, { "epoch": 4.8128, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.714765548706055, + "learning_rate": 0.00045239999999999994, + "loss": 1.8963, "step": 3008 }, { "epoch": 4.8144, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.2107770442962646, + "learning_rate": 0.00045224999999999996, + "loss": 1.5007, "step": 3009 }, { "epoch": 4.816, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.1892650127410889, + "learning_rate": 0.00045209999999999993, + "loss": 1.7101, "step": 3010 }, { "epoch": 4.8176, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 8.4028902053833, + "learning_rate": 0.00045194999999999995, + "loss": 2.0486, "step": 3011 }, { "epoch": 4.8192, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.556225299835205, + "learning_rate": 0.0004518, + "loss": 1.8077, "step": 3012 }, { "epoch": 4.8208, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.0722336769104, + "learning_rate": 0.00045165, + "loss": 1.9737, "step": 3013 }, { "epoch": 4.8224, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.667191505432129, + "learning_rate": 0.0004514999999999999, + "loss": 1.7824, "step": 3014 }, { "epoch": 4.824, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.2074222564697266, + "learning_rate": 0.00045134999999999994, + "loss": 1.8937, "step": 3015 }, { "epoch": 4.8256, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 5.051270484924316, + "learning_rate": 0.00045119999999999996, + "loss": 1.846, "step": 3016 }, { "epoch": 4.8272, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.784696340560913, + "learning_rate": 0.00045105, + "loss": 1.7694, "step": 3017 }, { "epoch": 4.8288, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.480818748474121, + "learning_rate": 0.0004508999999999999, + "loss": 1.9753, "step": 3018 }, { "epoch": 4.8304, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.1542861461639404, + "learning_rate": 0.0004507499999999999, + "loss": 1.6415, "step": 3019 }, { "epoch": 4.832, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.286095142364502, + "learning_rate": 0.00045059999999999995, + "loss": 2.0295, "step": 3020 }, { "epoch": 4.8336, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.7635704278945923, + "learning_rate": 0.00045044999999999997, + "loss": 1.6449, "step": 3021 }, { "epoch": 4.8352, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.3237788677215576, + "learning_rate": 0.00045029999999999994, + "loss": 2.4472, "step": 3022 }, { "epoch": 4.8368, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.521028995513916, + "learning_rate": 0.00045014999999999996, + "loss": 1.9126, "step": 3023 }, { "epoch": 4.8384, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.178478717803955, + "learning_rate": 0.00045, + "loss": 2.2155, "step": 3024 }, { "epoch": 4.84, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.7699393033981323, + "learning_rate": 0.00044984999999999996, + "loss": 1.9071, "step": 3025 }, { "epoch": 4.8416, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.4859039783477783, + "learning_rate": 0.0004497, + "loss": 1.7341, "step": 3026 }, { "epoch": 4.8431999999999995, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.2301384210586548, + "learning_rate": 0.00044954999999999995, + "loss": 1.6586, "step": 3027 }, { "epoch": 4.8448, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.274512529373169, + "learning_rate": 0.0004494, + "loss": 2.1292, "step": 3028 }, { "epoch": 4.8464, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.0131479501724243, + "learning_rate": 0.00044925, + "loss": 1.7007, "step": 3029 }, { "epoch": 4.848, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.0874526500701904, + "learning_rate": 0.0004491, + "loss": 1.692, "step": 3030 }, { "epoch": 4.8496, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.2531495094299316, + "learning_rate": 0.00044894999999999994, + "loss": 1.7062, "step": 3031 }, { "epoch": 4.8512, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.9190071821212769, + "learning_rate": 0.00044879999999999996, + "loss": 1.8297, "step": 3032 }, { "epoch": 4.8528, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.5107418298721313, + "learning_rate": 0.00044865, + "loss": 1.8974, "step": 3033 }, { "epoch": 4.8544, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.797076940536499, + "learning_rate": 0.0004485, + "loss": 2.1044, "step": 3034 }, { "epoch": 4.856, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.9107513427734375, + "learning_rate": 0.0004483499999999999, + "loss": 1.901, "step": 3035 }, { "epoch": 4.8576, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.097909927368164, + "learning_rate": 0.00044819999999999994, + "loss": 1.984, "step": 3036 }, { "epoch": 4.8591999999999995, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.7260329723358154, + "learning_rate": 0.00044804999999999997, + "loss": 1.6391, "step": 3037 }, { "epoch": 4.8608, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.3167763948440552, + "learning_rate": 0.0004479, + "loss": 1.8456, "step": 3038 }, { "epoch": 4.8624, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 5.121819019317627, + "learning_rate": 0.00044774999999999996, + "loss": 2.1833, "step": 3039 }, { "epoch": 4.864, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.3631556034088135, + "learning_rate": 0.0004476, + "loss": 1.5566, "step": 3040 }, { "epoch": 4.8656, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.2354460954666138, + "learning_rate": 0.00044744999999999995, + "loss": 1.9648, "step": 3041 }, { "epoch": 4.8672, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.4966917037963867, + "learning_rate": 0.0004473, + "loss": 1.7334, "step": 3042 }, { "epoch": 4.8688, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.9379713535308838, + "learning_rate": 0.00044714999999999995, + "loss": 2.5152, "step": 3043 }, { "epoch": 4.8704, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.353835105895996, + "learning_rate": 0.00044699999999999997, + "loss": 1.9858, "step": 3044 }, { "epoch": 4.872, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.308361291885376, + "learning_rate": 0.00044685, + "loss": 1.7178, "step": 3045 }, { "epoch": 4.8736, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.9239517450332642, + "learning_rate": 0.0004467, + "loss": 2.0712, "step": 3046 }, { "epoch": 4.8751999999999995, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.6877199411392212, + "learning_rate": 0.00044654999999999993, + "loss": 1.9809, "step": 3047 }, { "epoch": 4.8768, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.460870742797852, + "learning_rate": 0.00044639999999999995, + "loss": 2.3111, "step": 3048 }, { "epoch": 4.8784, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.2702107429504395, + "learning_rate": 0.00044625, + "loss": 2.2167, "step": 3049 }, { "epoch": 4.88, "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "learning_rate": 0.00044625, + "loss": 2.7315, "step": 3050 }, { "epoch": 4.8816, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 16.57255744934082, + "learning_rate": 0.0004461, + "loss": 2.9102, "step": 3051 }, { "epoch": 4.8832, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 9.352267265319824, + "learning_rate": 0.0004459499999999999, + "loss": 2.5403, "step": 3052 }, { "epoch": 4.8848, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 5.948455810546875, + "learning_rate": 0.00044579999999999994, + "loss": 2.0783, "step": 3053 }, { "epoch": 4.8864, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 7.270328998565674, + "learning_rate": 0.00044564999999999996, + "loss": 2.7684, "step": 3054 }, { "epoch": 4.888, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.1044552326202393, + "learning_rate": 0.0004455, + "loss": 2.26, "step": 3055 }, { "epoch": 4.8896, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 8.126490592956543, + "learning_rate": 0.00044534999999999996, + "loss": 1.9853, "step": 3056 }, { "epoch": 4.8911999999999995, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 5.923595905303955, + "learning_rate": 0.0004452, + "loss": 2.5213, "step": 3057 }, { "epoch": 4.8928, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.534145832061768, + "learning_rate": 0.00044504999999999995, + "loss": 1.9294, "step": 3058 }, { "epoch": 4.8944, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 5.250170707702637, + "learning_rate": 0.00044489999999999997, + "loss": 2.1167, "step": 3059 }, { "epoch": 4.896, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.8796231746673584, + "learning_rate": 0.00044474999999999994, + "loss": 2.0915, "step": 3060 }, { "epoch": 4.8976, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.8455438613891602, + "learning_rate": 0.00044459999999999996, + "loss": 2.5508, "step": 3061 }, { "epoch": 4.8992, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.386833190917969, + "learning_rate": 0.00044445, + "loss": 1.7572, "step": 3062 }, { "epoch": 4.9008, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 6.531754016876221, + "learning_rate": 0.0004443, + "loss": 1.9147, "step": 3063 }, { "epoch": 4.9024, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.6494725942611694, + "learning_rate": 0.0004441499999999999, + "loss": 1.7609, "step": 3064 }, { "epoch": 4.904, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.8058489561080933, + "learning_rate": 0.00044399999999999995, + "loss": 1.8494, "step": 3065 }, { "epoch": 4.9056, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.1291446685791016, + "learning_rate": 0.00044385, + "loss": 1.6648, "step": 3066 }, { "epoch": 4.9072, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.5931782722473145, + "learning_rate": 0.0004437, + "loss": 1.8376, "step": 3067 }, { "epoch": 4.9088, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.7532620429992676, + "learning_rate": 0.0004435499999999999, + "loss": 1.7616, "step": 3068 }, { "epoch": 4.9104, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.9374960660934448, + "learning_rate": 0.00044339999999999994, + "loss": 1.8737, "step": 3069 }, { "epoch": 4.912, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.7475881576538086, + "learning_rate": 0.00044324999999999996, + "loss": 1.9106, "step": 3070 }, { "epoch": 4.9136, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.425243854522705, + "learning_rate": 0.0004431, + "loss": 1.5245, "step": 3071 }, { "epoch": 4.9152000000000005, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.2945805788040161, + "learning_rate": 0.00044294999999999995, + "loss": 1.8072, "step": 3072 }, { "epoch": 4.9168, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.6314753293991089, + "learning_rate": 0.0004428, + "loss": 2.1144, "step": 3073 }, { "epoch": 4.9184, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.375238060951233, + "learning_rate": 0.00044264999999999994, + "loss": 2.2158, "step": 3074 }, { "epoch": 4.92, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.8653980493545532, + "learning_rate": 0.00044249999999999997, + "loss": 1.9081, "step": 3075 }, { "epoch": 4.9216, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 5.507571697235107, + "learning_rate": 0.00044234999999999994, + "loss": 1.8373, "step": 3076 }, { "epoch": 4.9232, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.9098926782608032, + "learning_rate": 0.00044219999999999996, + "loss": 1.7228, "step": 3077 }, { "epoch": 4.9248, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 7.42020845413208, + "learning_rate": 0.00044205, + "loss": 1.6555, "step": 3078 }, { "epoch": 4.9264, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.94321346282959, + "learning_rate": 0.0004419, + "loss": 1.7539, "step": 3079 }, { "epoch": 4.928, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.5416208505630493, + "learning_rate": 0.0004417499999999999, + "loss": 1.4654, "step": 3080 }, { "epoch": 4.9296, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.7197470664978027, + "learning_rate": 0.00044159999999999995, + "loss": 2.085, "step": 3081 }, { "epoch": 4.9312000000000005, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.6691932678222656, + "learning_rate": 0.00044144999999999997, + "loss": 2.0641, "step": 3082 }, { "epoch": 4.9328, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.6235414743423462, + "learning_rate": 0.0004413, + "loss": 2.285, "step": 3083 }, { "epoch": 4.9344, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.4483048915863037, + "learning_rate": 0.00044114999999999996, + "loss": 2.0029, "step": 3084 }, { "epoch": 4.936, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.2496280670166016, + "learning_rate": 0.00044099999999999993, + "loss": 1.6248, "step": 3085 }, { "epoch": 4.9376, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.6654266119003296, + "learning_rate": 0.00044084999999999996, + "loss": 2.0614, "step": 3086 }, { "epoch": 4.9392, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.0901358127593994, + "learning_rate": 0.0004407, + "loss": 2.2341, "step": 3087 }, { "epoch": 4.9408, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.6806238889694214, + "learning_rate": 0.00044054999999999995, + "loss": 1.7547, "step": 3088 }, { "epoch": 4.9424, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.5384247303009033, + "learning_rate": 0.00044039999999999997, + "loss": 1.7914, "step": 3089 }, { "epoch": 4.944, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 5.296206951141357, + "learning_rate": 0.00044025, + "loss": 1.9357, "step": 3090 }, { "epoch": 4.9456, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.6634535789489746, + "learning_rate": 0.00044009999999999996, + "loss": 1.9188, "step": 3091 }, { "epoch": 4.9472000000000005, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.649627923965454, + "learning_rate": 0.00043994999999999993, + "loss": 1.9103, "step": 3092 }, { "epoch": 4.9488, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.5803511142730713, + "learning_rate": 0.00043979999999999996, + "loss": 1.8764, "step": 3093 }, { "epoch": 4.9504, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.5392707586288452, + "learning_rate": 0.00043965, + "loss": 2.0883, "step": 3094 }, { "epoch": 4.952, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.35639488697052, + "learning_rate": 0.0004395, + "loss": 2.2488, "step": 3095 }, { "epoch": 4.9536, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.5602680444717407, + "learning_rate": 0.0004393499999999999, + "loss": 1.6585, "step": 3096 }, { "epoch": 4.9552, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.285512924194336, + "learning_rate": 0.00043919999999999994, + "loss": 2.3513, "step": 3097 }, { "epoch": 4.9568, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.6870580911636353, + "learning_rate": 0.00043904999999999997, + "loss": 2.8381, "step": 3098 }, { "epoch": 4.9584, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.5827438831329346, + "learning_rate": 0.0004389, + "loss": 2.2782, "step": 3099 }, { "epoch": 4.96, "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, - "step": 3100 - }, - { - "epoch": 4.96, - "eval_cer": 1.0, - "eval_loss": NaN, - "eval_runtime": 156.866, - "eval_samples_per_second": 19.992, - "eval_steps_per_second": 1.249, - "eval_wer": 1.0, + "learning_rate": 0.0004389, + "loss": 1.7011, "step": 3100 }, { "epoch": 4.9616, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 13.932778358459473, + "learning_rate": 0.00043874999999999996, + "loss": 2.8354, "step": 3101 }, { "epoch": 4.9632, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 13.474605560302734, + "learning_rate": 0.00043859999999999993, + "loss": 3.0243, "step": 3102 }, { "epoch": 4.9648, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 7.513943195343018, + "learning_rate": 0.00043844999999999995, + "loss": 2.1145, "step": 3103 }, { "epoch": 4.9664, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 7.522618293762207, + "learning_rate": 0.0004383, + "loss": 2.5017, "step": 3104 }, { "epoch": 4.968, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.3348541259765625, + "learning_rate": 0.00043814999999999994, + "loss": 1.8663, "step": 3105 }, { "epoch": 4.9696, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 5.547016143798828, + "learning_rate": 0.00043799999999999997, + "loss": 1.9593, "step": 3106 }, { "epoch": 4.9712, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.8356808423995972, + "learning_rate": 0.00043785, + "loss": 1.6838, "step": 3107 }, { "epoch": 4.9728, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.7317214012145996, + "learning_rate": 0.00043769999999999996, + "loss": 1.8651, "step": 3108 }, { "epoch": 4.9744, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.4681026935577393, + "learning_rate": 0.00043754999999999993, + "loss": 2.0261, "step": 3109 }, { "epoch": 4.976, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.7375612258911133, + "learning_rate": 0.00043739999999999995, + "loss": 1.9152, "step": 3110 }, { "epoch": 4.9776, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.383476972579956, + "learning_rate": 0.00043725, + "loss": 2.047, "step": 3111 }, { "epoch": 4.9792, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.8656511306762695, + "learning_rate": 0.0004371, + "loss": 2.43, "step": 3112 }, { "epoch": 4.9808, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 5.090378761291504, + "learning_rate": 0.0004369499999999999, + "loss": 2.1424, "step": 3113 }, { "epoch": 4.9824, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.096062421798706, + "learning_rate": 0.00043679999999999994, + "loss": 1.7151, "step": 3114 }, { "epoch": 4.984, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.4025139808654785, + "learning_rate": 0.00043664999999999996, + "loss": 1.809, "step": 3115 }, { "epoch": 4.9856, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.3236799240112305, + "learning_rate": 0.0004365, + "loss": 2.0313, "step": 3116 }, { "epoch": 4.9872, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.579554557800293, + "learning_rate": 0.00043634999999999995, + "loss": 1.7257, "step": 3117 }, { "epoch": 4.9888, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.1674931049346924, + "learning_rate": 0.0004361999999999999, + "loss": 2.0792, "step": 3118 }, { "epoch": 4.9904, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.9446090459823608, + "learning_rate": 0.00043604999999999995, + "loss": 1.8795, "step": 3119 }, { "epoch": 4.992, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.445674419403076, + "learning_rate": 0.00043589999999999997, + "loss": 2.0259, "step": 3120 }, { "epoch": 4.9936, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 6.725457668304443, + "learning_rate": 0.00043574999999999994, + "loss": 2.0193, "step": 3121 }, { "epoch": 4.9952, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.952589988708496, + "learning_rate": 0.00043559999999999996, + "loss": 2.2579, "step": 3122 }, { "epoch": 4.9968, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.328230142593384, + "learning_rate": 0.00043545, + "loss": 2.8075, "step": 3123 }, { "epoch": 4.9984, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.565820574760437, + "learning_rate": 0.00043529999999999996, + "loss": 2.2939, "step": 3124 }, { "epoch": 5.0, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.119185209274292, + "learning_rate": 0.0004351499999999999, + "loss": 2.4498, "step": 3125 }, { "epoch": 5.0016, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.970066547393799, + "learning_rate": 0.00043499999999999995, + "loss": 2.4477, "step": 3126 }, { "epoch": 5.0032, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.783778190612793, + "learning_rate": 0.00043484999999999997, + "loss": 2.3359, "step": 3127 }, { "epoch": 5.0048, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 19.11392593383789, + "learning_rate": 0.0004347, + "loss": 2.7559, "step": 3128 }, { "epoch": 5.0064, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.7798497676849365, + "learning_rate": 0.0004345499999999999, + "loss": 2.0971, "step": 3129 }, { "epoch": 5.008, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 7.793581962585449, + "learning_rate": 0.00043439999999999993, + "loss": 2.6192, "step": 3130 }, { "epoch": 5.0096, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 7.100231170654297, + "learning_rate": 0.00043424999999999996, + "loss": 1.9119, "step": 3131 }, { "epoch": 5.0112, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 7.948700904846191, + "learning_rate": 0.0004341, + "loss": 2.3903, "step": 3132 }, { "epoch": 5.0128, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.955105781555176, + "learning_rate": 0.00043394999999999995, + "loss": 2.0834, "step": 3133 }, { "epoch": 5.0144, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.116028070449829, + "learning_rate": 0.0004338, + "loss": 1.9087, "step": 3134 }, { "epoch": 5.016, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.022517442703247, + "learning_rate": 0.00043364999999999994, + "loss": 1.8392, "step": 3135 }, { "epoch": 5.0176, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 5.67773962020874, + "learning_rate": 0.00043349999999999997, + "loss": 1.9662, "step": 3136 }, { "epoch": 5.0192, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 8.792007446289062, + "learning_rate": 0.00043334999999999994, + "loss": 2.2113, "step": 3137 }, { "epoch": 5.0208, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 7.263298034667969, + "learning_rate": 0.00043319999999999996, + "loss": 1.8671, "step": 3138 }, { "epoch": 5.0224, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 11.039977073669434, + "learning_rate": 0.00043305, + "loss": 2.3769, "step": 3139 }, { "epoch": 5.024, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 8.120280265808105, + "learning_rate": 0.0004329, + "loss": 2.0752, "step": 3140 }, { "epoch": 5.0256, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.3145546913146973, + "learning_rate": 0.0004327499999999999, + "loss": 1.9827, "step": 3141 }, { "epoch": 5.0272, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.8486971855163574, + "learning_rate": 0.00043259999999999994, + "loss": 1.7318, "step": 3142 }, { "epoch": 5.0288, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 11.292924880981445, + "learning_rate": 0.00043244999999999997, + "loss": 2.1877, "step": 3143 }, { "epoch": 5.0304, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.601168632507324, + "learning_rate": 0.0004323, + "loss": 1.7993, "step": 3144 }, { "epoch": 5.032, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.0242254734039307, + "learning_rate": 0.0004321499999999999, + "loss": 1.6663, "step": 3145 }, { "epoch": 5.0336, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.0984086990356445, + "learning_rate": 0.00043199999999999993, + "loss": 1.9235, "step": 3146 }, { "epoch": 5.0352, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.6991688013076782, + "learning_rate": 0.00043184999999999995, + "loss": 1.6364, "step": 3147 }, { "epoch": 5.0368, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.7893872261047363, + "learning_rate": 0.0004317, + "loss": 2.1728, "step": 3148 }, { "epoch": 5.0384, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 5.034099102020264, + "learning_rate": 0.00043154999999999995, + "loss": 2.162, "step": 3149 }, { "epoch": 5.04, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 5.309537410736084, + "learning_rate": 0.00043139999999999997, + "loss": 1.7779, "step": 3150 }, { "epoch": 5.0416, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.693896532058716, + "learning_rate": 0.00043124999999999994, + "loss": 1.6807, "step": 3151 }, { "epoch": 5.0432, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.089795112609863, + "learning_rate": 0.00043109999999999996, + "loss": 1.9001, "step": 3152 }, { "epoch": 5.0448, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.010385513305664, + "learning_rate": 0.00043095, + "loss": 1.5703, "step": 3153 }, { "epoch": 5.0464, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.8490614891052246, + "learning_rate": 0.00043079999999999995, + "loss": 1.9197, "step": 3154 }, { "epoch": 5.048, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.640694499015808, + "learning_rate": 0.00043065, + "loss": 1.6976, "step": 3155 }, { "epoch": 5.0496, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.294865846633911, + "learning_rate": 0.0004305, + "loss": 1.8106, "step": 3156 }, { "epoch": 5.0512, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.1499030590057373, + "learning_rate": 0.00043034999999999997, + "loss": 2.1928, "step": 3157 }, { "epoch": 5.0528, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.309077739715576, + "learning_rate": 0.00043019999999999994, + "loss": 1.7319, "step": 3158 }, { "epoch": 5.0544, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.6867856979370117, + "learning_rate": 0.00043004999999999996, + "loss": 2.0171, "step": 3159 }, { "epoch": 5.056, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.8424367904663086, + "learning_rate": 0.0004299, + "loss": 2.0746, "step": 3160 }, { "epoch": 5.0576, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.5826704502105713, + "learning_rate": 0.00042975, + "loss": 2.0328, "step": 3161 }, { "epoch": 5.0592, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.853431224822998, + "learning_rate": 0.0004295999999999999, + "loss": 1.9964, "step": 3162 }, { "epoch": 5.0608, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.6717617511749268, + "learning_rate": 0.00042944999999999995, + "loss": 2.0824, "step": 3163 }, { "epoch": 5.0624, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.649266004562378, + "learning_rate": 0.00042929999999999997, + "loss": 2.3392, "step": 3164 }, { "epoch": 5.064, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.8702266216278076, + "learning_rate": 0.00042915, + "loss": 2.1132, "step": 3165 }, { "epoch": 5.0656, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.922991752624512, + "learning_rate": 0.00042899999999999997, + "loss": 2.4097, "step": 3166 }, { "epoch": 5.0672, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.757828712463379, + "learning_rate": 0.00042884999999999993, + "loss": 2.3162, "step": 3167 }, { "epoch": 5.0688, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.166485071182251, + "learning_rate": 0.00042869999999999996, + "loss": 2.0998, "step": 3168 }, { "epoch": 5.0704, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.8233377933502197, + "learning_rate": 0.00042855, + "loss": 2.1253, "step": 3169 }, { "epoch": 5.072, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.8624346256256104, + "learning_rate": 0.00042839999999999995, + "loss": 2.0939, "step": 3170 }, { "epoch": 5.0736, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 7.785826683044434, + "learning_rate": 0.00042825, + "loss": 2.0271, "step": 3171 }, { "epoch": 5.0752, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 6.498620510101318, + "learning_rate": 0.0004281, + "loss": 2.0795, "step": 3172 }, { "epoch": 5.0768, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.1600935459136963, + "learning_rate": 0.00042794999999999997, + "loss": 2.3353, "step": 3173 }, { "epoch": 5.0784, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 8.206274032592773, + "learning_rate": 0.00042779999999999994, + "loss": 2.7881, "step": 3174 }, { "epoch": 5.08, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.14153790473938, + "learning_rate": 0.00042764999999999996, + "loss": 2.886, "step": 3175 }, { "epoch": 5.0816, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 10.464966773986816, + "learning_rate": 0.0004275, + "loss": 2.7726, "step": 3176 }, { "epoch": 5.0832, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.2620949745178223, + "learning_rate": 0.00042735, + "loss": 2.3795, "step": 3177 }, { "epoch": 5.0848, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.8024942874908447, + "learning_rate": 0.0004271999999999999, + "loss": 2.4685, "step": 3178 }, { "epoch": 5.0864, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 9.8808012008667, + "learning_rate": 0.00042704999999999994, + "loss": 3.0535, "step": 3179 }, { "epoch": 5.088, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.9684244394302368, + "learning_rate": 0.00042689999999999997, + "loss": 2.4331, "step": 3180 }, { "epoch": 5.0896, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 6.007668495178223, + "learning_rate": 0.00042675, + "loss": 2.2903, "step": 3181 }, { "epoch": 5.0912, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 8.786413192749023, + "learning_rate": 0.00042659999999999996, + "loss": 2.0783, "step": 3182 }, { "epoch": 5.0928, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.8741869926452637, + "learning_rate": 0.00042645, + "loss": 2.7336, "step": 3183 }, { "epoch": 5.0944, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 7.79109001159668, + "learning_rate": 0.00042629999999999995, + "loss": 2.399, "step": 3184 }, { "epoch": 5.096, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.3625593185424805, + "learning_rate": 0.00042615, + "loss": 1.9723, "step": 3185 }, { "epoch": 5.0976, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.4920530319213867, + "learning_rate": 0.00042599999999999995, + "loss": 2.0308, "step": 3186 }, { "epoch": 5.0992, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.7185099124908447, + "learning_rate": 0.00042584999999999997, + "loss": 2.0599, "step": 3187 }, { "epoch": 5.1008, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.5361008644104004, + "learning_rate": 0.0004257, + "loss": 1.9269, "step": 3188 }, { "epoch": 5.1024, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.932170033454895, + "learning_rate": 0.00042555, + "loss": 2.0408, "step": 3189 }, { "epoch": 5.104, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.8861477375030518, + "learning_rate": 0.00042539999999999993, + "loss": 2.1108, "step": 3190 }, { "epoch": 5.1056, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.9711568355560303, + "learning_rate": 0.00042524999999999996, + "loss": 1.92, "step": 3191 }, { "epoch": 5.1072, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.2188079357147217, + "learning_rate": 0.0004251, + "loss": 2.0846, "step": 3192 }, { "epoch": 5.1088, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.8576302528381348, + "learning_rate": 0.00042495, + "loss": 2.1029, "step": 3193 }, { "epoch": 5.1104, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.3662407398223877, + "learning_rate": 0.0004247999999999999, + "loss": 1.9164, "step": 3194 }, { "epoch": 5.112, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.2516047954559326, + "learning_rate": 0.00042464999999999994, + "loss": 1.779, "step": 3195 }, { "epoch": 5.1136, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.81559419631958, + "learning_rate": 0.00042449999999999996, + "loss": 1.7815, "step": 3196 }, { "epoch": 5.1152, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 7.766302108764648, + "learning_rate": 0.00042435, + "loss": 2.1049, "step": 3197 }, { "epoch": 5.1168, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.273326873779297, + "learning_rate": 0.00042419999999999996, + "loss": 2.5158, "step": 3198 }, { "epoch": 5.1184, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.278658866882324, + "learning_rate": 0.00042405, + "loss": 2.0045, "step": 3199 }, { "epoch": 5.12, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, - "step": 3200 - }, - { - "epoch": 5.12, - "eval_cer": 1.0, - "eval_loss": NaN, - "eval_runtime": 157.8432, - "eval_samples_per_second": 19.868, - "eval_steps_per_second": 1.242, - "eval_wer": 1.0, + "grad_norm": 1.9064677953720093, + "learning_rate": 0.00042389999999999995, + "loss": 2.1329, "step": 3200 }, { "epoch": 5.1216, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.0119874477386475, + "learning_rate": 0.00042375, + "loss": 1.7553, "step": 3201 }, { "epoch": 5.1232, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.9547804594039917, + "learning_rate": 0.00042359999999999994, + "loss": 1.8721, "step": 3202 }, { "epoch": 5.1248, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.477467656135559, + "learning_rate": 0.00042344999999999997, + "loss": 1.98, "step": 3203 }, { "epoch": 5.1264, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.984140396118164, + "learning_rate": 0.0004233, + "loss": 2.4288, "step": 3204 }, { "epoch": 5.128, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.811163067817688, + "learning_rate": 0.00042315, + "loss": 2.2058, "step": 3205 }, { "epoch": 5.1296, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.115196704864502, + "learning_rate": 0.00042299999999999993, + "loss": 1.8964, "step": 3206 }, { "epoch": 5.1312, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.3901374340057373, + "learning_rate": 0.00042284999999999995, + "loss": 2.0045, "step": 3207 }, { "epoch": 5.1328, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 6.495665550231934, + "learning_rate": 0.0004227, + "loss": 1.905, "step": 3208 }, { "epoch": 5.1344, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.345617294311523, + "learning_rate": 0.00042255, + "loss": 2.1364, "step": 3209 }, { "epoch": 5.136, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.81538987159729, + "learning_rate": 0.0004223999999999999, + "loss": 2.1264, "step": 3210 }, { "epoch": 5.1376, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.2732255458831787, + "learning_rate": 0.00042224999999999994, + "loss": 2.0875, "step": 3211 }, { "epoch": 5.1392, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.7457600831985474, + "learning_rate": 0.00042209999999999996, + "loss": 1.9938, "step": 3212 }, { "epoch": 5.1408, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.7632317543029785, + "learning_rate": 0.00042195, + "loss": 2.0523, "step": 3213 }, { "epoch": 5.1424, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 5.035547733306885, + "learning_rate": 0.00042179999999999995, + "loss": 2.2168, "step": 3214 }, { "epoch": 5.144, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.828839659690857, + "learning_rate": 0.00042165, + "loss": 3.0716, "step": 3215 }, { "epoch": 5.1456, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.513362407684326, + "learning_rate": 0.00042149999999999995, + "loss": 2.0964, "step": 3216 }, { "epoch": 5.1472, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.751229286193848, + "learning_rate": 0.00042134999999999997, + "loss": 1.8053, "step": 3217 }, { "epoch": 5.1488, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.622213125228882, + "learning_rate": 0.00042119999999999994, + "loss": 1.9891, "step": 3218 }, { "epoch": 5.1504, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.1266136169433594, + "learning_rate": 0.00042104999999999996, + "loss": 2.2839, "step": 3219 }, { "epoch": 5.152, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.8702733516693115, + "learning_rate": 0.0004209, + "loss": 1.7316, "step": 3220 }, { "epoch": 5.1536, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.3464245796203613, + "learning_rate": 0.00042075, + "loss": 2.4212, "step": 3221 }, { "epoch": 5.1552, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.9818917512893677, + "learning_rate": 0.0004205999999999999, + "loss": 2.3367, "step": 3222 }, { "epoch": 5.1568, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.5446746349334717, + "learning_rate": 0.00042044999999999995, + "loss": 1.878, "step": 3223 }, { "epoch": 5.1584, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.786088943481445, + "learning_rate": 0.00042029999999999997, + "loss": 2.8948, "step": 3224 }, { "epoch": 5.16, "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "learning_rate": 0.00042029999999999997, + "loss": 2.5519, "step": 3225 }, { "epoch": 5.1616, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 9.343994140625, + "learning_rate": 0.00042015, + "loss": 2.784, "step": 3226 }, { "epoch": 5.1632, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 7.6946587562561035, + "learning_rate": 0.00041999999999999996, + "loss": 2.3843, "step": 3227 }, { "epoch": 5.1648, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.742859125137329, + "learning_rate": 0.00041984999999999993, + "loss": 2.1749, "step": 3228 }, { "epoch": 5.1664, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 5.633478164672852, + "learning_rate": 0.00041969999999999996, + "loss": 2.3875, "step": 3229 }, { "epoch": 5.168, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.2521109580993652, + "learning_rate": 0.00041955, + "loss": 2.3516, "step": 3230 }, { "epoch": 5.1696, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 21.70244598388672, + "learning_rate": 0.00041939999999999995, + "loss": 2.8914, "step": 3231 }, { "epoch": 5.1712, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.249197244644165, + "learning_rate": 0.00041924999999999997, + "loss": 2.1809, "step": 3232 }, { "epoch": 5.1728, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.294630765914917, + "learning_rate": 0.0004191, + "loss": 2.3573, "step": 3233 }, { "epoch": 5.1744, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.855384349822998, + "learning_rate": 0.00041894999999999996, + "loss": 2.3276, "step": 3234 }, { "epoch": 5.176, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.1429007053375244, + "learning_rate": 0.00041879999999999993, + "loss": 1.8539, "step": 3235 }, { "epoch": 5.1776, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.8854267597198486, + "learning_rate": 0.00041864999999999996, + "loss": 2.1204, "step": 3236 }, { "epoch": 5.1792, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 13.373618125915527, + "learning_rate": 0.0004185, + "loss": 2.8097, "step": 3237 }, { "epoch": 5.1808, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.8381149768829346, + "learning_rate": 0.00041835, + "loss": 1.9282, "step": 3238 }, { "epoch": 5.1824, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.196768045425415, + "learning_rate": 0.0004181999999999999, + "loss": 2.1187, "step": 3239 }, { "epoch": 5.184, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.200750350952148, + "learning_rate": 0.00041804999999999994, + "loss": 1.8194, "step": 3240 }, { "epoch": 5.1856, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.9149186611175537, + "learning_rate": 0.00041789999999999997, + "loss": 2.2479, "step": 3241 }, { "epoch": 5.1872, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.7128161191940308, + "learning_rate": 0.00041775, + "loss": 1.8903, "step": 3242 }, { "epoch": 5.1888, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.5912396907806396, + "learning_rate": 0.00041759999999999996, + "loss": 2.1393, "step": 3243 }, { "epoch": 5.1904, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.645106315612793, + "learning_rate": 0.00041744999999999993, + "loss": 1.9836, "step": 3244 }, { "epoch": 5.192, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.632805347442627, + "learning_rate": 0.00041729999999999995, + "loss": 1.9239, "step": 3245 }, { "epoch": 5.1936, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 9.526568412780762, + "learning_rate": 0.00041715, + "loss": 2.2065, "step": 3246 }, { "epoch": 5.1952, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 5.276612281799316, + "learning_rate": 0.00041699999999999994, + "loss": 1.8527, "step": 3247 }, { "epoch": 5.1968, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.1258394718170166, + "learning_rate": 0.00041684999999999997, + "loss": 1.4672, "step": 3248 }, { "epoch": 5.1984, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.9970532655715942, + "learning_rate": 0.0004167, + "loss": 2.665, "step": 3249 }, { "epoch": 5.2, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.5611498355865479, + "learning_rate": 0.00041654999999999996, + "loss": 2.3542, "step": 3250 }, { "epoch": 5.2016, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.6552271842956543, + "learning_rate": 0.00041639999999999993, + "loss": 2.312, "step": 3251 }, { "epoch": 5.2032, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.3266196250915527, + "learning_rate": 0.00041624999999999995, + "loss": 1.9171, "step": 3252 }, { "epoch": 5.2048, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.908831834793091, + "learning_rate": 0.0004161, + "loss": 1.6583, "step": 3253 }, { "epoch": 5.2064, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.550462245941162, + "learning_rate": 0.00041595, + "loss": 1.9416, "step": 3254 }, { "epoch": 5.208, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.041257381439209, + "learning_rate": 0.0004157999999999999, + "loss": 1.8542, "step": 3255 }, { "epoch": 5.2096, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.1703436374664307, + "learning_rate": 0.00041564999999999994, + "loss": 2.1299, "step": 3256 }, { "epoch": 5.2112, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.600370168685913, + "learning_rate": 0.00041549999999999996, + "loss": 2.3074, "step": 3257 }, { "epoch": 5.2128, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.4538733959198, + "learning_rate": 0.00041535, + "loss": 1.8745, "step": 3258 }, { "epoch": 5.2144, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 6.525084018707275, + "learning_rate": 0.00041519999999999995, + "loss": 2.2919, "step": 3259 }, { "epoch": 5.216, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.9094107151031494, + "learning_rate": 0.0004150499999999999, + "loss": 2.9068, "step": 3260 }, { "epoch": 5.2176, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.6780409812927246, + "learning_rate": 0.00041489999999999995, + "loss": 2.0547, "step": 3261 }, { "epoch": 5.2192, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.290412664413452, + "learning_rate": 0.00041474999999999997, + "loss": 2.5694, "step": 3262 }, { "epoch": 5.2208, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 5.3367204666137695, + "learning_rate": 0.00041459999999999994, + "loss": 1.9833, "step": 3263 }, { "epoch": 5.2224, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 6.052272319793701, + "learning_rate": 0.00041444999999999996, + "loss": 1.9832, "step": 3264 }, { "epoch": 5.224, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.0450096130371094, + "learning_rate": 0.0004143, + "loss": 1.9363, "step": 3265 }, { "epoch": 5.2256, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.9471709728240967, + "learning_rate": 0.00041414999999999996, + "loss": 2.1991, "step": 3266 }, { "epoch": 5.2272, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 6.396254062652588, + "learning_rate": 0.0004139999999999999, + "loss": 2.2656, "step": 3267 }, { "epoch": 5.2288, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.650744915008545, + "learning_rate": 0.00041384999999999995, + "loss": 2.6204, "step": 3268 }, { "epoch": 5.2304, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.017385959625244, + "learning_rate": 0.00041369999999999997, + "loss": 2.0147, "step": 3269 }, { "epoch": 5.232, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.572188854217529, + "learning_rate": 0.00041355, + "loss": 2.088, "step": 3270 }, { "epoch": 5.2336, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.2741122245788574, + "learning_rate": 0.0004133999999999999, + "loss": 2.1394, "step": 3271 }, { "epoch": 5.2352, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.6230545043945312, + "learning_rate": 0.00041324999999999993, + "loss": 2.0355, "step": 3272 }, { "epoch": 5.2368, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.208142042160034, + "learning_rate": 0.00041309999999999996, + "loss": 2.2254, "step": 3273 }, { "epoch": 5.2384, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.0993072986602783, + "learning_rate": 0.00041295, + "loss": 3.0118, "step": 3274 }, { "epoch": 5.24, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.9154711961746216, + "learning_rate": 0.00041279999999999995, + "loss": 3.0291, "step": 3275 }, { "epoch": 5.2416, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 24.453245162963867, + "learning_rate": 0.00041265, + "loss": 3.0981, "step": 3276 }, { "epoch": 5.2432, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 13.712783813476562, + "learning_rate": 0.00041249999999999994, + "loss": 2.6749, "step": 3277 }, { "epoch": 5.2448, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 14.625894546508789, + "learning_rate": 0.00041234999999999997, + "loss": 3.1377, "step": 3278 }, { "epoch": 5.2464, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 19.11041259765625, + "learning_rate": 0.0004122, + "loss": 3.5501, "step": 3279 }, { "epoch": 5.248, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.8894535303115845, + "learning_rate": 0.00041204999999999996, + "loss": 2.2125, "step": 3280 }, { "epoch": 5.2496, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.0875790119171143, + "learning_rate": 0.0004119, + "loss": 2.1383, "step": 3281 }, { "epoch": 5.2512, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.760862112045288, + "learning_rate": 0.00041175, + "loss": 2.1078, "step": 3282 }, { "epoch": 5.2528, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.318727731704712, + "learning_rate": 0.0004116, + "loss": 3.0934, "step": 3283 }, { "epoch": 5.2544, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.975752830505371, + "learning_rate": 0.00041144999999999995, + "loss": 2.2509, "step": 3284 }, { "epoch": 5.256, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.216862916946411, + "learning_rate": 0.00041129999999999997, + "loss": 2.1086, "step": 3285 }, { "epoch": 5.2576, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.52018404006958, + "learning_rate": 0.00041115, + "loss": 2.0231, "step": 3286 }, { "epoch": 5.2592, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.959451675415039, + "learning_rate": 0.000411, + "loss": 1.9253, "step": 3287 }, { "epoch": 5.2608, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 6.943277359008789, + "learning_rate": 0.00041084999999999993, + "loss": 2.074, "step": 3288 }, { "epoch": 5.2624, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.6196024417877197, + "learning_rate": 0.00041069999999999995, + "loss": 1.9088, "step": 3289 }, { "epoch": 5.264, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.83423376083374, + "learning_rate": 0.00041055, + "loss": 2.3001, "step": 3290 }, { "epoch": 5.2656, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.978485107421875, + "learning_rate": 0.0004104, + "loss": 1.9418, "step": 3291 }, { "epoch": 5.2672, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.1441781520843506, + "learning_rate": 0.00041024999999999997, + "loss": 2.1912, "step": 3292 }, { "epoch": 5.2688, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.7500698566436768, + "learning_rate": 0.00041009999999999994, + "loss": 1.9621, "step": 3293 }, { "epoch": 5.2704, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.964430570602417, + "learning_rate": 0.00040994999999999996, + "loss": 1.9461, "step": 3294 }, { "epoch": 5.272, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.8824044466018677, + "learning_rate": 0.0004098, + "loss": 1.7739, "step": 3295 }, { "epoch": 5.2736, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.2687458992004395, + "learning_rate": 0.00040964999999999996, + "loss": 2.1379, "step": 3296 }, { "epoch": 5.2752, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 5.864386558532715, + "learning_rate": 0.0004095, + "loss": 2.4708, "step": 3297 }, { "epoch": 5.2768, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.9345788955688477, + "learning_rate": 0.00040935, + "loss": 1.6851, "step": 3298 }, { "epoch": 5.2783999999999995, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.8477360010147095, + "learning_rate": 0.00040919999999999997, + "loss": 2.1697, "step": 3299 }, { "epoch": 5.28, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, - "step": 3300 - }, - { - "epoch": 5.28, - "eval_cer": 1.0, - "eval_loss": NaN, - "eval_runtime": 157.0827, - "eval_samples_per_second": 19.964, - "eval_steps_per_second": 1.248, - "eval_wer": 1.0, + "grad_norm": 2.215449810028076, + "learning_rate": 0.00040904999999999994, + "loss": 2.0753, "step": 3300 }, { "epoch": 5.2816, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 5.495771408081055, + "learning_rate": 0.00040889999999999996, + "loss": 2.2299, "step": 3301 }, { "epoch": 5.2832, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.886584520339966, + "learning_rate": 0.00040875, + "loss": 2.0877, "step": 3302 }, { "epoch": 5.2848, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 5.448219299316406, + "learning_rate": 0.0004086, + "loss": 2.1397, "step": 3303 }, { "epoch": 5.2864, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.2363290786743164, + "learning_rate": 0.0004084499999999999, + "loss": 1.6692, "step": 3304 }, { "epoch": 5.288, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.9133745431900024, + "learning_rate": 0.00040829999999999995, + "loss": 2.0965, "step": 3305 }, { "epoch": 5.2896, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.235393524169922, + "learning_rate": 0.00040815, + "loss": 1.719, "step": 3306 }, { "epoch": 5.2912, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.2992868423461914, + "learning_rate": 0.000408, + "loss": 1.8493, "step": 3307 }, { "epoch": 5.2928, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 6.721340179443359, + "learning_rate": 0.00040784999999999997, + "loss": 2.3836, "step": 3308 }, { "epoch": 5.2943999999999996, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.3495800495147705, + "learning_rate": 0.00040769999999999994, + "loss": 2.3834, "step": 3309 }, { "epoch": 5.296, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.6632156372070312, + "learning_rate": 0.00040754999999999996, + "loss": 2.0363, "step": 3310 }, { "epoch": 5.2976, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.5388312339782715, + "learning_rate": 0.0004074, + "loss": 1.9148, "step": 3311 }, { "epoch": 5.2992, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.6432439088821411, + "learning_rate": 0.00040724999999999995, + "loss": 2.4588, "step": 3312 }, { "epoch": 5.3008, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 7.365741729736328, + "learning_rate": 0.0004071, + "loss": 2.131, "step": 3313 }, { "epoch": 5.3024000000000004, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.5189990997314453, + "learning_rate": 0.00040695, + "loss": 2.0987, "step": 3314 }, { "epoch": 5.304, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.2603230476379395, + "learning_rate": 0.00040679999999999997, + "loss": 2.0255, "step": 3315 }, { "epoch": 5.3056, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.1874077320098877, + "learning_rate": 0.00040664999999999994, + "loss": 2.3192, "step": 3316 }, { "epoch": 5.3072, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.8295190334320068, + "learning_rate": 0.00040649999999999996, + "loss": 2.5205, "step": 3317 }, { "epoch": 5.3088, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.067250967025757, + "learning_rate": 0.00040635, + "loss": 2.5474, "step": 3318 }, { "epoch": 5.3104, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.5971553325653076, + "learning_rate": 0.0004062, + "loss": 2.0922, "step": 3319 }, { "epoch": 5.312, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.9171907901763916, + "learning_rate": 0.0004060499999999999, + "loss": 1.9282, "step": 3320 }, { "epoch": 5.3136, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.9470739364624023, + "learning_rate": 0.00040589999999999995, + "loss": 1.7039, "step": 3321 }, { "epoch": 5.3152, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.5318472385406494, + "learning_rate": 0.00040574999999999997, + "loss": 2.2137, "step": 3322 }, { "epoch": 5.3168, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.8926937580108643, + "learning_rate": 0.0004056, + "loss": 1.9639, "step": 3323 }, { "epoch": 5.3184000000000005, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.356811285018921, + "learning_rate": 0.00040544999999999996, + "loss": 2.893, "step": 3324 }, { "epoch": 5.32, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.1108455657958984, + "learning_rate": 0.00040529999999999993, + "loss": 2.7859, "step": 3325 }, { "epoch": 5.3216, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 11.323676109313965, + "learning_rate": 0.00040514999999999995, + "loss": 3.1925, "step": 3326 }, { "epoch": 5.3232, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 5.932748794555664, + "learning_rate": 0.000405, + "loss": 3.6857, "step": 3327 }, { "epoch": 5.3248, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.096813917160034, + "learning_rate": 0.00040484999999999995, + "loss": 2.5022, "step": 3328 }, { "epoch": 5.3264, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.869088888168335, + "learning_rate": 0.00040469999999999997, + "loss": 3.3066, "step": 3329 }, { "epoch": 5.328, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 6.629360198974609, + "learning_rate": 0.00040455, + "loss": 2.8505, "step": 3330 }, { "epoch": 5.3296, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 5.395150661468506, + "learning_rate": 0.0004044, + "loss": 2.4143, "step": 3331 }, { "epoch": 5.3312, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.4230806827545166, + "learning_rate": 0.00040424999999999993, + "loss": 2.1592, "step": 3332 }, { "epoch": 5.3328, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.8740898370742798, + "learning_rate": 0.00040409999999999996, + "loss": 2.2058, "step": 3333 }, { "epoch": 5.3344, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.7961093187332153, + "learning_rate": 0.00040395, + "loss": 2.0633, "step": 3334 }, { "epoch": 5.336, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 6.026645183563232, + "learning_rate": 0.0004038, + "loss": 2.2895, "step": 3335 }, { "epoch": 5.3376, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 16.91562843322754, + "learning_rate": 0.0004036499999999999, + "loss": 2.7079, "step": 3336 }, { "epoch": 5.3392, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 9.92512321472168, + "learning_rate": 0.00040349999999999994, + "loss": 2.4684, "step": 3337 }, { "epoch": 5.3408, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 7.841024398803711, + "learning_rate": 0.00040334999999999997, + "loss": 2.1537, "step": 3338 }, { "epoch": 5.3424, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 5.517487525939941, + "learning_rate": 0.0004032, + "loss": 2.3195, "step": 3339 }, { "epoch": 5.344, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 9.065592765808105, + "learning_rate": 0.00040304999999999996, + "loss": 2.2745, "step": 3340 }, { "epoch": 5.3456, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.872082233428955, + "learning_rate": 0.0004029, + "loss": 2.0909, "step": 3341 }, { "epoch": 5.3472, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.373950242996216, + "learning_rate": 0.00040274999999999995, + "loss": 2.1205, "step": 3342 }, { "epoch": 5.3488, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.094278335571289, + "learning_rate": 0.0004026, + "loss": 1.5954, "step": 3343 }, { "epoch": 5.3504, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.5784664154052734, + "learning_rate": 0.00040244999999999994, + "loss": 2.2211, "step": 3344 }, { "epoch": 5.352, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.7646541595458984, + "learning_rate": 0.00040229999999999997, + "loss": 2.3838, "step": 3345 }, { "epoch": 5.3536, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.288341760635376, + "learning_rate": 0.00040215, + "loss": 2.14, "step": 3346 }, { "epoch": 5.3552, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.88338565826416, + "learning_rate": 0.000402, + "loss": 2.1205, "step": 3347 }, { "epoch": 5.3568, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.1948492527008057, + "learning_rate": 0.00040184999999999993, + "loss": 1.8892, "step": 3348 }, { "epoch": 5.3584, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.734545707702637, + "learning_rate": 0.00040169999999999995, + "loss": 2.4243, "step": 3349 }, { "epoch": 5.36, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 6.707056045532227, + "learning_rate": 0.00040155, + "loss": 2.1569, "step": 3350 }, { "epoch": 5.3616, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 7.175364971160889, + "learning_rate": 0.0004014, + "loss": 2.235, "step": 3351 }, { "epoch": 5.3632, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.4466357231140137, + "learning_rate": 0.0004012499999999999, + "loss": 2.2742, "step": 3352 }, { "epoch": 5.3648, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.7935688495635986, + "learning_rate": 0.00040109999999999994, + "loss": 1.9754, "step": 3353 }, { "epoch": 5.3664, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.9915544986724854, + "learning_rate": 0.00040094999999999996, + "loss": 1.8166, "step": 3354 }, { "epoch": 5.368, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.6275420188903809, + "learning_rate": 0.0004008, + "loss": 2.0787, "step": 3355 }, { "epoch": 5.3696, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.187203884124756, + "learning_rate": 0.00040064999999999995, + "loss": 2.2121, "step": 3356 }, { "epoch": 5.3712, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.500221014022827, + "learning_rate": 0.0004005, + "loss": 2.1276, "step": 3357 }, { "epoch": 5.3728, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.457571983337402, + "learning_rate": 0.00040034999999999995, + "loss": 2.1685, "step": 3358 }, { "epoch": 5.3744, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.376873016357422, + "learning_rate": 0.00040019999999999997, + "loss": 1.9855, "step": 3359 }, { "epoch": 5.376, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.6002018451690674, + "learning_rate": 0.00040004999999999994, + "loss": 2.1904, "step": 3360 }, { "epoch": 5.3776, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 5.4764018058776855, + "learning_rate": 0.00039989999999999996, + "loss": 2.3107, "step": 3361 }, { "epoch": 5.3792, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.1630468368530273, + "learning_rate": 0.00039975, + "loss": 2.4739, "step": 3362 }, { "epoch": 5.3808, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.243349552154541, + "learning_rate": 0.0003996, + "loss": 2.2559, "step": 3363 }, { "epoch": 5.3824, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.1403260231018066, + "learning_rate": 0.0003994499999999999, + "loss": 2.0141, "step": 3364 }, { "epoch": 5.384, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.301318883895874, + "learning_rate": 0.00039929999999999995, + "loss": 2.1598, "step": 3365 }, { "epoch": 5.3856, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.172241687774658, + "learning_rate": 0.00039914999999999997, + "loss": 2.2622, "step": 3366 }, { "epoch": 5.3872, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.561786413192749, + "learning_rate": 0.000399, + "loss": 2.2528, "step": 3367 }, { "epoch": 5.3888, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 5.7395148277282715, + "learning_rate": 0.0003988499999999999, + "loss": 2.0952, "step": 3368 }, { "epoch": 5.3904, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 7.103436470031738, + "learning_rate": 0.00039869999999999993, + "loss": 2.119, "step": 3369 }, { "epoch": 5.392, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 7.751752853393555, + "learning_rate": 0.00039854999999999996, + "loss": 2.4361, "step": 3370 }, { "epoch": 5.3936, "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "learning_rate": 0.00039854999999999996, + "loss": 1.1379, "step": 3371 }, { "epoch": 5.3952, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.6950693130493164, + "learning_rate": 0.0003984, + "loss": 2.5055, "step": 3372 }, { "epoch": 5.3968, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 5.234592914581299, + "learning_rate": 0.00039824999999999995, + "loss": 2.8761, "step": 3373 }, { "epoch": 5.3984, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 5.145230770111084, + "learning_rate": 0.0003981, + "loss": 2.7371, "step": 3374 }, { "epoch": 5.4, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.621272087097168, + "learning_rate": 0.00039794999999999994, + "loss": 1.9957, "step": 3375 }, { "epoch": 5.4016, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 24.97957420349121, + "learning_rate": 0.00039779999999999997, + "loss": 3.3891, "step": 3376 }, { "epoch": 5.4032, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 16.701343536376953, + "learning_rate": 0.00039764999999999993, + "loss": 2.8649, "step": 3377 }, { "epoch": 5.4048, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 5.291032791137695, + "learning_rate": 0.00039749999999999996, + "loss": 2.7527, "step": 3378 }, { "epoch": 5.4064, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 19.17793083190918, + "learning_rate": 0.00039735, + "loss": 3.1116, "step": 3379 }, { "epoch": 5.408, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 5.143411636352539, + "learning_rate": 0.0003972, + "loss": 2.7099, "step": 3380 }, { "epoch": 5.4096, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 5.2232489585876465, + "learning_rate": 0.0003970499999999999, + "loss": 2.1449, "step": 3381 }, { "epoch": 5.4112, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 8.380960464477539, + "learning_rate": 0.00039689999999999994, + "loss": 2.5124, "step": 3382 }, { "epoch": 5.4128, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 8.133636474609375, + "learning_rate": 0.00039674999999999997, + "loss": 2.6039, "step": 3383 }, { "epoch": 5.4144, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.810513973236084, + "learning_rate": 0.0003966, + "loss": 2.3203, "step": 3384 }, { "epoch": 5.416, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.090081214904785, + "learning_rate": 0.00039644999999999996, + "loss": 2.132, "step": 3385 }, { "epoch": 5.4176, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.350454330444336, + "learning_rate": 0.00039629999999999993, + "loss": 2.3344, "step": 3386 }, { "epoch": 5.4192, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 11.359246253967285, + "learning_rate": 0.00039614999999999995, + "loss": 2.8971, "step": 3387 }, { "epoch": 5.4208, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 10.318723678588867, + "learning_rate": 0.000396, + "loss": 2.6054, "step": 3388 }, { "epoch": 5.4224, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 13.670693397521973, + "learning_rate": 0.00039584999999999995, + "loss": 2.8112, "step": 3389 }, { "epoch": 5.424, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 9.342107772827148, + "learning_rate": 0.00039569999999999997, + "loss": 2.1677, "step": 3390 }, { "epoch": 5.4256, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 9.924628257751465, + "learning_rate": 0.00039555, + "loss": 2.0552, "step": 3391 }, { "epoch": 5.4272, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 10.202759742736816, + "learning_rate": 0.00039539999999999996, + "loss": 2.5917, "step": 3392 }, { "epoch": 5.4288, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.981536388397217, + "learning_rate": 0.00039524999999999993, + "loss": 2.2385, "step": 3393 }, { "epoch": 5.4304, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 6.929697513580322, + "learning_rate": 0.00039509999999999995, + "loss": 2.6301, "step": 3394 }, { "epoch": 5.432, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.891195774078369, + "learning_rate": 0.00039495, + "loss": 2.068, "step": 3395 }, { "epoch": 5.4336, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.2171473503112793, + "learning_rate": 0.0003948, + "loss": 2.0446, "step": 3396 }, { "epoch": 5.4352, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.248323440551758, + "learning_rate": 0.0003946499999999999, + "loss": 2.7933, "step": 3397 }, { "epoch": 5.4368, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.1067569255828857, + "learning_rate": 0.00039449999999999994, + "loss": 2.3266, "step": 3398 }, { "epoch": 5.4384, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.4660804271698, + "learning_rate": 0.00039434999999999996, + "loss": 1.958, "step": 3399 }, { "epoch": 5.44, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, - "step": 3400 - }, - { - "epoch": 5.44, - "eval_cer": 1.0, - "eval_loss": NaN, - "eval_runtime": 156.8368, - "eval_samples_per_second": 19.995, - "eval_steps_per_second": 1.25, - "eval_wer": 1.0, + "grad_norm": 2.094003438949585, + "learning_rate": 0.0003942, + "loss": 1.9817, "step": 3400 }, { "epoch": 5.4416, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.3582420349121094, + "learning_rate": 0.00039404999999999996, + "loss": 2.4704, "step": 3401 }, { "epoch": 5.4432, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.880425930023193, + "learning_rate": 0.0003938999999999999, + "loss": 2.3661, "step": 3402 }, { "epoch": 5.4448, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 8.494797706604004, + "learning_rate": 0.00039374999999999995, + "loss": 2.3579, "step": 3403 }, { "epoch": 5.4464, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 9.800097465515137, + "learning_rate": 0.00039359999999999997, + "loss": 2.2806, "step": 3404 }, { "epoch": 5.448, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 8.084938049316406, + "learning_rate": 0.00039345, + "loss": 2.2602, "step": 3405 }, { "epoch": 5.4496, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 6.3108110427856445, + "learning_rate": 0.00039329999999999996, + "loss": 2.6064, "step": 3406 }, { "epoch": 5.4512, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.094543218612671, + "learning_rate": 0.00039315, + "loss": 2.3647, "step": 3407 }, { "epoch": 5.4528, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.6065499782562256, + "learning_rate": 0.00039299999999999996, + "loss": 2.1223, "step": 3408 }, { "epoch": 5.4544, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.361059665679932, + "learning_rate": 0.00039285, + "loss": 2.1135, "step": 3409 }, { "epoch": 5.456, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.1946380138397217, + "learning_rate": 0.00039269999999999995, + "loss": 2.2807, "step": 3410 }, { "epoch": 5.4576, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.014134407043457, + "learning_rate": 0.00039255, + "loss": 2.1623, "step": 3411 }, { "epoch": 5.4592, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.53760290145874, + "learning_rate": 0.0003924, + "loss": 2.2694, "step": 3412 }, { "epoch": 5.4608, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.014392614364624, + "learning_rate": 0.00039225, + "loss": 2.0437, "step": 3413 }, { "epoch": 5.4624, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.662132501602173, + "learning_rate": 0.00039209999999999994, + "loss": 2.2526, "step": 3414 }, { "epoch": 5.464, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.099125623703003, + "learning_rate": 0.00039194999999999996, + "loss": 2.0935, "step": 3415 }, { "epoch": 5.4656, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 9.434375762939453, + "learning_rate": 0.0003918, + "loss": 2.3328, "step": 3416 }, { "epoch": 5.4672, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 6.892804145812988, + "learning_rate": 0.00039165, + "loss": 2.2687, "step": 3417 }, { "epoch": 5.4688, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 7.694879531860352, + "learning_rate": 0.0003914999999999999, + "loss": 2.2931, "step": 3418 }, { "epoch": 5.4704, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 10.8057279586792, + "learning_rate": 0.00039134999999999994, + "loss": 2.3536, "step": 3419 }, { "epoch": 5.4719999999999995, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 15.479071617126465, + "learning_rate": 0.00039119999999999997, + "loss": 3.7015, "step": 3420 }, { "epoch": 5.4736, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 6.293941974639893, + "learning_rate": 0.00039105, + "loss": 2.3939, "step": 3421 }, { "epoch": 5.4752, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.1204912662506104, + "learning_rate": 0.00039089999999999996, + "loss": 2.637, "step": 3422 }, { "epoch": 5.4768, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.834355354309082, + "learning_rate": 0.00039075, + "loss": 1.9505, "step": 3423 }, { "epoch": 5.4784, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.9132513999938965, + "learning_rate": 0.00039059999999999995, + "loss": 2.3591, "step": 3424 }, { "epoch": 5.48, "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "learning_rate": 0.00039059999999999995, + "loss": 1.9456, "step": 3425 }, { "epoch": 5.4816, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 30.53275489807129, + "learning_rate": 0.00039045, + "loss": 3.8161, "step": 3426 }, { "epoch": 5.4832, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 10.212854385375977, + "learning_rate": 0.00039029999999999995, + "loss": 2.9696, "step": 3427 }, { "epoch": 5.4848, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 6.351596832275391, + "learning_rate": 0.00039014999999999997, + "loss": 2.5098, "step": 3428 }, { "epoch": 5.4864, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.6440775394439697, + "learning_rate": 0.00039, + "loss": 2.3737, "step": 3429 }, { "epoch": 5.4879999999999995, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.7896642088890076, + "learning_rate": 0.00038985, + "loss": 2.6933, "step": 3430 }, { "epoch": 5.4896, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 6.894845008850098, + "learning_rate": 0.00038969999999999993, + "loss": 3.1879, "step": 3431 }, { "epoch": 5.4912, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.497279644012451, + "learning_rate": 0.00038954999999999995, + "loss": 3.2892, "step": 3432 }, { "epoch": 5.4928, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.8545002937316895, + "learning_rate": 0.0003894, + "loss": 2.6857, "step": 3433 }, { "epoch": 5.4944, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.6429190635681152, + "learning_rate": 0.00038925, + "loss": 2.8884, "step": 3434 }, { "epoch": 5.496, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 6.311533451080322, + "learning_rate": 0.00038909999999999997, + "loss": 2.7945, "step": 3435 }, { "epoch": 5.4976, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 7.832476615905762, + "learning_rate": 0.00038894999999999994, + "loss": 3.3598, "step": 3436 }, { "epoch": 5.4992, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 7.438668727874756, + "learning_rate": 0.00038879999999999996, + "loss": 2.9759, "step": 3437 }, { "epoch": 5.5008, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 7.911144733428955, + "learning_rate": 0.00038865, + "loss": 2.8816, "step": 3438 }, { "epoch": 5.5024, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 9.37494945526123, + "learning_rate": 0.00038849999999999996, + "loss": 2.6733, "step": 3439 }, { "epoch": 5.504, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 5.502228736877441, + "learning_rate": 0.00038835, + "loss": 2.7077, "step": 3440 }, { "epoch": 5.5056, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.751497268676758, + "learning_rate": 0.0003882, + "loss": 2.3219, "step": 3441 }, { "epoch": 5.5072, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.8348023891448975, + "learning_rate": 0.00038804999999999997, + "loss": 2.2012, "step": 3442 }, { "epoch": 5.5088, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.414168119430542, + "learning_rate": 0.00038789999999999994, + "loss": 2.2623, "step": 3443 }, { "epoch": 5.5104, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 7.748080253601074, + "learning_rate": 0.00038774999999999997, + "loss": 2.4969, "step": 3444 }, { "epoch": 5.5120000000000005, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.4296817779541016, + "learning_rate": 0.0003876, + "loss": 1.9788, "step": 3445 }, { "epoch": 5.5136, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 9.921895027160645, + "learning_rate": 0.00038745, + "loss": 2.5149, "step": 3446 }, { "epoch": 5.5152, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 5.484654426574707, + "learning_rate": 0.00038729999999999993, + "loss": 1.9065, "step": 3447 }, { "epoch": 5.5168, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 6.423427104949951, + "learning_rate": 0.00038714999999999995, + "loss": 2.3022, "step": 3448 }, { "epoch": 5.5184, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.991278171539307, + "learning_rate": 0.000387, + "loss": 2.2629, "step": 3449 }, { "epoch": 5.52, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 6.440671443939209, + "learning_rate": 0.00038685, + "loss": 2.3491, "step": 3450 }, { "epoch": 5.5216, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 8.21368408203125, + "learning_rate": 0.00038669999999999997, + "loss": 2.3761, "step": 3451 }, { "epoch": 5.5232, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 6.914760112762451, + "learning_rate": 0.00038654999999999994, + "loss": 1.9627, "step": 3452 }, { "epoch": 5.5248, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 5.628223419189453, + "learning_rate": 0.00038639999999999996, + "loss": 2.0523, "step": 3453 }, { "epoch": 5.5264, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 6.52740478515625, + "learning_rate": 0.00038625, + "loss": 2.0522, "step": 3454 }, { "epoch": 5.5280000000000005, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.8526406288146973, + "learning_rate": 0.00038609999999999995, + "loss": 2.3311, "step": 3455 }, { "epoch": 5.5296, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.2300243377685547, + "learning_rate": 0.00038595, + "loss": 2.1947, "step": 3456 }, { "epoch": 5.5312, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.7482554912567139, + "learning_rate": 0.0003858, + "loss": 2.5054, "step": 3457 }, { "epoch": 5.5328, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.598001003265381, + "learning_rate": 0.00038564999999999997, + "loss": 3.2034, "step": 3458 }, { "epoch": 5.5344, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.464871644973755, + "learning_rate": 0.00038549999999999994, + "loss": 1.5426, "step": 3459 }, { "epoch": 5.536, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.8509607315063477, + "learning_rate": 0.00038534999999999996, + "loss": 1.5835, "step": 3460 }, { "epoch": 5.5376, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.15891170501709, + "learning_rate": 0.0003852, + "loss": 2.5372, "step": 3461 }, { "epoch": 5.5392, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.5455851554870605, + "learning_rate": 0.00038505, + "loss": 2.0308, "step": 3462 }, { "epoch": 5.5408, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.959388494491577, + "learning_rate": 0.0003848999999999999, + "loss": 2.235, "step": 3463 }, { "epoch": 5.5424, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.463346481323242, + "learning_rate": 0.00038474999999999995, + "loss": 2.6703, "step": 3464 }, { "epoch": 5.5440000000000005, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 7.2167463302612305, + "learning_rate": 0.00038459999999999997, + "loss": 2.4555, "step": 3465 }, { "epoch": 5.5456, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.825636148452759, + "learning_rate": 0.00038445, + "loss": 2.6392, "step": 3466 }, { "epoch": 5.5472, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 14.59807300567627, + "learning_rate": 0.00038429999999999996, + "loss": 2.4381, "step": 3467 }, { "epoch": 5.5488, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.1133155822753906, + "learning_rate": 0.00038414999999999993, + "loss": 2.6275, "step": 3468 }, { "epoch": 5.5504, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 4.662886142730713, + "learning_rate": 0.00038399999999999996, + "loss": 2.5494, "step": 3469 }, { "epoch": 5.552, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.9676240682601929, + "learning_rate": 0.00038385, + "loss": 1.9896, "step": 3470 }, { "epoch": 5.5536, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.780965805053711, + "learning_rate": 0.00038369999999999995, + "loss": 2.4608, "step": 3471 }, { "epoch": 5.5552, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 3.9176764488220215, + "learning_rate": 0.00038354999999999997, + "loss": 3.1066, "step": 3472 }, { "epoch": 5.5568, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.076561450958252, + "learning_rate": 0.0003834, + "loss": 2.5004, "step": 3473 }, { "epoch": 5.5584, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.902611255645752, + "learning_rate": 0.00038324999999999996, + "loss": 2.4972, "step": 3474 }, { "epoch": 5.5600000000000005, "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "learning_rate": 0.00038324999999999996, + "loss": 2.5864, "step": 3475 }, { "epoch": 5.5616, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 23.75803565979004, + "learning_rate": 0.00038309999999999993, + "loss": 3.8795, "step": 3476 }, { "epoch": 5.5632, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.7207627296447754, + "learning_rate": 0.00038294999999999996, + "loss": 4.0748, "step": 3477 }, { "epoch": 5.5648, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 2.0137767791748047, + "learning_rate": 0.0003828, + "loss": 2.3535, "step": 3478 }, { "epoch": 5.5664, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.7027060389518738, + "learning_rate": 0.00038265, + "loss": 3.2328, "step": 3479 }, { "epoch": 5.568, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.26203009486198425, + "learning_rate": 0.0003824999999999999, + "loss": 2.3086, "step": 3480 }, { "epoch": 5.5696, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.36936789751052856, + "learning_rate": 0.00038234999999999994, + "loss": 3.1161, "step": 3481 }, { "epoch": 5.5712, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 8.606059074401855, + "learning_rate": 0.00038219999999999997, + "loss": 2.9592, "step": 3482 }, { "epoch": 5.5728, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.1602068841457367, + "learning_rate": 0.00038205, + "loss": 3.157, "step": 3483 }, { "epoch": 5.5744, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.08841034024953842, + "learning_rate": 0.00038189999999999996, + "loss": 2.4829, "step": 3484 }, { "epoch": 5.576, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.5950487852096558, + "learning_rate": 0.00038175, + "loss": 2.1481, "step": 3485 }, { "epoch": 5.5776, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.12550248205661774, + "learning_rate": 0.00038159999999999995, + "loss": 2.4758, "step": 3486 }, { "epoch": 5.5792, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.06971815228462219, + "learning_rate": 0.00038145, + "loss": 2.541, "step": 3487 }, { "epoch": 5.5808, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.4338502585887909, + "learning_rate": 0.00038129999999999994, + "loss": 2.9192, "step": 3488 }, { "epoch": 5.5824, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.0, + "learning_rate": 0.00038114999999999997, + "loss": 2.3753, "step": 3489 }, { "epoch": 5.584, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.1862034946680069, + "learning_rate": 0.000381, + "loss": 2.557, "step": 3490 }, { "epoch": 5.5856, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.25080394744873047, + "learning_rate": 0.00038085, + "loss": 3.3535, "step": 3491 }, { "epoch": 5.5872, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.13513095676898956, + "learning_rate": 0.00038069999999999993, + "loss": 3.3913, "step": 3492 }, { "epoch": 5.5888, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.5417481064796448, + "learning_rate": 0.00038054999999999995, + "loss": 2.5585, "step": 3493 }, { "epoch": 5.5904, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.21010702848434448, + "learning_rate": 0.0003804, + "loss": 2.7133, "step": 3494 }, { "epoch": 5.592, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.4073018431663513, + "learning_rate": 0.00038025, + "loss": 2.8361, "step": 3495 }, { "epoch": 5.5936, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.24190618097782135, + "learning_rate": 0.0003800999999999999, + "loss": 2.6598, "step": 3496 }, { "epoch": 5.5952, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.671675980091095, + "learning_rate": 0.00037994999999999994, + "loss": 2.7558, "step": 3497 }, { "epoch": 5.5968, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.17936047911643982, + "learning_rate": 0.00037979999999999996, + "loss": 2.7637, "step": 3498 }, { "epoch": 5.5984, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 0.3960895538330078, + "learning_rate": 0.00037965, + "loss": 2.969, "step": 3499 }, { "epoch": 5.6, - "grad_norm": NaN, - "learning_rate": 0.0005752, - "loss": 0.0, + "grad_norm": 1.2539879083633423, + "learning_rate": 0.00037949999999999995, + "loss": 3.0468, "step": 3500 }, { - "epoch": 5.6, - "eval_cer": 1.0, - "eval_loss": NaN, - "eval_runtime": 157.361, - "eval_samples_per_second": 19.929, - "eval_steps_per_second": 1.246, - "eval_wer": 1.0, - "step": 3500 + "epoch": 5.6016, + "grad_norm": 0.6254233717918396, + "learning_rate": 0.00037935, + "loss": 2.3933, + "step": 3501 }, { - "epoch": 5.6, - "step": 3500, - "total_flos": 2.158464150901847e+19, - "train_loss": 1.3790143143790108, - "train_runtime": 12492.843, - "train_samples_per_second": 8.965, - "train_steps_per_second": 0.28 + "epoch": 5.6032, + "grad_norm": 0.4243174195289612, + "learning_rate": 0.00037919999999999995, + "loss": 2.109, + "step": 3502 + }, + { + "epoch": 5.6048, + "grad_norm": 0.5422927737236023, + "learning_rate": 0.00037904999999999997, + "loss": 1.9256, + "step": 3503 + }, + { + "epoch": 5.6064, + "grad_norm": 0.2607356011867523, + "learning_rate": 0.00037889999999999994, + "loss": 2.259, + "step": 3504 + }, + { + "epoch": 5.608, + "grad_norm": 2.16225528717041, + "learning_rate": 0.00037874999999999996, + "loss": 2.3937, + "step": 3505 + }, + { + "epoch": 5.6096, + "grad_norm": 0.2814367115497589, + "learning_rate": 0.0003786, + "loss": 1.9617, + "step": 3506 + }, + { + "epoch": 5.6112, + "grad_norm": 1.0939712524414062, + "learning_rate": 0.00037845, + "loss": 2.1647, + "step": 3507 + }, + { + "epoch": 5.6128, + "grad_norm": 1.2848657369613647, + "learning_rate": 0.0003782999999999999, + "loss": 2.2722, + "step": 3508 + }, + { + "epoch": 5.6144, + "grad_norm": 2.4309964179992676, + "learning_rate": 0.00037814999999999995, + "loss": 2.2048, + "step": 3509 + }, + { + "epoch": 5.616, + "grad_norm": 1.734381079673767, + "learning_rate": 0.00037799999999999997, + "loss": 2.2176, + "step": 3510 + }, + { + "epoch": 5.6176, + "grad_norm": 0.7754119038581848, + "learning_rate": 0.00037785, + "loss": 1.9389, + "step": 3511 + }, + { + "epoch": 5.6192, + "grad_norm": 1.9875555038452148, + "learning_rate": 0.0003776999999999999, + "loss": 2.1693, + "step": 3512 + }, + { + "epoch": 5.6208, + "grad_norm": 1.4995112419128418, + "learning_rate": 0.00037754999999999993, + "loss": 2.4732, + "step": 3513 + }, + { + "epoch": 5.6224, + "grad_norm": 1.3808419704437256, + "learning_rate": 0.00037739999999999996, + "loss": 2.8265, + "step": 3514 + }, + { + "epoch": 5.624, + "grad_norm": 0.639369010925293, + "learning_rate": 0.00037725, + "loss": 2.1947, + "step": 3515 + }, + { + "epoch": 5.6256, + "grad_norm": 4.914621353149414, + "learning_rate": 0.00037709999999999995, + "loss": 3.165, + "step": 3516 + }, + { + "epoch": 5.6272, + "grad_norm": 1.8339734077453613, + "learning_rate": 0.00037695, + "loss": 2.7099, + "step": 3517 + }, + { + "epoch": 5.6288, + "grad_norm": 1.6919299364089966, + "learning_rate": 0.00037679999999999994, + "loss": 2.301, + "step": 3518 + }, + { + "epoch": 5.6304, + "grad_norm": 2.5645647048950195, + "learning_rate": 0.00037664999999999997, + "loss": 2.4862, + "step": 3519 + }, + { + "epoch": 5.632, + "grad_norm": 3.0452866554260254, + "learning_rate": 0.00037649999999999994, + "loss": 2.4736, + "step": 3520 + }, + { + "epoch": 5.6336, + "grad_norm": 1.4942389726638794, + "learning_rate": 0.00037634999999999996, + "loss": 1.8172, + "step": 3521 + }, + { + "epoch": 5.6352, + "grad_norm": 1.7136120796203613, + "learning_rate": 0.0003762, + "loss": 2.9583, + "step": 3522 + }, + { + "epoch": 5.6368, + "grad_norm": 2.3411545753479004, + "learning_rate": 0.00037605, + "loss": 3.5991, + "step": 3523 + }, + { + "epoch": 5.6384, + "grad_norm": 5.684845924377441, + "learning_rate": 0.0003758999999999999, + "loss": 3.5991, + "step": 3524 + }, + { + "epoch": 5.64, + "grad_norm": 8.414681434631348, + "learning_rate": 0.00037574999999999994, + "loss": 4.8554, + "step": 3525 + }, + { + "epoch": 5.6416, + "grad_norm": 23.659269332885742, + "learning_rate": 0.00037559999999999997, + "loss": 3.7371, + "step": 3526 + }, + { + "epoch": 5.6432, + "grad_norm": 10.667974472045898, + "learning_rate": 0.00037545, + "loss": 3.066, + "step": 3527 + }, + { + "epoch": 5.6448, + "grad_norm": 2.8220784664154053, + "learning_rate": 0.00037529999999999996, + "loss": 3.5351, + "step": 3528 + }, + { + "epoch": 5.6464, + "grad_norm": 3.089872121810913, + "learning_rate": 0.00037514999999999993, + "loss": 2.5887, + "step": 3529 + }, + { + "epoch": 5.648, + "grad_norm": 1.261480689048767, + "learning_rate": 0.00037499999999999995, + "loss": 2.7372, + "step": 3530 + }, + { + "epoch": 5.6495999999999995, + "grad_norm": 0.26717910170555115, + "learning_rate": 0.00037485, + "loss": 2.5624, + "step": 3531 + }, + { + "epoch": 5.6512, + "grad_norm": 0.0, + "learning_rate": 0.0003747, + "loss": 3.0436, + "step": 3532 + }, + { + "epoch": 5.6528, + "grad_norm": 0.7398553490638733, + "learning_rate": 0.00037454999999999997, + "loss": 2.7866, + "step": 3533 + }, + { + "epoch": 5.6544, + "grad_norm": 0.6313406229019165, + "learning_rate": 0.0003744, + "loss": 2.9158, + "step": 3534 + }, + { + "epoch": 5.656, + "grad_norm": 1.2333641052246094, + "learning_rate": 0.00037424999999999996, + "loss": 3.5131, + "step": 3535 + }, + { + "epoch": 5.6576, + "grad_norm": 1.7500239610671997, + "learning_rate": 0.0003741, + "loss": 3.7137, + "step": 3536 + }, + { + "epoch": 5.6592, + "grad_norm": 0.331387460231781, + "learning_rate": 0.00037394999999999996, + "loss": 3.2079, + "step": 3537 + }, + { + "epoch": 5.6608, + "grad_norm": 0.0, + "learning_rate": 0.0003738, + "loss": 2.5893, + "step": 3538 + }, + { + "epoch": 5.6624, + "grad_norm": 0.2625195384025574, + "learning_rate": 0.00037365, + "loss": 3.4963, + "step": 3539 + }, + { + "epoch": 5.664, + "grad_norm": 0.7120262384414673, + "learning_rate": 0.0003735, + "loss": 3.0325, + "step": 3540 + }, + { + "epoch": 5.6655999999999995, + "grad_norm": 0.2676083445549011, + "learning_rate": 0.00037334999999999994, + "loss": 3.2996, + "step": 3541 + }, + { + "epoch": 5.6672, + "grad_norm": 0.14166611433029175, + "learning_rate": 0.00037319999999999996, + "loss": 3.5891, + "step": 3542 + }, + { + "epoch": 5.6688, + "grad_norm": 0.0, + "learning_rate": 0.00037305, + "loss": 2.954, + "step": 3543 + }, + { + "epoch": 5.6704, + "grad_norm": 2.4091098308563232, + "learning_rate": 0.0003729, + "loss": 3.8448, + "step": 3544 + }, + { + "epoch": 5.672, + "grad_norm": 4.258470058441162, + "learning_rate": 0.0003727499999999999, + "loss": 3.1852, + "step": 3545 + }, + { + "epoch": 5.6736, + "grad_norm": 0.7352797985076904, + "learning_rate": 0.00037259999999999995, + "loss": 3.0091, + "step": 3546 + }, + { + "epoch": 5.6752, + "grad_norm": 1.294426679611206, + "learning_rate": 0.00037245, + "loss": 3.116, + "step": 3547 + }, + { + "epoch": 5.6768, + "grad_norm": 1.2180641889572144, + "learning_rate": 0.0003723, + "loss": 3.3635, + "step": 3548 + }, + { + "epoch": 5.6784, + "grad_norm": 1.7434768676757812, + "learning_rate": 0.00037214999999999997, + "loss": 3.1209, + "step": 3549 + }, + { + "epoch": 5.68, + "grad_norm": 0.20002809166908264, + "learning_rate": 0.000372, + "loss": 2.7624, + "step": 3550 + }, + { + "epoch": 5.6815999999999995, + "grad_norm": 0.7545135617256165, + "learning_rate": 0.00037184999999999996, + "loss": 2.5356, + "step": 3551 + }, + { + "epoch": 5.6832, + "grad_norm": 0.5763547420501709, + "learning_rate": 0.0003717, + "loss": 2.8453, + "step": 3552 + }, + { + "epoch": 5.6848, + "grad_norm": 0.379525363445282, + "learning_rate": 0.00037154999999999995, + "loss": 2.6852, + "step": 3553 + }, + { + "epoch": 5.6864, + "grad_norm": 0.5061140656471252, + "learning_rate": 0.0003714, + "loss": 2.3777, + "step": 3554 + }, + { + "epoch": 5.688, + "grad_norm": 0.7044528126716614, + "learning_rate": 0.00037125, + "loss": 2.4686, + "step": 3555 + }, + { + "epoch": 5.6896, + "grad_norm": 0.6468807458877563, + "learning_rate": 0.0003711, + "loss": 2.8022, + "step": 3556 + }, + { + "epoch": 5.6912, + "grad_norm": 1.682919979095459, + "learning_rate": 0.00037094999999999994, + "loss": 2.5016, + "step": 3557 + }, + { + "epoch": 5.6928, + "grad_norm": 3.406224012374878, + "learning_rate": 0.00037079999999999996, + "loss": 2.8903, + "step": 3558 + }, + { + "epoch": 5.6944, + "grad_norm": 0.8235665559768677, + "learning_rate": 0.00037065, + "loss": 2.4151, + "step": 3559 + }, + { + "epoch": 5.696, + "grad_norm": 4.509483814239502, + "learning_rate": 0.0003705, + "loss": 2.5733, + "step": 3560 + }, + { + "epoch": 5.6975999999999996, + "grad_norm": 0.9885743856430054, + "learning_rate": 0.0003703499999999999, + "loss": 2.2135, + "step": 3561 + }, + { + "epoch": 5.6992, + "grad_norm": 2.305175304412842, + "learning_rate": 0.00037019999999999995, + "loss": 2.3394, + "step": 3562 + }, + { + "epoch": 5.7008, + "grad_norm": 2.077007293701172, + "learning_rate": 0.00037004999999999997, + "loss": 2.4202, + "step": 3563 + }, + { + "epoch": 5.7024, + "grad_norm": 0.6968153715133667, + "learning_rate": 0.0003699, + "loss": 2.2318, + "step": 3564 + }, + { + "epoch": 5.704, + "grad_norm": 2.7181806564331055, + "learning_rate": 0.00036974999999999996, + "loss": 2.3018, + "step": 3565 + }, + { + "epoch": 5.7056000000000004, + "grad_norm": 2.1074941158294678, + "learning_rate": 0.0003696, + "loss": 3.078, + "step": 3566 + }, + { + "epoch": 5.7072, + "grad_norm": 1.7561454772949219, + "learning_rate": 0.00036944999999999995, + "loss": 1.7743, + "step": 3567 + }, + { + "epoch": 5.7088, + "grad_norm": 1.8946216106414795, + "learning_rate": 0.0003693, + "loss": 2.3119, + "step": 3568 + }, + { + "epoch": 5.7104, + "grad_norm": 1.7721720933914185, + "learning_rate": 0.00036914999999999995, + "loss": 2.3399, + "step": 3569 + }, + { + "epoch": 5.712, + "grad_norm": 2.348559617996216, + "learning_rate": 0.00036899999999999997, + "loss": 2.8106, + "step": 3570 + }, + { + "epoch": 5.7136, + "grad_norm": 3.1461150646209717, + "learning_rate": 0.00036885, + "loss": 2.2077, + "step": 3571 + }, + { + "epoch": 5.7152, + "grad_norm": 2.274102210998535, + "learning_rate": 0.0003687, + "loss": 2.4829, + "step": 3572 + }, + { + "epoch": 5.7168, + "grad_norm": 2.2827303409576416, + "learning_rate": 0.00036854999999999993, + "loss": 3.999, + "step": 3573 + }, + { + "epoch": 5.7184, + "grad_norm": 2.7197787761688232, + "learning_rate": 0.00036839999999999996, + "loss": 2.9606, + "step": 3574 + }, + { + "epoch": 5.72, + "grad_norm": 2.6948750019073486, + "learning_rate": 0.00036825, + "loss": 3.1148, + "step": 3575 + }, + { + "epoch": 5.7216000000000005, + "grad_norm": 0.272156298160553, + "learning_rate": 0.0003681, + "loss": 2.705, + "step": 3576 + }, + { + "epoch": 5.7232, + "grad_norm": 3.451700448989868, + "learning_rate": 0.0003679499999999999, + "loss": 3.3249, + "step": 3577 + }, + { + "epoch": 5.7248, + "grad_norm": 3.6281368732452393, + "learning_rate": 0.00036779999999999994, + "loss": 3.0924, + "step": 3578 + }, + { + "epoch": 5.7264, + "grad_norm": 10.167502403259277, + "learning_rate": 0.00036764999999999996, + "loss": 3.0343, + "step": 3579 + }, + { + "epoch": 5.728, + "grad_norm": 0.0, + "learning_rate": 0.0003675, + "loss": 2.8255, + "step": 3580 + }, + { + "epoch": 5.7296, + "grad_norm": 0.0, + "learning_rate": 0.00036734999999999996, + "loss": 3.0536, + "step": 3581 + }, + { + "epoch": 5.7312, + "grad_norm": 7.263051509857178, + "learning_rate": 0.0003672, + "loss": 3.2911, + "step": 3582 + }, + { + "epoch": 5.7328, + "grad_norm": 0.07337801903486252, + "learning_rate": 0.00036705, + "loss": 2.583, + "step": 3583 + }, + { + "epoch": 5.7344, + "grad_norm": 3.855539321899414, + "learning_rate": 0.0003669, + "loss": 2.5903, + "step": 3584 + }, + { + "epoch": 5.736, + "grad_norm": 0.20969152450561523, + "learning_rate": 0.00036674999999999994, + "loss": 2.6072, + "step": 3585 + }, + { + "epoch": 5.7376000000000005, + "grad_norm": 1.757079839706421, + "learning_rate": 0.00036659999999999997, + "loss": 2.3773, + "step": 3586 + }, + { + "epoch": 5.7392, + "grad_norm": 4.414907932281494, + "learning_rate": 0.00036645, + "loss": 2.956, + "step": 3587 + }, + { + "epoch": 5.7408, + "grad_norm": 0.1729482263326645, + "learning_rate": 0.0003663, + "loss": 2.659, + "step": 3588 + }, + { + "epoch": 5.7424, + "grad_norm": 0.10146791487932205, + "learning_rate": 0.00036614999999999993, + "loss": 2.9967, + "step": 3589 + }, + { + "epoch": 5.744, + "grad_norm": 0.11373867094516754, + "learning_rate": 0.00036599999999999995, + "loss": 2.4121, + "step": 3590 + }, + { + "epoch": 5.7456, + "grad_norm": 0.0, + "learning_rate": 0.00036585, + "loss": 2.4116, + "step": 3591 + }, + { + "epoch": 5.7472, + "grad_norm": 0.2513590455055237, + "learning_rate": 0.0003657, + "loss": 2.8289, + "step": 3592 + }, + { + "epoch": 5.7488, + "grad_norm": 0.7403852343559265, + "learning_rate": 0.00036554999999999997, + "loss": 2.9567, + "step": 3593 + }, + { + "epoch": 5.7504, + "grad_norm": 0.31389737129211426, + "learning_rate": 0.00036539999999999994, + "loss": 2.6591, + "step": 3594 + }, + { + "epoch": 5.752, + "grad_norm": 0.8933272957801819, + "learning_rate": 0.00036524999999999996, + "loss": 2.7838, + "step": 3595 + }, + { + "epoch": 5.7536000000000005, + "grad_norm": 1.0159043073654175, + "learning_rate": 0.0003651, + "loss": 2.7835, + "step": 3596 + }, + { + "epoch": 5.7552, + "grad_norm": 0.21672067046165466, + "learning_rate": 0.00036494999999999995, + "loss": 3.1109, + "step": 3597 + }, + { + "epoch": 5.7568, + "grad_norm": 0.5512559413909912, + "learning_rate": 0.0003648, + "loss": 2.7365, + "step": 3598 + }, + { + "epoch": 5.7584, + "grad_norm": 0.7378922700881958, + "learning_rate": 0.00036465, + "loss": 3.2703, + "step": 3599 + }, + { + "epoch": 5.76, + "grad_norm": 2.0105292797088623, + "learning_rate": 0.00036449999999999997, + "loss": 3.349, + "step": 3600 + }, + { + "epoch": 5.7616, + "grad_norm": 1.5307704210281372, + "learning_rate": 0.00036434999999999994, + "loss": 2.8438, + "step": 3601 + }, + { + "epoch": 5.7632, + "grad_norm": 1.4970020055770874, + "learning_rate": 0.00036419999999999996, + "loss": 3.2691, + "step": 3602 + }, + { + "epoch": 5.7648, + "grad_norm": 0.9958275556564331, + "learning_rate": 0.00036405, + "loss": 2.8416, + "step": 3603 + }, + { + "epoch": 5.7664, + "grad_norm": 0.8194541931152344, + "learning_rate": 0.0003639, + "loss": 2.6063, + "step": 3604 + }, + { + "epoch": 5.768, + "grad_norm": 1.198455572128296, + "learning_rate": 0.0003637499999999999, + "loss": 3.0273, + "step": 3605 + }, + { + "epoch": 5.7696, + "grad_norm": 0.8404093384742737, + "learning_rate": 0.00036359999999999995, + "loss": 2.5248, + "step": 3606 + }, + { + "epoch": 5.7712, + "grad_norm": 0.1331993043422699, + "learning_rate": 0.00036344999999999997, + "loss": 2.1761, + "step": 3607 + }, + { + "epoch": 5.7728, + "grad_norm": 1.9269537925720215, + "learning_rate": 0.0003633, + "loss": 2.9078, + "step": 3608 + }, + { + "epoch": 5.7744, + "grad_norm": 2.229436159133911, + "learning_rate": 0.00036314999999999996, + "loss": 2.3954, + "step": 3609 + }, + { + "epoch": 5.776, + "grad_norm": 1.4860639572143555, + "learning_rate": 0.00036299999999999993, + "loss": 2.3718, + "step": 3610 + }, + { + "epoch": 5.7776, + "grad_norm": 1.9439291954040527, + "learning_rate": 0.00036284999999999996, + "loss": 3.4277, + "step": 3611 + }, + { + "epoch": 5.7792, + "grad_norm": 1.416441798210144, + "learning_rate": 0.0003627, + "loss": 2.1693, + "step": 3612 + }, + { + "epoch": 5.7808, + "grad_norm": 1.086266279220581, + "learning_rate": 0.00036254999999999995, + "loss": 2.3808, + "step": 3613 + }, + { + "epoch": 5.7824, + "grad_norm": 0.8500319719314575, + "learning_rate": 0.00036239999999999997, + "loss": 2.603, + "step": 3614 + }, + { + "epoch": 5.784, + "grad_norm": 2.137688398361206, + "learning_rate": 0.00036225, + "loss": 2.4094, + "step": 3615 + }, + { + "epoch": 5.7856, + "grad_norm": 1.138893961906433, + "learning_rate": 0.00036209999999999997, + "loss": 2.5953, + "step": 3616 + }, + { + "epoch": 5.7872, + "grad_norm": 2.8222107887268066, + "learning_rate": 0.00036194999999999993, + "loss": 2.2547, + "step": 3617 + }, + { + "epoch": 5.7888, + "grad_norm": 1.6344839334487915, + "learning_rate": 0.00036179999999999996, + "loss": 2.3833, + "step": 3618 + }, + { + "epoch": 5.7904, + "grad_norm": 1.5543922185897827, + "learning_rate": 0.00036165, + "loss": 2.8417, + "step": 3619 + }, + { + "epoch": 5.792, + "grad_norm": 1.5485179424285889, + "learning_rate": 0.0003615, + "loss": 2.4023, + "step": 3620 + }, + { + "epoch": 5.7936, + "grad_norm": 1.6071698665618896, + "learning_rate": 0.0003613499999999999, + "loss": 2.8954, + "step": 3621 + }, + { + "epoch": 5.7952, + "grad_norm": 1.7881412506103516, + "learning_rate": 0.00036119999999999994, + "loss": 2.5974, + "step": 3622 + }, + { + "epoch": 5.7968, + "grad_norm": 3.4796767234802246, + "learning_rate": 0.00036104999999999997, + "loss": 2.6164, + "step": 3623 + }, + { + "epoch": 5.7984, + "grad_norm": 3.5168821811676025, + "learning_rate": 0.0003609, + "loss": 2.58, + "step": 3624 + }, + { + "epoch": 5.8, + "grad_norm": 2.84468150138855, + "learning_rate": 0.00036074999999999996, + "loss": 3.1729, + "step": 3625 + }, + { + "epoch": 5.8016, + "grad_norm": 27.352258682250977, + "learning_rate": 0.00036059999999999993, + "loss": 5.0092, + "step": 3626 + }, + { + "epoch": 5.8032, + "grad_norm": 7.655608654022217, + "learning_rate": 0.00036044999999999995, + "loss": 3.3475, + "step": 3627 + }, + { + "epoch": 5.8048, + "grad_norm": 1.47767972946167, + "learning_rate": 0.0003603, + "loss": 3.133, + "step": 3628 + }, + { + "epoch": 5.8064, + "grad_norm": 0.7489213347434998, + "learning_rate": 0.00036014999999999995, + "loss": 2.7145, + "step": 3629 + }, + { + "epoch": 5.808, + "grad_norm": 9.507126808166504, + "learning_rate": 0.00035999999999999997, + "loss": 2.7299, + "step": 3630 + }, + { + "epoch": 5.8096, + "grad_norm": 1.0871667861938477, + "learning_rate": 0.00035985, + "loss": 3.3557, + "step": 3631 + }, + { + "epoch": 5.8112, + "grad_norm": 0.6891421675682068, + "learning_rate": 0.00035969999999999996, + "loss": 2.453, + "step": 3632 + }, + { + "epoch": 5.8128, + "grad_norm": 0.11214147508144379, + "learning_rate": 0.00035954999999999993, + "loss": 2.9922, + "step": 3633 + }, + { + "epoch": 5.8144, + "grad_norm": 0.45183080434799194, + "learning_rate": 0.00035939999999999995, + "loss": 3.4575, + "step": 3634 + }, + { + "epoch": 5.816, + "grad_norm": 0.42742919921875, + "learning_rate": 0.00035925, + "loss": 2.6807, + "step": 3635 + }, + { + "epoch": 5.8176, + "grad_norm": 0.3587559163570404, + "learning_rate": 0.0003591, + "loss": 2.689, + "step": 3636 + }, + { + "epoch": 5.8192, + "grad_norm": 0.12587009370326996, + "learning_rate": 0.0003589499999999999, + "loss": 2.9673, + "step": 3637 + }, + { + "epoch": 5.8208, + "grad_norm": 0.16796007752418518, + "learning_rate": 0.00035879999999999994, + "loss": 2.9272, + "step": 3638 + }, + { + "epoch": 5.8224, + "grad_norm": 0.8416668176651001, + "learning_rate": 0.00035864999999999996, + "loss": 3.1985, + "step": 3639 + }, + { + "epoch": 5.824, + "grad_norm": 1.0569196939468384, + "learning_rate": 0.0003585, + "loss": 3.362, + "step": 3640 + }, + { + "epoch": 5.8256, + "grad_norm": 0.45992952585220337, + "learning_rate": 0.00035834999999999996, + "loss": 3.0646, + "step": 3641 + }, + { + "epoch": 5.8272, + "grad_norm": 0.04979749768972397, + "learning_rate": 0.0003582, + "loss": 2.7772, + "step": 3642 + }, + { + "epoch": 5.8288, + "grad_norm": 2.914752960205078, + "learning_rate": 0.00035804999999999995, + "loss": 3.0556, + "step": 3643 + }, + { + "epoch": 5.8304, + "grad_norm": 0.5485931634902954, + "learning_rate": 0.00035789999999999997, + "loss": 2.7492, + "step": 3644 + }, + { + "epoch": 5.832, + "grad_norm": 0.425957053899765, + "learning_rate": 0.00035774999999999994, + "loss": 2.737, + "step": 3645 + }, + { + "epoch": 5.8336, + "grad_norm": 0.3038780987262726, + "learning_rate": 0.00035759999999999996, + "loss": 3.1534, + "step": 3646 + }, + { + "epoch": 5.8352, + "grad_norm": 0.6664896011352539, + "learning_rate": 0.00035745, + "loss": 2.8512, + "step": 3647 + }, + { + "epoch": 5.8368, + "grad_norm": 0.5831376910209656, + "learning_rate": 0.0003573, + "loss": 2.8303, + "step": 3648 + }, + { + "epoch": 5.8384, + "grad_norm": 0.7792990803718567, + "learning_rate": 0.0003571499999999999, + "loss": 2.8147, + "step": 3649 + }, + { + "epoch": 5.84, + "grad_norm": 0.7054243683815002, + "learning_rate": 0.00035699999999999995, + "loss": 2.6731, + "step": 3650 + }, + { + "epoch": 5.8416, + "grad_norm": 0.44943201541900635, + "learning_rate": 0.00035685, + "loss": 2.6966, + "step": 3651 + }, + { + "epoch": 5.8431999999999995, + "grad_norm": 0.7449806928634644, + "learning_rate": 0.0003567, + "loss": 2.7262, + "step": 3652 + }, + { + "epoch": 5.8448, + "grad_norm": 0.5415761470794678, + "learning_rate": 0.0003565499999999999, + "loss": 2.1463, + "step": 3653 + }, + { + "epoch": 5.8464, + "grad_norm": 0.5867460370063782, + "learning_rate": 0.00035639999999999994, + "loss": 2.2366, + "step": 3654 + }, + { + "epoch": 5.848, + "grad_norm": 0.15858949720859528, + "learning_rate": 0.00035624999999999996, + "loss": 2.8666, + "step": 3655 + }, + { + "epoch": 5.8496, + "grad_norm": 1.2337722778320312, + "learning_rate": 0.0003561, + "loss": 2.2596, + "step": 3656 + }, + { + "epoch": 5.8512, + "grad_norm": 0.874224066734314, + "learning_rate": 0.00035595, + "loss": 2.4329, + "step": 3657 + }, + { + "epoch": 5.8528, + "grad_norm": 1.4948076009750366, + "learning_rate": 0.0003558, + "loss": 3.2546, + "step": 3658 + }, + { + "epoch": 5.8544, + "grad_norm": 2.395164728164673, + "learning_rate": 0.00035564999999999994, + "loss": 2.2181, + "step": 3659 + }, + { + "epoch": 5.856, + "grad_norm": 0.8242649435997009, + "learning_rate": 0.00035549999999999997, + "loss": 2.1852, + "step": 3660 + }, + { + "epoch": 5.8576, + "grad_norm": 2.2238521575927734, + "learning_rate": 0.00035535, + "loss": 3.2672, + "step": 3661 + }, + { + "epoch": 5.8591999999999995, + "grad_norm": 0.8653325438499451, + "learning_rate": 0.00035519999999999996, + "loss": 2.174, + "step": 3662 + }, + { + "epoch": 5.8608, + "grad_norm": 1.0872834920883179, + "learning_rate": 0.00035505, + "loss": 2.316, + "step": 3663 + }, + { + "epoch": 5.8624, + "grad_norm": 1.8807929754257202, + "learning_rate": 0.0003549, + "loss": 2.8189, + "step": 3664 + }, + { + "epoch": 5.864, + "grad_norm": 1.446363925933838, + "learning_rate": 0.00035475, + "loss": 2.4262, + "step": 3665 + }, + { + "epoch": 5.8656, + "grad_norm": 2.251727819442749, + "learning_rate": 0.00035459999999999995, + "loss": 2.5645, + "step": 3666 + }, + { + "epoch": 5.8672, + "grad_norm": 1.9902764558792114, + "learning_rate": 0.00035444999999999997, + "loss": 2.4165, + "step": 3667 + }, + { + "epoch": 5.8688, + "grad_norm": 1.670762062072754, + "learning_rate": 0.0003543, + "loss": 2.6425, + "step": 3668 + }, + { + "epoch": 5.8704, + "grad_norm": 8.156818389892578, + "learning_rate": 0.00035415, + "loss": 2.1805, + "step": 3669 + }, + { + "epoch": 5.872, + "grad_norm": 1.7316776514053345, + "learning_rate": 0.00035399999999999993, + "loss": 2.6375, + "step": 3670 + }, + { + "epoch": 5.8736, + "grad_norm": 3.32999587059021, + "learning_rate": 0.00035384999999999995, + "loss": 2.8535, + "step": 3671 + }, + { + "epoch": 5.8751999999999995, + "grad_norm": 1.9965382814407349, + "learning_rate": 0.0003537, + "loss": 2.7322, + "step": 3672 + }, + { + "epoch": 5.8768, + "grad_norm": 2.408020257949829, + "learning_rate": 0.00035355, + "loss": 2.3558, + "step": 3673 + }, + { + "epoch": 5.8784, + "grad_norm": 1.9624814987182617, + "learning_rate": 0.00035339999999999997, + "loss": 2.6974, + "step": 3674 + }, + { + "epoch": 5.88, + "grad_norm": NaN, + "learning_rate": 0.00035339999999999997, + "loss": 2.638, + "step": 3675 + }, + { + "epoch": 5.8816, + "grad_norm": 0.5264666676521301, + "learning_rate": 0.00035324999999999994, + "loss": 3.2748, + "step": 3676 + }, + { + "epoch": 5.8832, + "grad_norm": 0.0, + "learning_rate": 0.00035309999999999996, + "loss": 3.4704, + "step": 3677 + }, + { + "epoch": 5.8848, + "grad_norm": 6.098517894744873, + "learning_rate": 0.00035295, + "loss": 3.4468, + "step": 3678 + }, + { + "epoch": 5.8864, + "grad_norm": 0.0, + "learning_rate": 0.00035279999999999996, + "loss": 3.5018, + "step": 3679 + }, + { + "epoch": 5.888, + "grad_norm": 0.74627685546875, + "learning_rate": 0.00035265, + "loss": 2.7214, + "step": 3680 + }, + { + "epoch": 5.8896, + "grad_norm": 9.594038963317871, + "learning_rate": 0.0003525, + "loss": 4.2435, + "step": 3681 + }, + { + "epoch": 5.8911999999999995, + "grad_norm": 0.0, + "learning_rate": 0.00035234999999999997, + "loss": 3.0054, + "step": 3682 + }, + { + "epoch": 5.8928, + "grad_norm": 0.0, + "learning_rate": 0.00035219999999999994, + "loss": 3.3124, + "step": 3683 + }, + { + "epoch": 5.8944, + "grad_norm": 0.0, + "learning_rate": 0.00035204999999999996, + "loss": 2.5724, + "step": 3684 + }, + { + "epoch": 5.896, + "grad_norm": 1.967746615409851, + "learning_rate": 0.0003519, + "loss": 3.026, + "step": 3685 + }, + { + "epoch": 5.8976, + "grad_norm": 0.0, + "learning_rate": 0.00035175, + "loss": 2.5008, + "step": 3686 + }, + { + "epoch": 5.8992, + "grad_norm": 0.0, + "learning_rate": 0.0003515999999999999, + "loss": 2.7721, + "step": 3687 + }, + { + "epoch": 5.9008, + "grad_norm": 0.0, + "learning_rate": 0.00035144999999999995, + "loss": 2.6447, + "step": 3688 + }, + { + "epoch": 5.9024, + "grad_norm": 0.0, + "learning_rate": 0.0003513, + "loss": 2.1775, + "step": 3689 + }, + { + "epoch": 5.904, + "grad_norm": 5.827794551849365, + "learning_rate": 0.00035115, + "loss": 2.6886, + "step": 3690 + }, + { + "epoch": 5.9056, + "grad_norm": 0.3083203434944153, + "learning_rate": 0.00035099999999999997, + "loss": 2.7191, + "step": 3691 + }, + { + "epoch": 5.9072, + "grad_norm": 0.0, + "learning_rate": 0.00035085, + "loss": 2.4568, + "step": 3692 + }, + { + "epoch": 5.9088, + "grad_norm": 0.0, + "learning_rate": 0.00035069999999999996, + "loss": 2.2762, + "step": 3693 + }, + { + "epoch": 5.9104, + "grad_norm": 0.0, + "learning_rate": 0.00035055, + "loss": 2.6573, + "step": 3694 + }, + { + "epoch": 5.912, + "grad_norm": 0.0, + "learning_rate": 0.00035039999999999995, + "loss": 2.92, + "step": 3695 + }, + { + "epoch": 5.9136, + "grad_norm": 0.0, + "learning_rate": 0.00035025, + "loss": 2.3955, + "step": 3696 + }, + { + "epoch": 5.9152000000000005, + "grad_norm": 0.0, + "learning_rate": 0.0003501, + "loss": 2.8471, + "step": 3697 + }, + { + "epoch": 5.9168, + "grad_norm": 0.0, + "learning_rate": 0.00034995, + "loss": 2.3392, + "step": 3698 + }, + { + "epoch": 5.9184, + "grad_norm": 0.0, + "learning_rate": 0.00034979999999999994, + "loss": 2.4649, + "step": 3699 + }, + { + "epoch": 5.92, + "grad_norm": 0.0, + "learning_rate": 0.00034964999999999996, + "loss": 2.9227, + "step": 3700 + }, + { + "epoch": 5.9216, + "grad_norm": 0.0, + "learning_rate": 0.0003495, + "loss": 2.2749, + "step": 3701 + }, + { + "epoch": 5.9232, + "grad_norm": 0.0, + "learning_rate": 0.00034935, + "loss": 2.2452, + "step": 3702 + }, + { + "epoch": 5.9248, + "grad_norm": 0.0, + "learning_rate": 0.0003491999999999999, + "loss": 2.3583, + "step": 3703 + }, + { + "epoch": 5.9264, + "grad_norm": 0.0, + "learning_rate": 0.00034904999999999995, + "loss": 2.7241, + "step": 3704 + }, + { + "epoch": 5.928, + "grad_norm": 0.5584777593612671, + "learning_rate": 0.00034889999999999997, + "loss": 2.6595, + "step": 3705 + }, + { + "epoch": 5.9296, + "grad_norm": 0.4647485315799713, + "learning_rate": 0.00034875, + "loss": 3.1368, + "step": 3706 + }, + { + "epoch": 5.9312000000000005, + "grad_norm": 0.0, + "learning_rate": 0.00034859999999999996, + "loss": 2.6697, + "step": 3707 + }, + { + "epoch": 5.9328, + "grad_norm": 0.32228565216064453, + "learning_rate": 0.00034845, + "loss": 3.4953, + "step": 3708 + }, + { + "epoch": 5.9344, + "grad_norm": 0.7425710558891296, + "learning_rate": 0.00034829999999999996, + "loss": 2.7956, + "step": 3709 + }, + { + "epoch": 5.936, + "grad_norm": 0.0, + "learning_rate": 0.00034815, + "loss": 2.9307, + "step": 3710 + }, + { + "epoch": 5.9376, + "grad_norm": 1.3797106742858887, + "learning_rate": 0.00034799999999999995, + "loss": 3.6561, + "step": 3711 + }, + { + "epoch": 5.9392, + "grad_norm": 0.0, + "learning_rate": 0.00034784999999999997, + "loss": 2.8536, + "step": 3712 + }, + { + "epoch": 5.9408, + "grad_norm": 0.0, + "learning_rate": 0.0003477, + "loss": 3.0708, + "step": 3713 + }, + { + "epoch": 5.9424, + "grad_norm": 0.6898555755615234, + "learning_rate": 0.00034755, + "loss": 2.8878, + "step": 3714 + }, + { + "epoch": 5.944, + "grad_norm": 0.3775753080844879, + "learning_rate": 0.00034739999999999993, + "loss": 2.4076, + "step": 3715 + }, + { + "epoch": 5.9456, + "grad_norm": 1.3761353492736816, + "learning_rate": 0.00034724999999999996, + "loss": 4.4482, + "step": 3716 + }, + { + "epoch": 5.9472000000000005, + "grad_norm": 1.383660912513733, + "learning_rate": 0.0003471, + "loss": 3.4384, + "step": 3717 + }, + { + "epoch": 5.9488, + "grad_norm": 0.6061517000198364, + "learning_rate": 0.00034695, + "loss": 3.4682, + "step": 3718 + }, + { + "epoch": 5.9504, + "grad_norm": 0.4879741072654724, + "learning_rate": 0.0003467999999999999, + "loss": 2.7721, + "step": 3719 + }, + { + "epoch": 5.952, + "grad_norm": 0.30795058608055115, + "learning_rate": 0.00034664999999999994, + "loss": 2.9665, + "step": 3720 + }, + { + "epoch": 5.9536, + "grad_norm": 0.7795411944389343, + "learning_rate": 0.00034649999999999997, + "loss": 3.0061, + "step": 3721 + }, + { + "epoch": 5.9552, + "grad_norm": 0.41862058639526367, + "learning_rate": 0.00034635, + "loss": 3.0164, + "step": 3722 + }, + { + "epoch": 5.9568, + "grad_norm": 0.9294652342796326, + "learning_rate": 0.00034619999999999996, + "loss": 3.2634, + "step": 3723 + }, + { + "epoch": 5.9584, + "grad_norm": 1.671874761581421, + "learning_rate": 0.00034605, + "loss": 2.8787, + "step": 3724 + }, + { + "epoch": 5.96, + "grad_norm": 1.3849936723709106, + "learning_rate": 0.00034589999999999995, + "loss": 3.0871, + "step": 3725 + }, + { + "epoch": 5.9616, + "grad_norm": 2.8756697177886963, + "learning_rate": 0.00034575, + "loss": 3.8418, + "step": 3726 + }, + { + "epoch": 5.9632, + "grad_norm": 5.069089412689209, + "learning_rate": 0.00034559999999999994, + "loss": 3.3647, + "step": 3727 + }, + { + "epoch": 5.9648, + "grad_norm": 0.0, + "learning_rate": 0.00034544999999999997, + "loss": 3.2391, + "step": 3728 + }, + { + "epoch": 5.9664, + "grad_norm": 0.6909729242324829, + "learning_rate": 0.0003453, + "loss": 3.0748, + "step": 3729 + }, + { + "epoch": 5.968, + "grad_norm": 0.24212801456451416, + "learning_rate": 0.00034515, + "loss": 3.2098, + "step": 3730 + }, + { + "epoch": 5.9696, + "grad_norm": 0.0, + "learning_rate": 0.00034499999999999993, + "loss": 3.4712, + "step": 3731 + }, + { + "epoch": 5.9712, + "grad_norm": 0.0, + "learning_rate": 0.00034484999999999995, + "loss": 2.919, + "step": 3732 + }, + { + "epoch": 5.9728, + "grad_norm": 0.0, + "learning_rate": 0.0003447, + "loss": 2.4222, + "step": 3733 + }, + { + "epoch": 5.9744, + "grad_norm": 0.0, + "learning_rate": 0.00034455, + "loss": 2.9553, + "step": 3734 + }, + { + "epoch": 5.976, + "grad_norm": 0.0, + "learning_rate": 0.00034439999999999997, + "loss": 2.2691, + "step": 3735 + }, + { + "epoch": 5.9776, + "grad_norm": 0.0, + "learning_rate": 0.00034424999999999994, + "loss": 2.6048, + "step": 3736 + }, + { + "epoch": 5.9792, + "grad_norm": 0.0, + "learning_rate": 0.00034409999999999996, + "loss": 3.181, + "step": 3737 + }, + { + "epoch": 5.9808, + "grad_norm": 0.5898072719573975, + "learning_rate": 0.00034395, + "loss": 2.2676, + "step": 3738 + }, + { + "epoch": 5.9824, + "grad_norm": 0.0, + "learning_rate": 0.00034379999999999995, + "loss": 2.8159, + "step": 3739 + }, + { + "epoch": 5.984, + "grad_norm": 0.0, + "learning_rate": 0.00034365, + "loss": 2.8115, + "step": 3740 + }, + { + "epoch": 5.9856, + "grad_norm": 3.8790807723999023, + "learning_rate": 0.0003435, + "loss": 3.8424, + "step": 3741 + }, + { + "epoch": 5.9872, + "grad_norm": 0.0, + "learning_rate": 0.00034334999999999997, + "loss": 3.3567, + "step": 3742 + }, + { + "epoch": 5.9888, + "grad_norm": 0.0, + "learning_rate": 0.00034319999999999994, + "loss": 3.3293, + "step": 3743 + }, + { + "epoch": 5.9904, + "grad_norm": 0.0, + "learning_rate": 0.00034304999999999996, + "loss": 2.1649, + "step": 3744 + }, + { + "epoch": 5.992, + "grad_norm": 0.0, + "learning_rate": 0.0003429, + "loss": 2.4547, + "step": 3745 + }, + { + "epoch": 5.9936, + "grad_norm": 1.2165772914886475, + "learning_rate": 0.00034275, + "loss": 3.6393, + "step": 3746 + }, + { + "epoch": 5.9952, + "grad_norm": 0.3224068582057953, + "learning_rate": 0.0003425999999999999, + "loss": 2.719, + "step": 3747 + }, + { + "epoch": 5.9968, + "grad_norm": 0.7286518216133118, + "learning_rate": 0.00034244999999999995, + "loss": 2.3758, + "step": 3748 + }, + { + "epoch": 5.9984, + "grad_norm": 1.9613587856292725, + "learning_rate": 0.00034229999999999997, + "loss": 3.2195, + "step": 3749 + }, + { + "epoch": 6.0, + "grad_norm": 1.1014471054077148, + "learning_rate": 0.00034215, + "loss": 2.6696, + "step": 3750 + }, + { + "epoch": 6.0016, + "grad_norm": 0.0, + "learning_rate": 0.00034199999999999996, + "loss": 4.3507, + "step": 3751 + }, + { + "epoch": 6.0032, + "grad_norm": 2.0164687633514404, + "learning_rate": 0.00034184999999999993, + "loss": 3.3653, + "step": 3752 + }, + { + "epoch": 6.0048, + "grad_norm": 0.0, + "learning_rate": 0.00034169999999999996, + "loss": 3.2864, + "step": 3753 + }, + { + "epoch": 6.0064, + "grad_norm": 0.0, + "learning_rate": 0.00034155, + "loss": 3.4196, + "step": 3754 + }, + { + "epoch": 6.008, + "grad_norm": 0.8604856729507446, + "learning_rate": 0.00034139999999999995, + "loss": 3.6826, + "step": 3755 + }, + { + "epoch": 6.0096, + "grad_norm": 0.0, + "learning_rate": 0.00034125, + "loss": 3.0471, + "step": 3756 + }, + { + "epoch": 6.0112, + "grad_norm": 4.856592178344727, + "learning_rate": 0.0003411, + "loss": 3.9458, + "step": 3757 + }, + { + "epoch": 6.0128, + "grad_norm": 0.0, + "learning_rate": 0.00034094999999999997, + "loss": 3.4694, + "step": 3758 + }, + { + "epoch": 6.0144, + "grad_norm": 0.0, + "learning_rate": 0.00034079999999999994, + "loss": 3.995, + "step": 3759 + }, + { + "epoch": 6.016, + "grad_norm": 0.0, + "learning_rate": 0.00034064999999999996, + "loss": 3.4592, + "step": 3760 + }, + { + "epoch": 6.0176, + "grad_norm": 0.0, + "learning_rate": 0.0003405, + "loss": 3.7419, + "step": 3761 + }, + { + "epoch": 6.0192, + "grad_norm": 0.0, + "learning_rate": 0.00034035, + "loss": 3.5671, + "step": 3762 + }, + { + "epoch": 6.0208, + "grad_norm": 0.0, + "learning_rate": 0.0003401999999999999, + "loss": 4.0311, + "step": 3763 + }, + { + "epoch": 6.0224, + "grad_norm": 0.0, + "learning_rate": 0.00034004999999999994, + "loss": 3.654, + "step": 3764 + }, + { + "epoch": 6.024, + "grad_norm": 0.0, + "learning_rate": 0.00033989999999999997, + "loss": 3.0512, + "step": 3765 + }, + { + "epoch": 6.0256, + "grad_norm": 0.0, + "learning_rate": 0.00033975, + "loss": 3.817, + "step": 3766 + }, + { + "epoch": 6.0272, + "grad_norm": 0.0, + "learning_rate": 0.00033959999999999996, + "loss": 4.0627, + "step": 3767 + }, + { + "epoch": 6.0288, + "grad_norm": 0.0, + "learning_rate": 0.00033944999999999993, + "loss": 4.4509, + "step": 3768 + }, + { + "epoch": 6.0304, + "grad_norm": 0.0, + "learning_rate": 0.00033929999999999995, + "loss": 3.3457, + "step": 3769 + }, + { + "epoch": 6.032, + "grad_norm": 0.0, + "learning_rate": 0.00033915, + "loss": 4.3662, + "step": 3770 + }, + { + "epoch": 6.0336, + "grad_norm": 0.0, + "learning_rate": 0.00033899999999999995, + "loss": 4.1323, + "step": 3771 + }, + { + "epoch": 6.0352, + "grad_norm": 0.27863427996635437, + "learning_rate": 0.00033884999999999997, + "loss": 3.6353, + "step": 3772 + }, + { + "epoch": 6.0368, + "grad_norm": 0.0, + "learning_rate": 0.0003387, + "loss": 3.9024, + "step": 3773 + }, + { + "epoch": 6.0384, + "grad_norm": 1.2410098314285278, + "learning_rate": 0.00033854999999999996, + "loss": 3.7343, + "step": 3774 + }, + { + "epoch": 6.04, + "grad_norm": 0.0, + "learning_rate": 0.00033839999999999993, + "loss": 3.529, + "step": 3775 + }, + { + "epoch": 6.0416, + "grad_norm": 0.0, + "learning_rate": 0.00033824999999999995, + "loss": 4.2161, + "step": 3776 + }, + { + "epoch": 6.0432, + "grad_norm": 0.49260056018829346, + "learning_rate": 0.0003381, + "loss": 4.5646, + "step": 3777 + }, + { + "epoch": 6.0448, + "grad_norm": 0.0, + "learning_rate": 0.00033795, + "loss": 3.6236, + "step": 3778 + }, + { + "epoch": 6.0464, + "grad_norm": 0.7572435140609741, + "learning_rate": 0.0003377999999999999, + "loss": 3.7258, + "step": 3779 + }, + { + "epoch": 6.048, + "grad_norm": 0.0, + "learning_rate": 0.00033764999999999994, + "loss": 3.9813, + "step": 3780 + }, + { + "epoch": 6.0496, + "grad_norm": 0.10314198583364487, + "learning_rate": 0.00033749999999999996, + "loss": 3.7357, + "step": 3781 + }, + { + "epoch": 6.0512, + "grad_norm": 0.7457315921783447, + "learning_rate": 0.00033735, + "loss": 3.8844, + "step": 3782 + }, + { + "epoch": 6.0528, + "grad_norm": 0.0, + "learning_rate": 0.0003372, + "loss": 4.0611, + "step": 3783 + }, + { + "epoch": 6.0544, + "grad_norm": 0.0, + "learning_rate": 0.00033705, + "loss": 3.2851, + "step": 3784 + }, + { + "epoch": 6.056, + "grad_norm": 0.45380157232284546, + "learning_rate": 0.00033689999999999995, + "loss": 3.5424, + "step": 3785 + }, + { + "epoch": 6.0576, + "grad_norm": 0.24705453217029572, + "learning_rate": 0.00033674999999999997, + "loss": 3.7474, + "step": 3786 + }, + { + "epoch": 6.0592, + "grad_norm": 0.26397302746772766, + "learning_rate": 0.0003366, + "loss": 3.1184, + "step": 3787 + }, + { + "epoch": 6.0608, + "grad_norm": 0.4187479019165039, + "learning_rate": 0.00033644999999999997, + "loss": 3.7795, + "step": 3788 + }, + { + "epoch": 6.0624, + "grad_norm": 0.23197272419929504, + "learning_rate": 0.0003363, + "loss": 2.6915, + "step": 3789 + }, + { + "epoch": 6.064, + "grad_norm": 0.2890470027923584, + "learning_rate": 0.00033615, + "loss": 3.2712, + "step": 3790 + }, + { + "epoch": 6.0656, + "grad_norm": 0.8943280577659607, + "learning_rate": 0.000336, + "loss": 2.5483, + "step": 3791 + }, + { + "epoch": 6.0672, + "grad_norm": 0.8919143080711365, + "learning_rate": 0.00033584999999999995, + "loss": 2.3608, + "step": 3792 + }, + { + "epoch": 6.0688, + "grad_norm": 0.0, + "learning_rate": 0.0003357, + "loss": 2.5619, + "step": 3793 + }, + { + "epoch": 6.0704, + "grad_norm": 0.5600613951683044, + "learning_rate": 0.00033555, + "loss": 3.0782, + "step": 3794 + }, + { + "epoch": 6.072, + "grad_norm": 0.7343853712081909, + "learning_rate": 0.0003354, + "loss": 3.122, + "step": 3795 + }, + { + "epoch": 6.0736, + "grad_norm": 0.7387804985046387, + "learning_rate": 0.00033524999999999994, + "loss": 2.9913, + "step": 3796 + }, + { + "epoch": 6.0752, + "grad_norm": 1.676078200340271, + "learning_rate": 0.00033509999999999996, + "loss": 2.456, + "step": 3797 + }, + { + "epoch": 6.0768, + "grad_norm": 1.1079598665237427, + "learning_rate": 0.00033495, + "loss": 2.82, + "step": 3798 + }, + { + "epoch": 6.0784, + "grad_norm": 0.6876312494277954, + "learning_rate": 0.0003348, + "loss": 2.9411, + "step": 3799 + }, + { + "epoch": 6.08, + "grad_norm": NaN, + "learning_rate": 0.0003348, + "loss": 3.124, + "step": 3800 + }, + { + "epoch": 6.0816, + "grad_norm": 0.0, + "learning_rate": 0.00033465, + "loss": 4.1995, + "step": 3801 + }, + { + "epoch": 6.0832, + "grad_norm": 0.0, + "learning_rate": 0.00033449999999999994, + "loss": 3.8703, + "step": 3802 + }, + { + "epoch": 6.0848, + "grad_norm": 0.0, + "learning_rate": 0.00033434999999999997, + "loss": 3.5055, + "step": 3803 + }, + { + "epoch": 6.0864, + "grad_norm": 0.0, + "learning_rate": 0.0003342, + "loss": 2.6898, + "step": 3804 + }, + { + "epoch": 6.088, + "grad_norm": 0.0, + "learning_rate": 0.00033404999999999996, + "loss": 3.3677, + "step": 3805 + }, + { + "epoch": 6.0896, + "grad_norm": 0.0, + "learning_rate": 0.0003339, + "loss": 3.135, + "step": 3806 + }, + { + "epoch": 6.0912, + "grad_norm": 0.0, + "learning_rate": 0.00033375, + "loss": 3.8218, + "step": 3807 + }, + { + "epoch": 6.0928, + "grad_norm": 0.0, + "learning_rate": 0.0003336, + "loss": 3.883, + "step": 3808 + }, + { + "epoch": 6.0944, + "grad_norm": 0.0, + "learning_rate": 0.00033344999999999995, + "loss": 3.4605, + "step": 3809 + }, + { + "epoch": 6.096, + "grad_norm": 0.0, + "learning_rate": 0.00033329999999999997, + "loss": 4.6752, + "step": 3810 + }, + { + "epoch": 6.0976, + "grad_norm": 0.0, + "learning_rate": 0.00033315, + "loss": 3.5125, + "step": 3811 + }, + { + "epoch": 6.0992, + "grad_norm": 0.0, + "learning_rate": 0.000333, + "loss": 4.1268, + "step": 3812 + }, + { + "epoch": 6.1008, + "grad_norm": 0.0, + "learning_rate": 0.00033284999999999993, + "loss": 3.7619, + "step": 3813 + }, + { + "epoch": 6.1024, + "grad_norm": 0.0, + "learning_rate": 0.00033269999999999996, + "loss": 4.4734, + "step": 3814 + }, + { + "epoch": 6.104, + "grad_norm": 0.0, + "learning_rate": 0.00033255, + "loss": 3.1928, + "step": 3815 + }, + { + "epoch": 6.1056, + "grad_norm": 0.0, + "learning_rate": 0.0003324, + "loss": 3.406, + "step": 3816 + }, + { + "epoch": 6.1072, + "grad_norm": 0.0, + "learning_rate": 0.00033224999999999997, + "loss": 4.1266, + "step": 3817 + }, + { + "epoch": 6.1088, + "grad_norm": 0.0, + "learning_rate": 0.00033209999999999994, + "loss": 4.3521, + "step": 3818 + }, + { + "epoch": 6.1104, + "grad_norm": 0.0, + "learning_rate": 0.00033194999999999996, + "loss": 3.9209, + "step": 3819 + }, + { + "epoch": 6.112, + "grad_norm": 0.0, + "learning_rate": 0.0003318, + "loss": 3.5629, + "step": 3820 + }, + { + "epoch": 6.1136, + "grad_norm": 1.4520596265792847, + "learning_rate": 0.00033164999999999996, + "loss": 4.3292, + "step": 3821 + }, + { + "epoch": 6.1152, + "grad_norm": 0.0, + "learning_rate": 0.0003315, + "loss": 3.9747, + "step": 3822 + }, + { + "epoch": 6.1168, + "grad_norm": 0.0, + "learning_rate": 0.00033135, + "loss": 3.5212, + "step": 3823 + }, + { + "epoch": 6.1184, + "grad_norm": 0.0, + "learning_rate": 0.0003312, + "loss": 4.3496, + "step": 3824 + }, + { + "epoch": 6.12, + "grad_norm": 0.0, + "learning_rate": 0.00033104999999999994, + "loss": 4.2499, + "step": 3825 + }, + { + "epoch": 6.1216, + "grad_norm": 0.0, + "learning_rate": 0.00033089999999999997, + "loss": 4.6441, + "step": 3826 + }, + { + "epoch": 6.1232, + "grad_norm": 0.0, + "learning_rate": 0.00033075, + "loss": 4.1746, + "step": 3827 + }, + { + "epoch": 6.1248, + "grad_norm": 0.0, + "learning_rate": 0.0003306, + "loss": 4.3512, + "step": 3828 + }, + { + "epoch": 6.1264, + "grad_norm": 0.0, + "learning_rate": 0.00033044999999999993, + "loss": 3.6166, + "step": 3829 + }, + { + "epoch": 6.128, + "grad_norm": 0.0, + "learning_rate": 0.00033029999999999995, + "loss": 3.8687, + "step": 3830 + }, + { + "epoch": 6.1296, + "grad_norm": 0.0, + "learning_rate": 0.00033015, + "loss": 3.8298, + "step": 3831 + }, + { + "epoch": 6.1312, + "grad_norm": 0.0, + "learning_rate": 0.00033, + "loss": 3.991, + "step": 3832 + }, + { + "epoch": 6.1328, + "grad_norm": 0.0, + "learning_rate": 0.00032984999999999997, + "loss": 4.028, + "step": 3833 + }, + { + "epoch": 6.1344, + "grad_norm": 2.680119037628174, + "learning_rate": 0.0003297, + "loss": 3.7545, + "step": 3834 + }, + { + "epoch": 6.136, + "grad_norm": 0.0, + "learning_rate": 0.00032954999999999996, + "loss": 3.6376, + "step": 3835 + }, + { + "epoch": 6.1376, + "grad_norm": 0.0, + "learning_rate": 0.0003294, + "loss": 3.226, + "step": 3836 + }, + { + "epoch": 6.1392, + "grad_norm": 0.0, + "learning_rate": 0.00032924999999999995, + "loss": 3.8085, + "step": 3837 + }, + { + "epoch": 6.1408, + "grad_norm": 0.0, + "learning_rate": 0.0003291, + "loss": 3.4092, + "step": 3838 + }, + { + "epoch": 6.1424, + "grad_norm": 0.0, + "learning_rate": 0.00032895, + "loss": 3.266, + "step": 3839 + }, + { + "epoch": 6.144, + "grad_norm": 0.0, + "learning_rate": 0.0003288, + "loss": 4.4733, + "step": 3840 + }, + { + "epoch": 6.1456, + "grad_norm": 0.0, + "learning_rate": 0.00032864999999999994, + "loss": 3.4369, + "step": 3841 + }, + { + "epoch": 6.1472, + "grad_norm": 0.0, + "learning_rate": 0.00032849999999999996, + "loss": 3.8578, + "step": 3842 + }, + { + "epoch": 6.1488, + "grad_norm": 0.0, + "learning_rate": 0.00032835, + "loss": 3.9243, + "step": 3843 + }, + { + "epoch": 6.1504, + "grad_norm": 0.0, + "learning_rate": 0.0003282, + "loss": 3.7415, + "step": 3844 + }, + { + "epoch": 6.152, + "grad_norm": 0.0, + "learning_rate": 0.0003280499999999999, + "loss": 2.8646, + "step": 3845 + }, + { + "epoch": 6.1536, + "grad_norm": 0.0, + "learning_rate": 0.00032789999999999995, + "loss": 3.7953, + "step": 3846 + }, + { + "epoch": 6.1552, + "grad_norm": 0.2975400686264038, + "learning_rate": 0.00032774999999999997, + "loss": 3.0622, + "step": 3847 + }, + { + "epoch": 6.1568, + "grad_norm": 0.0, + "learning_rate": 0.0003276, + "loss": 3.1223, + "step": 3848 + }, + { + "epoch": 6.1584, + "grad_norm": 2.694124221801758, + "learning_rate": 0.00032744999999999996, + "loss": 4.4976, + "step": 3849 + }, + { + "epoch": 6.16, + "grad_norm": NaN, + "learning_rate": 0.00032744999999999996, + "loss": 3.6087, + "step": 3850 + }, + { + "epoch": 6.1616, + "grad_norm": 0.0, + "learning_rate": 0.0003273, + "loss": 7.9388, + "step": 3851 + }, + { + "epoch": 6.1632, + "grad_norm": 0.0, + "learning_rate": 0.00032714999999999996, + "loss": 5.2248, + "step": 3852 + }, + { + "epoch": 6.1648, + "grad_norm": 0.0, + "learning_rate": 0.000327, + "loss": 3.9534, + "step": 3853 + }, + { + "epoch": 6.1664, + "grad_norm": 0.0, + "learning_rate": 0.00032684999999999995, + "loss": 5.0484, + "step": 3854 + }, + { + "epoch": 6.168, + "grad_norm": 0.0, + "learning_rate": 0.00032669999999999997, + "loss": 3.3562, + "step": 3855 + }, + { + "epoch": 6.1696, + "grad_norm": 0.0, + "learning_rate": 0.00032655, + "loss": 3.0022, + "step": 3856 + }, + { + "epoch": 6.1712, + "grad_norm": 0.0, + "learning_rate": 0.0003264, + "loss": 3.0776, + "step": 3857 + }, + { + "epoch": 6.1728, + "grad_norm": 0.0, + "learning_rate": 0.00032624999999999993, + "loss": 4.0646, + "step": 3858 + }, + { + "epoch": 6.1744, + "grad_norm": 0.0, + "learning_rate": 0.00032609999999999996, + "loss": 2.6194, + "step": 3859 + }, + { + "epoch": 6.176, + "grad_norm": 0.0, + "learning_rate": 0.00032595, + "loss": 3.1227, + "step": 3860 + }, + { + "epoch": 6.1776, + "grad_norm": 0.0, + "learning_rate": 0.0003258, + "loss": 3.2917, + "step": 3861 + }, + { + "epoch": 6.1792, + "grad_norm": 0.0, + "learning_rate": 0.0003256499999999999, + "loss": 3.45, + "step": 3862 + }, + { + "epoch": 6.1808, + "grad_norm": 0.0, + "learning_rate": 0.00032549999999999994, + "loss": 2.762, + "step": 3863 + }, + { + "epoch": 6.1824, + "grad_norm": 0.0, + "learning_rate": 0.00032534999999999997, + "loss": 3.0529, + "step": 3864 + }, + { + "epoch": 6.184, + "grad_norm": 0.0, + "learning_rate": 0.0003252, + "loss": 2.7619, + "step": 3865 + }, + { + "epoch": 6.1856, + "grad_norm": 0.0, + "learning_rate": 0.00032504999999999996, + "loss": 3.0824, + "step": 3866 + }, + { + "epoch": 6.1872, + "grad_norm": 0.0, + "learning_rate": 0.0003249, + "loss": 3.1386, + "step": 3867 + }, + { + "epoch": 6.1888, + "grad_norm": 0.0, + "learning_rate": 0.00032474999999999995, + "loss": 2.5669, + "step": 3868 + }, + { + "epoch": 6.1904, + "grad_norm": 0.0, + "learning_rate": 0.0003246, + "loss": 3.0091, + "step": 3869 + }, + { + "epoch": 6.192, + "grad_norm": 0.0, + "learning_rate": 0.00032444999999999994, + "loss": 2.4578, + "step": 3870 + }, + { + "epoch": 6.1936, + "grad_norm": 0.0, + "learning_rate": 0.00032429999999999997, + "loss": 2.9423, + "step": 3871 + }, + { + "epoch": 6.1952, + "grad_norm": 0.0, + "learning_rate": 0.00032415, + "loss": 3.1918, + "step": 3872 + }, + { + "epoch": 6.1968, + "grad_norm": 0.0, + "learning_rate": 0.000324, + "loss": 2.8123, + "step": 3873 + }, + { + "epoch": 6.1984, + "grad_norm": 0.0, + "learning_rate": 0.00032384999999999993, + "loss": 3.1704, + "step": 3874 + }, + { + "epoch": 6.2, + "grad_norm": 0.0, + "learning_rate": 0.00032369999999999995, + "loss": 2.6135, + "step": 3875 + }, + { + "epoch": 6.2016, + "grad_norm": 0.0, + "learning_rate": 0.00032355, + "loss": 3.4235, + "step": 3876 + }, + { + "epoch": 6.2032, + "grad_norm": 0.0, + "learning_rate": 0.0003234, + "loss": 2.5807, + "step": 3877 + }, + { + "epoch": 6.2048, + "grad_norm": 0.0, + "learning_rate": 0.0003232499999999999, + "loss": 2.2725, + "step": 3878 + }, + { + "epoch": 6.2064, + "grad_norm": 0.0, + "learning_rate": 0.00032309999999999994, + "loss": 3.3268, + "step": 3879 + }, + { + "epoch": 6.208, + "grad_norm": 0.0, + "learning_rate": 0.00032294999999999996, + "loss": 2.7539, + "step": 3880 + }, + { + "epoch": 6.2096, + "grad_norm": 0.0, + "learning_rate": 0.0003228, + "loss": 2.1264, + "step": 3881 + }, + { + "epoch": 6.2112, + "grad_norm": 0.0, + "learning_rate": 0.00032264999999999996, + "loss": 2.7397, + "step": 3882 + }, + { + "epoch": 6.2128, + "grad_norm": 0.0, + "learning_rate": 0.0003225, + "loss": 2.8085, + "step": 3883 + }, + { + "epoch": 6.2144, + "grad_norm": 0.0, + "learning_rate": 0.00032235, + "loss": 2.5252, + "step": 3884 + }, + { + "epoch": 6.216, + "grad_norm": 0.0, + "learning_rate": 0.00032219999999999997, + "loss": 2.4381, + "step": 3885 + }, + { + "epoch": 6.2176, + "grad_norm": 0.0, + "learning_rate": 0.00032204999999999994, + "loss": 2.9784, + "step": 3886 + }, + { + "epoch": 6.2192, + "grad_norm": 0.0, + "learning_rate": 0.00032189999999999996, + "loss": 2.8678, + "step": 3887 + }, + { + "epoch": 6.2208, + "grad_norm": 0.0, + "learning_rate": 0.00032175, + "loss": 2.3483, + "step": 3888 + }, + { + "epoch": 6.2224, + "grad_norm": 0.0, + "learning_rate": 0.0003216, + "loss": 2.6857, + "step": 3889 + }, + { + "epoch": 6.224, + "grad_norm": 0.0, + "learning_rate": 0.0003214499999999999, + "loss": 3.1904, + "step": 3890 + }, + { + "epoch": 6.2256, + "grad_norm": 0.0, + "learning_rate": 0.00032129999999999995, + "loss": 4.1369, + "step": 3891 + }, + { + "epoch": 6.2272, + "grad_norm": 0.0, + "learning_rate": 0.00032114999999999997, + "loss": 3.0205, + "step": 3892 + }, + { + "epoch": 6.2288, + "grad_norm": 0.0, + "learning_rate": 0.000321, + "loss": 2.5759, + "step": 3893 + }, + { + "epoch": 6.2304, + "grad_norm": 0.0, + "learning_rate": 0.00032084999999999997, + "loss": 2.9976, + "step": 3894 + }, + { + "epoch": 6.232, + "grad_norm": 0.0, + "learning_rate": 0.00032069999999999993, + "loss": 3.7825, + "step": 3895 + }, + { + "epoch": 6.2336, + "grad_norm": 0.0, + "learning_rate": 0.00032054999999999996, + "loss": 3.623, + "step": 3896 + }, + { + "epoch": 6.2352, + "grad_norm": 0.0, + "learning_rate": 0.0003204, + "loss": 3.3718, + "step": 3897 + }, + { + "epoch": 6.2368, + "grad_norm": 0.0, + "learning_rate": 0.00032024999999999995, + "loss": 2.7264, + "step": 3898 + }, + { + "epoch": 6.2384, + "grad_norm": 0.0, + "learning_rate": 0.0003201, + "loss": 4.1795, + "step": 3899 + }, + { + "epoch": 6.24, + "grad_norm": 0.0, + "learning_rate": 0.00031995, + "loss": 3.4573, + "step": 3900 + }, + { + "epoch": 6.2416, + "grad_norm": 0.0, + "learning_rate": 0.00031979999999999997, + "loss": 3.8604, + "step": 3901 + }, + { + "epoch": 6.2432, + "grad_norm": 0.0, + "learning_rate": 0.00031964999999999994, + "loss": 3.1814, + "step": 3902 + }, + { + "epoch": 6.2448, + "grad_norm": 0.0, + "learning_rate": 0.00031949999999999996, + "loss": 3.407, + "step": 3903 + }, + { + "epoch": 6.2464, + "grad_norm": 0.0, + "learning_rate": 0.00031935, + "loss": 3.8178, + "step": 3904 + }, + { + "epoch": 6.248, + "grad_norm": 0.0, + "learning_rate": 0.0003192, + "loss": 3.2901, + "step": 3905 + }, + { + "epoch": 6.2496, + "grad_norm": 0.0, + "learning_rate": 0.0003190499999999999, + "loss": 5.0435, + "step": 3906 + }, + { + "epoch": 6.2512, + "grad_norm": 0.0, + "learning_rate": 0.00031889999999999995, + "loss": 3.0831, + "step": 3907 + }, + { + "epoch": 6.2528, + "grad_norm": 0.0, + "learning_rate": 0.00031874999999999997, + "loss": 3.5521, + "step": 3908 + }, + { + "epoch": 6.2544, + "grad_norm": 0.0, + "learning_rate": 0.0003186, + "loss": 2.5273, + "step": 3909 + }, + { + "epoch": 6.256, + "grad_norm": 0.0, + "learning_rate": 0.00031845, + "loss": 2.1646, + "step": 3910 + }, + { + "epoch": 6.2576, + "grad_norm": 0.0, + "learning_rate": 0.00031829999999999993, + "loss": 2.8208, + "step": 3911 + }, + { + "epoch": 6.2592, + "grad_norm": 0.0, + "learning_rate": 0.00031814999999999995, + "loss": 3.3235, + "step": 3912 + }, + { + "epoch": 6.2608, + "grad_norm": 0.0, + "learning_rate": 0.000318, + "loss": 2.6847, + "step": 3913 + }, + { + "epoch": 6.2624, + "grad_norm": 0.0, + "learning_rate": 0.00031785, + "loss": 2.4997, + "step": 3914 + }, + { + "epoch": 6.264, + "grad_norm": 0.0, + "learning_rate": 0.00031769999999999997, + "loss": 2.3451, + "step": 3915 + }, + { + "epoch": 6.2656, + "grad_norm": 0.0, + "learning_rate": 0.00031755, + "loss": 3.2673, + "step": 3916 + }, + { + "epoch": 6.2672, + "grad_norm": 0.0, + "learning_rate": 0.00031739999999999996, + "loss": 2.8985, + "step": 3917 + }, + { + "epoch": 6.2688, + "grad_norm": 0.0, + "learning_rate": 0.00031725, + "loss": 2.4833, + "step": 3918 + }, + { + "epoch": 6.2704, + "grad_norm": 0.0, + "learning_rate": 0.00031709999999999996, + "loss": 2.8448, + "step": 3919 + }, + { + "epoch": 6.272, + "grad_norm": 0.0, + "learning_rate": 0.00031695, + "loss": 2.5166, + "step": 3920 + }, + { + "epoch": 6.2736, + "grad_norm": 0.0, + "learning_rate": 0.0003168, + "loss": 3.3755, + "step": 3921 + }, + { + "epoch": 6.2752, + "grad_norm": 0.0, + "learning_rate": 0.00031665, + "loss": 2.3369, + "step": 3922 + }, + { + "epoch": 6.2768, + "grad_norm": 0.0, + "learning_rate": 0.00031649999999999994, + "loss": 2.9503, + "step": 3923 + }, + { + "epoch": 6.2783999999999995, + "grad_norm": 0.0, + "learning_rate": 0.00031634999999999996, + "loss": 2.8635, + "step": 3924 + }, + { + "epoch": 6.28, + "grad_norm": 0.0, + "learning_rate": 0.0003162, + "loss": 2.9627, + "step": 3925 + }, + { + "epoch": 6.2816, + "grad_norm": 0.0, + "learning_rate": 0.00031605, + "loss": 2.7479, + "step": 3926 + }, + { + "epoch": 6.2832, + "grad_norm": 0.0, + "learning_rate": 0.0003158999999999999, + "loss": 2.2887, + "step": 3927 + }, + { + "epoch": 6.2848, + "grad_norm": 0.0, + "learning_rate": 0.00031574999999999995, + "loss": 3.7974, + "step": 3928 + }, + { + "epoch": 6.2864, + "grad_norm": 0.0, + "learning_rate": 0.0003156, + "loss": 3.4341, + "step": 3929 + }, + { + "epoch": 6.288, + "grad_norm": 0.0, + "learning_rate": 0.00031545, + "loss": 2.3601, + "step": 3930 + }, + { + "epoch": 6.2896, + "grad_norm": 0.0, + "learning_rate": 0.00031529999999999997, + "loss": 3.3701, + "step": 3931 + }, + { + "epoch": 6.2912, + "grad_norm": 0.0, + "learning_rate": 0.00031515, + "loss": 3.1594, + "step": 3932 + }, + { + "epoch": 6.2928, + "grad_norm": 0.7066333889961243, + "learning_rate": 0.00031499999999999996, + "loss": 2.9424, + "step": 3933 + }, + { + "epoch": 6.2943999999999996, + "grad_norm": 0.0, + "learning_rate": 0.00031485, + "loss": 2.5434, + "step": 3934 + }, + { + "epoch": 6.296, + "grad_norm": 0.0, + "learning_rate": 0.00031469999999999995, + "loss": 2.9575, + "step": 3935 + }, + { + "epoch": 6.2976, + "grad_norm": 0.0, + "learning_rate": 0.00031455, + "loss": 2.647, + "step": 3936 + }, + { + "epoch": 6.2992, + "grad_norm": 0.0, + "learning_rate": 0.0003144, + "loss": 2.5617, + "step": 3937 + }, + { + "epoch": 6.3008, + "grad_norm": 0.0, + "learning_rate": 0.00031425, + "loss": 4.2255, + "step": 3938 + }, + { + "epoch": 6.3024000000000004, + "grad_norm": 0.0, + "learning_rate": 0.00031409999999999994, + "loss": 4.08, + "step": 3939 + }, + { + "epoch": 6.304, + "grad_norm": 0.0, + "learning_rate": 0.00031394999999999996, + "loss": 2.8675, + "step": 3940 + }, + { + "epoch": 6.3056, + "grad_norm": 0.0, + "learning_rate": 0.0003138, + "loss": 2.7682, + "step": 3941 + }, + { + "epoch": 6.3072, + "grad_norm": 0.0, + "learning_rate": 0.00031365, + "loss": 2.8993, + "step": 3942 + }, + { + "epoch": 6.3088, + "grad_norm": 0.0, + "learning_rate": 0.0003135, + "loss": 3.5349, + "step": 3943 + }, + { + "epoch": 6.3104, + "grad_norm": 0.0, + "learning_rate": 0.00031334999999999995, + "loss": 3.5218, + "step": 3944 + }, + { + "epoch": 6.312, + "grad_norm": 0.0, + "learning_rate": 0.00031319999999999997, + "loss": 3.2293, + "step": 3945 + }, + { + "epoch": 6.3136, + "grad_norm": 0.0, + "learning_rate": 0.00031305, + "loss": 2.6338, + "step": 3946 + }, + { + "epoch": 6.3152, + "grad_norm": 0.0, + "learning_rate": 0.00031289999999999996, + "loss": 3.2263, + "step": 3947 + }, + { + "epoch": 6.3168, + "grad_norm": 0.0, + "learning_rate": 0.00031275, + "loss": 2.486, + "step": 3948 + }, + { + "epoch": 6.3184000000000005, + "grad_norm": 0.0, + "learning_rate": 0.0003126, + "loss": 3.832, + "step": 3949 + }, + { + "epoch": 6.32, + "grad_norm": 0.0, + "learning_rate": 0.00031245, + "loss": 3.0282, + "step": 3950 + }, + { + "epoch": 6.3216, + "grad_norm": 0.0, + "learning_rate": 0.00031229999999999995, + "loss": 4.0251, + "step": 3951 + }, + { + "epoch": 6.3232, + "grad_norm": 0.0, + "learning_rate": 0.00031214999999999997, + "loss": 4.8373, + "step": 3952 + }, + { + "epoch": 6.3248, + "grad_norm": 0.0, + "learning_rate": 0.000312, + "loss": 3.9881, + "step": 3953 + }, + { + "epoch": 6.3264, + "grad_norm": 0.0, + "learning_rate": 0.00031185, + "loss": 3.7715, + "step": 3954 + }, + { + "epoch": 6.328, + "grad_norm": 0.0, + "learning_rate": 0.00031169999999999993, + "loss": 3.0073, + "step": 3955 + }, + { + "epoch": 6.3296, + "grad_norm": 0.0, + "learning_rate": 0.00031154999999999996, + "loss": 3.559, + "step": 3956 + }, + { + "epoch": 6.3312, + "grad_norm": 0.0, + "learning_rate": 0.0003114, + "loss": 3.7248, + "step": 3957 + }, + { + "epoch": 6.3328, + "grad_norm": 0.0, + "learning_rate": 0.00031125, + "loss": 3.5672, + "step": 3958 + }, + { + "epoch": 6.3344, + "grad_norm": 0.0, + "learning_rate": 0.00031109999999999997, + "loss": 2.905, + "step": 3959 + }, + { + "epoch": 6.336, + "grad_norm": 0.0, + "learning_rate": 0.00031094999999999994, + "loss": 3.4616, + "step": 3960 + }, + { + "epoch": 6.3376, + "grad_norm": 0.0, + "learning_rate": 0.00031079999999999997, + "loss": 3.2915, + "step": 3961 + }, + { + "epoch": 6.3392, + "grad_norm": 0.0, + "learning_rate": 0.00031065, + "loss": 3.3367, + "step": 3962 + }, + { + "epoch": 6.3408, + "grad_norm": 0.0, + "learning_rate": 0.00031049999999999996, + "loss": 2.7731, + "step": 3963 + }, + { + "epoch": 6.3424, + "grad_norm": 0.0, + "learning_rate": 0.00031035, + "loss": 2.9709, + "step": 3964 + }, + { + "epoch": 6.344, + "grad_norm": 0.0, + "learning_rate": 0.0003102, + "loss": 3.2344, + "step": 3965 + }, + { + "epoch": 6.3456, + "grad_norm": 0.0, + "learning_rate": 0.00031005, + "loss": 3.5747, + "step": 3966 + }, + { + "epoch": 6.3472, + "grad_norm": 0.0, + "learning_rate": 0.00030989999999999994, + "loss": 3.3665, + "step": 3967 + }, + { + "epoch": 6.3488, + "grad_norm": 0.0, + "learning_rate": 0.00030974999999999997, + "loss": 3.1321, + "step": 3968 + }, + { + "epoch": 6.3504, + "grad_norm": 0.0, + "learning_rate": 0.0003096, + "loss": 2.6795, + "step": 3969 + }, + { + "epoch": 6.352, + "grad_norm": 0.0, + "learning_rate": 0.00030945, + "loss": 3.3899, + "step": 3970 + }, + { + "epoch": 6.3536, + "grad_norm": 0.0, + "learning_rate": 0.00030929999999999993, + "loss": 2.9092, + "step": 3971 + }, + { + "epoch": 6.3552, + "grad_norm": 0.0, + "learning_rate": 0.00030914999999999995, + "loss": 4.6425, + "step": 3972 + }, + { + "epoch": 6.3568, + "grad_norm": 0.0, + "learning_rate": 0.000309, + "loss": 3.0781, + "step": 3973 + }, + { + "epoch": 6.3584, + "grad_norm": 0.0, + "learning_rate": 0.00030885, + "loss": 2.8645, + "step": 3974 + }, + { + "epoch": 6.36, + "grad_norm": 0.0, + "learning_rate": 0.00030869999999999997, + "loss": 2.3022, + "step": 3975 + }, + { + "epoch": 6.3616, + "grad_norm": 0.0, + "learning_rate": 0.00030854999999999994, + "loss": 2.7279, + "step": 3976 + }, + { + "epoch": 6.3632, + "grad_norm": 0.0, + "learning_rate": 0.00030839999999999996, + "loss": 2.7442, + "step": 3977 + }, + { + "epoch": 6.3648, + "grad_norm": 0.0, + "learning_rate": 0.00030825, + "loss": 3.519, + "step": 3978 + }, + { + "epoch": 6.3664, + "grad_norm": 0.0, + "learning_rate": 0.00030809999999999995, + "loss": 2.9618, + "step": 3979 + }, + { + "epoch": 6.368, + "grad_norm": 0.0, + "learning_rate": 0.00030795, + "loss": 3.3267, + "step": 3980 + }, + { + "epoch": 6.3696, + "grad_norm": 0.0, + "learning_rate": 0.0003078, + "loss": 2.9491, + "step": 3981 + }, + { + "epoch": 6.3712, + "grad_norm": 3.947984457015991, + "learning_rate": 0.00030764999999999997, + "loss": 3.9896, + "step": 3982 + }, + { + "epoch": 6.3728, + "grad_norm": 0.0, + "learning_rate": 0.00030749999999999994, + "loss": 3.7426, + "step": 3983 + }, + { + "epoch": 6.3744, + "grad_norm": 0.0, + "learning_rate": 0.00030734999999999996, + "loss": 2.8783, + "step": 3984 + }, + { + "epoch": 6.376, + "grad_norm": 0.0, + "learning_rate": 0.0003072, + "loss": 2.4963, + "step": 3985 + }, + { + "epoch": 6.3776, + "grad_norm": 0.0, + "learning_rate": 0.00030705, + "loss": 3.3298, + "step": 3986 + }, + { + "epoch": 6.3792, + "grad_norm": 0.0, + "learning_rate": 0.0003068999999999999, + "loss": 3.0442, + "step": 3987 + }, + { + "epoch": 6.3808, + "grad_norm": 0.0, + "learning_rate": 0.00030674999999999995, + "loss": 2.7951, + "step": 3988 + }, + { + "epoch": 6.3824, + "grad_norm": 0.0, + "learning_rate": 0.00030659999999999997, + "loss": 3.2513, + "step": 3989 + }, + { + "epoch": 6.384, + "grad_norm": 0.0, + "learning_rate": 0.00030645, + "loss": 3.1854, + "step": 3990 + }, + { + "epoch": 6.3856, + "grad_norm": 0.0, + "learning_rate": 0.00030629999999999996, + "loss": 2.8245, + "step": 3991 + }, + { + "epoch": 6.3872, + "grad_norm": 0.0, + "learning_rate": 0.00030615, + "loss": 3.0166, + "step": 3992 + }, + { + "epoch": 6.3888, + "grad_norm": 0.0, + "learning_rate": 0.00030599999999999996, + "loss": 3.1391, + "step": 3993 + }, + { + "epoch": 6.3904, + "grad_norm": 0.0, + "learning_rate": 0.00030585, + "loss": 3.8282, + "step": 3994 + }, + { + "epoch": 6.392, + "grad_norm": 0.0, + "learning_rate": 0.00030569999999999995, + "loss": 2.4871, + "step": 3995 + }, + { + "epoch": 6.3936, + "grad_norm": 0.0, + "learning_rate": 0.00030555, + "loss": 3.2249, + "step": 3996 + }, + { + "epoch": 6.3952, + "grad_norm": 0.0, + "learning_rate": 0.0003054, + "loss": 3.1221, + "step": 3997 + }, + { + "epoch": 6.3968, + "grad_norm": 0.0, + "learning_rate": 0.00030525, + "loss": 3.1722, + "step": 3998 + }, + { + "epoch": 6.3984, + "grad_norm": 0.0, + "learning_rate": 0.00030509999999999994, + "loss": 3.5031, + "step": 3999 + }, + { + "epoch": 6.4, + "grad_norm": NaN, + "learning_rate": 0.00030509999999999994, + "loss": 3.3134, + "step": 4000 + }, + { + "epoch": 6.4, + "eval_cer": 0.5204408657870611, + "eval_loss": 3.302504777908325, + "eval_runtime": 159.6303, + "eval_samples_per_second": 19.645, + "eval_steps_per_second": 1.228, + "eval_wer": 0.8032742720961559, + "step": 4000 + }, + { + "epoch": 6.4016, + "grad_norm": 0.0, + "learning_rate": 0.00030494999999999996, + "loss": 3.7485, + "step": 4001 + }, + { + "epoch": 6.4032, + "grad_norm": 0.0, + "learning_rate": 0.0003048, + "loss": 3.6302, + "step": 4002 + }, + { + "epoch": 6.4048, + "grad_norm": 0.0, + "learning_rate": 0.00030465, + "loss": 3.312, + "step": 4003 + }, + { + "epoch": 6.4064, + "grad_norm": 0.0, + "learning_rate": 0.0003044999999999999, + "loss": 4.4313, + "step": 4004 + }, + { + "epoch": 6.408, + "grad_norm": 0.0, + "learning_rate": 0.00030434999999999994, + "loss": 3.9922, + "step": 4005 + }, + { + "epoch": 6.4096, + "grad_norm": 0.0, + "learning_rate": 0.00030419999999999997, + "loss": 3.5101, + "step": 4006 + }, + { + "epoch": 6.4112, + "grad_norm": 0.0, + "learning_rate": 0.00030405, + "loss": 3.2452, + "step": 4007 + }, + { + "epoch": 6.4128, + "grad_norm": 0.0, + "learning_rate": 0.00030389999999999996, + "loss": 3.2645, + "step": 4008 + }, + { + "epoch": 6.4144, + "grad_norm": 0.0, + "learning_rate": 0.00030375, + "loss": 3.4592, + "step": 4009 + }, + { + "epoch": 6.416, + "grad_norm": 0.0, + "learning_rate": 0.00030359999999999995, + "loss": 4.1544, + "step": 4010 + }, + { + "epoch": 6.4176, + "grad_norm": 0.0, + "learning_rate": 0.00030345, + "loss": 3.4224, + "step": 4011 + }, + { + "epoch": 6.4192, + "grad_norm": 0.0, + "learning_rate": 0.00030329999999999995, + "loss": 2.963, + "step": 4012 + }, + { + "epoch": 6.4208, + "grad_norm": 0.0, + "learning_rate": 0.00030314999999999997, + "loss": 3.6251, + "step": 4013 + }, + { + "epoch": 6.4224, + "grad_norm": 0.0, + "learning_rate": 0.000303, + "loss": 2.4698, + "step": 4014 + }, + { + "epoch": 6.424, + "grad_norm": 0.0, + "learning_rate": 0.00030285, + "loss": 2.7059, + "step": 4015 + }, + { + "epoch": 6.4256, + "grad_norm": 0.0, + "learning_rate": 0.00030269999999999993, + "loss": 3.056, + "step": 4016 + }, + { + "epoch": 6.4272, + "grad_norm": 0.0, + "learning_rate": 0.00030254999999999995, + "loss": 3.0482, + "step": 4017 + }, + { + "epoch": 6.4288, + "grad_norm": 0.0, + "learning_rate": 0.0003024, + "loss": 2.4599, + "step": 4018 + }, + { + "epoch": 6.4304, + "grad_norm": 0.0, + "learning_rate": 0.00030225, + "loss": 2.67, + "step": 4019 + }, + { + "epoch": 6.432, + "grad_norm": 0.0, + "learning_rate": 0.0003020999999999999, + "loss": 3.273, + "step": 4020 + }, + { + "epoch": 6.4336, + "grad_norm": 0.0, + "learning_rate": 0.00030194999999999994, + "loss": 2.584, + "step": 4021 + }, + { + "epoch": 6.4352, + "grad_norm": 0.0, + "learning_rate": 0.00030179999999999996, + "loss": 2.8512, + "step": 4022 + }, + { + "epoch": 6.4368, + "grad_norm": 0.0, + "learning_rate": 0.00030165, + "loss": 3.4812, + "step": 4023 + }, + { + "epoch": 6.4384, + "grad_norm": 0.0, + "learning_rate": 0.00030149999999999996, + "loss": 3.0326, + "step": 4024 + }, + { + "epoch": 6.44, + "grad_norm": 0.0, + "learning_rate": 0.00030135, + "loss": 2.572, + "step": 4025 + }, + { + "epoch": 6.4416, + "grad_norm": 0.0, + "learning_rate": 0.00030119999999999995, + "loss": 2.8446, + "step": 4026 + }, + { + "epoch": 6.4432, + "grad_norm": 0.0, + "learning_rate": 0.00030104999999999997, + "loss": 3.3667, + "step": 4027 + }, + { + "epoch": 6.4448, + "grad_norm": 0.0, + "learning_rate": 0.00030089999999999994, + "loss": 2.9241, + "step": 4028 + }, + { + "epoch": 6.4464, + "grad_norm": 0.0, + "learning_rate": 0.00030074999999999996, + "loss": 3.1371, + "step": 4029 + }, + { + "epoch": 6.448, + "grad_norm": 0.0, + "learning_rate": 0.0003006, + "loss": 2.3366, + "step": 4030 + }, + { + "epoch": 6.4496, + "grad_norm": 0.0, + "learning_rate": 0.00030045, + "loss": 3.2662, + "step": 4031 + }, + { + "epoch": 6.4512, + "grad_norm": 0.0, + "learning_rate": 0.0003002999999999999, + "loss": 2.4098, + "step": 4032 + }, + { + "epoch": 6.4528, + "grad_norm": 0.0, + "learning_rate": 0.00030014999999999995, + "loss": 3.1829, + "step": 4033 + }, + { + "epoch": 6.4544, + "grad_norm": 0.0, + "learning_rate": 0.0003, + "loss": 3.1601, + "step": 4034 + }, + { + "epoch": 6.456, + "grad_norm": 0.0, + "learning_rate": 0.00029985, + "loss": 3.1077, + "step": 4035 + }, + { + "epoch": 6.4576, + "grad_norm": 0.0, + "learning_rate": 0.00029969999999999997, + "loss": 2.7685, + "step": 4036 + }, + { + "epoch": 6.4592, + "grad_norm": 0.0, + "learning_rate": 0.00029955, + "loss": 2.6426, + "step": 4037 + }, + { + "epoch": 6.4608, + "grad_norm": 0.0, + "learning_rate": 0.00029939999999999996, + "loss": 3.2698, + "step": 4038 + }, + { + "epoch": 6.4624, + "grad_norm": 0.0, + "learning_rate": 0.00029925, + "loss": 3.1552, + "step": 4039 + }, + { + "epoch": 6.464, + "grad_norm": 0.0, + "learning_rate": 0.00029909999999999995, + "loss": 3.6805, + "step": 4040 + }, + { + "epoch": 6.4656, + "grad_norm": NaN, + "learning_rate": 0.00029909999999999995, + "loss": 1.2008, + "step": 4041 + }, + { + "epoch": 6.4672, + "grad_norm": 0.0, + "learning_rate": 0.00029895, + "loss": 2.9159, + "step": 4042 + }, + { + "epoch": 6.4688, + "grad_norm": 0.0, + "learning_rate": 0.0002988, + "loss": 3.4111, + "step": 4043 + }, + { + "epoch": 6.4704, + "grad_norm": 0.0, + "learning_rate": 0.00029864999999999997, + "loss": 2.6915, + "step": 4044 + }, + { + "epoch": 6.4719999999999995, + "grad_norm": 0.0, + "learning_rate": 0.0002985, + "loss": 2.7053, + "step": 4045 + }, + { + "epoch": 6.4736, + "grad_norm": 0.0, + "learning_rate": 0.00029835, + "loss": 2.8908, + "step": 4046 + }, + { + "epoch": 6.4752, + "grad_norm": 0.0, + "learning_rate": 0.0002982, + "loss": 3.1514, + "step": 4047 + }, + { + "epoch": 6.4768, + "grad_norm": 0.0, + "learning_rate": 0.00029805, + "loss": 2.9263, + "step": 4048 + }, + { + "epoch": 6.4784, + "grad_norm": 0.0, + "learning_rate": 0.0002979, + "loss": 3.1634, + "step": 4049 + }, + { + "epoch": 6.48, + "grad_norm": 0.0, + "learning_rate": 0.00029775, + "loss": 3.4911, + "step": 4050 + }, + { + "epoch": 6.4816, + "grad_norm": 0.0, + "learning_rate": 0.00029759999999999997, + "loss": 4.6418, + "step": 4051 + }, + { + "epoch": 6.4832, + "grad_norm": 0.0, + "learning_rate": 0.00029745, + "loss": 4.2075, + "step": 4052 + }, + { + "epoch": 6.4848, + "grad_norm": 0.0, + "learning_rate": 0.00029729999999999996, + "loss": 3.1241, + "step": 4053 + }, + { + "epoch": 6.4864, + "grad_norm": 0.0, + "learning_rate": 0.00029715, + "loss": 3.7562, + "step": 4054 + }, + { + "epoch": 6.4879999999999995, + "grad_norm": 0.0, + "learning_rate": 0.00029699999999999996, + "loss": 3.0509, + "step": 4055 + }, + { + "epoch": 6.4896, + "grad_norm": 0.0, + "learning_rate": 0.00029685, + "loss": 3.2858, + "step": 4056 + }, + { + "epoch": 6.4912, + "grad_norm": 0.0, + "learning_rate": 0.00029669999999999995, + "loss": 3.0402, + "step": 4057 + }, + { + "epoch": 6.4928, + "grad_norm": 0.0, + "learning_rate": 0.00029654999999999997, + "loss": 3.7944, + "step": 4058 + }, + { + "epoch": 6.4944, + "grad_norm": 0.0, + "learning_rate": 0.0002964, + "loss": 2.8896, + "step": 4059 + }, + { + "epoch": 6.496, + "grad_norm": 0.0, + "learning_rate": 0.00029624999999999996, + "loss": 2.9788, + "step": 4060 + }, + { + "epoch": 6.4976, + "grad_norm": 0.0, + "learning_rate": 0.0002961, + "loss": 3.1403, + "step": 4061 + }, + { + "epoch": 6.4992, + "grad_norm": 0.0, + "learning_rate": 0.00029595, + "loss": 3.5309, + "step": 4062 + }, + { + "epoch": 6.5008, + "grad_norm": 0.0, + "learning_rate": 0.0002958, + "loss": 2.9013, + "step": 4063 + }, + { + "epoch": 6.5024, + "grad_norm": 0.0, + "learning_rate": 0.00029565, + "loss": 3.0639, + "step": 4064 + }, + { + "epoch": 6.504, + "grad_norm": 0.0, + "learning_rate": 0.00029549999999999997, + "loss": 3.1012, + "step": 4065 + }, + { + "epoch": 6.5056, + "grad_norm": 0.0, + "learning_rate": 0.00029535, + "loss": 2.8293, + "step": 4066 + }, + { + "epoch": 6.5072, + "grad_norm": 0.0, + "learning_rate": 0.00029519999999999997, + "loss": 2.6492, + "step": 4067 + }, + { + "epoch": 6.5088, + "grad_norm": 0.0, + "learning_rate": 0.00029505, + "loss": 2.7127, + "step": 4068 + }, + { + "epoch": 6.5104, + "grad_norm": 0.0, + "learning_rate": 0.00029489999999999996, + "loss": 2.9354, + "step": 4069 + }, + { + "epoch": 6.5120000000000005, + "grad_norm": 0.0, + "learning_rate": 0.00029475, + "loss": 3.0551, + "step": 4070 + }, + { + "epoch": 6.5136, + "grad_norm": 0.0, + "learning_rate": 0.00029459999999999995, + "loss": 2.4176, + "step": 4071 + }, + { + "epoch": 6.5152, + "grad_norm": 0.0, + "learning_rate": 0.00029445, + "loss": 2.4007, + "step": 4072 + }, + { + "epoch": 6.5168, + "grad_norm": 0.0, + "learning_rate": 0.00029429999999999994, + "loss": 2.4701, + "step": 4073 + }, + { + "epoch": 6.5184, + "grad_norm": 0.0, + "learning_rate": 0.00029414999999999997, + "loss": 2.8651, + "step": 4074 + }, + { + "epoch": 6.52, + "grad_norm": 0.0, + "learning_rate": 0.000294, + "loss": 2.2248, + "step": 4075 + }, + { + "epoch": 6.5216, + "grad_norm": 0.0, + "learning_rate": 0.00029384999999999996, + "loss": 2.7142, + "step": 4076 + }, + { + "epoch": 6.5232, + "grad_norm": 0.0, + "learning_rate": 0.0002937, + "loss": 3.0723, + "step": 4077 + }, + { + "epoch": 6.5248, + "grad_norm": 0.0, + "learning_rate": 0.00029355, + "loss": 2.6237, + "step": 4078 + }, + { + "epoch": 6.5264, + "grad_norm": 0.0, + "learning_rate": 0.0002934, + "loss": 2.7589, + "step": 4079 + }, + { + "epoch": 6.5280000000000005, + "grad_norm": 0.0, + "learning_rate": 0.00029325, + "loss": 2.5206, + "step": 4080 + }, + { + "epoch": 6.5296, + "grad_norm": 0.0, + "learning_rate": 0.00029309999999999997, + "loss": 2.2624, + "step": 4081 + }, + { + "epoch": 6.5312, + "grad_norm": 0.0, + "learning_rate": 0.00029295, + "loss": 2.7337, + "step": 4082 + }, + { + "epoch": 6.5328, + "grad_norm": 0.0, + "learning_rate": 0.00029279999999999996, + "loss": 3.2515, + "step": 4083 + }, + { + "epoch": 6.5344, + "grad_norm": 0.0, + "learning_rate": 0.00029265, + "loss": 3.1189, + "step": 4084 + }, + { + "epoch": 6.536, + "grad_norm": 0.0, + "learning_rate": 0.00029249999999999995, + "loss": 3.1952, + "step": 4085 + }, + { + "epoch": 6.5376, + "grad_norm": 0.0, + "learning_rate": 0.00029235, + "loss": 3.007, + "step": 4086 + }, + { + "epoch": 6.5392, + "grad_norm": 0.0, + "learning_rate": 0.00029219999999999995, + "loss": 2.7831, + "step": 4087 + }, + { + "epoch": 6.5408, + "grad_norm": 0.0, + "learning_rate": 0.00029204999999999997, + "loss": 2.8143, + "step": 4088 + }, + { + "epoch": 6.5424, + "grad_norm": 0.0, + "learning_rate": 0.0002919, + "loss": 2.461, + "step": 4089 + }, + { + "epoch": 6.5440000000000005, + "grad_norm": 0.0, + "learning_rate": 0.00029174999999999996, + "loss": 2.4208, + "step": 4090 + }, + { + "epoch": 6.5456, + "grad_norm": 0.0, + "learning_rate": 0.0002916, + "loss": 2.7209, + "step": 4091 + }, + { + "epoch": 6.5472, + "grad_norm": 0.0, + "learning_rate": 0.00029145, + "loss": 2.8859, + "step": 4092 + }, + { + "epoch": 6.5488, + "grad_norm": 0.0, + "learning_rate": 0.0002913, + "loss": 2.6131, + "step": 4093 + }, + { + "epoch": 6.5504, + "grad_norm": 0.0, + "learning_rate": 0.00029115, + "loss": 2.8062, + "step": 4094 + }, + { + "epoch": 6.552, + "grad_norm": 0.0, + "learning_rate": 0.00029099999999999997, + "loss": 2.6563, + "step": 4095 + }, + { + "epoch": 6.5536, + "grad_norm": 0.0, + "learning_rate": 0.00029085, + "loss": 2.732, + "step": 4096 + }, + { + "epoch": 6.5552, + "grad_norm": 0.0, + "learning_rate": 0.00029069999999999996, + "loss": 3.2932, + "step": 4097 + }, + { + "epoch": 6.5568, + "grad_norm": 0.0, + "learning_rate": 0.00029055, + "loss": 3.189, + "step": 4098 + }, + { + "epoch": 6.5584, + "grad_norm": 0.0, + "learning_rate": 0.00029039999999999996, + "loss": 3.0375, + "step": 4099 + }, + { + "epoch": 6.5600000000000005, + "grad_norm": 0.0, + "learning_rate": 0.00029025, + "loss": 4.1397, + "step": 4100 + }, + { + "epoch": 6.5616, + "grad_norm": 0.0, + "learning_rate": 0.00029009999999999995, + "loss": 4.448, + "step": 4101 + }, + { + "epoch": 6.5632, + "grad_norm": 0.0, + "learning_rate": 0.00028995, + "loss": 3.241, + "step": 4102 + }, + { + "epoch": 6.5648, + "grad_norm": 0.0, + "learning_rate": 0.00028979999999999994, + "loss": 3.3749, + "step": 4103 + }, + { + "epoch": 6.5664, + "grad_norm": 0.0, + "learning_rate": 0.00028964999999999997, + "loss": 3.301, + "step": 4104 + }, + { + "epoch": 6.568, + "grad_norm": 0.0, + "learning_rate": 0.0002895, + "loss": 3.2784, + "step": 4105 + }, + { + "epoch": 6.5696, + "grad_norm": 0.0, + "learning_rate": 0.00028934999999999996, + "loss": 3.8074, + "step": 4106 + }, + { + "epoch": 6.5712, + "grad_norm": 0.0, + "learning_rate": 0.0002892, + "loss": 3.412, + "step": 4107 + }, + { + "epoch": 6.5728, + "grad_norm": 0.0, + "learning_rate": 0.00028905, + "loss": 2.9879, + "step": 4108 + }, + { + "epoch": 6.5744, + "grad_norm": 0.0, + "learning_rate": 0.0002889, + "loss": 4.0027, + "step": 4109 + }, + { + "epoch": 6.576, + "grad_norm": 0.0, + "learning_rate": 0.00028875, + "loss": 2.685, + "step": 4110 + }, + { + "epoch": 6.5776, + "grad_norm": 0.0, + "learning_rate": 0.00028859999999999997, + "loss": 2.8833, + "step": 4111 + }, + { + "epoch": 6.5792, + "grad_norm": 0.0, + "learning_rate": 0.00028845, + "loss": 2.7789, + "step": 4112 + }, + { + "epoch": 6.5808, + "grad_norm": 0.0, + "learning_rate": 0.00028829999999999996, + "loss": 3.5811, + "step": 4113 + }, + { + "epoch": 6.5824, + "grad_norm": 0.0, + "learning_rate": 0.00028815, + "loss": 2.6637, + "step": 4114 + }, + { + "epoch": 6.584, + "grad_norm": 0.0, + "learning_rate": 0.00028799999999999995, + "loss": 2.8443, + "step": 4115 + }, + { + "epoch": 6.5856, + "grad_norm": 0.0, + "learning_rate": 0.00028785, + "loss": 3.1588, + "step": 4116 + }, + { + "epoch": 6.5872, + "grad_norm": 0.0, + "learning_rate": 0.00028769999999999995, + "loss": 2.6272, + "step": 4117 + }, + { + "epoch": 6.5888, + "grad_norm": 0.0, + "learning_rate": 0.00028754999999999997, + "loss": 3.33, + "step": 4118 + }, + { + "epoch": 6.5904, + "grad_norm": 0.0, + "learning_rate": 0.00028739999999999994, + "loss": 2.8541, + "step": 4119 + }, + { + "epoch": 6.592, + "grad_norm": 0.0, + "learning_rate": 0.00028724999999999996, + "loss": 2.3283, + "step": 4120 + }, + { + "epoch": 6.5936, + "grad_norm": 0.0, + "learning_rate": 0.0002871, + "loss": 2.9972, + "step": 4121 + }, + { + "epoch": 6.5952, + "grad_norm": 0.0, + "learning_rate": 0.00028694999999999995, + "loss": 3.2081, + "step": 4122 + }, + { + "epoch": 6.5968, + "grad_norm": 0.0, + "learning_rate": 0.0002868, + "loss": 3.4921, + "step": 4123 + }, + { + "epoch": 6.5984, + "grad_norm": 0.0, + "learning_rate": 0.00028665, + "loss": 2.5474, + "step": 4124 + }, + { + "epoch": 6.6, + "grad_norm": 0.0, + "learning_rate": 0.00028649999999999997, + "loss": 3.0886, + "step": 4125 + }, + { + "epoch": 6.6016, + "grad_norm": 0.0, + "learning_rate": 0.00028635, + "loss": 2.9268, + "step": 4126 + }, + { + "epoch": 6.6032, + "grad_norm": 0.0, + "learning_rate": 0.00028619999999999996, + "loss": 2.3568, + "step": 4127 + }, + { + "epoch": 6.6048, + "grad_norm": 0.0, + "learning_rate": 0.00028605, + "loss": 3.1724, + "step": 4128 + }, + { + "epoch": 6.6064, + "grad_norm": 0.0, + "learning_rate": 0.00028589999999999996, + "loss": 2.4307, + "step": 4129 + }, + { + "epoch": 6.608, + "grad_norm": 0.0, + "learning_rate": 0.00028575, + "loss": 2.8321, + "step": 4130 + }, + { + "epoch": 6.6096, + "grad_norm": 0.0, + "learning_rate": 0.00028559999999999995, + "loss": 2.5707, + "step": 4131 + }, + { + "epoch": 6.6112, + "grad_norm": 0.0, + "learning_rate": 0.00028544999999999997, + "loss": 3.0704, + "step": 4132 + }, + { + "epoch": 6.6128, + "grad_norm": 0.0, + "learning_rate": 0.00028529999999999994, + "loss": 3.2784, + "step": 4133 + }, + { + "epoch": 6.6144, + "grad_norm": 0.0, + "learning_rate": 0.00028514999999999997, + "loss": 3.3985, + "step": 4134 + }, + { + "epoch": 6.616, + "grad_norm": 0.0, + "learning_rate": 0.000285, + "loss": 2.8685, + "step": 4135 + }, + { + "epoch": 6.6176, + "grad_norm": 0.0, + "learning_rate": 0.00028484999999999996, + "loss": 3.0241, + "step": 4136 + }, + { + "epoch": 6.6192, + "grad_norm": 0.0, + "learning_rate": 0.0002847, + "loss": 2.4871, + "step": 4137 + }, + { + "epoch": 6.6208, + "grad_norm": 0.0, + "learning_rate": 0.00028455, + "loss": 2.8786, + "step": 4138 + }, + { + "epoch": 6.6224, + "grad_norm": 0.0, + "learning_rate": 0.0002844, + "loss": 2.8877, + "step": 4139 + }, + { + "epoch": 6.624, + "grad_norm": 0.0, + "learning_rate": 0.00028425, + "loss": 2.6461, + "step": 4140 + }, + { + "epoch": 6.6256, + "grad_norm": 0.0, + "learning_rate": 0.00028409999999999997, + "loss": 2.699, + "step": 4141 + }, + { + "epoch": 6.6272, + "grad_norm": 0.0, + "learning_rate": 0.00028395, + "loss": 2.8822, + "step": 4142 + }, + { + "epoch": 6.6288, + "grad_norm": 0.0, + "learning_rate": 0.00028379999999999996, + "loss": 2.7284, + "step": 4143 + }, + { + "epoch": 6.6304, + "grad_norm": 0.0, + "learning_rate": 0.00028365, + "loss": 2.8397, + "step": 4144 + }, + { + "epoch": 6.632, + "grad_norm": 0.0, + "learning_rate": 0.00028349999999999995, + "loss": 2.984, + "step": 4145 + }, + { + "epoch": 6.6336, + "grad_norm": 0.0, + "learning_rate": 0.00028335, + "loss": 3.0078, + "step": 4146 + }, + { + "epoch": 6.6352, + "grad_norm": 0.0, + "learning_rate": 0.00028319999999999994, + "loss": 3.1687, + "step": 4147 + }, + { + "epoch": 6.6368, + "grad_norm": 0.0, + "learning_rate": 0.00028304999999999997, + "loss": 3.0273, + "step": 4148 + }, + { + "epoch": 6.6384, + "grad_norm": 0.0, + "learning_rate": 0.00028289999999999994, + "loss": 3.1394, + "step": 4149 + }, + { + "epoch": 6.64, + "grad_norm": 0.0, + "learning_rate": 0.00028274999999999996, + "loss": 4.6158, + "step": 4150 + }, + { + "epoch": 6.6416, + "grad_norm": 0.0, + "learning_rate": 0.0002826, + "loss": 3.8414, + "step": 4151 + }, + { + "epoch": 6.6432, + "grad_norm": 0.0, + "learning_rate": 0.00028244999999999995, + "loss": 2.765, + "step": 4152 + }, + { + "epoch": 6.6448, + "grad_norm": 0.0, + "learning_rate": 0.0002823, + "loss": 3.6998, + "step": 4153 + }, + { + "epoch": 6.6464, + "grad_norm": 0.0, + "learning_rate": 0.00028215, + "loss": 5.3172, + "step": 4154 + }, + { + "epoch": 6.648, + "grad_norm": 0.0, + "learning_rate": 0.00028199999999999997, + "loss": 2.9695, + "step": 4155 + }, + { + "epoch": 6.6495999999999995, + "grad_norm": 0.0, + "learning_rate": 0.00028185, + "loss": 3.8436, + "step": 4156 + }, + { + "epoch": 6.6512, + "grad_norm": 0.0, + "learning_rate": 0.00028169999999999996, + "loss": 3.3066, + "step": 4157 + }, + { + "epoch": 6.6528, + "grad_norm": 0.0, + "learning_rate": 0.00028155, + "loss": 3.0264, + "step": 4158 + }, + { + "epoch": 6.6544, + "grad_norm": 0.0, + "learning_rate": 0.00028139999999999996, + "loss": 2.5204, + "step": 4159 + }, + { + "epoch": 6.656, + "grad_norm": 0.0, + "learning_rate": 0.00028125, + "loss": 2.4504, + "step": 4160 + }, + { + "epoch": 6.6576, + "grad_norm": 0.0, + "learning_rate": 0.0002811, + "loss": 3.045, + "step": 4161 + }, + { + "epoch": 6.6592, + "grad_norm": 0.0, + "learning_rate": 0.00028094999999999997, + "loss": 3.8107, + "step": 4162 + }, + { + "epoch": 6.6608, + "grad_norm": 0.0, + "learning_rate": 0.0002808, + "loss": 2.8968, + "step": 4163 + }, + { + "epoch": 6.6624, + "grad_norm": 0.0, + "learning_rate": 0.00028064999999999996, + "loss": 2.5925, + "step": 4164 + }, + { + "epoch": 6.664, + "grad_norm": 0.0, + "learning_rate": 0.0002805, + "loss": 3.1626, + "step": 4165 + }, + { + "epoch": 6.6655999999999995, + "grad_norm": 0.0, + "learning_rate": 0.00028034999999999996, + "loss": 3.3632, + "step": 4166 + }, + { + "epoch": 6.6672, + "grad_norm": 0.0, + "learning_rate": 0.0002802, + "loss": 2.757, + "step": 4167 + }, + { + "epoch": 6.6688, + "grad_norm": 0.0, + "learning_rate": 0.00028004999999999995, + "loss": 2.8508, + "step": 4168 + }, + { + "epoch": 6.6704, + "grad_norm": 0.0, + "learning_rate": 0.0002799, + "loss": 2.8919, + "step": 4169 + }, + { + "epoch": 6.672, + "grad_norm": 0.0, + "learning_rate": 0.00027975, + "loss": 2.7713, + "step": 4170 + }, + { + "epoch": 6.6736, + "grad_norm": 0.0, + "learning_rate": 0.00027959999999999997, + "loss": 3.0, + "step": 4171 + }, + { + "epoch": 6.6752, + "grad_norm": 0.0, + "learning_rate": 0.00027945, + "loss": 2.4242, + "step": 4172 + }, + { + "epoch": 6.6768, + "grad_norm": 0.0, + "learning_rate": 0.0002793, + "loss": 2.288, + "step": 4173 + }, + { + "epoch": 6.6784, + "grad_norm": 0.0, + "learning_rate": 0.00027915, + "loss": 2.9365, + "step": 4174 + }, + { + "epoch": 6.68, + "grad_norm": 0.0, + "learning_rate": 0.000279, + "loss": 2.5699, + "step": 4175 + }, + { + "epoch": 6.6815999999999995, + "grad_norm": 0.0, + "learning_rate": 0.00027885, + "loss": 2.9192, + "step": 4176 + }, + { + "epoch": 6.6832, + "grad_norm": 0.0, + "learning_rate": 0.0002787, + "loss": 2.8393, + "step": 4177 + }, + { + "epoch": 6.6848, + "grad_norm": 0.0, + "learning_rate": 0.00027854999999999997, + "loss": 2.9024, + "step": 4178 + }, + { + "epoch": 6.6864, + "grad_norm": 0.0, + "learning_rate": 0.0002784, + "loss": 2.3331, + "step": 4179 + }, + { + "epoch": 6.688, + "grad_norm": 0.0, + "learning_rate": 0.00027824999999999996, + "loss": 2.8534, + "step": 4180 + }, + { + "epoch": 6.6896, + "grad_norm": 0.0, + "learning_rate": 0.0002781, + "loss": 2.1188, + "step": 4181 + }, + { + "epoch": 6.6912, + "grad_norm": 0.0, + "learning_rate": 0.00027794999999999995, + "loss": 2.5233, + "step": 4182 + }, + { + "epoch": 6.6928, + "grad_norm": 0.0, + "learning_rate": 0.0002778, + "loss": 2.1454, + "step": 4183 + }, + { + "epoch": 6.6944, + "grad_norm": 0.0, + "learning_rate": 0.00027764999999999995, + "loss": 2.5926, + "step": 4184 + }, + { + "epoch": 6.696, + "grad_norm": 0.0, + "learning_rate": 0.00027749999999999997, + "loss": 2.6491, + "step": 4185 + }, + { + "epoch": 6.6975999999999996, + "grad_norm": 0.0, + "learning_rate": 0.00027735, + "loss": 3.3521, + "step": 4186 + }, + { + "epoch": 6.6992, + "grad_norm": 0.0, + "learning_rate": 0.0002772, + "loss": 2.8787, + "step": 4187 + }, + { + "epoch": 6.7008, + "grad_norm": 0.0, + "learning_rate": 0.00027705, + "loss": 2.3401, + "step": 4188 + }, + { + "epoch": 6.7024, + "grad_norm": 0.0, + "learning_rate": 0.0002769, + "loss": 2.6153, + "step": 4189 + }, + { + "epoch": 6.704, + "grad_norm": 0.0, + "learning_rate": 0.00027675, + "loss": 2.562, + "step": 4190 + }, + { + "epoch": 6.7056000000000004, + "grad_norm": 0.0, + "learning_rate": 0.0002766, + "loss": 3.778, + "step": 4191 + }, + { + "epoch": 6.7072, + "grad_norm": 0.0, + "learning_rate": 0.00027644999999999997, + "loss": 2.895, + "step": 4192 + }, + { + "epoch": 6.7088, + "grad_norm": 0.0, + "learning_rate": 0.0002763, + "loss": 4.1391, + "step": 4193 + }, + { + "epoch": 6.7104, + "grad_norm": 0.0, + "learning_rate": 0.00027614999999999996, + "loss": 3.3999, + "step": 4194 + }, + { + "epoch": 6.712, + "grad_norm": 0.0, + "learning_rate": 0.000276, + "loss": 3.3463, + "step": 4195 + }, + { + "epoch": 6.7136, + "grad_norm": 0.0, + "learning_rate": 0.00027584999999999996, + "loss": 3.2327, + "step": 4196 + }, + { + "epoch": 6.7152, + "grad_norm": 0.0, + "learning_rate": 0.0002757, + "loss": 2.7379, + "step": 4197 + }, + { + "epoch": 6.7168, + "grad_norm": 0.0, + "learning_rate": 0.00027554999999999995, + "loss": 3.2524, + "step": 4198 + }, + { + "epoch": 6.7184, + "grad_norm": 0.0, + "learning_rate": 0.00027539999999999997, + "loss": 3.8617, + "step": 4199 + }, + { + "epoch": 6.72, + "grad_norm": 0.0, + "learning_rate": 0.00027525, + "loss": 3.601, + "step": 4200 + }, + { + "epoch": 6.7216000000000005, + "grad_norm": 0.0, + "learning_rate": 0.00027509999999999996, + "loss": 4.4073, + "step": 4201 + }, + { + "epoch": 6.7232, + "grad_norm": 0.0, + "learning_rate": 0.00027495, + "loss": 3.4466, + "step": 4202 + }, + { + "epoch": 6.7248, + "grad_norm": 0.0, + "learning_rate": 0.0002748, + "loss": 3.2598, + "step": 4203 + }, + { + "epoch": 6.7264, + "grad_norm": 0.0, + "learning_rate": 0.00027465, + "loss": 3.9411, + "step": 4204 + }, + { + "epoch": 6.728, + "grad_norm": 0.0, + "learning_rate": 0.0002745, + "loss": 3.5236, + "step": 4205 + }, + { + "epoch": 6.7296, + "grad_norm": 0.0, + "learning_rate": 0.00027435, + "loss": 3.1362, + "step": 4206 + }, + { + "epoch": 6.7312, + "grad_norm": 0.0, + "learning_rate": 0.0002742, + "loss": 3.0027, + "step": 4207 + }, + { + "epoch": 6.7328, + "grad_norm": 0.0, + "learning_rate": 0.00027404999999999997, + "loss": 2.8268, + "step": 4208 + }, + { + "epoch": 6.7344, + "grad_norm": 0.0, + "learning_rate": 0.0002739, + "loss": 3.9959, + "step": 4209 + }, + { + "epoch": 6.736, + "grad_norm": 0.0, + "learning_rate": 0.00027374999999999996, + "loss": 2.7458, + "step": 4210 + }, + { + "epoch": 6.7376000000000005, + "grad_norm": 0.0, + "learning_rate": 0.0002736, + "loss": 3.1352, + "step": 4211 + }, + { + "epoch": 6.7392, + "grad_norm": 0.0, + "learning_rate": 0.00027344999999999995, + "loss": 3.0027, + "step": 4212 + }, + { + "epoch": 6.7408, + "grad_norm": 0.0, + "learning_rate": 0.0002733, + "loss": 2.5663, + "step": 4213 + }, + { + "epoch": 6.7424, + "grad_norm": 0.0, + "learning_rate": 0.00027314999999999994, + "loss": 2.8878, + "step": 4214 + }, + { + "epoch": 6.744, + "grad_norm": 0.0, + "learning_rate": 0.00027299999999999997, + "loss": 2.4712, + "step": 4215 + }, + { + "epoch": 6.7456, + "grad_norm": 0.0, + "learning_rate": 0.00027285, + "loss": 2.9749, + "step": 4216 + }, + { + "epoch": 6.7472, + "grad_norm": 0.0, + "learning_rate": 0.00027269999999999996, + "loss": 3.0081, + "step": 4217 + }, + { + "epoch": 6.7488, + "grad_norm": 0.0, + "learning_rate": 0.00027255, + "loss": 2.7849, + "step": 4218 + }, + { + "epoch": 6.7504, + "grad_norm": 0.0, + "learning_rate": 0.0002724, + "loss": 3.5248, + "step": 4219 + }, + { + "epoch": 6.752, + "grad_norm": 0.0, + "learning_rate": 0.00027225, + "loss": 2.1496, + "step": 4220 + }, + { + "epoch": 6.7536000000000005, + "grad_norm": 0.0, + "learning_rate": 0.0002721, + "loss": 2.3653, + "step": 4221 + }, + { + "epoch": 6.7552, + "grad_norm": 0.0, + "learning_rate": 0.00027194999999999997, + "loss": 2.6872, + "step": 4222 + }, + { + "epoch": 6.7568, + "grad_norm": 0.0, + "learning_rate": 0.0002718, + "loss": 2.7962, + "step": 4223 + }, + { + "epoch": 6.7584, + "grad_norm": 0.0, + "learning_rate": 0.00027164999999999996, + "loss": 2.6521, + "step": 4224 + }, + { + "epoch": 6.76, + "grad_norm": 0.0, + "learning_rate": 0.0002715, + "loss": 2.9779, + "step": 4225 + }, + { + "epoch": 6.7616, + "grad_norm": 0.0, + "learning_rate": 0.00027134999999999995, + "loss": 2.8511, + "step": 4226 + }, + { + "epoch": 6.7632, + "grad_norm": 0.0, + "learning_rate": 0.0002712, + "loss": 3.1687, + "step": 4227 + }, + { + "epoch": 6.7648, + "grad_norm": 0.0, + "learning_rate": 0.00027104999999999995, + "loss": 2.8407, + "step": 4228 + }, + { + "epoch": 6.7664, + "grad_norm": 0.0, + "learning_rate": 0.00027089999999999997, + "loss": 2.4979, + "step": 4229 + }, + { + "epoch": 6.768, + "grad_norm": 0.0, + "learning_rate": 0.00027074999999999994, + "loss": 3.0246, + "step": 4230 + }, + { + "epoch": 6.7696, + "grad_norm": 0.0, + "learning_rate": 0.00027059999999999996, + "loss": 2.3302, + "step": 4231 + }, + { + "epoch": 6.7712, + "grad_norm": 0.0, + "learning_rate": 0.00027045, + "loss": 3.2841, + "step": 4232 + }, + { + "epoch": 6.7728, + "grad_norm": 0.0, + "learning_rate": 0.00027029999999999996, + "loss": 2.9967, + "step": 4233 + }, + { + "epoch": 6.7744, + "grad_norm": 0.0, + "learning_rate": 0.00027015, + "loss": 3.3507, + "step": 4234 + }, + { + "epoch": 6.776, + "grad_norm": 0.0, + "learning_rate": 0.00027, + "loss": 2.5622, + "step": 4235 + }, + { + "epoch": 6.7776, + "grad_norm": 0.0, + "learning_rate": 0.00026984999999999997, + "loss": 2.9587, + "step": 4236 + }, + { + "epoch": 6.7792, + "grad_norm": 0.0, + "learning_rate": 0.0002697, + "loss": 3.1645, + "step": 4237 + }, + { + "epoch": 6.7808, + "grad_norm": 0.0, + "learning_rate": 0.00026954999999999997, + "loss": 2.9535, + "step": 4238 + }, + { + "epoch": 6.7824, + "grad_norm": 0.0, + "learning_rate": 0.0002694, + "loss": 2.8682, + "step": 4239 + }, + { + "epoch": 6.784, + "grad_norm": 0.0, + "learning_rate": 0.00026924999999999996, + "loss": 2.3655, + "step": 4240 + }, + { + "epoch": 6.7856, + "grad_norm": 0.0, + "learning_rate": 0.0002691, + "loss": 3.3084, + "step": 4241 + }, + { + "epoch": 6.7872, + "grad_norm": 0.0, + "learning_rate": 0.00026894999999999995, + "loss": 2.5359, + "step": 4242 + }, + { + "epoch": 6.7888, + "grad_norm": 0.0, + "learning_rate": 0.0002688, + "loss": 2.728, + "step": 4243 + }, + { + "epoch": 6.7904, + "grad_norm": 0.0, + "learning_rate": 0.00026864999999999994, + "loss": 2.4105, + "step": 4244 + }, + { + "epoch": 6.792, + "grad_norm": 0.0, + "learning_rate": 0.00026849999999999997, + "loss": 2.7605, + "step": 4245 + }, + { + "epoch": 6.7936, + "grad_norm": 0.0, + "learning_rate": 0.00026835, + "loss": 3.6181, + "step": 4246 + }, + { + "epoch": 6.7952, + "grad_norm": 0.0, + "learning_rate": 0.00026819999999999996, + "loss": 3.7172, + "step": 4247 + }, + { + "epoch": 6.7968, + "grad_norm": 0.0, + "learning_rate": 0.00026805, + "loss": 3.3299, + "step": 4248 + }, + { + "epoch": 6.7984, + "grad_norm": 0.0, + "learning_rate": 0.0002679, + "loss": 2.6778, + "step": 4249 + }, + { + "epoch": 6.8, + "grad_norm": 0.0, + "learning_rate": 0.00026775, + "loss": 3.0789, + "step": 4250 + }, + { + "epoch": 6.8016, + "grad_norm": 0.0, + "learning_rate": 0.0002676, + "loss": 5.0018, + "step": 4251 + }, + { + "epoch": 6.8032, + "grad_norm": 0.0, + "learning_rate": 0.00026744999999999997, + "loss": 4.1692, + "step": 4252 + }, + { + "epoch": 6.8048, + "grad_norm": 0.0, + "learning_rate": 0.0002673, + "loss": 4.7239, + "step": 4253 + }, + { + "epoch": 6.8064, + "grad_norm": 0.0, + "learning_rate": 0.00026714999999999996, + "loss": 3.4193, + "step": 4254 + }, + { + "epoch": 6.808, + "grad_norm": 0.0, + "learning_rate": 0.000267, + "loss": 3.4795, + "step": 4255 + }, + { + "epoch": 6.8096, + "grad_norm": 0.0, + "learning_rate": 0.00026684999999999995, + "loss": 3.4933, + "step": 4256 + }, + { + "epoch": 6.8112, + "grad_norm": 0.0, + "learning_rate": 0.0002667, + "loss": 3.6073, + "step": 4257 + }, + { + "epoch": 6.8128, + "grad_norm": 0.0, + "learning_rate": 0.00026654999999999995, + "loss": 3.0732, + "step": 4258 + }, + { + "epoch": 6.8144, + "grad_norm": 0.0, + "learning_rate": 0.00026639999999999997, + "loss": 3.1783, + "step": 4259 + }, + { + "epoch": 6.816, + "grad_norm": 0.0, + "learning_rate": 0.00026624999999999994, + "loss": 3.2414, + "step": 4260 + }, + { + "epoch": 6.8176, + "grad_norm": 0.0, + "learning_rate": 0.00026609999999999996, + "loss": 3.0902, + "step": 4261 + }, + { + "epoch": 6.8192, + "grad_norm": 0.0, + "learning_rate": 0.00026595, + "loss": 2.8653, + "step": 4262 + }, + { + "epoch": 6.8208, + "grad_norm": 0.0, + "learning_rate": 0.00026579999999999996, + "loss": 2.651, + "step": 4263 + }, + { + "epoch": 6.8224, + "grad_norm": 0.0, + "learning_rate": 0.00026565, + "loss": 3.1109, + "step": 4264 + }, + { + "epoch": 6.824, + "grad_norm": 0.0, + "learning_rate": 0.0002655, + "loss": 3.5091, + "step": 4265 + }, + { + "epoch": 6.8256, + "grad_norm": 0.0, + "learning_rate": 0.00026534999999999997, + "loss": 2.6144, + "step": 4266 + }, + { + "epoch": 6.8272, + "grad_norm": 0.0, + "learning_rate": 0.0002652, + "loss": 2.7945, + "step": 4267 + }, + { + "epoch": 6.8288, + "grad_norm": 0.0, + "learning_rate": 0.00026504999999999996, + "loss": 2.8182, + "step": 4268 + }, + { + "epoch": 6.8304, + "grad_norm": 0.0, + "learning_rate": 0.0002649, + "loss": 2.7363, + "step": 4269 + }, + { + "epoch": 6.832, + "grad_norm": 0.0, + "learning_rate": 0.00026474999999999996, + "loss": 3.0435, + "step": 4270 + }, + { + "epoch": 6.8336, + "grad_norm": 0.0, + "learning_rate": 0.0002646, + "loss": 2.7486, + "step": 4271 + }, + { + "epoch": 6.8352, + "grad_norm": 0.0, + "learning_rate": 0.00026444999999999995, + "loss": 2.567, + "step": 4272 + }, + { + "epoch": 6.8368, + "grad_norm": 0.0, + "learning_rate": 0.0002643, + "loss": 3.1096, + "step": 4273 + }, + { + "epoch": 6.8384, + "grad_norm": 0.0, + "learning_rate": 0.00026414999999999994, + "loss": 2.5198, + "step": 4274 + }, + { + "epoch": 6.84, + "grad_norm": 0.0, + "learning_rate": 0.00026399999999999997, + "loss": 3.0245, + "step": 4275 + }, + { + "epoch": 6.8416, + "grad_norm": 0.0, + "learning_rate": 0.00026384999999999994, + "loss": 3.0371, + "step": 4276 + }, + { + "epoch": 6.8431999999999995, + "grad_norm": 0.0, + "learning_rate": 0.00026369999999999996, + "loss": 2.8618, + "step": 4277 + }, + { + "epoch": 6.8448, + "grad_norm": 0.0, + "learning_rate": 0.00026355, + "loss": 2.5902, + "step": 4278 + }, + { + "epoch": 6.8464, + "grad_norm": 0.0, + "learning_rate": 0.00026339999999999995, + "loss": 2.687, + "step": 4279 + }, + { + "epoch": 6.848, + "grad_norm": 0.0, + "learning_rate": 0.00026325, + "loss": 2.7124, + "step": 4280 + }, + { + "epoch": 6.8496, + "grad_norm": 0.0, + "learning_rate": 0.0002631, + "loss": 3.6682, + "step": 4281 + }, + { + "epoch": 6.8512, + "grad_norm": 0.0, + "learning_rate": 0.00026294999999999997, + "loss": 2.4604, + "step": 4282 + }, + { + "epoch": 6.8528, + "grad_norm": 0.0, + "learning_rate": 0.0002628, + "loss": 2.8002, + "step": 4283 + }, + { + "epoch": 6.8544, + "grad_norm": 0.0, + "learning_rate": 0.00026264999999999996, + "loss": 2.6395, + "step": 4284 + }, + { + "epoch": 6.856, + "grad_norm": 0.0, + "learning_rate": 0.0002625, + "loss": 2.4311, + "step": 4285 + }, + { + "epoch": 6.8576, + "grad_norm": 0.0, + "learning_rate": 0.00026235, + "loss": 2.9462, + "step": 4286 + }, + { + "epoch": 6.8591999999999995, + "grad_norm": 0.0, + "learning_rate": 0.0002622, + "loss": 3.6116, + "step": 4287 + }, + { + "epoch": 6.8608, + "grad_norm": 0.0, + "learning_rate": 0.00026205, + "loss": 3.5986, + "step": 4288 + }, + { + "epoch": 6.8624, + "grad_norm": 0.0, + "learning_rate": 0.00026189999999999997, + "loss": 2.9106, + "step": 4289 + }, + { + "epoch": 6.864, + "grad_norm": 0.0, + "learning_rate": 0.00026175, + "loss": 3.1669, + "step": 4290 + }, + { + "epoch": 6.8656, + "grad_norm": 0.0, + "learning_rate": 0.00026159999999999996, + "loss": 3.6225, + "step": 4291 + }, + { + "epoch": 6.8672, + "grad_norm": 0.0, + "learning_rate": 0.00026145, + "loss": 3.0465, + "step": 4292 + }, + { + "epoch": 6.8688, + "grad_norm": 0.0, + "learning_rate": 0.00026129999999999995, + "loss": 3.0488, + "step": 4293 + }, + { + "epoch": 6.8704, + "grad_norm": 0.0, + "learning_rate": 0.00026115, + "loss": 2.2928, + "step": 4294 + }, + { + "epoch": 6.872, + "grad_norm": 0.0, + "learning_rate": 0.000261, + "loss": 2.9981, + "step": 4295 + }, + { + "epoch": 6.8736, + "grad_norm": 0.0, + "learning_rate": 0.00026084999999999997, + "loss": 2.8215, + "step": 4296 + }, + { + "epoch": 6.8751999999999995, + "grad_norm": 0.0, + "learning_rate": 0.0002607, + "loss": 2.9482, + "step": 4297 + }, + { + "epoch": 6.8768, + "grad_norm": 0.0, + "learning_rate": 0.00026055, + "loss": 2.9353, + "step": 4298 + }, + { + "epoch": 6.8784, + "grad_norm": 0.0, + "learning_rate": 0.0002604, + "loss": 2.7093, + "step": 4299 + }, + { + "epoch": 6.88, + "grad_norm": 0.0, + "learning_rate": 0.00026025, + "loss": 5.2044, + "step": 4300 + }, + { + "epoch": 6.8816, + "grad_norm": 0.0, + "learning_rate": 0.0002601, + "loss": 4.0005, + "step": 4301 + }, + { + "epoch": 6.8832, + "grad_norm": 0.0, + "learning_rate": 0.00025995, + "loss": 3.5939, + "step": 4302 + }, + { + "epoch": 6.8848, + "grad_norm": 0.0, + "learning_rate": 0.00025979999999999997, + "loss": 4.2119, + "step": 4303 + }, + { + "epoch": 6.8864, + "grad_norm": 0.0, + "learning_rate": 0.00025965, + "loss": 3.7944, + "step": 4304 + }, + { + "epoch": 6.888, + "grad_norm": 0.0, + "learning_rate": 0.00025949999999999997, + "loss": 3.7665, + "step": 4305 + }, + { + "epoch": 6.8896, + "grad_norm": 0.0, + "learning_rate": 0.00025935, + "loss": 3.5207, + "step": 4306 + }, + { + "epoch": 6.8911999999999995, + "grad_norm": 0.0, + "learning_rate": 0.00025919999999999996, + "loss": 2.9676, + "step": 4307 + }, + { + "epoch": 6.8928, + "grad_norm": 0.0, + "learning_rate": 0.00025905, + "loss": 2.6527, + "step": 4308 + }, + { + "epoch": 6.8944, + "grad_norm": 0.0, + "learning_rate": 0.00025889999999999995, + "loss": 3.0158, + "step": 4309 + }, + { + "epoch": 6.896, + "grad_norm": 0.0, + "learning_rate": 0.00025875, + "loss": 2.931, + "step": 4310 + }, + { + "epoch": 6.8976, + "grad_norm": 0.0, + "learning_rate": 0.0002586, + "loss": 3.003, + "step": 4311 + }, + { + "epoch": 6.8992, + "grad_norm": 0.0, + "learning_rate": 0.00025844999999999997, + "loss": 3.6457, + "step": 4312 + }, + { + "epoch": 6.9008, + "grad_norm": 0.0, + "learning_rate": 0.0002583, + "loss": 2.887, + "step": 4313 + }, + { + "epoch": 6.9024, + "grad_norm": 0.0, + "learning_rate": 0.00025815, + "loss": 2.5508, + "step": 4314 + }, + { + "epoch": 6.904, + "grad_norm": 0.0, + "learning_rate": 0.000258, + "loss": 2.8492, + "step": 4315 + }, + { + "epoch": 6.9056, + "grad_norm": 0.0, + "learning_rate": 0.00025785, + "loss": 2.5418, + "step": 4316 + }, + { + "epoch": 6.9072, + "grad_norm": 0.0, + "learning_rate": 0.0002577, + "loss": 2.7807, + "step": 4317 + }, + { + "epoch": 6.9088, + "grad_norm": 0.0, + "learning_rate": 0.00025755, + "loss": 2.6149, + "step": 4318 + }, + { + "epoch": 6.9104, + "grad_norm": 0.0, + "learning_rate": 0.00025739999999999997, + "loss": 2.6781, + "step": 4319 + }, + { + "epoch": 6.912, + "grad_norm": 0.0, + "learning_rate": 0.00025725, + "loss": 2.3818, + "step": 4320 + }, + { + "epoch": 6.9136, + "grad_norm": 0.0, + "learning_rate": 0.00025709999999999996, + "loss": 3.4684, + "step": 4321 + }, + { + "epoch": 6.9152000000000005, + "grad_norm": 0.0, + "learning_rate": 0.00025695, + "loss": 2.6856, + "step": 4322 + }, + { + "epoch": 6.9168, + "grad_norm": 0.0, + "learning_rate": 0.00025679999999999995, + "loss": 3.0432, + "step": 4323 + }, + { + "epoch": 6.9184, + "grad_norm": 0.0, + "learning_rate": 0.00025665, + "loss": 2.5531, + "step": 4324 + }, + { + "epoch": 6.92, + "grad_norm": 0.0, + "learning_rate": 0.00025649999999999995, + "loss": 2.0887, + "step": 4325 + }, + { + "epoch": 6.9216, + "grad_norm": 0.0, + "learning_rate": 0.00025634999999999997, + "loss": 3.3472, + "step": 4326 + }, + { + "epoch": 6.9232, + "grad_norm": 0.0, + "learning_rate": 0.0002562, + "loss": 2.7786, + "step": 4327 + }, + { + "epoch": 6.9248, + "grad_norm": 0.0, + "learning_rate": 0.00025604999999999996, + "loss": 3.0112, + "step": 4328 + }, + { + "epoch": 6.9264, + "grad_norm": 0.0, + "learning_rate": 0.0002559, + "loss": 3.2129, + "step": 4329 + }, + { + "epoch": 6.928, + "grad_norm": 0.0, + "learning_rate": 0.00025575, + "loss": 3.3005, + "step": 4330 + }, + { + "epoch": 6.9296, + "grad_norm": 0.0, + "learning_rate": 0.0002556, + "loss": 2.9553, + "step": 4331 + }, + { + "epoch": 6.9312000000000005, + "grad_norm": 0.0, + "learning_rate": 0.00025545, + "loss": 3.0149, + "step": 4332 + }, + { + "epoch": 6.9328, + "grad_norm": 0.0, + "learning_rate": 0.00025529999999999997, + "loss": 2.7872, + "step": 4333 + }, + { + "epoch": 6.9344, + "grad_norm": 0.0, + "learning_rate": 0.00025515, + "loss": 2.9444, + "step": 4334 + }, + { + "epoch": 6.936, + "grad_norm": 0.0, + "learning_rate": 0.00025499999999999996, + "loss": 2.7883, + "step": 4335 + }, + { + "epoch": 6.9376, + "grad_norm": 0.0, + "learning_rate": 0.00025485, + "loss": 2.5147, + "step": 4336 + }, + { + "epoch": 6.9392, + "grad_norm": 0.0, + "learning_rate": 0.00025469999999999996, + "loss": 2.6618, + "step": 4337 + }, + { + "epoch": 6.9408, + "grad_norm": 0.0, + "learning_rate": 0.00025455, + "loss": 3.1769, + "step": 4338 + }, + { + "epoch": 6.9424, + "grad_norm": 0.0, + "learning_rate": 0.00025439999999999995, + "loss": 2.593, + "step": 4339 + }, + { + "epoch": 6.944, + "grad_norm": 0.0, + "learning_rate": 0.00025425, + "loss": 2.4697, + "step": 4340 + }, + { + "epoch": 6.9456, + "grad_norm": 0.0, + "learning_rate": 0.0002541, + "loss": 2.3677, + "step": 4341 + }, + { + "epoch": 6.9472000000000005, + "grad_norm": 0.0, + "learning_rate": 0.00025394999999999997, + "loss": 3.4864, + "step": 4342 + }, + { + "epoch": 6.9488, + "grad_norm": 0.0, + "learning_rate": 0.0002538, + "loss": 2.7577, + "step": 4343 + }, + { + "epoch": 6.9504, + "grad_norm": 0.0, + "learning_rate": 0.00025365, + "loss": 2.7377, + "step": 4344 + }, + { + "epoch": 6.952, + "grad_norm": 0.0, + "learning_rate": 0.0002535, + "loss": 2.7041, + "step": 4345 + }, + { + "epoch": 6.9536, + "grad_norm": 0.0, + "learning_rate": 0.00025335, + "loss": 2.4056, + "step": 4346 + }, + { + "epoch": 6.9552, + "grad_norm": 0.0, + "learning_rate": 0.0002532, + "loss": 3.782, + "step": 4347 + }, + { + "epoch": 6.9568, + "grad_norm": 0.0, + "learning_rate": 0.00025305, + "loss": 3.3568, + "step": 4348 + }, + { + "epoch": 6.9584, + "grad_norm": 0.0, + "learning_rate": 0.00025289999999999997, + "loss": 4.0026, + "step": 4349 + }, + { + "epoch": 6.96, + "grad_norm": NaN, + "learning_rate": 0.00025289999999999997, + "loss": 3.1076, + "step": 4350 + }, + { + "epoch": 6.9616, + "grad_norm": 0.0, + "learning_rate": 0.00025275, + "loss": 4.3278, + "step": 4351 + }, + { + "epoch": 6.9632, + "grad_norm": 0.0, + "learning_rate": 0.00025259999999999996, + "loss": 3.6904, + "step": 4352 + }, + { + "epoch": 6.9648, + "grad_norm": 0.0, + "learning_rate": 0.00025245, + "loss": 2.9045, + "step": 4353 + }, + { + "epoch": 6.9664, + "grad_norm": 0.0, + "learning_rate": 0.00025229999999999995, + "loss": 4.1884, + "step": 4354 + }, + { + "epoch": 6.968, + "grad_norm": 0.0, + "learning_rate": 0.00025215, + "loss": 4.3218, + "step": 4355 + }, + { + "epoch": 6.9696, + "grad_norm": 0.0, + "learning_rate": 0.00025199999999999995, + "loss": 2.6043, + "step": 4356 + }, + { + "epoch": 6.9712, + "grad_norm": 0.0, + "learning_rate": 0.00025184999999999997, + "loss": 2.6978, + "step": 4357 + }, + { + "epoch": 6.9728, + "grad_norm": 0.0, + "learning_rate": 0.0002517, + "loss": 2.8147, + "step": 4358 + }, + { + "epoch": 6.9744, + "grad_norm": 0.0, + "learning_rate": 0.00025154999999999996, + "loss": 2.6158, + "step": 4359 + }, + { + "epoch": 6.976, + "grad_norm": 0.0, + "learning_rate": 0.0002514, + "loss": 2.6996, + "step": 4360 + }, + { + "epoch": 6.9776, + "grad_norm": 0.0, + "learning_rate": 0.00025125, + "loss": 3.5713, + "step": 4361 + }, + { + "epoch": 6.9792, + "grad_norm": 0.0, + "learning_rate": 0.0002511, + "loss": 3.194, + "step": 4362 + }, + { + "epoch": 6.9808, + "grad_norm": 0.0, + "learning_rate": 0.00025095, + "loss": 3.3608, + "step": 4363 + }, + { + "epoch": 6.9824, + "grad_norm": 0.0, + "learning_rate": 0.00025079999999999997, + "loss": 2.3152, + "step": 4364 + }, + { + "epoch": 6.984, + "grad_norm": 0.0, + "learning_rate": 0.00025065, + "loss": 2.8111, + "step": 4365 + }, + { + "epoch": 6.9856, + "grad_norm": 0.0, + "learning_rate": 0.00025049999999999996, + "loss": 3.17, + "step": 4366 + }, + { + "epoch": 6.9872, + "grad_norm": 0.0, + "learning_rate": 0.00025035, + "loss": 2.7564, + "step": 4367 + }, + { + "epoch": 6.9888, + "grad_norm": 0.0, + "learning_rate": 0.00025019999999999996, + "loss": 3.6902, + "step": 4368 + }, + { + "epoch": 6.9904, + "grad_norm": 0.0, + "learning_rate": 0.00025005, + "loss": 2.6898, + "step": 4369 + }, + { + "epoch": 6.992, + "grad_norm": 0.0, + "learning_rate": 0.00024989999999999995, + "loss": 3.0014, + "step": 4370 + }, + { + "epoch": 6.9936, + "grad_norm": 0.0, + "learning_rate": 0.00024974999999999997, + "loss": 2.8975, + "step": 4371 + }, + { + "epoch": 6.9952, + "grad_norm": 0.0, + "learning_rate": 0.00024959999999999994, + "loss": 2.2888, + "step": 4372 + }, + { + "epoch": 6.9968, + "grad_norm": 0.0, + "learning_rate": 0.00024944999999999996, + "loss": 3.0213, + "step": 4373 + }, + { + "epoch": 6.9984, + "grad_norm": 0.0, + "learning_rate": 0.0002493, + "loss": 2.0513, + "step": 4374 + }, + { + "epoch": 7.0, + "grad_norm": 0.0, + "learning_rate": 0.00024914999999999996, + "loss": 3.1802, + "step": 4375 + }, + { + "epoch": 7.0016, + "grad_norm": 0.0, + "learning_rate": 0.000249, + "loss": 4.4198, + "step": 4376 + }, + { + "epoch": 7.0032, + "grad_norm": 0.0, + "learning_rate": 0.00024885, + "loss": 4.4398, + "step": 4377 + }, + { + "epoch": 7.0048, + "grad_norm": 0.0, + "learning_rate": 0.0002487, + "loss": 3.1124, + "step": 4378 + }, + { + "epoch": 7.0064, + "grad_norm": 0.0, + "learning_rate": 0.00024855, + "loss": 4.2911, + "step": 4379 + }, + { + "epoch": 7.008, + "grad_norm": 0.0, + "learning_rate": 0.00024839999999999997, + "loss": 4.1504, + "step": 4380 + }, + { + "epoch": 7.0096, + "grad_norm": 0.0, + "learning_rate": 0.00024825, + "loss": 2.849, + "step": 4381 + }, + { + "epoch": 7.0112, + "grad_norm": 0.0, + "learning_rate": 0.00024809999999999996, + "loss": 2.7981, + "step": 4382 + }, + { + "epoch": 7.0128, + "grad_norm": 0.0, + "learning_rate": 0.00024795, + "loss": 3.4686, + "step": 4383 + }, + { + "epoch": 7.0144, + "grad_norm": 0.0, + "learning_rate": 0.00024779999999999995, + "loss": 3.4509, + "step": 4384 + }, + { + "epoch": 7.016, + "grad_norm": 0.0, + "learning_rate": 0.00024765, + "loss": 3.4504, + "step": 4385 + }, + { + "epoch": 7.0176, + "grad_norm": 0.0, + "learning_rate": 0.00024749999999999994, + "loss": 2.8491, + "step": 4386 + }, + { + "epoch": 7.0192, + "grad_norm": 0.0, + "learning_rate": 0.00024734999999999997, + "loss": 3.495, + "step": 4387 + }, + { + "epoch": 7.0208, + "grad_norm": 0.0, + "learning_rate": 0.0002472, + "loss": 2.9273, + "step": 4388 + }, + { + "epoch": 7.0224, + "grad_norm": 0.0, + "learning_rate": 0.00024704999999999996, + "loss": 3.0325, + "step": 4389 + }, + { + "epoch": 7.024, + "grad_norm": 0.0, + "learning_rate": 0.0002469, + "loss": 3.1798, + "step": 4390 + }, + { + "epoch": 7.0256, + "grad_norm": 0.0, + "learning_rate": 0.00024675, + "loss": 3.3611, + "step": 4391 + }, + { + "epoch": 7.0272, + "grad_norm": 0.0, + "learning_rate": 0.0002466, + "loss": 2.7285, + "step": 4392 + }, + { + "epoch": 7.0288, + "grad_norm": 0.0, + "learning_rate": 0.00024645, + "loss": 2.5132, + "step": 4393 + }, + { + "epoch": 7.0304, + "grad_norm": 0.0, + "learning_rate": 0.00024629999999999997, + "loss": 2.9996, + "step": 4394 + }, + { + "epoch": 7.032, + "grad_norm": 0.0, + "learning_rate": 0.00024615, + "loss": 2.6329, + "step": 4395 + }, + { + "epoch": 7.0336, + "grad_norm": 0.0, + "learning_rate": 0.00024599999999999996, + "loss": 2.4842, + "step": 4396 + }, + { + "epoch": 7.0352, + "grad_norm": 0.0, + "learning_rate": 0.00024585, + "loss": 2.9166, + "step": 4397 + }, + { + "epoch": 7.0368, + "grad_norm": 0.0, + "learning_rate": 0.00024569999999999995, + "loss": 2.2728, + "step": 4398 + }, + { + "epoch": 7.0384, + "grad_norm": 0.0, + "learning_rate": 0.00024555, + "loss": 3.182, + "step": 4399 + }, + { + "epoch": 7.04, + "grad_norm": 0.0, + "learning_rate": 0.00024539999999999995, + "loss": 3.4259, + "step": 4400 + }, + { + "epoch": 7.0416, + "grad_norm": 0.0, + "learning_rate": 0.00024524999999999997, + "loss": 2.6928, + "step": 4401 + }, + { + "epoch": 7.0432, + "grad_norm": 0.0, + "learning_rate": 0.00024509999999999994, + "loss": 2.9651, + "step": 4402 + }, + { + "epoch": 7.0448, + "grad_norm": 0.0, + "learning_rate": 0.00024494999999999996, + "loss": 2.5092, + "step": 4403 + }, + { + "epoch": 7.0464, + "grad_norm": 0.0, + "learning_rate": 0.0002448, + "loss": 2.5621, + "step": 4404 + }, + { + "epoch": 7.048, + "grad_norm": 0.0, + "learning_rate": 0.00024464999999999996, + "loss": 2.1604, + "step": 4405 + }, + { + "epoch": 7.0496, + "grad_norm": 0.0, + "learning_rate": 0.0002445, + "loss": 2.8104, + "step": 4406 + }, + { + "epoch": 7.0512, + "grad_norm": 0.0, + "learning_rate": 0.00024435, + "loss": 3.2445, + "step": 4407 + }, + { + "epoch": 7.0528, + "grad_norm": 0.0, + "learning_rate": 0.00024419999999999997, + "loss": 3.0785, + "step": 4408 + }, + { + "epoch": 7.0544, + "grad_norm": 0.0, + "learning_rate": 0.00024404999999999997, + "loss": 3.4991, + "step": 4409 + }, + { + "epoch": 7.056, + "grad_norm": 0.0, + "learning_rate": 0.00024389999999999997, + "loss": 2.883, + "step": 4410 + }, + { + "epoch": 7.0576, + "grad_norm": 0.0, + "learning_rate": 0.00024375, + "loss": 2.8117, + "step": 4411 + }, + { + "epoch": 7.0592, + "grad_norm": 0.0, + "learning_rate": 0.00024359999999999999, + "loss": 3.6825, + "step": 4412 + }, + { + "epoch": 7.0608, + "grad_norm": 0.0, + "learning_rate": 0.00024344999999999998, + "loss": 2.6544, + "step": 4413 + }, + { + "epoch": 7.0624, + "grad_norm": 0.0, + "learning_rate": 0.0002433, + "loss": 2.7173, + "step": 4414 + }, + { + "epoch": 7.064, + "grad_norm": 0.0, + "learning_rate": 0.00024314999999999997, + "loss": 3.1109, + "step": 4415 + }, + { + "epoch": 7.0656, + "grad_norm": 0.0, + "learning_rate": 0.000243, + "loss": 3.0858, + "step": 4416 + }, + { + "epoch": 7.0672, + "grad_norm": 0.0, + "learning_rate": 0.00024284999999999997, + "loss": 2.5237, + "step": 4417 + }, + { + "epoch": 7.0688, + "grad_norm": 0.0, + "learning_rate": 0.0002427, + "loss": 2.5561, + "step": 4418 + }, + { + "epoch": 7.0704, + "grad_norm": 0.0, + "learning_rate": 0.00024255, + "loss": 3.4563, + "step": 4419 + }, + { + "epoch": 7.072, + "grad_norm": 0.0, + "learning_rate": 0.00024239999999999998, + "loss": 2.5289, + "step": 4420 + }, + { + "epoch": 7.0736, + "grad_norm": 0.0, + "learning_rate": 0.00024224999999999998, + "loss": 3.3271, + "step": 4421 + }, + { + "epoch": 7.0752, + "grad_norm": 0.0, + "learning_rate": 0.0002421, + "loss": 3.2885, + "step": 4422 + }, + { + "epoch": 7.0768, + "grad_norm": 0.0, + "learning_rate": 0.00024194999999999997, + "loss": 3.2276, + "step": 4423 + }, + { + "epoch": 7.0784, + "grad_norm": 0.0, + "learning_rate": 0.0002418, + "loss": 2.785, + "step": 4424 + }, + { + "epoch": 7.08, + "grad_norm": 0.0, + "learning_rate": 0.00024164999999999996, + "loss": 3.4965, + "step": 4425 + }, + { + "epoch": 7.0816, + "grad_norm": 0.0, + "learning_rate": 0.0002415, + "loss": 4.8508, + "step": 4426 + }, + { + "epoch": 7.0832, + "grad_norm": 0.0, + "learning_rate": 0.00024134999999999998, + "loss": 2.9179, + "step": 4427 + }, + { + "epoch": 7.0848, + "grad_norm": 0.0, + "learning_rate": 0.00024119999999999998, + "loss": 4.5402, + "step": 4428 + }, + { + "epoch": 7.0864, + "grad_norm": 0.0, + "learning_rate": 0.00024104999999999998, + "loss": 3.5119, + "step": 4429 + }, + { + "epoch": 7.088, + "grad_norm": 0.0, + "learning_rate": 0.0002409, + "loss": 3.4308, + "step": 4430 + }, + { + "epoch": 7.0896, + "grad_norm": 0.0, + "learning_rate": 0.00024074999999999997, + "loss": 2.709, + "step": 4431 + }, + { + "epoch": 7.0912, + "grad_norm": 0.0, + "learning_rate": 0.0002406, + "loss": 3.1521, + "step": 4432 + }, + { + "epoch": 7.0928, + "grad_norm": 0.0, + "learning_rate": 0.00024044999999999996, + "loss": 2.9411, + "step": 4433 + }, + { + "epoch": 7.0944, + "grad_norm": 0.0, + "learning_rate": 0.00024029999999999999, + "loss": 3.0063, + "step": 4434 + }, + { + "epoch": 7.096, + "grad_norm": 0.0, + "learning_rate": 0.00024014999999999998, + "loss": 3.4764, + "step": 4435 + }, + { + "epoch": 7.0976, + "grad_norm": 0.0, + "learning_rate": 0.00023999999999999998, + "loss": 3.1709, + "step": 4436 + }, + { + "epoch": 7.0992, + "grad_norm": 0.0, + "learning_rate": 0.00023984999999999998, + "loss": 2.8713, + "step": 4437 + }, + { + "epoch": 7.1008, + "grad_norm": 0.0, + "learning_rate": 0.0002397, + "loss": 2.8115, + "step": 4438 + }, + { + "epoch": 7.1024, + "grad_norm": 0.0, + "learning_rate": 0.00023954999999999997, + "loss": 2.7006, + "step": 4439 + }, + { + "epoch": 7.104, + "grad_norm": 0.0, + "learning_rate": 0.0002394, + "loss": 3.5017, + "step": 4440 + }, + { + "epoch": 7.1056, + "grad_norm": 0.0, + "learning_rate": 0.00023925, + "loss": 3.0051, + "step": 4441 + }, + { + "epoch": 7.1072, + "grad_norm": 0.0, + "learning_rate": 0.00023909999999999998, + "loss": 3.0562, + "step": 4442 + }, + { + "epoch": 7.1088, + "grad_norm": 0.0, + "learning_rate": 0.00023894999999999998, + "loss": 2.5437, + "step": 4443 + }, + { + "epoch": 7.1104, + "grad_norm": 0.0, + "learning_rate": 0.0002388, + "loss": 2.7862, + "step": 4444 + }, + { + "epoch": 7.112, + "grad_norm": 0.0, + "learning_rate": 0.00023864999999999997, + "loss": 2.838, + "step": 4445 + }, + { + "epoch": 7.1136, + "grad_norm": 0.0, + "learning_rate": 0.0002385, + "loss": 2.2052, + "step": 4446 + }, + { + "epoch": 7.1152, + "grad_norm": 0.0, + "learning_rate": 0.00023834999999999997, + "loss": 2.9081, + "step": 4447 + }, + { + "epoch": 7.1168, + "grad_norm": 0.0, + "learning_rate": 0.0002382, + "loss": 2.4304, + "step": 4448 + }, + { + "epoch": 7.1184, + "grad_norm": 0.0, + "learning_rate": 0.00023804999999999999, + "loss": 2.941, + "step": 4449 + }, + { + "epoch": 7.12, + "grad_norm": 0.0, + "learning_rate": 0.00023789999999999998, + "loss": 2.7017, + "step": 4450 + }, + { + "epoch": 7.1216, + "grad_norm": 0.0, + "learning_rate": 0.00023774999999999998, + "loss": 2.8525, + "step": 4451 + }, + { + "epoch": 7.1232, + "grad_norm": 0.0, + "learning_rate": 0.0002376, + "loss": 3.0987, + "step": 4452 + }, + { + "epoch": 7.1248, + "grad_norm": 0.0, + "learning_rate": 0.00023744999999999997, + "loss": 2.4817, + "step": 4453 + }, + { + "epoch": 7.1264, + "grad_norm": 0.0, + "learning_rate": 0.0002373, + "loss": 3.2059, + "step": 4454 + }, + { + "epoch": 7.128, + "grad_norm": 0.0, + "learning_rate": 0.00023714999999999996, + "loss": 2.5229, + "step": 4455 + }, + { + "epoch": 7.1296, + "grad_norm": 0.0, + "learning_rate": 0.000237, + "loss": 2.9222, + "step": 4456 + }, + { + "epoch": 7.1312, + "grad_norm": 0.0, + "learning_rate": 0.00023684999999999998, + "loss": 3.0885, + "step": 4457 + }, + { + "epoch": 7.1328, + "grad_norm": 0.0, + "learning_rate": 0.00023669999999999998, + "loss": 3.4679, + "step": 4458 + }, + { + "epoch": 7.1344, + "grad_norm": 0.0, + "learning_rate": 0.00023654999999999998, + "loss": 2.9408, + "step": 4459 + }, + { + "epoch": 7.136, + "grad_norm": 0.0, + "learning_rate": 0.0002364, + "loss": 2.8065, + "step": 4460 + }, + { + "epoch": 7.1376, + "grad_norm": 0.0, + "learning_rate": 0.00023624999999999997, + "loss": 3.0352, + "step": 4461 + }, + { + "epoch": 7.1392, + "grad_norm": 0.0, + "learning_rate": 0.0002361, + "loss": 2.9077, + "step": 4462 + }, + { + "epoch": 7.1408, + "grad_norm": 0.0, + "learning_rate": 0.00023594999999999996, + "loss": 3.1758, + "step": 4463 + }, + { + "epoch": 7.1424, + "grad_norm": 0.0, + "learning_rate": 0.00023579999999999999, + "loss": 4.3853, + "step": 4464 + }, + { + "epoch": 7.144, + "grad_norm": 0.0, + "learning_rate": 0.00023564999999999998, + "loss": 3.2723, + "step": 4465 + }, + { + "epoch": 7.1456, + "grad_norm": 0.0, + "learning_rate": 0.00023549999999999998, + "loss": 4.1234, + "step": 4466 + }, + { + "epoch": 7.1472, + "grad_norm": 0.0, + "learning_rate": 0.00023534999999999997, + "loss": 2.7587, + "step": 4467 + }, + { + "epoch": 7.1488, + "grad_norm": 0.0, + "learning_rate": 0.0002352, + "loss": 3.2122, + "step": 4468 + }, + { + "epoch": 7.1504, + "grad_norm": 0.0, + "learning_rate": 0.00023504999999999997, + "loss": 3.7312, + "step": 4469 + }, + { + "epoch": 7.152, + "grad_norm": 0.0, + "learning_rate": 0.0002349, + "loss": 3.0445, + "step": 4470 + }, + { + "epoch": 7.1536, + "grad_norm": 0.0, + "learning_rate": 0.00023474999999999996, + "loss": 2.5092, + "step": 4471 + }, + { + "epoch": 7.1552, + "grad_norm": 0.0, + "learning_rate": 0.00023459999999999998, + "loss": 2.8707, + "step": 4472 + }, + { + "epoch": 7.1568, + "grad_norm": 0.0, + "learning_rate": 0.00023444999999999998, + "loss": 3.4896, + "step": 4473 + }, + { + "epoch": 7.1584, + "grad_norm": 0.0, + "learning_rate": 0.00023429999999999998, + "loss": 2.7866, + "step": 4474 + }, + { + "epoch": 7.16, + "grad_norm": NaN, + "learning_rate": 0.00023429999999999998, + "loss": 3.4577, + "step": 4475 + }, + { + "epoch": 7.1616, + "grad_norm": 0.0, + "learning_rate": 0.00023414999999999997, + "loss": 4.0787, + "step": 4476 + }, + { + "epoch": 7.1632, + "grad_norm": 0.0, + "learning_rate": 0.000234, + "loss": 3.5122, + "step": 4477 + }, + { + "epoch": 7.1648, + "grad_norm": 0.0, + "learning_rate": 0.00023384999999999997, + "loss": 4.6246, + "step": 4478 + }, + { + "epoch": 7.1664, + "grad_norm": 0.0, + "learning_rate": 0.0002337, + "loss": 3.394, + "step": 4479 + }, + { + "epoch": 7.168, + "grad_norm": 0.0, + "learning_rate": 0.00023354999999999996, + "loss": 3.8507, + "step": 4480 + }, + { + "epoch": 7.1696, + "grad_norm": 0.0, + "learning_rate": 0.00023339999999999998, + "loss": 4.4262, + "step": 4481 + }, + { + "epoch": 7.1712, + "grad_norm": 0.0, + "learning_rate": 0.00023324999999999998, + "loss": 3.6319, + "step": 4482 + }, + { + "epoch": 7.1728, + "grad_norm": 0.0, + "learning_rate": 0.00023309999999999997, + "loss": 3.8157, + "step": 4483 + }, + { + "epoch": 7.1744, + "grad_norm": 0.0, + "learning_rate": 0.00023294999999999997, + "loss": 3.4546, + "step": 4484 + }, + { + "epoch": 7.176, + "grad_norm": 0.0, + "learning_rate": 0.0002328, + "loss": 2.7881, + "step": 4485 + }, + { + "epoch": 7.1776, + "grad_norm": 0.0, + "learning_rate": 0.00023264999999999996, + "loss": 3.051, + "step": 4486 + }, + { + "epoch": 7.1792, + "grad_norm": 0.0, + "learning_rate": 0.00023249999999999999, + "loss": 3.8072, + "step": 4487 + }, + { + "epoch": 7.1808, + "grad_norm": 0.0, + "learning_rate": 0.00023234999999999998, + "loss": 3.5231, + "step": 4488 + }, + { + "epoch": 7.1824, + "grad_norm": 0.0, + "learning_rate": 0.00023219999999999998, + "loss": 2.9368, + "step": 4489 + }, + { + "epoch": 7.184, + "grad_norm": 0.0, + "learning_rate": 0.00023204999999999998, + "loss": 2.2729, + "step": 4490 + }, + { + "epoch": 7.1856, + "grad_norm": 0.0, + "learning_rate": 0.0002319, + "loss": 2.8623, + "step": 4491 + }, + { + "epoch": 7.1872, + "grad_norm": 0.0, + "learning_rate": 0.00023174999999999997, + "loss": 2.7405, + "step": 4492 + }, + { + "epoch": 7.1888, + "grad_norm": 0.0, + "learning_rate": 0.0002316, + "loss": 3.0231, + "step": 4493 + }, + { + "epoch": 7.1904, + "grad_norm": 0.0, + "learning_rate": 0.00023144999999999996, + "loss": 2.6991, + "step": 4494 + }, + { + "epoch": 7.192, + "grad_norm": 0.0, + "learning_rate": 0.00023129999999999998, + "loss": 2.6831, + "step": 4495 + }, + { + "epoch": 7.1936, + "grad_norm": 0.0, + "learning_rate": 0.00023114999999999998, + "loss": 2.8213, + "step": 4496 + }, + { + "epoch": 7.1952, + "grad_norm": 0.0, + "learning_rate": 0.00023099999999999998, + "loss": 2.3582, + "step": 4497 + }, + { + "epoch": 7.1968, + "grad_norm": 0.0, + "learning_rate": 0.00023084999999999997, + "loss": 4.7609, + "step": 4498 + }, + { + "epoch": 7.1984, + "grad_norm": 0.0, + "learning_rate": 0.0002307, + "loss": 2.6573, + "step": 4499 + }, + { + "epoch": 7.2, + "grad_norm": 0.0, + "learning_rate": 0.00023054999999999997, + "loss": 2.6968, + "step": 4500 + }, + { + "epoch": 7.2016, + "grad_norm": 0.0, + "learning_rate": 0.0002304, + "loss": 2.8074, + "step": 4501 + }, + { + "epoch": 7.2032, + "grad_norm": 0.0, + "learning_rate": 0.00023024999999999996, + "loss": 3.4635, + "step": 4502 + }, + { + "epoch": 7.2048, + "grad_norm": 0.0, + "learning_rate": 0.00023009999999999998, + "loss": 3.3308, + "step": 4503 + }, + { + "epoch": 7.2064, + "grad_norm": 0.0, + "learning_rate": 0.00022994999999999998, + "loss": 2.6982, + "step": 4504 + }, + { + "epoch": 7.208, + "grad_norm": 0.0, + "learning_rate": 0.00022979999999999997, + "loss": 2.4931, + "step": 4505 + }, + { + "epoch": 7.2096, + "grad_norm": 0.0, + "learning_rate": 0.00022964999999999997, + "loss": 3.8077, + "step": 4506 + }, + { + "epoch": 7.2112, + "grad_norm": 0.0, + "learning_rate": 0.0002295, + "loss": 2.4663, + "step": 4507 + }, + { + "epoch": 7.2128, + "grad_norm": 0.0, + "learning_rate": 0.00022934999999999996, + "loss": 2.4444, + "step": 4508 + }, + { + "epoch": 7.2144, + "grad_norm": 0.0, + "learning_rate": 0.0002292, + "loss": 2.8851, + "step": 4509 + }, + { + "epoch": 7.216, + "grad_norm": 0.0, + "learning_rate": 0.00022904999999999996, + "loss": 3.0816, + "step": 4510 + }, + { + "epoch": 7.2176, + "grad_norm": 0.0, + "learning_rate": 0.00022889999999999998, + "loss": 2.8437, + "step": 4511 + }, + { + "epoch": 7.2192, + "grad_norm": 0.0, + "learning_rate": 0.00022874999999999998, + "loss": 2.5104, + "step": 4512 + }, + { + "epoch": 7.2208, + "grad_norm": 0.0, + "learning_rate": 0.00022859999999999997, + "loss": 3.1474, + "step": 4513 + }, + { + "epoch": 7.2224, + "grad_norm": 0.0, + "learning_rate": 0.00022844999999999997, + "loss": 3.1323, + "step": 4514 + }, + { + "epoch": 7.224, + "grad_norm": 0.0, + "learning_rate": 0.0002283, + "loss": 3.3355, + "step": 4515 + }, + { + "epoch": 7.2256, + "grad_norm": 0.0, + "learning_rate": 0.00022814999999999996, + "loss": 2.3959, + "step": 4516 + }, + { + "epoch": 7.2272, + "grad_norm": 0.0, + "learning_rate": 0.00022799999999999999, + "loss": 3.0254, + "step": 4517 + }, + { + "epoch": 7.2288, + "grad_norm": 0.0, + "learning_rate": 0.00022784999999999995, + "loss": 2.8069, + "step": 4518 + }, + { + "epoch": 7.2304, + "grad_norm": 0.0, + "learning_rate": 0.00022769999999999998, + "loss": 3.5841, + "step": 4519 + }, + { + "epoch": 7.232, + "grad_norm": 0.0, + "learning_rate": 0.00022754999999999997, + "loss": 2.5879, + "step": 4520 + }, + { + "epoch": 7.2336, + "grad_norm": 0.0, + "learning_rate": 0.00022739999999999997, + "loss": 3.2762, + "step": 4521 + }, + { + "epoch": 7.2352, + "grad_norm": 0.0, + "learning_rate": 0.00022724999999999997, + "loss": 2.0486, + "step": 4522 + }, + { + "epoch": 7.2368, + "grad_norm": 0.0, + "learning_rate": 0.0002271, + "loss": 2.6137, + "step": 4523 + }, + { + "epoch": 7.2384, + "grad_norm": 0.0, + "learning_rate": 0.00022694999999999996, + "loss": 3.9723, + "step": 4524 + }, + { + "epoch": 7.24, + "grad_norm": 0.0, + "learning_rate": 0.00022679999999999998, + "loss": 4.651, + "step": 4525 + }, + { + "epoch": 7.2416, + "grad_norm": 0.0, + "learning_rate": 0.00022664999999999995, + "loss": 4.3543, + "step": 4526 + }, + { + "epoch": 7.2432, + "grad_norm": 0.0, + "learning_rate": 0.00022649999999999998, + "loss": 3.9596, + "step": 4527 + }, + { + "epoch": 7.2448, + "grad_norm": 0.0, + "learning_rate": 0.00022634999999999997, + "loss": 3.3218, + "step": 4528 + }, + { + "epoch": 7.2464, + "grad_norm": 0.0, + "learning_rate": 0.00022619999999999997, + "loss": 3.7675, + "step": 4529 + }, + { + "epoch": 7.248, + "grad_norm": 0.0, + "learning_rate": 0.00022604999999999997, + "loss": 3.9477, + "step": 4530 + }, + { + "epoch": 7.2496, + "grad_norm": 0.0, + "learning_rate": 0.0002259, + "loss": 2.8677, + "step": 4531 + }, + { + "epoch": 7.2512, + "grad_norm": 0.0, + "learning_rate": 0.00022574999999999996, + "loss": 3.0615, + "step": 4532 + }, + { + "epoch": 7.2528, + "grad_norm": 0.0, + "learning_rate": 0.00022559999999999998, + "loss": 3.2326, + "step": 4533 + }, + { + "epoch": 7.2544, + "grad_norm": 0.0, + "learning_rate": 0.00022544999999999995, + "loss": 2.5745, + "step": 4534 + }, + { + "epoch": 7.256, + "grad_norm": 0.0, + "learning_rate": 0.00022529999999999997, + "loss": 3.1765, + "step": 4535 + }, + { + "epoch": 7.2576, + "grad_norm": 0.0, + "learning_rate": 0.00022514999999999997, + "loss": 3.271, + "step": 4536 + }, + { + "epoch": 7.2592, + "grad_norm": 0.0, + "learning_rate": 0.000225, + "loss": 3.1558, + "step": 4537 + }, + { + "epoch": 7.2608, + "grad_norm": 0.0, + "learning_rate": 0.00022485, + "loss": 2.5605, + "step": 4538 + }, + { + "epoch": 7.2624, + "grad_norm": 0.0, + "learning_rate": 0.0002247, + "loss": 2.6944, + "step": 4539 + }, + { + "epoch": 7.264, + "grad_norm": 0.0, + "learning_rate": 0.00022455, + "loss": 2.6869, + "step": 4540 + }, + { + "epoch": 7.2656, + "grad_norm": 0.0, + "learning_rate": 0.00022439999999999998, + "loss": 2.8473, + "step": 4541 + }, + { + "epoch": 7.2672, + "grad_norm": 0.0, + "learning_rate": 0.00022425, + "loss": 2.8917, + "step": 4542 + }, + { + "epoch": 7.2688, + "grad_norm": 0.0, + "learning_rate": 0.00022409999999999997, + "loss": 2.9746, + "step": 4543 + }, + { + "epoch": 7.2704, + "grad_norm": 0.0, + "learning_rate": 0.00022395, + "loss": 3.6293, + "step": 4544 + }, + { + "epoch": 7.272, + "grad_norm": 0.0, + "learning_rate": 0.0002238, + "loss": 2.8313, + "step": 4545 + }, + { + "epoch": 7.2736, + "grad_norm": 0.0, + "learning_rate": 0.00022365, + "loss": 2.3467, + "step": 4546 + }, + { + "epoch": 7.2752, + "grad_norm": 0.0, + "learning_rate": 0.00022349999999999998, + "loss": 2.8612, + "step": 4547 + }, + { + "epoch": 7.2768, + "grad_norm": 0.0, + "learning_rate": 0.00022335, + "loss": 2.7406, + "step": 4548 + }, + { + "epoch": 7.2783999999999995, + "grad_norm": 0.0, + "learning_rate": 0.00022319999999999998, + "loss": 2.6151, + "step": 4549 + }, + { + "epoch": 7.28, + "grad_norm": 0.0, + "learning_rate": 0.00022305, + "loss": 2.3726, + "step": 4550 + }, + { + "epoch": 7.2816, + "grad_norm": 0.0, + "learning_rate": 0.00022289999999999997, + "loss": 2.4332, + "step": 4551 + }, + { + "epoch": 7.2832, + "grad_norm": 0.0, + "learning_rate": 0.00022275, + "loss": 2.9835, + "step": 4552 + }, + { + "epoch": 7.2848, + "grad_norm": 0.0, + "learning_rate": 0.0002226, + "loss": 2.5813, + "step": 4553 + }, + { + "epoch": 7.2864, + "grad_norm": 0.0, + "learning_rate": 0.00022244999999999999, + "loss": 2.9601, + "step": 4554 + }, + { + "epoch": 7.288, + "grad_norm": 0.0, + "learning_rate": 0.00022229999999999998, + "loss": 2.7316, + "step": 4555 + }, + { + "epoch": 7.2896, + "grad_norm": 0.0, + "learning_rate": 0.00022215, + "loss": 2.8432, + "step": 4556 + }, + { + "epoch": 7.2912, + "grad_norm": 0.0, + "learning_rate": 0.00022199999999999998, + "loss": 2.6512, + "step": 4557 + }, + { + "epoch": 7.2928, + "grad_norm": 0.0, + "learning_rate": 0.00022185, + "loss": 2.9372, + "step": 4558 + }, + { + "epoch": 7.2943999999999996, + "grad_norm": 0.0, + "learning_rate": 0.00022169999999999997, + "loss": 2.3597, + "step": 4559 + }, + { + "epoch": 7.296, + "grad_norm": 0.0, + "learning_rate": 0.00022155, + "loss": 2.6766, + "step": 4560 + }, + { + "epoch": 7.2976, + "grad_norm": 0.0, + "learning_rate": 0.0002214, + "loss": 2.6093, + "step": 4561 + }, + { + "epoch": 7.2992, + "grad_norm": 0.0, + "learning_rate": 0.00022124999999999998, + "loss": 3.3046, + "step": 4562 + }, + { + "epoch": 7.3008, + "grad_norm": 0.0, + "learning_rate": 0.00022109999999999998, + "loss": 2.4641, + "step": 4563 + }, + { + "epoch": 7.3024000000000004, + "grad_norm": 0.0, + "learning_rate": 0.00022095, + "loss": 3.1234, + "step": 4564 + }, + { + "epoch": 7.304, + "grad_norm": 0.0, + "learning_rate": 0.00022079999999999997, + "loss": 2.3525, + "step": 4565 + }, + { + "epoch": 7.3056, + "grad_norm": 0.0, + "learning_rate": 0.00022065, + "loss": 3.0467, + "step": 4566 + }, + { + "epoch": 7.3072, + "grad_norm": 0.0, + "learning_rate": 0.00022049999999999997, + "loss": 2.6771, + "step": 4567 + }, + { + "epoch": 7.3088, + "grad_norm": 0.0, + "learning_rate": 0.00022035, + "loss": 3.5626, + "step": 4568 + }, + { + "epoch": 7.3104, + "grad_norm": 0.0, + "learning_rate": 0.00022019999999999999, + "loss": 3.0666, + "step": 4569 + }, + { + "epoch": 7.312, + "grad_norm": 0.0, + "learning_rate": 0.00022004999999999998, + "loss": 3.374, + "step": 4570 + }, + { + "epoch": 7.3136, + "grad_norm": 0.0, + "learning_rate": 0.00021989999999999998, + "loss": 3.1645, + "step": 4571 + }, + { + "epoch": 7.3152, + "grad_norm": 0.0, + "learning_rate": 0.00021975, + "loss": 2.5583, + "step": 4572 + }, + { + "epoch": 7.3168, + "grad_norm": 0.0, + "learning_rate": 0.00021959999999999997, + "loss": 3.6608, + "step": 4573 + }, + { + "epoch": 7.3184000000000005, + "grad_norm": 0.0, + "learning_rate": 0.00021945, + "loss": 4.1134, + "step": 4574 + }, + { + "epoch": 7.32, + "grad_norm": 0.0, + "learning_rate": 0.00021929999999999996, + "loss": 3.1932, + "step": 4575 + }, + { + "epoch": 7.3216, + "grad_norm": 0.0, + "learning_rate": 0.00021915, + "loss": 3.548, + "step": 4576 + }, + { + "epoch": 7.3232, + "grad_norm": 0.0, + "learning_rate": 0.00021899999999999998, + "loss": 3.1762, + "step": 4577 + }, + { + "epoch": 7.3248, + "grad_norm": 0.0, + "learning_rate": 0.00021884999999999998, + "loss": 4.0924, + "step": 4578 + }, + { + "epoch": 7.3264, + "grad_norm": 0.0, + "learning_rate": 0.00021869999999999998, + "loss": 3.4292, + "step": 4579 + }, + { + "epoch": 7.328, + "grad_norm": 0.0, + "learning_rate": 0.00021855, + "loss": 3.931, + "step": 4580 + }, + { + "epoch": 7.3296, + "grad_norm": 0.0, + "learning_rate": 0.00021839999999999997, + "loss": 2.9257, + "step": 4581 + }, + { + "epoch": 7.3312, + "grad_norm": 0.0, + "learning_rate": 0.00021825, + "loss": 3.6792, + "step": 4582 + }, + { + "epoch": 7.3328, + "grad_norm": 0.0, + "learning_rate": 0.00021809999999999996, + "loss": 3.305, + "step": 4583 + }, + { + "epoch": 7.3344, + "grad_norm": 0.0, + "learning_rate": 0.00021794999999999999, + "loss": 2.4825, + "step": 4584 + }, + { + "epoch": 7.336, + "grad_norm": 0.0, + "learning_rate": 0.00021779999999999998, + "loss": 3.573, + "step": 4585 + }, + { + "epoch": 7.3376, + "grad_norm": 0.0, + "learning_rate": 0.00021764999999999998, + "loss": 2.5883, + "step": 4586 + }, + { + "epoch": 7.3392, + "grad_norm": 0.0, + "learning_rate": 0.00021749999999999997, + "loss": 3.0585, + "step": 4587 + }, + { + "epoch": 7.3408, + "grad_norm": 0.0, + "learning_rate": 0.00021735, + "loss": 2.9204, + "step": 4588 + }, + { + "epoch": 7.3424, + "grad_norm": 0.0, + "learning_rate": 0.00021719999999999997, + "loss": 2.9738, + "step": 4589 + }, + { + "epoch": 7.344, + "grad_norm": 0.0, + "learning_rate": 0.00021705, + "loss": 2.8103, + "step": 4590 + }, + { + "epoch": 7.3456, + "grad_norm": 0.0, + "learning_rate": 0.0002169, + "loss": 2.9121, + "step": 4591 + }, + { + "epoch": 7.3472, + "grad_norm": 0.0, + "learning_rate": 0.00021674999999999998, + "loss": 2.4368, + "step": 4592 + }, + { + "epoch": 7.3488, + "grad_norm": 0.0, + "learning_rate": 0.00021659999999999998, + "loss": 3.1357, + "step": 4593 + }, + { + "epoch": 7.3504, + "grad_norm": 0.0, + "learning_rate": 0.00021645, + "loss": 2.927, + "step": 4594 + }, + { + "epoch": 7.352, + "grad_norm": 0.0, + "learning_rate": 0.00021629999999999997, + "loss": 2.7341, + "step": 4595 + }, + { + "epoch": 7.3536, + "grad_norm": 0.0, + "learning_rate": 0.00021615, + "loss": 2.8885, + "step": 4596 + }, + { + "epoch": 7.3552, + "grad_norm": 0.0, + "learning_rate": 0.00021599999999999996, + "loss": 2.8116, + "step": 4597 + }, + { + "epoch": 7.3568, + "grad_norm": 0.0, + "learning_rate": 0.00021585, + "loss": 2.6231, + "step": 4598 + }, + { + "epoch": 7.3584, + "grad_norm": 0.0, + "learning_rate": 0.00021569999999999998, + "loss": 2.3804, + "step": 4599 + }, + { + "epoch": 7.36, + "grad_norm": 0.0, + "learning_rate": 0.00021554999999999998, + "loss": 2.9015, + "step": 4600 + }, + { + "epoch": 7.3616, + "grad_norm": 0.0, + "learning_rate": 0.00021539999999999998, + "loss": 2.782, + "step": 4601 + }, + { + "epoch": 7.3632, + "grad_norm": 0.0, + "learning_rate": 0.00021525, + "loss": 2.5905, + "step": 4602 + }, + { + "epoch": 7.3648, + "grad_norm": 0.0, + "learning_rate": 0.00021509999999999997, + "loss": 2.6103, + "step": 4603 + }, + { + "epoch": 7.3664, + "grad_norm": 0.0, + "learning_rate": 0.00021495, + "loss": 2.575, + "step": 4604 + }, + { + "epoch": 7.368, + "grad_norm": 0.0, + "learning_rate": 0.00021479999999999996, + "loss": 3.0087, + "step": 4605 + }, + { + "epoch": 7.3696, + "grad_norm": 0.0, + "learning_rate": 0.00021464999999999999, + "loss": 2.3155, + "step": 4606 + }, + { + "epoch": 7.3712, + "grad_norm": 0.0, + "learning_rate": 0.00021449999999999998, + "loss": 2.8529, + "step": 4607 + }, + { + "epoch": 7.3728, + "grad_norm": 0.0, + "learning_rate": 0.00021434999999999998, + "loss": 2.7225, + "step": 4608 + }, + { + "epoch": 7.3744, + "grad_norm": 0.0, + "learning_rate": 0.00021419999999999998, + "loss": 2.8051, + "step": 4609 + }, + { + "epoch": 7.376, + "grad_norm": 0.0, + "learning_rate": 0.00021405, + "loss": 2.5828, + "step": 4610 + }, + { + "epoch": 7.3776, + "grad_norm": 0.0, + "learning_rate": 0.00021389999999999997, + "loss": 3.0833, + "step": 4611 + }, + { + "epoch": 7.3792, + "grad_norm": 0.0, + "learning_rate": 0.00021375, + "loss": 2.8253, + "step": 4612 + }, + { + "epoch": 7.3808, + "grad_norm": 0.0, + "learning_rate": 0.00021359999999999996, + "loss": 2.7968, + "step": 4613 + }, + { + "epoch": 7.3824, + "grad_norm": 0.0, + "learning_rate": 0.00021344999999999998, + "loss": 2.8949, + "step": 4614 + }, + { + "epoch": 7.384, + "grad_norm": 0.0, + "learning_rate": 0.00021329999999999998, + "loss": 2.5806, + "step": 4615 + }, + { + "epoch": 7.3856, + "grad_norm": 0.0, + "learning_rate": 0.00021314999999999998, + "loss": 3.7758, + "step": 4616 + }, + { + "epoch": 7.3872, + "grad_norm": 0.0, + "learning_rate": 0.00021299999999999997, + "loss": 2.7733, + "step": 4617 + }, + { + "epoch": 7.3888, + "grad_norm": 0.0, + "learning_rate": 0.00021285, + "loss": 3.2324, + "step": 4618 + }, + { + "epoch": 7.3904, + "grad_norm": 0.0, + "learning_rate": 0.00021269999999999997, + "loss": 2.5837, + "step": 4619 + }, + { + "epoch": 7.392, + "grad_norm": 0.0, + "learning_rate": 0.00021255, + "loss": 2.8423, + "step": 4620 + }, + { + "epoch": 7.3936, + "grad_norm": 0.0, + "learning_rate": 0.00021239999999999996, + "loss": 2.7471, + "step": 4621 + }, + { + "epoch": 7.3952, + "grad_norm": 0.0, + "learning_rate": 0.00021224999999999998, + "loss": 2.7898, + "step": 4622 + }, + { + "epoch": 7.3968, + "grad_norm": 0.0, + "learning_rate": 0.00021209999999999998, + "loss": 3.2034, + "step": 4623 + }, + { + "epoch": 7.3984, + "grad_norm": 0.0, + "learning_rate": 0.00021194999999999997, + "loss": 3.224, + "step": 4624 + }, + { + "epoch": 7.4, + "grad_norm": 0.0, + "learning_rate": 0.00021179999999999997, + "loss": 3.9561, + "step": 4625 + }, + { + "epoch": 7.4016, + "grad_norm": 0.0, + "learning_rate": 0.00021165, + "loss": 4.0521, + "step": 4626 + }, + { + "epoch": 7.4032, + "grad_norm": 0.0, + "learning_rate": 0.00021149999999999996, + "loss": 4.0914, + "step": 4627 + }, + { + "epoch": 7.4048, + "grad_norm": 0.0, + "learning_rate": 0.00021135, + "loss": 3.4127, + "step": 4628 + }, + { + "epoch": 7.4064, + "grad_norm": 0.0, + "learning_rate": 0.00021119999999999996, + "loss": 3.3387, + "step": 4629 + }, + { + "epoch": 7.408, + "grad_norm": 0.0, + "learning_rate": 0.00021104999999999998, + "loss": 3.7805, + "step": 4630 + }, + { + "epoch": 7.4096, + "grad_norm": 0.0, + "learning_rate": 0.00021089999999999998, + "loss": 3.903, + "step": 4631 + }, + { + "epoch": 7.4112, + "grad_norm": 0.0, + "learning_rate": 0.00021074999999999997, + "loss": 3.3901, + "step": 4632 + }, + { + "epoch": 7.4128, + "grad_norm": 0.0, + "learning_rate": 0.00021059999999999997, + "loss": 2.9554, + "step": 4633 + }, + { + "epoch": 7.4144, + "grad_norm": 0.0, + "learning_rate": 0.00021045, + "loss": 3.3087, + "step": 4634 + }, + { + "epoch": 7.416, + "grad_norm": 0.0, + "learning_rate": 0.00021029999999999996, + "loss": 2.9068, + "step": 4635 + }, + { + "epoch": 7.4176, + "grad_norm": 0.0, + "learning_rate": 0.00021014999999999999, + "loss": 2.7432, + "step": 4636 + }, + { + "epoch": 7.4192, + "grad_norm": 0.0, + "learning_rate": 0.00020999999999999998, + "loss": 2.867, + "step": 4637 + }, + { + "epoch": 7.4208, + "grad_norm": 0.0, + "learning_rate": 0.00020984999999999998, + "loss": 3.2016, + "step": 4638 + }, + { + "epoch": 7.4224, + "grad_norm": 0.0, + "learning_rate": 0.00020969999999999997, + "loss": 3.1958, + "step": 4639 + }, + { + "epoch": 7.424, + "grad_norm": 0.0, + "learning_rate": 0.00020955, + "loss": 3.1179, + "step": 4640 + }, + { + "epoch": 7.4256, + "grad_norm": 0.0, + "learning_rate": 0.00020939999999999997, + "loss": 2.5467, + "step": 4641 + }, + { + "epoch": 7.4272, + "grad_norm": 0.0, + "learning_rate": 0.00020925, + "loss": 3.0303, + "step": 4642 + }, + { + "epoch": 7.4288, + "grad_norm": 0.0, + "learning_rate": 0.00020909999999999996, + "loss": 2.5878, + "step": 4643 + }, + { + "epoch": 7.4304, + "grad_norm": 0.0, + "learning_rate": 0.00020894999999999998, + "loss": 3.0959, + "step": 4644 + }, + { + "epoch": 7.432, + "grad_norm": 0.0, + "learning_rate": 0.00020879999999999998, + "loss": 2.7835, + "step": 4645 + }, + { + "epoch": 7.4336, + "grad_norm": 0.0, + "learning_rate": 0.00020864999999999998, + "loss": 3.2442, + "step": 4646 + }, + { + "epoch": 7.4352, + "grad_norm": 0.0, + "learning_rate": 0.00020849999999999997, + "loss": 2.6535, + "step": 4647 + }, + { + "epoch": 7.4368, + "grad_norm": 0.0, + "learning_rate": 0.00020835, + "loss": 2.9818, + "step": 4648 + }, + { + "epoch": 7.4384, + "grad_norm": 0.0, + "learning_rate": 0.00020819999999999996, + "loss": 2.9469, + "step": 4649 + }, + { + "epoch": 7.44, + "grad_norm": 0.0, + "learning_rate": 0.00020805, + "loss": 3.2529, + "step": 4650 + }, + { + "epoch": 7.4416, + "grad_norm": 0.0, + "learning_rate": 0.00020789999999999996, + "loss": 3.6801, + "step": 4651 + }, + { + "epoch": 7.4432, + "grad_norm": 0.0, + "learning_rate": 0.00020774999999999998, + "loss": 2.682, + "step": 4652 + }, + { + "epoch": 7.4448, + "grad_norm": 0.0, + "learning_rate": 0.00020759999999999998, + "loss": 2.9149, + "step": 4653 + }, + { + "epoch": 7.4464, + "grad_norm": 0.0, + "learning_rate": 0.00020744999999999997, + "loss": 3.1497, + "step": 4654 + }, + { + "epoch": 7.448, + "grad_norm": 0.0, + "learning_rate": 0.00020729999999999997, + "loss": 2.9631, + "step": 4655 + }, + { + "epoch": 7.4496, + "grad_norm": 0.0, + "learning_rate": 0.00020715, + "loss": 2.767, + "step": 4656 + }, + { + "epoch": 7.4512, + "grad_norm": 0.0, + "learning_rate": 0.00020699999999999996, + "loss": 3.045, + "step": 4657 + }, + { + "epoch": 7.4528, + "grad_norm": 0.0, + "learning_rate": 0.00020684999999999999, + "loss": 2.5517, + "step": 4658 + }, + { + "epoch": 7.4544, + "grad_norm": 0.0, + "learning_rate": 0.00020669999999999996, + "loss": 3.8156, + "step": 4659 + }, + { + "epoch": 7.456, + "grad_norm": 0.0, + "learning_rate": 0.00020654999999999998, + "loss": 3.2374, + "step": 4660 + }, + { + "epoch": 7.4576, + "grad_norm": 0.0, + "learning_rate": 0.00020639999999999998, + "loss": 2.8912, + "step": 4661 + }, + { + "epoch": 7.4592, + "grad_norm": 0.0, + "learning_rate": 0.00020624999999999997, + "loss": 2.6571, + "step": 4662 + }, + { + "epoch": 7.4608, + "grad_norm": 0.0, + "learning_rate": 0.0002061, + "loss": 2.9825, + "step": 4663 + }, + { + "epoch": 7.4624, + "grad_norm": 0.0, + "learning_rate": 0.00020595, + "loss": 3.2419, + "step": 4664 + }, + { + "epoch": 7.464, + "grad_norm": 0.0, + "learning_rate": 0.0002058, + "loss": 2.3981, + "step": 4665 + }, + { + "epoch": 7.4656, + "grad_norm": 0.0, + "learning_rate": 0.00020564999999999998, + "loss": 2.7962, + "step": 4666 + }, + { + "epoch": 7.4672, + "grad_norm": 0.0, + "learning_rate": 0.0002055, + "loss": 3.2975, + "step": 4667 + }, + { + "epoch": 7.4688, + "grad_norm": 0.0, + "learning_rate": 0.00020534999999999998, + "loss": 3.0376, + "step": 4668 + }, + { + "epoch": 7.4704, + "grad_norm": 0.0, + "learning_rate": 0.0002052, + "loss": 3.1567, + "step": 4669 + }, + { + "epoch": 7.4719999999999995, + "grad_norm": 0.0, + "learning_rate": 0.00020504999999999997, + "loss": 3.283, + "step": 4670 + }, + { + "epoch": 7.4736, + "grad_norm": 0.0, + "learning_rate": 0.0002049, + "loss": 2.2496, + "step": 4671 + }, + { + "epoch": 7.4752, + "grad_norm": 0.0, + "learning_rate": 0.00020475, + "loss": 2.6953, + "step": 4672 + }, + { + "epoch": 7.4768, + "grad_norm": 0.0, + "learning_rate": 0.00020459999999999999, + "loss": 3.5792, + "step": 4673 + }, + { + "epoch": 7.4784, + "grad_norm": 0.0, + "learning_rate": 0.00020444999999999998, + "loss": 3.2485, + "step": 4674 + }, + { + "epoch": 7.48, + "grad_norm": 0.0, + "learning_rate": 0.0002043, + "loss": 3.1137, + "step": 4675 + }, + { + "epoch": 7.4816, + "grad_norm": 0.0, + "learning_rate": 0.00020414999999999997, + "loss": 4.1653, + "step": 4676 + }, + { + "epoch": 7.4832, + "grad_norm": 0.0, + "learning_rate": 0.000204, + "loss": 4.5673, + "step": 4677 + }, + { + "epoch": 7.4848, + "grad_norm": 0.0, + "learning_rate": 0.00020384999999999997, + "loss": 3.773, + "step": 4678 + }, + { + "epoch": 7.4864, + "grad_norm": 0.0, + "learning_rate": 0.0002037, + "loss": 3.8248, + "step": 4679 + }, + { + "epoch": 7.4879999999999995, + "grad_norm": 0.0, + "learning_rate": 0.00020355, + "loss": 3.3055, + "step": 4680 + }, + { + "epoch": 7.4896, + "grad_norm": 0.0, + "learning_rate": 0.00020339999999999998, + "loss": 3.5717, + "step": 4681 + }, + { + "epoch": 7.4912, + "grad_norm": 0.0, + "learning_rate": 0.00020324999999999998, + "loss": 3.0292, + "step": 4682 + }, + { + "epoch": 7.4928, + "grad_norm": 0.0, + "learning_rate": 0.0002031, + "loss": 4.1389, + "step": 4683 + }, + { + "epoch": 7.4944, + "grad_norm": 0.0, + "learning_rate": 0.00020294999999999997, + "loss": 2.9023, + "step": 4684 + }, + { + "epoch": 7.496, + "grad_norm": 0.0, + "learning_rate": 0.0002028, + "loss": 2.8841, + "step": 4685 + }, + { + "epoch": 7.4976, + "grad_norm": 0.0, + "learning_rate": 0.00020264999999999997, + "loss": 3.5379, + "step": 4686 + }, + { + "epoch": 7.4992, + "grad_norm": 0.0, + "learning_rate": 0.0002025, + "loss": 3.1102, + "step": 4687 + }, + { + "epoch": 7.5008, + "grad_norm": 0.0, + "learning_rate": 0.00020234999999999999, + "loss": 2.3866, + "step": 4688 + }, + { + "epoch": 7.5024, + "grad_norm": 0.0, + "learning_rate": 0.0002022, + "loss": 3.1948, + "step": 4689 + }, + { + "epoch": 7.504, + "grad_norm": 0.0, + "learning_rate": 0.00020204999999999998, + "loss": 2.9193, + "step": 4690 + }, + { + "epoch": 7.5056, + "grad_norm": 0.0, + "learning_rate": 0.0002019, + "loss": 2.6375, + "step": 4691 + }, + { + "epoch": 7.5072, + "grad_norm": 0.0, + "learning_rate": 0.00020174999999999997, + "loss": 2.7399, + "step": 4692 + }, + { + "epoch": 7.5088, + "grad_norm": 0.0, + "learning_rate": 0.0002016, + "loss": 2.7869, + "step": 4693 + }, + { + "epoch": 7.5104, + "grad_norm": 0.0, + "learning_rate": 0.00020145, + "loss": 2.7297, + "step": 4694 + }, + { + "epoch": 7.5120000000000005, + "grad_norm": 0.0, + "learning_rate": 0.0002013, + "loss": 3.0658, + "step": 4695 + }, + { + "epoch": 7.5136, + "grad_norm": 0.0, + "learning_rate": 0.00020114999999999998, + "loss": 2.5824, + "step": 4696 + }, + { + "epoch": 7.5152, + "grad_norm": 0.0, + "learning_rate": 0.000201, + "loss": 3.1716, + "step": 4697 + }, + { + "epoch": 7.5168, + "grad_norm": 0.0, + "learning_rate": 0.00020084999999999998, + "loss": 3.4955, + "step": 4698 + }, + { + "epoch": 7.5184, + "grad_norm": 0.0, + "learning_rate": 0.0002007, + "loss": 2.6723, + "step": 4699 + }, + { + "epoch": 7.52, + "grad_norm": 0.0, + "learning_rate": 0.00020054999999999997, + "loss": 3.1623, + "step": 4700 + }, + { + "epoch": 7.5216, + "grad_norm": 0.0, + "learning_rate": 0.0002004, + "loss": 2.3293, + "step": 4701 + }, + { + "epoch": 7.5232, + "grad_norm": 0.0, + "learning_rate": 0.00020025, + "loss": 3.2434, + "step": 4702 + }, + { + "epoch": 7.5248, + "grad_norm": 0.0, + "learning_rate": 0.00020009999999999998, + "loss": 3.1604, + "step": 4703 + }, + { + "epoch": 7.5264, + "grad_norm": 0.0, + "learning_rate": 0.00019994999999999998, + "loss": 2.4726, + "step": 4704 + }, + { + "epoch": 7.5280000000000005, + "grad_norm": 0.0, + "learning_rate": 0.0001998, + "loss": 3.4306, + "step": 4705 + }, + { + "epoch": 7.5296, + "grad_norm": 0.0, + "learning_rate": 0.00019964999999999997, + "loss": 2.6647, + "step": 4706 + }, + { + "epoch": 7.5312, + "grad_norm": 0.0, + "learning_rate": 0.0001995, + "loss": 2.991, + "step": 4707 + }, + { + "epoch": 7.5328, + "grad_norm": 0.0, + "learning_rate": 0.00019934999999999997, + "loss": 3.0528, + "step": 4708 + }, + { + "epoch": 7.5344, + "grad_norm": 0.0, + "learning_rate": 0.0001992, + "loss": 3.1222, + "step": 4709 + }, + { + "epoch": 7.536, + "grad_norm": 0.0, + "learning_rate": 0.00019905, + "loss": 3.3261, + "step": 4710 + }, + { + "epoch": 7.5376, + "grad_norm": 0.0, + "learning_rate": 0.00019889999999999998, + "loss": 2.8421, + "step": 4711 + }, + { + "epoch": 7.5392, + "grad_norm": 0.0, + "learning_rate": 0.00019874999999999998, + "loss": 2.7211, + "step": 4712 + }, + { + "epoch": 7.5408, + "grad_norm": 0.0, + "learning_rate": 0.0001986, + "loss": 2.951, + "step": 4713 + }, + { + "epoch": 7.5424, + "grad_norm": 0.0, + "learning_rate": 0.00019844999999999997, + "loss": 3.3668, + "step": 4714 + }, + { + "epoch": 7.5440000000000005, + "grad_norm": 0.0, + "learning_rate": 0.0001983, + "loss": 2.5635, + "step": 4715 + }, + { + "epoch": 7.5456, + "grad_norm": 0.0, + "learning_rate": 0.00019814999999999996, + "loss": 3.2249, + "step": 4716 + }, + { + "epoch": 7.5472, + "grad_norm": 0.0, + "learning_rate": 0.000198, + "loss": 2.7145, + "step": 4717 + }, + { + "epoch": 7.5488, + "grad_norm": 0.0, + "learning_rate": 0.00019784999999999998, + "loss": 2.9497, + "step": 4718 + }, + { + "epoch": 7.5504, + "grad_norm": 0.0, + "learning_rate": 0.00019769999999999998, + "loss": 3.2384, + "step": 4719 + }, + { + "epoch": 7.552, + "grad_norm": 0.0, + "learning_rate": 0.00019754999999999998, + "loss": 2.9618, + "step": 4720 + }, + { + "epoch": 7.5536, + "grad_norm": 0.0, + "learning_rate": 0.0001974, + "loss": 3.3049, + "step": 4721 + }, + { + "epoch": 7.5552, + "grad_norm": 0.0, + "learning_rate": 0.00019724999999999997, + "loss": 3.2177, + "step": 4722 + }, + { + "epoch": 7.5568, + "grad_norm": 0.0, + "learning_rate": 0.0001971, + "loss": 3.6352, + "step": 4723 + }, + { + "epoch": 7.5584, + "grad_norm": 0.0, + "learning_rate": 0.00019694999999999996, + "loss": 3.0243, + "step": 4724 + }, + { + "epoch": 7.5600000000000005, + "grad_norm": 0.0, + "learning_rate": 0.00019679999999999999, + "loss": 4.1826, + "step": 4725 + }, + { + "epoch": 7.5616, + "grad_norm": 0.0, + "learning_rate": 0.00019664999999999998, + "loss": 4.4972, + "step": 4726 + }, + { + "epoch": 7.5632, + "grad_norm": 0.0, + "learning_rate": 0.00019649999999999998, + "loss": 4.1343, + "step": 4727 + }, + { + "epoch": 7.5648, + "grad_norm": 0.0, + "learning_rate": 0.00019634999999999998, + "loss": 3.8212, + "step": 4728 + }, + { + "epoch": 7.5664, + "grad_norm": 0.0, + "learning_rate": 0.0001962, + "loss": 3.3201, + "step": 4729 + }, + { + "epoch": 7.568, + "grad_norm": 0.0, + "learning_rate": 0.00019604999999999997, + "loss": 3.2459, + "step": 4730 + }, + { + "epoch": 7.5696, + "grad_norm": 0.0, + "learning_rate": 0.0001959, + "loss": 3.7524, + "step": 4731 + }, + { + "epoch": 7.5712, + "grad_norm": 0.0, + "learning_rate": 0.00019574999999999996, + "loss": 3.0224, + "step": 4732 + }, + { + "epoch": 7.5728, + "grad_norm": 0.0, + "learning_rate": 0.00019559999999999998, + "loss": 3.2785, + "step": 4733 + }, + { + "epoch": 7.5744, + "grad_norm": 0.0, + "learning_rate": 0.00019544999999999998, + "loss": 3.0942, + "step": 4734 + }, + { + "epoch": 7.576, + "grad_norm": 0.0, + "learning_rate": 0.00019529999999999998, + "loss": 3.1707, + "step": 4735 + }, + { + "epoch": 7.5776, + "grad_norm": 0.0, + "learning_rate": 0.00019514999999999997, + "loss": 2.686, + "step": 4736 + }, + { + "epoch": 7.5792, + "grad_norm": 0.0, + "learning_rate": 0.000195, + "loss": 2.6039, + "step": 4737 + }, + { + "epoch": 7.5808, + "grad_norm": 0.0, + "learning_rate": 0.00019484999999999997, + "loss": 3.097, + "step": 4738 + }, + { + "epoch": 7.5824, + "grad_norm": 0.0, + "learning_rate": 0.0001947, + "loss": 2.4248, + "step": 4739 + }, + { + "epoch": 7.584, + "grad_norm": 0.0, + "learning_rate": 0.00019454999999999999, + "loss": 2.9075, + "step": 4740 + }, + { + "epoch": 7.5856, + "grad_norm": 0.0, + "learning_rate": 0.00019439999999999998, + "loss": 2.7502, + "step": 4741 + }, + { + "epoch": 7.5872, + "grad_norm": 0.0, + "learning_rate": 0.00019424999999999998, + "loss": 3.0797, + "step": 4742 + }, + { + "epoch": 7.5888, + "grad_norm": 0.0, + "learning_rate": 0.0001941, + "loss": 3.082, + "step": 4743 + }, + { + "epoch": 7.5904, + "grad_norm": 0.0, + "learning_rate": 0.00019394999999999997, + "loss": 3.4958, + "step": 4744 + }, + { + "epoch": 7.592, + "grad_norm": 0.0, + "learning_rate": 0.0001938, + "loss": 2.8035, + "step": 4745 + }, + { + "epoch": 7.5936, + "grad_norm": 0.0, + "learning_rate": 0.00019364999999999996, + "loss": 2.5721, + "step": 4746 + }, + { + "epoch": 7.5952, + "grad_norm": 0.0, + "learning_rate": 0.0001935, + "loss": 2.7212, + "step": 4747 + }, + { + "epoch": 7.5968, + "grad_norm": 0.0, + "learning_rate": 0.00019334999999999998, + "loss": 3.8946, + "step": 4748 + }, + { + "epoch": 7.5984, + "grad_norm": 0.0, + "learning_rate": 0.00019319999999999998, + "loss": 2.6282, + "step": 4749 + }, + { + "epoch": 7.6, + "grad_norm": 0.0, + "learning_rate": 0.00019304999999999998, + "loss": 2.9679, + "step": 4750 + }, + { + "epoch": 7.6016, + "grad_norm": 0.0, + "learning_rate": 0.0001929, + "loss": 2.5958, + "step": 4751 + }, + { + "epoch": 7.6032, + "grad_norm": 0.0, + "learning_rate": 0.00019274999999999997, + "loss": 3.128, + "step": 4752 + }, + { + "epoch": 7.6048, + "grad_norm": 0.0, + "learning_rate": 0.0001926, + "loss": 2.3277, + "step": 4753 + }, + { + "epoch": 7.6064, + "grad_norm": 0.0, + "learning_rate": 0.00019244999999999996, + "loss": 2.5896, + "step": 4754 + }, + { + "epoch": 7.608, + "grad_norm": 0.0, + "learning_rate": 0.00019229999999999999, + "loss": 2.8047, + "step": 4755 + }, + { + "epoch": 7.6096, + "grad_norm": 0.0, + "learning_rate": 0.00019214999999999998, + "loss": 3.0452, + "step": 4756 + }, + { + "epoch": 7.6112, + "grad_norm": 0.0, + "learning_rate": 0.00019199999999999998, + "loss": 2.8145, + "step": 4757 + }, + { + "epoch": 7.6128, + "grad_norm": 0.0, + "learning_rate": 0.00019184999999999997, + "loss": 2.6334, + "step": 4758 + }, + { + "epoch": 7.6144, + "grad_norm": 0.0, + "learning_rate": 0.0001917, + "loss": 2.5752, + "step": 4759 + }, + { + "epoch": 7.616, + "grad_norm": 0.0, + "learning_rate": 0.00019154999999999997, + "loss": 3.1652, + "step": 4760 + }, + { + "epoch": 7.6176, + "grad_norm": 0.0, + "learning_rate": 0.0001914, + "loss": 2.452, + "step": 4761 + }, + { + "epoch": 7.6192, + "grad_norm": 0.0, + "learning_rate": 0.00019124999999999996, + "loss": 2.4447, + "step": 4762 + }, + { + "epoch": 7.6208, + "grad_norm": 0.0, + "learning_rate": 0.00019109999999999998, + "loss": 2.3181, + "step": 4763 + }, + { + "epoch": 7.6224, + "grad_norm": 0.0, + "learning_rate": 0.00019094999999999998, + "loss": 2.6932, + "step": 4764 + }, + { + "epoch": 7.624, + "grad_norm": 0.0, + "learning_rate": 0.00019079999999999998, + "loss": 3.6925, + "step": 4765 + }, + { + "epoch": 7.6256, + "grad_norm": 0.0, + "learning_rate": 0.00019064999999999997, + "loss": 4.25, + "step": 4766 + }, + { + "epoch": 7.6272, + "grad_norm": 0.0, + "learning_rate": 0.0001905, + "loss": 3.0461, + "step": 4767 + }, + { + "epoch": 7.6288, + "grad_norm": 0.0, + "learning_rate": 0.00019034999999999996, + "loss": 3.0939, + "step": 4768 + }, + { + "epoch": 7.6304, + "grad_norm": 0.0, + "learning_rate": 0.0001902, + "loss": 2.8625, + "step": 4769 + }, + { + "epoch": 7.632, + "grad_norm": 0.0, + "learning_rate": 0.00019004999999999996, + "loss": 3.7221, + "step": 4770 + }, + { + "epoch": 7.6336, + "grad_norm": 0.0, + "learning_rate": 0.00018989999999999998, + "loss": 2.636, + "step": 4771 + }, + { + "epoch": 7.6352, + "grad_norm": 0.0, + "learning_rate": 0.00018974999999999998, + "loss": 3.3965, + "step": 4772 + }, + { + "epoch": 7.6368, + "grad_norm": 0.0, + "learning_rate": 0.00018959999999999997, + "loss": 2.7437, + "step": 4773 + }, + { + "epoch": 7.6384, + "grad_norm": 0.0, + "learning_rate": 0.00018944999999999997, + "loss": 2.915, + "step": 4774 + }, + { + "epoch": 7.64, + "grad_norm": NaN, + "learning_rate": 0.00018944999999999997, + "loss": 2.9823, + "step": 4775 + }, + { + "epoch": 7.6416, + "grad_norm": 0.0, + "learning_rate": 0.0001893, + "loss": 3.0832, + "step": 4776 + }, + { + "epoch": 7.6432, + "grad_norm": 0.0, + "learning_rate": 0.00018914999999999996, + "loss": 3.059, + "step": 4777 + }, + { + "epoch": 7.6448, + "grad_norm": 0.0, + "learning_rate": 0.00018899999999999999, + "loss": 2.8947, + "step": 4778 + }, + { + "epoch": 7.6464, + "grad_norm": 0.0, + "learning_rate": 0.00018884999999999996, + "loss": 3.123, + "step": 4779 + }, + { + "epoch": 7.648, + "grad_norm": 0.0, + "learning_rate": 0.00018869999999999998, + "loss": 3.7362, + "step": 4780 + }, + { + "epoch": 7.6495999999999995, + "grad_norm": 0.0, + "learning_rate": 0.00018854999999999998, + "loss": 3.4403, + "step": 4781 + }, + { + "epoch": 7.6512, + "grad_norm": 0.0, + "learning_rate": 0.00018839999999999997, + "loss": 2.7382, + "step": 4782 + }, + { + "epoch": 7.6528, + "grad_norm": 0.0, + "learning_rate": 0.00018824999999999997, + "loss": 3.2175, + "step": 4783 + }, + { + "epoch": 7.6544, + "grad_norm": 0.0, + "learning_rate": 0.0001881, + "loss": 3.1441, + "step": 4784 + }, + { + "epoch": 7.656, + "grad_norm": 0.0, + "learning_rate": 0.00018794999999999996, + "loss": 3.2875, + "step": 4785 + }, + { + "epoch": 7.6576, + "grad_norm": 0.0, + "learning_rate": 0.00018779999999999998, + "loss": 2.4975, + "step": 4786 + }, + { + "epoch": 7.6592, + "grad_norm": 0.0, + "learning_rate": 0.00018764999999999998, + "loss": 3.5703, + "step": 4787 + }, + { + "epoch": 7.6608, + "grad_norm": 0.0, + "learning_rate": 0.00018749999999999998, + "loss": 2.8902, + "step": 4788 + }, + { + "epoch": 7.6624, + "grad_norm": 0.0, + "learning_rate": 0.00018735, + "loss": 3.2099, + "step": 4789 + }, + { + "epoch": 7.664, + "grad_norm": 0.0, + "learning_rate": 0.0001872, + "loss": 2.9238, + "step": 4790 + }, + { + "epoch": 7.6655999999999995, + "grad_norm": 0.0, + "learning_rate": 0.00018705, + "loss": 3.0309, + "step": 4791 + }, + { + "epoch": 7.6672, + "grad_norm": 0.0, + "learning_rate": 0.0001869, + "loss": 3.3599, + "step": 4792 + }, + { + "epoch": 7.6688, + "grad_norm": 0.0, + "learning_rate": 0.00018675, + "loss": 2.6549, + "step": 4793 + }, + { + "epoch": 7.6704, + "grad_norm": 0.0, + "learning_rate": 0.00018659999999999998, + "loss": 3.019, + "step": 4794 + }, + { + "epoch": 7.672, + "grad_norm": 0.0, + "learning_rate": 0.00018645, + "loss": 2.5279, + "step": 4795 + }, + { + "epoch": 7.6736, + "grad_norm": 0.0, + "learning_rate": 0.00018629999999999997, + "loss": 2.6226, + "step": 4796 + }, + { + "epoch": 7.6752, + "grad_norm": 0.0, + "learning_rate": 0.00018615, + "loss": 3.0384, + "step": 4797 + }, + { + "epoch": 7.6768, + "grad_norm": 0.0, + "learning_rate": 0.000186, + "loss": 2.7494, + "step": 4798 + }, + { + "epoch": 7.6784, + "grad_norm": 0.0, + "learning_rate": 0.00018585, + "loss": 2.9542, + "step": 4799 + }, + { + "epoch": 7.68, + "grad_norm": 0.0, + "learning_rate": 0.0001857, + "loss": 2.6497, + "step": 4800 + }, + { + "epoch": 7.6815999999999995, + "grad_norm": 0.0, + "learning_rate": 0.00018555, + "loss": 2.7592, + "step": 4801 + }, + { + "epoch": 7.6832, + "grad_norm": 0.0, + "learning_rate": 0.00018539999999999998, + "loss": 2.4912, + "step": 4802 + }, + { + "epoch": 7.6848, + "grad_norm": 0.0, + "learning_rate": 0.00018525, + "loss": 2.8979, + "step": 4803 + }, + { + "epoch": 7.6864, + "grad_norm": 0.0, + "learning_rate": 0.00018509999999999997, + "loss": 3.2736, + "step": 4804 + }, + { + "epoch": 7.688, + "grad_norm": 0.0, + "learning_rate": 0.00018495, + "loss": 3.156, + "step": 4805 + }, + { + "epoch": 7.6896, + "grad_norm": 0.0, + "learning_rate": 0.0001848, + "loss": 2.873, + "step": 4806 + }, + { + "epoch": 7.6912, + "grad_norm": 0.0, + "learning_rate": 0.00018465, + "loss": 2.6056, + "step": 4807 + }, + { + "epoch": 7.6928, + "grad_norm": 0.0, + "learning_rate": 0.00018449999999999999, + "loss": 2.2569, + "step": 4808 + }, + { + "epoch": 7.6944, + "grad_norm": 0.0, + "learning_rate": 0.00018435, + "loss": 3.3233, + "step": 4809 + }, + { + "epoch": 7.696, + "grad_norm": 0.0, + "learning_rate": 0.00018419999999999998, + "loss": 2.7609, + "step": 4810 + }, + { + "epoch": 7.6975999999999996, + "grad_norm": 0.0, + "learning_rate": 0.00018405, + "loss": 2.8943, + "step": 4811 + }, + { + "epoch": 7.6992, + "grad_norm": 0.0, + "learning_rate": 0.00018389999999999997, + "loss": 2.8554, + "step": 4812 + }, + { + "epoch": 7.7008, + "grad_norm": 0.0, + "learning_rate": 0.00018375, + "loss": 3.2062, + "step": 4813 + }, + { + "epoch": 7.7024, + "grad_norm": 0.0, + "learning_rate": 0.0001836, + "loss": 3.4082, + "step": 4814 + }, + { + "epoch": 7.704, + "grad_norm": 0.0, + "learning_rate": 0.00018345, + "loss": 2.3895, + "step": 4815 + }, + { + "epoch": 7.7056000000000004, + "grad_norm": 0.0, + "learning_rate": 0.00018329999999999998, + "loss": 3.1006, + "step": 4816 + }, + { + "epoch": 7.7072, + "grad_norm": 0.0, + "learning_rate": 0.00018315, + "loss": 2.6601, + "step": 4817 + }, + { + "epoch": 7.7088, + "grad_norm": 0.0, + "learning_rate": 0.00018299999999999998, + "loss": 3.321, + "step": 4818 + }, + { + "epoch": 7.7104, + "grad_norm": 0.0, + "learning_rate": 0.00018285, + "loss": 3.0802, + "step": 4819 + }, + { + "epoch": 7.712, + "grad_norm": 0.0, + "learning_rate": 0.00018269999999999997, + "loss": 2.8503, + "step": 4820 + }, + { + "epoch": 7.7136, + "grad_norm": 0.0, + "learning_rate": 0.00018255, + "loss": 2.7178, + "step": 4821 + }, + { + "epoch": 7.7152, + "grad_norm": 0.0, + "learning_rate": 0.0001824, + "loss": 3.6901, + "step": 4822 + }, + { + "epoch": 7.7168, + "grad_norm": 0.0, + "learning_rate": 0.00018224999999999998, + "loss": 3.2879, + "step": 4823 + }, + { + "epoch": 7.7184, + "grad_norm": 0.0, + "learning_rate": 0.00018209999999999998, + "loss": 2.8951, + "step": 4824 + }, + { + "epoch": 7.72, + "grad_norm": 0.0, + "learning_rate": 0.00018195, + "loss": 3.6328, + "step": 4825 + }, + { + "epoch": 7.7216000000000005, + "grad_norm": 0.0, + "learning_rate": 0.00018179999999999997, + "loss": 4.4299, + "step": 4826 + }, + { + "epoch": 7.7232, + "grad_norm": 0.0, + "learning_rate": 0.00018165, + "loss": 4.0601, + "step": 4827 + }, + { + "epoch": 7.7248, + "grad_norm": 0.0, + "learning_rate": 0.00018149999999999997, + "loss": 3.0605, + "step": 4828 + }, + { + "epoch": 7.7264, + "grad_norm": 0.0, + "learning_rate": 0.00018135, + "loss": 3.3661, + "step": 4829 + }, + { + "epoch": 7.728, + "grad_norm": 0.0, + "learning_rate": 0.00018119999999999999, + "loss": 3.6533, + "step": 4830 + }, + { + "epoch": 7.7296, + "grad_norm": 0.0, + "learning_rate": 0.00018104999999999998, + "loss": 4.6602, + "step": 4831 + }, + { + "epoch": 7.7312, + "grad_norm": 0.0, + "learning_rate": 0.00018089999999999998, + "loss": 3.2386, + "step": 4832 + }, + { + "epoch": 7.7328, + "grad_norm": 0.0, + "learning_rate": 0.00018075, + "loss": 3.4873, + "step": 4833 + }, + { + "epoch": 7.7344, + "grad_norm": 0.0, + "learning_rate": 0.00018059999999999997, + "loss": 3.9829, + "step": 4834 + }, + { + "epoch": 7.736, + "grad_norm": 0.0, + "learning_rate": 0.00018045, + "loss": 4.0437, + "step": 4835 + }, + { + "epoch": 7.7376000000000005, + "grad_norm": 0.0, + "learning_rate": 0.00018029999999999996, + "loss": 2.6534, + "step": 4836 + }, + { + "epoch": 7.7392, + "grad_norm": 0.0, + "learning_rate": 0.00018015, + "loss": 3.3255, + "step": 4837 + }, + { + "epoch": 7.7408, + "grad_norm": 0.0, + "learning_rate": 0.00017999999999999998, + "loss": 2.3529, + "step": 4838 + }, + { + "epoch": 7.7424, + "grad_norm": 0.0, + "learning_rate": 0.00017984999999999998, + "loss": 2.8427, + "step": 4839 + }, + { + "epoch": 7.744, + "grad_norm": 0.0, + "learning_rate": 0.00017969999999999998, + "loss": 3.0223, + "step": 4840 + }, + { + "epoch": 7.7456, + "grad_norm": 0.0, + "learning_rate": 0.00017955, + "loss": 2.8049, + "step": 4841 + }, + { + "epoch": 7.7472, + "grad_norm": 0.0, + "learning_rate": 0.00017939999999999997, + "loss": 2.7341, + "step": 4842 + }, + { + "epoch": 7.7488, + "grad_norm": 0.0, + "learning_rate": 0.00017925, + "loss": 2.4303, + "step": 4843 + }, + { + "epoch": 7.7504, + "grad_norm": 0.0, + "learning_rate": 0.0001791, + "loss": 3.6557, + "step": 4844 + }, + { + "epoch": 7.752, + "grad_norm": 0.0, + "learning_rate": 0.00017894999999999999, + "loss": 2.5856, + "step": 4845 + }, + { + "epoch": 7.7536000000000005, + "grad_norm": 0.0, + "learning_rate": 0.00017879999999999998, + "loss": 2.7199, + "step": 4846 + }, + { + "epoch": 7.7552, + "grad_norm": 0.0, + "learning_rate": 0.00017865, + "loss": 2.2769, + "step": 4847 + }, + { + "epoch": 7.7568, + "grad_norm": 0.0, + "learning_rate": 0.00017849999999999997, + "loss": 3.1895, + "step": 4848 + }, + { + "epoch": 7.7584, + "grad_norm": 0.0, + "learning_rate": 0.00017835, + "loss": 3.0962, + "step": 4849 + }, + { + "epoch": 7.76, + "grad_norm": 0.0, + "learning_rate": 0.00017819999999999997, + "loss": 2.5096, + "step": 4850 + }, + { + "epoch": 7.7616, + "grad_norm": 0.0, + "learning_rate": 0.00017805, + "loss": 3.0981, + "step": 4851 + }, + { + "epoch": 7.7632, + "grad_norm": 0.0, + "learning_rate": 0.0001779, + "loss": 2.7704, + "step": 4852 + }, + { + "epoch": 7.7648, + "grad_norm": 0.0, + "learning_rate": 0.00017774999999999998, + "loss": 2.6671, + "step": 4853 + }, + { + "epoch": 7.7664, + "grad_norm": 0.0, + "learning_rate": 0.00017759999999999998, + "loss": 3.685, + "step": 4854 + }, + { + "epoch": 7.768, + "grad_norm": 0.0, + "learning_rate": 0.00017745, + "loss": 2.9276, + "step": 4855 + }, + { + "epoch": 7.7696, + "grad_norm": 0.0, + "learning_rate": 0.00017729999999999997, + "loss": 2.6199, + "step": 4856 + }, + { + "epoch": 7.7712, + "grad_norm": 0.0, + "learning_rate": 0.00017715, + "loss": 3.2054, + "step": 4857 + }, + { + "epoch": 7.7728, + "grad_norm": 0.0, + "learning_rate": 0.00017699999999999997, + "loss": 2.7798, + "step": 4858 + }, + { + "epoch": 7.7744, + "grad_norm": 0.0, + "learning_rate": 0.00017685, + "loss": 2.1486, + "step": 4859 + }, + { + "epoch": 7.776, + "grad_norm": 0.0, + "learning_rate": 0.00017669999999999999, + "loss": 2.8584, + "step": 4860 + }, + { + "epoch": 7.7776, + "grad_norm": 0.0, + "learning_rate": 0.00017654999999999998, + "loss": 2.9491, + "step": 4861 + }, + { + "epoch": 7.7792, + "grad_norm": 0.0, + "learning_rate": 0.00017639999999999998, + "loss": 3.88, + "step": 4862 + }, + { + "epoch": 7.7808, + "grad_norm": 0.0, + "learning_rate": 0.00017625, + "loss": 3.6535, + "step": 4863 + }, + { + "epoch": 7.7824, + "grad_norm": 0.0, + "learning_rate": 0.00017609999999999997, + "loss": 3.1241, + "step": 4864 + }, + { + "epoch": 7.784, + "grad_norm": 0.0, + "learning_rate": 0.00017595, + "loss": 3.2194, + "step": 4865 + }, + { + "epoch": 7.7856, + "grad_norm": 0.0, + "learning_rate": 0.00017579999999999996, + "loss": 3.383, + "step": 4866 + }, + { + "epoch": 7.7872, + "grad_norm": 0.0, + "learning_rate": 0.00017565, + "loss": 3.2972, + "step": 4867 + }, + { + "epoch": 7.7888, + "grad_norm": 0.0, + "learning_rate": 0.00017549999999999998, + "loss": 2.985, + "step": 4868 + }, + { + "epoch": 7.7904, + "grad_norm": 0.0, + "learning_rate": 0.00017534999999999998, + "loss": 2.5855, + "step": 4869 + }, + { + "epoch": 7.792, + "grad_norm": 0.0, + "learning_rate": 0.00017519999999999998, + "loss": 3.1634, + "step": 4870 + }, + { + "epoch": 7.7936, + "grad_norm": 0.0, + "learning_rate": 0.00017505, + "loss": 3.1311, + "step": 4871 + }, + { + "epoch": 7.7952, + "grad_norm": 0.0, + "learning_rate": 0.00017489999999999997, + "loss": 2.5284, + "step": 4872 + }, + { + "epoch": 7.7968, + "grad_norm": 0.0, + "learning_rate": 0.00017475, + "loss": 3.1961, + "step": 4873 + }, + { + "epoch": 7.7984, + "grad_norm": 0.0, + "learning_rate": 0.00017459999999999996, + "loss": 3.5777, + "step": 4874 + }, + { + "epoch": 7.8, + "grad_norm": 0.0, + "learning_rate": 0.00017444999999999998, + "loss": 3.6377, + "step": 4875 + }, + { + "epoch": 7.8016, + "grad_norm": 0.0, + "learning_rate": 0.00017429999999999998, + "loss": 4.5462, + "step": 4876 + }, + { + "epoch": 7.8032, + "grad_norm": 0.0, + "learning_rate": 0.00017414999999999998, + "loss": 5.1288, + "step": 4877 + }, + { + "epoch": 7.8048, + "grad_norm": 0.0, + "learning_rate": 0.00017399999999999997, + "loss": 3.9374, + "step": 4878 + }, + { + "epoch": 7.8064, + "grad_norm": 0.0, + "learning_rate": 0.00017385, + "loss": 3.9448, + "step": 4879 + }, + { + "epoch": 7.808, + "grad_norm": 0.0, + "learning_rate": 0.00017369999999999997, + "loss": 3.4438, + "step": 4880 + }, + { + "epoch": 7.8096, + "grad_norm": 0.0, + "learning_rate": 0.00017355, + "loss": 3.3757, + "step": 4881 + }, + { + "epoch": 7.8112, + "grad_norm": 0.0, + "learning_rate": 0.00017339999999999996, + "loss": 3.2291, + "step": 4882 + }, + { + "epoch": 7.8128, + "grad_norm": 0.0, + "learning_rate": 0.00017324999999999998, + "loss": 3.4574, + "step": 4883 + }, + { + "epoch": 7.8144, + "grad_norm": 0.0, + "learning_rate": 0.00017309999999999998, + "loss": 3.1308, + "step": 4884 + }, + { + "epoch": 7.816, + "grad_norm": 0.0, + "learning_rate": 0.00017294999999999998, + "loss": 3.5847, + "step": 4885 + }, + { + "epoch": 7.8176, + "grad_norm": 0.0, + "learning_rate": 0.00017279999999999997, + "loss": 3.464, + "step": 4886 + }, + { + "epoch": 7.8192, + "grad_norm": 0.0, + "learning_rate": 0.00017265, + "loss": 2.862, + "step": 4887 + }, + { + "epoch": 7.8208, + "grad_norm": 0.0, + "learning_rate": 0.00017249999999999996, + "loss": 2.6696, + "step": 4888 + }, + { + "epoch": 7.8224, + "grad_norm": 0.0, + "learning_rate": 0.00017235, + "loss": 3.0537, + "step": 4889 + }, + { + "epoch": 7.824, + "grad_norm": 0.0, + "learning_rate": 0.00017219999999999998, + "loss": 3.1748, + "step": 4890 + }, + { + "epoch": 7.8256, + "grad_norm": 0.0, + "learning_rate": 0.00017204999999999998, + "loss": 2.8772, + "step": 4891 + }, + { + "epoch": 7.8272, + "grad_norm": 0.0, + "learning_rate": 0.00017189999999999998, + "loss": 2.8474, + "step": 4892 + }, + { + "epoch": 7.8288, + "grad_norm": 0.0, + "learning_rate": 0.00017175, + "loss": 2.9608, + "step": 4893 + }, + { + "epoch": 7.8304, + "grad_norm": 0.0, + "learning_rate": 0.00017159999999999997, + "loss": 3.2269, + "step": 4894 + }, + { + "epoch": 7.832, + "grad_norm": 0.0, + "learning_rate": 0.00017145, + "loss": 2.2713, + "step": 4895 + }, + { + "epoch": 7.8336, + "grad_norm": 0.0, + "learning_rate": 0.00017129999999999996, + "loss": 2.3742, + "step": 4896 + }, + { + "epoch": 7.8352, + "grad_norm": 0.0, + "learning_rate": 0.00017114999999999999, + "loss": 2.4596, + "step": 4897 + }, + { + "epoch": 7.8368, + "grad_norm": 0.0, + "learning_rate": 0.00017099999999999998, + "loss": 4.0418, + "step": 4898 + }, + { + "epoch": 7.8384, + "grad_norm": 0.0, + "learning_rate": 0.00017084999999999998, + "loss": 2.816, + "step": 4899 + }, + { + "epoch": 7.84, + "grad_norm": 0.0, + "learning_rate": 0.00017069999999999998, + "loss": 3.6281, + "step": 4900 + }, + { + "epoch": 7.8416, + "grad_norm": 0.0, + "learning_rate": 0.00017055, + "loss": 2.9427, + "step": 4901 + }, + { + "epoch": 7.8431999999999995, + "grad_norm": 0.0, + "learning_rate": 0.00017039999999999997, + "loss": 3.0255, + "step": 4902 + }, + { + "epoch": 7.8448, + "grad_norm": 0.0, + "learning_rate": 0.00017025, + "loss": 2.7146, + "step": 4903 + }, + { + "epoch": 7.8464, + "grad_norm": 0.0, + "learning_rate": 0.00017009999999999996, + "loss": 2.7954, + "step": 4904 + }, + { + "epoch": 7.848, + "grad_norm": 0.0, + "learning_rate": 0.00016994999999999998, + "loss": 2.593, + "step": 4905 + }, + { + "epoch": 7.8496, + "grad_norm": 0.0, + "learning_rate": 0.00016979999999999998, + "loss": 2.5321, + "step": 4906 + }, + { + "epoch": 7.8512, + "grad_norm": 0.0, + "learning_rate": 0.00016964999999999998, + "loss": 2.776, + "step": 4907 + }, + { + "epoch": 7.8528, + "grad_norm": 0.0, + "learning_rate": 0.00016949999999999997, + "loss": 2.554, + "step": 4908 + }, + { + "epoch": 7.8544, + "grad_norm": 0.0, + "learning_rate": 0.00016935, + "loss": 2.8608, + "step": 4909 + }, + { + "epoch": 7.856, + "grad_norm": 0.0, + "learning_rate": 0.00016919999999999997, + "loss": 3.0325, + "step": 4910 + }, + { + "epoch": 7.8576, + "grad_norm": 0.0, + "learning_rate": 0.00016905, + "loss": 2.5525, + "step": 4911 + }, + { + "epoch": 7.8591999999999995, + "grad_norm": 0.0, + "learning_rate": 0.00016889999999999996, + "loss": 2.5041, + "step": 4912 + }, + { + "epoch": 7.8608, + "grad_norm": 0.0, + "learning_rate": 0.00016874999999999998, + "loss": 2.4444, + "step": 4913 + }, + { + "epoch": 7.8624, + "grad_norm": 0.0, + "learning_rate": 0.0001686, + "loss": 2.978, + "step": 4914 + }, + { + "epoch": 7.864, + "grad_norm": 0.0, + "learning_rate": 0.00016844999999999997, + "loss": 2.5528, + "step": 4915 + }, + { + "epoch": 7.8656, + "grad_norm": 0.0, + "learning_rate": 0.0001683, + "loss": 3.7146, + "step": 4916 + }, + { + "epoch": 7.8672, + "grad_norm": NaN, + "learning_rate": 0.0001683, + "loss": 2.6056, + "step": 4917 + }, + { + "epoch": 7.8688, + "grad_norm": 0.0, + "learning_rate": 0.00016815, + "loss": 3.3739, + "step": 4918 + }, + { + "epoch": 7.8704, + "grad_norm": 0.0, + "learning_rate": 0.000168, + "loss": 3.0553, + "step": 4919 + }, + { + "epoch": 7.872, + "grad_norm": 0.0, + "learning_rate": 0.00016785, + "loss": 2.3268, + "step": 4920 + }, + { + "epoch": 7.8736, + "grad_norm": 0.0, + "learning_rate": 0.0001677, + "loss": 2.5411, + "step": 4921 + }, + { + "epoch": 7.8751999999999995, + "grad_norm": 0.0, + "learning_rate": 0.00016754999999999998, + "loss": 2.4827, + "step": 4922 + }, + { + "epoch": 7.8768, + "grad_norm": 0.0, + "learning_rate": 0.0001674, + "loss": 3.9451, + "step": 4923 + }, + { + "epoch": 7.8784, + "grad_norm": 0.0, + "learning_rate": 0.00016724999999999997, + "loss": 2.621, + "step": 4924 + }, + { + "epoch": 7.88, + "grad_norm": NaN, + "learning_rate": 0.00016724999999999997, + "loss": 2.7644, + "step": 4925 + }, + { + "epoch": 7.8816, + "grad_norm": 0.0, + "learning_rate": 0.0001671, + "loss": 4.6617, + "step": 4926 + }, + { + "epoch": 7.8832, + "grad_norm": 0.0, + "learning_rate": 0.00016695, + "loss": 3.873, + "step": 4927 + }, + { + "epoch": 7.8848, + "grad_norm": 0.0, + "learning_rate": 0.0001668, + "loss": 3.3357, + "step": 4928 + }, + { + "epoch": 7.8864, + "grad_norm": 0.0, + "learning_rate": 0.00016664999999999998, + "loss": 4.252, + "step": 4929 + }, + { + "epoch": 7.888, + "grad_norm": 0.0, + "learning_rate": 0.0001665, + "loss": 3.3949, + "step": 4930 + }, + { + "epoch": 7.8896, + "grad_norm": 0.0, + "learning_rate": 0.00016634999999999998, + "loss": 2.5891, + "step": 4931 + }, + { + "epoch": 7.8911999999999995, + "grad_norm": 0.0, + "learning_rate": 0.0001662, + "loss": 3.537, + "step": 4932 + }, + { + "epoch": 7.8928, + "grad_norm": 0.0, + "learning_rate": 0.00016604999999999997, + "loss": 3.3417, + "step": 4933 + }, + { + "epoch": 7.8944, + "grad_norm": 0.0, + "learning_rate": 0.0001659, + "loss": 2.878, + "step": 4934 + }, + { + "epoch": 7.896, + "grad_norm": 0.0, + "learning_rate": 0.00016575, + "loss": 3.7171, + "step": 4935 + }, + { + "epoch": 7.8976, + "grad_norm": 0.0, + "learning_rate": 0.0001656, + "loss": 2.6422, + "step": 4936 + }, + { + "epoch": 7.8992, + "grad_norm": 0.0, + "learning_rate": 0.00016544999999999998, + "loss": 3.0546, + "step": 4937 + }, + { + "epoch": 7.9008, + "grad_norm": 0.0, + "learning_rate": 0.0001653, + "loss": 2.7797, + "step": 4938 + }, + { + "epoch": 7.9024, + "grad_norm": 0.0, + "learning_rate": 0.00016514999999999998, + "loss": 2.5424, + "step": 4939 + }, + { + "epoch": 7.904, + "grad_norm": 0.0, + "learning_rate": 0.000165, + "loss": 2.8186, + "step": 4940 + }, + { + "epoch": 7.9056, + "grad_norm": 0.0, + "learning_rate": 0.00016485, + "loss": 2.8034, + "step": 4941 + }, + { + "epoch": 7.9072, + "grad_norm": 0.0, + "learning_rate": 0.0001647, + "loss": 3.2561, + "step": 4942 + }, + { + "epoch": 7.9088, + "grad_norm": 0.0, + "learning_rate": 0.00016455, + "loss": 2.6736, + "step": 4943 + }, + { + "epoch": 7.9104, + "grad_norm": 0.0, + "learning_rate": 0.0001644, + "loss": 2.5695, + "step": 4944 + }, + { + "epoch": 7.912, + "grad_norm": 0.0, + "learning_rate": 0.00016424999999999998, + "loss": 2.9508, + "step": 4945 + }, + { + "epoch": 7.9136, + "grad_norm": 0.0, + "learning_rate": 0.0001641, + "loss": 3.2022, + "step": 4946 + }, + { + "epoch": 7.9152000000000005, + "grad_norm": 0.0, + "learning_rate": 0.00016394999999999997, + "loss": 2.4282, + "step": 4947 + }, + { + "epoch": 7.9168, + "grad_norm": 0.0, + "learning_rate": 0.0001638, + "loss": 2.4215, + "step": 4948 + }, + { + "epoch": 7.9184, + "grad_norm": 0.0, + "learning_rate": 0.00016365, + "loss": 3.4899, + "step": 4949 + }, + { + "epoch": 7.92, + "grad_norm": 0.0, + "learning_rate": 0.0001635, + "loss": 2.93, + "step": 4950 + }, + { + "epoch": 7.9216, + "grad_norm": 0.0, + "learning_rate": 0.00016334999999999999, + "loss": 2.541, + "step": 4951 + }, + { + "epoch": 7.9232, + "grad_norm": 0.0, + "learning_rate": 0.0001632, + "loss": 2.9613, + "step": 4952 + }, + { + "epoch": 7.9248, + "grad_norm": 0.0, + "learning_rate": 0.00016304999999999998, + "loss": 3.1947, + "step": 4953 + }, + { + "epoch": 7.9264, + "grad_norm": 0.0, + "learning_rate": 0.0001629, + "loss": 2.3242, + "step": 4954 + }, + { + "epoch": 7.928, + "grad_norm": 0.0, + "learning_rate": 0.00016274999999999997, + "loss": 2.7264, + "step": 4955 + }, + { + "epoch": 7.9296, + "grad_norm": 0.0, + "learning_rate": 0.0001626, + "loss": 3.0834, + "step": 4956 + }, + { + "epoch": 7.9312000000000005, + "grad_norm": 0.0, + "learning_rate": 0.00016245, + "loss": 2.9288, + "step": 4957 + }, + { + "epoch": 7.9328, + "grad_norm": 0.0, + "learning_rate": 0.0001623, + "loss": 3.3546, + "step": 4958 + }, + { + "epoch": 7.9344, + "grad_norm": 0.0, + "learning_rate": 0.00016214999999999998, + "loss": 2.814, + "step": 4959 + }, + { + "epoch": 7.936, + "grad_norm": 0.0, + "learning_rate": 0.000162, + "loss": 3.0695, + "step": 4960 + }, + { + "epoch": 7.9376, + "grad_norm": 0.0, + "learning_rate": 0.00016184999999999998, + "loss": 2.59, + "step": 4961 + }, + { + "epoch": 7.9392, + "grad_norm": 0.0, + "learning_rate": 0.0001617, + "loss": 3.9635, + "step": 4962 + }, + { + "epoch": 7.9408, + "grad_norm": 0.0, + "learning_rate": 0.00016154999999999997, + "loss": 2.8278, + "step": 4963 + }, + { + "epoch": 7.9424, + "grad_norm": 0.0, + "learning_rate": 0.0001614, + "loss": 2.9744, + "step": 4964 + }, + { + "epoch": 7.944, + "grad_norm": 0.0, + "learning_rate": 0.00016125, + "loss": 2.4744, + "step": 4965 + }, + { + "epoch": 7.9456, + "grad_norm": 0.0, + "learning_rate": 0.00016109999999999999, + "loss": 2.4728, + "step": 4966 + }, + { + "epoch": 7.9472000000000005, + "grad_norm": 0.0, + "learning_rate": 0.00016094999999999998, + "loss": 2.719, + "step": 4967 + }, + { + "epoch": 7.9488, + "grad_norm": 0.0, + "learning_rate": 0.0001608, + "loss": 2.5483, + "step": 4968 + }, + { + "epoch": 7.9504, + "grad_norm": 0.0, + "learning_rate": 0.00016064999999999997, + "loss": 3.4515, + "step": 4969 + }, + { + "epoch": 7.952, + "grad_norm": 0.0, + "learning_rate": 0.0001605, + "loss": 3.9347, + "step": 4970 + }, + { + "epoch": 7.9536, + "grad_norm": 0.0, + "learning_rate": 0.00016034999999999997, + "loss": 2.8573, + "step": 4971 + }, + { + "epoch": 7.9552, + "grad_norm": 0.0, + "learning_rate": 0.0001602, + "loss": 3.3291, + "step": 4972 + }, + { + "epoch": 7.9568, + "grad_norm": 0.0, + "learning_rate": 0.00016005, + "loss": 3.4811, + "step": 4973 + }, + { + "epoch": 7.9584, + "grad_norm": 0.0, + "learning_rate": 0.00015989999999999998, + "loss": 3.1149, + "step": 4974 + }, + { + "epoch": 7.96, + "grad_norm": 0.0, + "learning_rate": 0.00015974999999999998, + "loss": 3.6624, + "step": 4975 + }, + { + "epoch": 7.9616, + "grad_norm": 0.0, + "learning_rate": 0.0001596, + "loss": 4.6154, + "step": 4976 + }, + { + "epoch": 7.9632, + "grad_norm": 0.0, + "learning_rate": 0.00015944999999999997, + "loss": 3.8381, + "step": 4977 + }, + { + "epoch": 7.9648, + "grad_norm": 0.0, + "learning_rate": 0.0001593, + "loss": 3.4302, + "step": 4978 + }, + { + "epoch": 7.9664, + "grad_norm": 0.0, + "learning_rate": 0.00015914999999999997, + "loss": 2.838, + "step": 4979 + }, + { + "epoch": 7.968, + "grad_norm": 0.0, + "learning_rate": 0.000159, + "loss": 3.2424, + "step": 4980 + }, + { + "epoch": 7.9696, + "grad_norm": 0.0, + "learning_rate": 0.00015884999999999999, + "loss": 3.268, + "step": 4981 + }, + { + "epoch": 7.9712, + "grad_norm": 0.0, + "learning_rate": 0.00015869999999999998, + "loss": 2.5788, + "step": 4982 + }, + { + "epoch": 7.9728, + "grad_norm": 0.0, + "learning_rate": 0.00015854999999999998, + "loss": 3.1136, + "step": 4983 + }, + { + "epoch": 7.9744, + "grad_norm": 0.0, + "learning_rate": 0.0001584, + "loss": 3.21, + "step": 4984 + }, + { + "epoch": 7.976, + "grad_norm": 0.0, + "learning_rate": 0.00015824999999999997, + "loss": 3.0565, + "step": 4985 + }, + { + "epoch": 7.9776, + "grad_norm": 0.0, + "learning_rate": 0.0001581, + "loss": 2.467, + "step": 4986 + }, + { + "epoch": 7.9792, + "grad_norm": 0.0, + "learning_rate": 0.00015794999999999996, + "loss": 2.4044, + "step": 4987 + }, + { + "epoch": 7.9808, + "grad_norm": 0.0, + "learning_rate": 0.0001578, + "loss": 2.5883, + "step": 4988 + }, + { + "epoch": 7.9824, + "grad_norm": 0.0, + "learning_rate": 0.00015764999999999998, + "loss": 2.5788, + "step": 4989 + }, + { + "epoch": 7.984, + "grad_norm": 0.0, + "learning_rate": 0.00015749999999999998, + "loss": 3.0362, + "step": 4990 + }, + { + "epoch": 7.9856, + "grad_norm": 0.0, + "learning_rate": 0.00015734999999999998, + "loss": 2.5262, + "step": 4991 + }, + { + "epoch": 7.9872, + "grad_norm": 0.0, + "learning_rate": 0.0001572, + "loss": 2.9619, + "step": 4992 + }, + { + "epoch": 7.9888, + "grad_norm": 0.0, + "learning_rate": 0.00015704999999999997, + "loss": 4.0168, + "step": 4993 + }, + { + "epoch": 7.9904, + "grad_norm": 0.0, + "learning_rate": 0.0001569, + "loss": 2.4526, + "step": 4994 + }, + { + "epoch": 7.992, + "grad_norm": 0.0, + "learning_rate": 0.00015675, + "loss": 3.016, + "step": 4995 + }, + { + "epoch": 7.9936, + "grad_norm": 0.0, + "learning_rate": 0.00015659999999999998, + "loss": 2.4731, + "step": 4996 + }, + { + "epoch": 7.9952, + "grad_norm": 0.0, + "learning_rate": 0.00015644999999999998, + "loss": 2.3664, + "step": 4997 + }, + { + "epoch": 7.9968, + "grad_norm": 0.0, + "learning_rate": 0.0001563, + "loss": 2.4883, + "step": 4998 + }, + { + "epoch": 7.9984, + "grad_norm": 0.0, + "learning_rate": 0.00015614999999999997, + "loss": 2.6371, + "step": 4999 + }, + { + "epoch": 8.0, + "grad_norm": 0.0, + "learning_rate": 0.000156, + "loss": 3.4098, + "step": 5000 + }, + { + "epoch": 8.0, + "eval_cer": 0.5195918204647826, + "eval_loss": 3.2885050773620605, + "eval_runtime": 161.1911, + "eval_samples_per_second": 19.455, + "eval_steps_per_second": 1.216, + "eval_wer": 0.8050150243498083, + "step": 5000 + }, + { + "epoch": 8.0016, + "grad_norm": 0.0, + "learning_rate": 0.00015584999999999997, + "loss": 4.2925, + "step": 5001 + }, + { + "epoch": 8.0032, + "grad_norm": 0.0, + "learning_rate": 0.0001557, + "loss": 4.3188, + "step": 5002 + }, + { + "epoch": 8.0048, + "grad_norm": 0.0, + "learning_rate": 0.00015554999999999999, + "loss": 3.3977, + "step": 5003 + }, + { + "epoch": 8.0064, + "grad_norm": 0.0, + "learning_rate": 0.00015539999999999998, + "loss": 3.9065, + "step": 5004 + }, + { + "epoch": 8.008, + "grad_norm": 0.0, + "learning_rate": 0.00015524999999999998, + "loss": 2.8307, + "step": 5005 + }, + { + "epoch": 8.0096, + "grad_norm": 0.0, + "learning_rate": 0.0001551, + "loss": 3.3355, + "step": 5006 + }, + { + "epoch": 8.0112, + "grad_norm": 0.0, + "learning_rate": 0.00015494999999999997, + "loss": 3.7054, + "step": 5007 + }, + { + "epoch": 8.0128, + "grad_norm": 0.0, + "learning_rate": 0.0001548, + "loss": 3.4316, + "step": 5008 + }, + { + "epoch": 8.0144, + "grad_norm": 0.0, + "learning_rate": 0.00015464999999999996, + "loss": 2.7287, + "step": 5009 + }, + { + "epoch": 8.016, + "grad_norm": 0.0, + "learning_rate": 0.0001545, + "loss": 2.9146, + "step": 5010 + }, + { + "epoch": 8.0176, + "grad_norm": 0.0, + "learning_rate": 0.00015434999999999998, + "loss": 3.566, + "step": 5011 + }, + { + "epoch": 8.0192, + "grad_norm": 0.0, + "learning_rate": 0.00015419999999999998, + "loss": 3.9366, + "step": 5012 + }, + { + "epoch": 8.0208, + "grad_norm": 0.0, + "learning_rate": 0.00015404999999999998, + "loss": 2.746, + "step": 5013 + }, + { + "epoch": 8.0224, + "grad_norm": 0.0, + "learning_rate": 0.0001539, + "loss": 2.7101, + "step": 5014 + }, + { + "epoch": 8.024, + "grad_norm": 0.0, + "learning_rate": 0.00015374999999999997, + "loss": 3.2882, + "step": 5015 + }, + { + "epoch": 8.0256, + "grad_norm": 0.0, + "learning_rate": 0.0001536, + "loss": 2.9917, + "step": 5016 + }, + { + "epoch": 8.0272, + "grad_norm": 0.0, + "learning_rate": 0.00015344999999999996, + "loss": 2.8916, + "step": 5017 + }, + { + "epoch": 8.0288, + "grad_norm": 0.0, + "learning_rate": 0.00015329999999999999, + "loss": 2.4952, + "step": 5018 + }, + { + "epoch": 8.0304, + "grad_norm": 0.0, + "learning_rate": 0.00015314999999999998, + "loss": 2.9952, + "step": 5019 + }, + { + "epoch": 8.032, + "grad_norm": 0.0, + "learning_rate": 0.00015299999999999998, + "loss": 2.9704, + "step": 5020 + }, + { + "epoch": 8.0336, + "grad_norm": 0.0, + "learning_rate": 0.00015284999999999997, + "loss": 2.6999, + "step": 5021 + }, + { + "epoch": 8.0352, + "grad_norm": 0.0, + "learning_rate": 0.0001527, + "loss": 2.2176, + "step": 5022 + }, + { + "epoch": 8.0368, + "grad_norm": 0.0, + "learning_rate": 0.00015254999999999997, + "loss": 2.576, + "step": 5023 + }, + { + "epoch": 8.0384, + "grad_norm": 0.0, + "learning_rate": 0.0001524, + "loss": 2.979, + "step": 5024 + }, + { + "epoch": 8.04, + "grad_norm": 0.0, + "learning_rate": 0.00015224999999999996, + "loss": 2.9045, + "step": 5025 + }, + { + "epoch": 8.0416, + "grad_norm": 0.0, + "learning_rate": 0.00015209999999999998, + "loss": 2.8305, + "step": 5026 + }, + { + "epoch": 8.0432, + "grad_norm": 0.0, + "learning_rate": 0.00015194999999999998, + "loss": 2.6715, + "step": 5027 + }, + { + "epoch": 8.0448, + "grad_norm": 0.0, + "learning_rate": 0.00015179999999999998, + "loss": 2.3777, + "step": 5028 + }, + { + "epoch": 8.0464, + "grad_norm": 0.0, + "learning_rate": 0.00015164999999999997, + "loss": 2.6145, + "step": 5029 + }, + { + "epoch": 8.048, + "grad_norm": 0.0, + "learning_rate": 0.0001515, + "loss": 2.9689, + "step": 5030 + }, + { + "epoch": 8.0496, + "grad_norm": 0.0, + "learning_rate": 0.00015134999999999997, + "loss": 2.6881, + "step": 5031 + }, + { + "epoch": 8.0512, + "grad_norm": 0.0, + "learning_rate": 0.0001512, + "loss": 2.4856, + "step": 5032 + }, + { + "epoch": 8.0528, + "grad_norm": 0.0, + "learning_rate": 0.00015104999999999996, + "loss": 2.4703, + "step": 5033 + }, + { + "epoch": 8.0544, + "grad_norm": 0.0, + "learning_rate": 0.00015089999999999998, + "loss": 3.364, + "step": 5034 + }, + { + "epoch": 8.056, + "grad_norm": 0.0, + "learning_rate": 0.00015074999999999998, + "loss": 2.7439, + "step": 5035 + }, + { + "epoch": 8.0576, + "grad_norm": 0.0, + "learning_rate": 0.00015059999999999997, + "loss": 2.6949, + "step": 5036 + }, + { + "epoch": 8.0592, + "grad_norm": 0.0, + "learning_rate": 0.00015044999999999997, + "loss": 3.5367, + "step": 5037 + }, + { + "epoch": 8.0608, + "grad_norm": 0.0, + "learning_rate": 0.0001503, + "loss": 2.4536, + "step": 5038 + }, + { + "epoch": 8.0624, + "grad_norm": 0.0, + "learning_rate": 0.00015014999999999996, + "loss": 2.1861, + "step": 5039 + }, + { + "epoch": 8.064, + "grad_norm": 0.0, + "learning_rate": 0.00015, + "loss": 3.0478, + "step": 5040 + }, + { + "epoch": 8.0656, + "grad_norm": 0.0, + "learning_rate": 0.00014984999999999998, + "loss": 2.8823, + "step": 5041 + }, + { + "epoch": 8.0672, + "grad_norm": 0.0, + "learning_rate": 0.00014969999999999998, + "loss": 3.1218, + "step": 5042 + }, + { + "epoch": 8.0688, + "grad_norm": 0.0, + "learning_rate": 0.00014954999999999998, + "loss": 2.5482, + "step": 5043 + }, + { + "epoch": 8.0704, + "grad_norm": 0.0, + "learning_rate": 0.0001494, + "loss": 3.969, + "step": 5044 + }, + { + "epoch": 8.072, + "grad_norm": 0.0, + "learning_rate": 0.00014925, + "loss": 2.7562, + "step": 5045 + }, + { + "epoch": 8.0736, + "grad_norm": 0.0, + "learning_rate": 0.0001491, + "loss": 3.4103, + "step": 5046 + }, + { + "epoch": 8.0752, + "grad_norm": 0.0, + "learning_rate": 0.00014895, + "loss": 4.3819, + "step": 5047 + }, + { + "epoch": 8.0768, + "grad_norm": 0.0, + "learning_rate": 0.00014879999999999998, + "loss": 2.7046, + "step": 5048 + }, + { + "epoch": 8.0784, + "grad_norm": 0.0, + "learning_rate": 0.00014864999999999998, + "loss": 4.2045, + "step": 5049 + }, + { + "epoch": 8.08, + "grad_norm": 0.0, + "learning_rate": 0.00014849999999999998, + "loss": 4.6815, + "step": 5050 + }, + { + "epoch": 8.0816, + "grad_norm": 0.0, + "learning_rate": 0.00014834999999999997, + "loss": 3.4338, + "step": 5051 + }, + { + "epoch": 8.0832, + "grad_norm": 0.0, + "learning_rate": 0.0001482, + "loss": 4.3434, + "step": 5052 + }, + { + "epoch": 8.0848, + "grad_norm": 0.0, + "learning_rate": 0.00014805, + "loss": 3.0588, + "step": 5053 + }, + { + "epoch": 8.0864, + "grad_norm": 0.0, + "learning_rate": 0.0001479, + "loss": 3.5971, + "step": 5054 + }, + { + "epoch": 8.088, + "grad_norm": 0.0, + "learning_rate": 0.00014774999999999999, + "loss": 3.661, + "step": 5055 + }, + { + "epoch": 8.0896, + "grad_norm": 0.0, + "learning_rate": 0.00014759999999999998, + "loss": 4.0386, + "step": 5056 + }, + { + "epoch": 8.0912, + "grad_norm": 0.0, + "learning_rate": 0.00014744999999999998, + "loss": 3.4621, + "step": 5057 + }, + { + "epoch": 8.0928, + "grad_norm": 0.0, + "learning_rate": 0.00014729999999999998, + "loss": 3.5794, + "step": 5058 + }, + { + "epoch": 8.0944, + "grad_norm": 0.0, + "learning_rate": 0.00014714999999999997, + "loss": 2.784, + "step": 5059 + }, + { + "epoch": 8.096, + "grad_norm": 0.0, + "learning_rate": 0.000147, + "loss": 2.583, + "step": 5060 + }, + { + "epoch": 8.0976, + "grad_norm": 0.0, + "learning_rate": 0.00014685, + "loss": 2.8943, + "step": 5061 + }, + { + "epoch": 8.0992, + "grad_norm": 0.0, + "learning_rate": 0.0001467, + "loss": 2.8541, + "step": 5062 + }, + { + "epoch": 8.1008, + "grad_norm": 0.0, + "learning_rate": 0.00014654999999999998, + "loss": 2.8582, + "step": 5063 + }, + { + "epoch": 8.1024, + "grad_norm": 0.0, + "learning_rate": 0.00014639999999999998, + "loss": 3.1044, + "step": 5064 + }, + { + "epoch": 8.104, + "grad_norm": 0.0, + "learning_rate": 0.00014624999999999998, + "loss": 3.0776, + "step": 5065 + }, + { + "epoch": 8.1056, + "grad_norm": 0.0, + "learning_rate": 0.00014609999999999997, + "loss": 3.434, + "step": 5066 + }, + { + "epoch": 8.1072, + "grad_norm": 0.0, + "learning_rate": 0.00014595, + "loss": 3.0661, + "step": 5067 + }, + { + "epoch": 8.1088, + "grad_norm": 0.0, + "learning_rate": 0.0001458, + "loss": 3.0772, + "step": 5068 + }, + { + "epoch": 8.1104, + "grad_norm": 0.0, + "learning_rate": 0.00014565, + "loss": 3.5264, + "step": 5069 + }, + { + "epoch": 8.112, + "grad_norm": 0.0, + "learning_rate": 0.00014549999999999999, + "loss": 2.9278, + "step": 5070 + }, + { + "epoch": 8.1136, + "grad_norm": 0.0, + "learning_rate": 0.00014534999999999998, + "loss": 2.7835, + "step": 5071 + }, + { + "epoch": 8.1152, + "grad_norm": 0.0, + "learning_rate": 0.00014519999999999998, + "loss": 2.7639, + "step": 5072 + }, + { + "epoch": 8.1168, + "grad_norm": 0.0, + "learning_rate": 0.00014504999999999997, + "loss": 2.7795, + "step": 5073 + }, + { + "epoch": 8.1184, + "grad_norm": 0.0, + "learning_rate": 0.00014489999999999997, + "loss": 2.9886, + "step": 5074 + }, + { + "epoch": 8.12, + "grad_norm": 0.0, + "learning_rate": 0.00014475, + "loss": 2.7629, + "step": 5075 + }, + { + "epoch": 8.1216, + "grad_norm": 0.0, + "learning_rate": 0.0001446, + "loss": 2.7788, + "step": 5076 + }, + { + "epoch": 8.1232, + "grad_norm": 0.0, + "learning_rate": 0.00014445, + "loss": 3.0377, + "step": 5077 + }, + { + "epoch": 8.1248, + "grad_norm": 0.0, + "learning_rate": 0.00014429999999999998, + "loss": 2.5952, + "step": 5078 + }, + { + "epoch": 8.1264, + "grad_norm": 0.0, + "learning_rate": 0.00014414999999999998, + "loss": 2.4114, + "step": 5079 + }, + { + "epoch": 8.128, + "grad_norm": 0.0, + "learning_rate": 0.00014399999999999998, + "loss": 2.8515, + "step": 5080 + }, + { + "epoch": 8.1296, + "grad_norm": 0.0, + "learning_rate": 0.00014384999999999997, + "loss": 2.9155, + "step": 5081 + }, + { + "epoch": 8.1312, + "grad_norm": 0.0, + "learning_rate": 0.00014369999999999997, + "loss": 3.2422, + "step": 5082 + }, + { + "epoch": 8.1328, + "grad_norm": 0.0, + "learning_rate": 0.00014355, + "loss": 2.7588, + "step": 5083 + }, + { + "epoch": 8.1344, + "grad_norm": 0.0, + "learning_rate": 0.0001434, + "loss": 2.8192, + "step": 5084 + }, + { + "epoch": 8.136, + "grad_norm": 0.0, + "learning_rate": 0.00014324999999999999, + "loss": 2.9131, + "step": 5085 + }, + { + "epoch": 8.1376, + "grad_norm": 0.0, + "learning_rate": 0.00014309999999999998, + "loss": 2.2079, + "step": 5086 + }, + { + "epoch": 8.1392, + "grad_norm": 0.0, + "learning_rate": 0.00014294999999999998, + "loss": 3.2466, + "step": 5087 + }, + { + "epoch": 8.1408, + "grad_norm": 0.0, + "learning_rate": 0.00014279999999999997, + "loss": 2.9861, + "step": 5088 + }, + { + "epoch": 8.1424, + "grad_norm": 0.0, + "learning_rate": 0.00014264999999999997, + "loss": 2.6862, + "step": 5089 + }, + { + "epoch": 8.144, + "grad_norm": 0.0, + "learning_rate": 0.0001425, + "loss": 2.6454, + "step": 5090 + }, + { + "epoch": 8.1456, + "grad_norm": 0.0, + "learning_rate": 0.00014235, + "loss": 2.9265, + "step": 5091 + }, + { + "epoch": 8.1472, + "grad_norm": 0.0, + "learning_rate": 0.0001422, + "loss": 3.5343, + "step": 5092 + }, + { + "epoch": 8.1488, + "grad_norm": 0.0, + "learning_rate": 0.00014204999999999998, + "loss": 2.8883, + "step": 5093 + }, + { + "epoch": 8.1504, + "grad_norm": 0.0, + "learning_rate": 0.00014189999999999998, + "loss": 3.3708, + "step": 5094 + }, + { + "epoch": 8.152, + "grad_norm": 0.0, + "learning_rate": 0.00014174999999999998, + "loss": 3.6141, + "step": 5095 + }, + { + "epoch": 8.1536, + "grad_norm": 0.0, + "learning_rate": 0.00014159999999999997, + "loss": 2.6007, + "step": 5096 + }, + { + "epoch": 8.1552, + "grad_norm": 0.0, + "learning_rate": 0.00014144999999999997, + "loss": 2.7028, + "step": 5097 + }, + { + "epoch": 8.1568, + "grad_norm": 0.0, + "learning_rate": 0.0001413, + "loss": 3.3034, + "step": 5098 + }, + { + "epoch": 8.1584, + "grad_norm": 0.0, + "learning_rate": 0.00014115, + "loss": 3.0746, + "step": 5099 + }, + { + "epoch": 8.16, + "grad_norm": 0.0, + "learning_rate": 0.00014099999999999998, + "loss": 3.4602, + "step": 5100 + }, + { + "epoch": 8.1616, + "grad_norm": 0.0, + "learning_rate": 0.00014084999999999998, + "loss": 4.0837, + "step": 5101 + }, + { + "epoch": 8.1632, + "grad_norm": 0.0, + "learning_rate": 0.00014069999999999998, + "loss": 3.8424, + "step": 5102 + }, + { + "epoch": 8.1648, + "grad_norm": 0.0, + "learning_rate": 0.00014055, + "loss": 5.0357, + "step": 5103 + }, + { + "epoch": 8.1664, + "grad_norm": 0.0, + "learning_rate": 0.0001404, + "loss": 3.3535, + "step": 5104 + }, + { + "epoch": 8.168, + "grad_norm": 0.0, + "learning_rate": 0.00014025, + "loss": 2.9491, + "step": 5105 + }, + { + "epoch": 8.1696, + "grad_norm": 0.0, + "learning_rate": 0.0001401, + "loss": 3.0972, + "step": 5106 + }, + { + "epoch": 8.1712, + "grad_norm": 0.0, + "learning_rate": 0.00013995, + "loss": 2.8251, + "step": 5107 + }, + { + "epoch": 8.1728, + "grad_norm": 0.0, + "learning_rate": 0.00013979999999999998, + "loss": 2.6972, + "step": 5108 + }, + { + "epoch": 8.1744, + "grad_norm": 0.0, + "learning_rate": 0.00013965, + "loss": 2.8575, + "step": 5109 + }, + { + "epoch": 8.176, + "grad_norm": 0.0, + "learning_rate": 0.0001395, + "loss": 2.9295, + "step": 5110 + }, + { + "epoch": 8.1776, + "grad_norm": 0.0, + "learning_rate": 0.00013935, + "loss": 2.8582, + "step": 5111 + }, + { + "epoch": 8.1792, + "grad_norm": 0.0, + "learning_rate": 0.0001392, + "loss": 3.0131, + "step": 5112 + }, + { + "epoch": 8.1808, + "grad_norm": 0.0, + "learning_rate": 0.00013905, + "loss": 2.8202, + "step": 5113 + }, + { + "epoch": 8.1824, + "grad_norm": 0.0, + "learning_rate": 0.0001389, + "loss": 2.2841, + "step": 5114 + }, + { + "epoch": 8.184, + "grad_norm": 0.0, + "learning_rate": 0.00013874999999999998, + "loss": 2.8865, + "step": 5115 + }, + { + "epoch": 8.1856, + "grad_norm": 0.0, + "learning_rate": 0.0001386, + "loss": 3.2878, + "step": 5116 + }, + { + "epoch": 8.1872, + "grad_norm": 0.0, + "learning_rate": 0.00013845, + "loss": 2.5579, + "step": 5117 + }, + { + "epoch": 8.1888, + "grad_norm": 0.0, + "learning_rate": 0.0001383, + "loss": 3.4639, + "step": 5118 + }, + { + "epoch": 8.1904, + "grad_norm": 0.0, + "learning_rate": 0.00013815, + "loss": 2.1206, + "step": 5119 + }, + { + "epoch": 8.192, + "grad_norm": 0.0, + "learning_rate": 0.000138, + "loss": 3.2176, + "step": 5120 + }, + { + "epoch": 8.1936, + "grad_norm": 0.0, + "learning_rate": 0.00013785, + "loss": 3.531, + "step": 5121 + }, + { + "epoch": 8.1952, + "grad_norm": 0.0, + "learning_rate": 0.00013769999999999999, + "loss": 2.5251, + "step": 5122 + }, + { + "epoch": 8.1968, + "grad_norm": 0.0, + "learning_rate": 0.00013754999999999998, + "loss": 2.7042, + "step": 5123 + }, + { + "epoch": 8.1984, + "grad_norm": 0.0, + "learning_rate": 0.0001374, + "loss": 2.6401, + "step": 5124 + }, + { + "epoch": 8.2, + "grad_norm": 0.0, + "learning_rate": 0.00013725, + "loss": 2.8847, + "step": 5125 + }, + { + "epoch": 8.2016, + "grad_norm": 0.0, + "learning_rate": 0.0001371, + "loss": 2.831, + "step": 5126 + }, + { + "epoch": 8.2032, + "grad_norm": 0.0, + "learning_rate": 0.00013695, + "loss": 3.003, + "step": 5127 + }, + { + "epoch": 8.2048, + "grad_norm": 0.0, + "learning_rate": 0.0001368, + "loss": 3.5552, + "step": 5128 + }, + { + "epoch": 8.2064, + "grad_norm": 0.0, + "learning_rate": 0.00013665, + "loss": 3.0249, + "step": 5129 + }, + { + "epoch": 8.208, + "grad_norm": 0.0, + "learning_rate": 0.00013649999999999998, + "loss": 2.6, + "step": 5130 + }, + { + "epoch": 8.2096, + "grad_norm": 0.0, + "learning_rate": 0.00013634999999999998, + "loss": 3.1531, + "step": 5131 + }, + { + "epoch": 8.2112, + "grad_norm": 0.0, + "learning_rate": 0.0001362, + "loss": 3.5822, + "step": 5132 + }, + { + "epoch": 8.2128, + "grad_norm": 0.0, + "learning_rate": 0.00013605, + "loss": 2.864, + "step": 5133 + }, + { + "epoch": 8.2144, + "grad_norm": 0.0, + "learning_rate": 0.0001359, + "loss": 3.009, + "step": 5134 + }, + { + "epoch": 8.216, + "grad_norm": 0.0, + "learning_rate": 0.00013575, + "loss": 2.92, + "step": 5135 + }, + { + "epoch": 8.2176, + "grad_norm": 0.0, + "learning_rate": 0.0001356, + "loss": 2.5759, + "step": 5136 + }, + { + "epoch": 8.2192, + "grad_norm": 0.0, + "learning_rate": 0.00013544999999999999, + "loss": 3.3534, + "step": 5137 + }, + { + "epoch": 8.2208, + "grad_norm": 0.0, + "learning_rate": 0.00013529999999999998, + "loss": 2.4112, + "step": 5138 + }, + { + "epoch": 8.2224, + "grad_norm": 0.0, + "learning_rate": 0.00013514999999999998, + "loss": 3.7695, + "step": 5139 + }, + { + "epoch": 8.224, + "grad_norm": 0.0, + "learning_rate": 0.000135, + "loss": 3.4061, + "step": 5140 + }, + { + "epoch": 8.2256, + "grad_norm": 0.0, + "learning_rate": 0.00013485, + "loss": 2.9928, + "step": 5141 + }, + { + "epoch": 8.2272, + "grad_norm": 0.0, + "learning_rate": 0.0001347, + "loss": 2.9372, + "step": 5142 + }, + { + "epoch": 8.2288, + "grad_norm": 0.0, + "learning_rate": 0.00013455, + "loss": 2.407, + "step": 5143 + }, + { + "epoch": 8.2304, + "grad_norm": 0.0, + "learning_rate": 0.0001344, + "loss": 2.7185, + "step": 5144 + }, + { + "epoch": 8.232, + "grad_norm": 0.0, + "learning_rate": 0.00013424999999999998, + "loss": 3.0936, + "step": 5145 + }, + { + "epoch": 8.2336, + "grad_norm": 0.0, + "learning_rate": 0.00013409999999999998, + "loss": 3.018, + "step": 5146 + }, + { + "epoch": 8.2352, + "grad_norm": 0.0, + "learning_rate": 0.00013395, + "loss": 2.9415, + "step": 5147 + }, + { + "epoch": 8.2368, + "grad_norm": 0.0, + "learning_rate": 0.0001338, + "loss": 3.6344, + "step": 5148 + }, + { + "epoch": 8.2384, + "grad_norm": 0.0, + "learning_rate": 0.00013365, + "loss": 3.3231, + "step": 5149 + }, + { + "epoch": 8.24, + "grad_norm": 0.0, + "learning_rate": 0.0001335, + "loss": 4.426, + "step": 5150 + }, + { + "epoch": 8.2416, + "grad_norm": 0.0, + "learning_rate": 0.00013335, + "loss": 4.2659, + "step": 5151 + }, + { + "epoch": 8.2432, + "grad_norm": 0.0, + "learning_rate": 0.00013319999999999999, + "loss": 4.1716, + "step": 5152 + }, + { + "epoch": 8.2448, + "grad_norm": 0.0, + "learning_rate": 0.00013304999999999998, + "loss": 3.4874, + "step": 5153 + }, + { + "epoch": 8.2464, + "grad_norm": 0.0, + "learning_rate": 0.00013289999999999998, + "loss": 3.6386, + "step": 5154 + }, + { + "epoch": 8.248, + "grad_norm": 0.0, + "learning_rate": 0.00013275, + "loss": 3.1903, + "step": 5155 + }, + { + "epoch": 8.2496, + "grad_norm": 0.0, + "learning_rate": 0.0001326, + "loss": 2.9343, + "step": 5156 + }, + { + "epoch": 8.2512, + "grad_norm": 0.0, + "learning_rate": 0.00013245, + "loss": 3.3521, + "step": 5157 + }, + { + "epoch": 8.2528, + "grad_norm": 0.0, + "learning_rate": 0.0001323, + "loss": 3.9208, + "step": 5158 + }, + { + "epoch": 8.2544, + "grad_norm": 0.0, + "learning_rate": 0.00013215, + "loss": 3.5965, + "step": 5159 + }, + { + "epoch": 8.256, + "grad_norm": 0.0, + "learning_rate": 0.00013199999999999998, + "loss": 3.196, + "step": 5160 + }, + { + "epoch": 8.2576, + "grad_norm": 0.0, + "learning_rate": 0.00013184999999999998, + "loss": 2.6157, + "step": 5161 + }, + { + "epoch": 8.2592, + "grad_norm": 0.0, + "learning_rate": 0.00013169999999999998, + "loss": 3.261, + "step": 5162 + }, + { + "epoch": 8.2608, + "grad_norm": 0.0, + "learning_rate": 0.00013155, + "loss": 2.9177, + "step": 5163 + }, + { + "epoch": 8.2624, + "grad_norm": 0.0, + "learning_rate": 0.0001314, + "loss": 2.8547, + "step": 5164 + }, + { + "epoch": 8.264, + "grad_norm": 0.0, + "learning_rate": 0.00013125, + "loss": 2.9475, + "step": 5165 + }, + { + "epoch": 8.2656, + "grad_norm": 0.0, + "learning_rate": 0.0001311, + "loss": 2.9541, + "step": 5166 + }, + { + "epoch": 8.2672, + "grad_norm": 0.0, + "learning_rate": 0.00013094999999999998, + "loss": 3.7245, + "step": 5167 + }, + { + "epoch": 8.2688, + "grad_norm": 0.0, + "learning_rate": 0.00013079999999999998, + "loss": 2.5799, + "step": 5168 + }, + { + "epoch": 8.2704, + "grad_norm": 0.0, + "learning_rate": 0.00013064999999999998, + "loss": 3.0271, + "step": 5169 + }, + { + "epoch": 8.272, + "grad_norm": 0.0, + "learning_rate": 0.0001305, + "loss": 3.2876, + "step": 5170 + }, + { + "epoch": 8.2736, + "grad_norm": 0.0, + "learning_rate": 0.00013035, + "loss": 3.2062, + "step": 5171 + }, + { + "epoch": 8.2752, + "grad_norm": 0.0, + "learning_rate": 0.0001302, + "loss": 2.895, + "step": 5172 + }, + { + "epoch": 8.2768, + "grad_norm": 0.0, + "learning_rate": 0.00013005, + "loss": 2.8953, + "step": 5173 + }, + { + "epoch": 8.2784, + "grad_norm": 0.0, + "learning_rate": 0.00012989999999999999, + "loss": 2.7017, + "step": 5174 + }, + { + "epoch": 8.28, + "grad_norm": 0.0, + "learning_rate": 0.00012974999999999998, + "loss": 2.5641, + "step": 5175 + }, + { + "epoch": 8.2816, + "grad_norm": 0.0, + "learning_rate": 0.00012959999999999998, + "loss": 2.3601, + "step": 5176 + }, + { + "epoch": 8.2832, + "grad_norm": 0.0, + "learning_rate": 0.00012944999999999998, + "loss": 2.7067, + "step": 5177 + }, + { + "epoch": 8.2848, + "grad_norm": 0.0, + "learning_rate": 0.0001293, + "loss": 3.2428, + "step": 5178 + }, + { + "epoch": 8.2864, + "grad_norm": 0.0, + "learning_rate": 0.00012915, + "loss": 2.6421, + "step": 5179 + }, + { + "epoch": 8.288, + "grad_norm": 0.0, + "learning_rate": 0.000129, + "loss": 2.8029, + "step": 5180 + }, + { + "epoch": 8.2896, + "grad_norm": 0.0, + "learning_rate": 0.00012885, + "loss": 2.2514, + "step": 5181 + }, + { + "epoch": 8.2912, + "grad_norm": 0.0, + "learning_rate": 0.00012869999999999998, + "loss": 2.3215, + "step": 5182 + }, + { + "epoch": 8.2928, + "grad_norm": 0.0, + "learning_rate": 0.00012854999999999998, + "loss": 2.6813, + "step": 5183 + }, + { + "epoch": 8.2944, + "grad_norm": 0.0, + "learning_rate": 0.00012839999999999998, + "loss": 3.8293, + "step": 5184 + }, + { + "epoch": 8.296, + "grad_norm": 0.0, + "learning_rate": 0.00012824999999999997, + "loss": 4.1989, + "step": 5185 + }, + { + "epoch": 8.2976, + "grad_norm": 0.0, + "learning_rate": 0.0001281, + "loss": 2.7688, + "step": 5186 + }, + { + "epoch": 8.2992, + "grad_norm": 0.0, + "learning_rate": 0.00012795, + "loss": 3.1238, + "step": 5187 + }, + { + "epoch": 8.3008, + "grad_norm": 0.0, + "learning_rate": 0.0001278, + "loss": 3.05, + "step": 5188 + }, + { + "epoch": 8.3024, + "grad_norm": 0.0, + "learning_rate": 0.00012764999999999999, + "loss": 2.8148, + "step": 5189 + }, + { + "epoch": 8.304, + "grad_norm": 0.0, + "learning_rate": 0.00012749999999999998, + "loss": 2.9786, + "step": 5190 + }, + { + "epoch": 8.3056, + "grad_norm": 0.0, + "learning_rate": 0.00012734999999999998, + "loss": 3.1833, + "step": 5191 + }, + { + "epoch": 8.3072, + "grad_norm": NaN, + "learning_rate": 0.00012734999999999998, + "loss": 3.6253, + "step": 5192 + }, + { + "epoch": 8.3088, + "grad_norm": 0.0, + "learning_rate": 0.00012719999999999997, + "loss": 2.4153, + "step": 5193 + }, + { + "epoch": 8.3104, + "grad_norm": 0.0, + "learning_rate": 0.00012705, + "loss": 3.1281, + "step": 5194 + }, + { + "epoch": 8.312, + "grad_norm": 0.0, + "learning_rate": 0.0001269, + "loss": 2.9595, + "step": 5195 + }, + { + "epoch": 8.3136, + "grad_norm": 0.0, + "learning_rate": 0.00012675, + "loss": 3.495, + "step": 5196 + }, + { + "epoch": 8.3152, + "grad_norm": 0.0, + "learning_rate": 0.0001266, + "loss": 3.3463, + "step": 5197 + }, + { + "epoch": 8.3168, + "grad_norm": 0.0, + "learning_rate": 0.00012644999999999998, + "loss": 3.6449, + "step": 5198 + }, + { + "epoch": 8.3184, + "grad_norm": 0.0, + "learning_rate": 0.00012629999999999998, + "loss": 2.7316, + "step": 5199 + }, + { + "epoch": 8.32, + "grad_norm": NaN, + "learning_rate": 0.00012629999999999998, + "loss": 3.3002, + "step": 5200 + }, + { + "epoch": 8.3216, + "grad_norm": 0.0, + "learning_rate": 0.00012614999999999998, + "loss": 4.1094, + "step": 5201 + }, + { + "epoch": 8.3232, + "grad_norm": 0.0, + "learning_rate": 0.00012599999999999997, + "loss": 3.6867, + "step": 5202 + }, + { + "epoch": 8.3248, + "grad_norm": 0.0, + "learning_rate": 0.00012585, + "loss": 3.3427, + "step": 5203 + }, + { + "epoch": 8.3264, + "grad_norm": 0.0, + "learning_rate": 0.0001257, + "loss": 4.1188, + "step": 5204 + }, + { + "epoch": 8.328, + "grad_norm": 0.0, + "learning_rate": 0.00012555, + "loss": 3.5926, + "step": 5205 + }, + { + "epoch": 8.3296, + "grad_norm": 0.0, + "learning_rate": 0.00012539999999999999, + "loss": 4.8951, + "step": 5206 + }, + { + "epoch": 8.3312, + "grad_norm": 0.0, + "learning_rate": 0.00012524999999999998, + "loss": 3.4692, + "step": 5207 + }, + { + "epoch": 8.3328, + "grad_norm": 0.0, + "learning_rate": 0.00012509999999999998, + "loss": 3.5879, + "step": 5208 + }, + { + "epoch": 8.3344, + "grad_norm": 0.0, + "learning_rate": 0.00012494999999999997, + "loss": 3.5962, + "step": 5209 + }, + { + "epoch": 8.336, + "grad_norm": 0.0, + "learning_rate": 0.00012479999999999997, + "loss": 3.2599, + "step": 5210 + }, + { + "epoch": 8.3376, + "grad_norm": 0.0, + "learning_rate": 0.00012465, + "loss": 3.4111, + "step": 5211 + }, + { + "epoch": 8.3392, + "grad_norm": 0.0, + "learning_rate": 0.0001245, + "loss": 2.5443, + "step": 5212 + }, + { + "epoch": 8.3408, + "grad_norm": 0.0, + "learning_rate": 0.00012435, + "loss": 2.949, + "step": 5213 + }, + { + "epoch": 8.3424, + "grad_norm": 0.0, + "learning_rate": 0.00012419999999999998, + "loss": 2.4854, + "step": 5214 + }, + { + "epoch": 8.344, + "grad_norm": 0.0, + "learning_rate": 0.00012404999999999998, + "loss": 2.4217, + "step": 5215 + }, + { + "epoch": 8.3456, + "grad_norm": 0.0, + "learning_rate": 0.00012389999999999998, + "loss": 2.8774, + "step": 5216 + }, + { + "epoch": 8.3472, + "grad_norm": 0.0, + "learning_rate": 0.00012374999999999997, + "loss": 2.7447, + "step": 5217 + }, + { + "epoch": 8.3488, + "grad_norm": 0.0, + "learning_rate": 0.0001236, + "loss": 2.4299, + "step": 5218 + }, + { + "epoch": 8.3504, + "grad_norm": 0.0, + "learning_rate": 0.00012345, + "loss": 3.1645, + "step": 5219 + }, + { + "epoch": 8.352, + "grad_norm": 0.0, + "learning_rate": 0.0001233, + "loss": 3.019, + "step": 5220 + }, + { + "epoch": 8.3536, + "grad_norm": 0.0, + "learning_rate": 0.00012314999999999998, + "loss": 3.0103, + "step": 5221 + }, + { + "epoch": 8.3552, + "grad_norm": 0.0, + "learning_rate": 0.00012299999999999998, + "loss": 2.2844, + "step": 5222 + }, + { + "epoch": 8.3568, + "grad_norm": 0.0, + "learning_rate": 0.00012284999999999998, + "loss": 3.2206, + "step": 5223 + }, + { + "epoch": 8.3584, + "grad_norm": 0.0, + "learning_rate": 0.00012269999999999997, + "loss": 2.6892, + "step": 5224 + }, + { + "epoch": 8.36, + "grad_norm": 0.0, + "learning_rate": 0.00012254999999999997, + "loss": 2.8515, + "step": 5225 + }, + { + "epoch": 8.3616, + "grad_norm": 0.0, + "learning_rate": 0.0001224, + "loss": 3.5186, + "step": 5226 + }, + { + "epoch": 8.3632, + "grad_norm": 0.0, + "learning_rate": 0.00012225, + "loss": 2.4005, + "step": 5227 + }, + { + "epoch": 8.3648, + "grad_norm": 0.0, + "learning_rate": 0.00012209999999999999, + "loss": 3.0731, + "step": 5228 + }, + { + "epoch": 8.3664, + "grad_norm": 0.0, + "learning_rate": 0.00012194999999999998, + "loss": 2.9212, + "step": 5229 + }, + { + "epoch": 8.368, + "grad_norm": 0.0, + "learning_rate": 0.00012179999999999999, + "loss": 3.3993, + "step": 5230 + }, + { + "epoch": 8.3696, + "grad_norm": 0.0, + "learning_rate": 0.00012165, + "loss": 2.7038, + "step": 5231 + }, + { + "epoch": 8.3712, + "grad_norm": 0.0, + "learning_rate": 0.0001215, + "loss": 2.5096, + "step": 5232 + }, + { + "epoch": 8.3728, + "grad_norm": 0.0, + "learning_rate": 0.00012135, + "loss": 2.288, + "step": 5233 + }, + { + "epoch": 8.3744, + "grad_norm": 0.0, + "learning_rate": 0.00012119999999999999, + "loss": 2.904, + "step": 5234 + }, + { + "epoch": 8.376, + "grad_norm": 0.0, + "learning_rate": 0.00012105, + "loss": 2.927, + "step": 5235 + }, + { + "epoch": 8.3776, + "grad_norm": 0.0, + "learning_rate": 0.0001209, + "loss": 3.1311, + "step": 5236 + }, + { + "epoch": 8.3792, + "grad_norm": 0.0, + "learning_rate": 0.00012075, + "loss": 2.6616, + "step": 5237 + }, + { + "epoch": 8.3808, + "grad_norm": 0.0, + "learning_rate": 0.00012059999999999999, + "loss": 3.3794, + "step": 5238 + }, + { + "epoch": 8.3824, + "grad_norm": 0.0, + "learning_rate": 0.00012045, + "loss": 2.8368, + "step": 5239 + }, + { + "epoch": 8.384, + "grad_norm": 0.0, + "learning_rate": 0.0001203, + "loss": 3.2011, + "step": 5240 + }, + { + "epoch": 8.3856, + "grad_norm": 0.0, + "learning_rate": 0.00012014999999999999, + "loss": 2.7275, + "step": 5241 + }, + { + "epoch": 8.3872, + "grad_norm": 0.0, + "learning_rate": 0.00011999999999999999, + "loss": 2.8589, + "step": 5242 + }, + { + "epoch": 8.3888, + "grad_norm": 0.0, + "learning_rate": 0.00011985, + "loss": 3.5764, + "step": 5243 + }, + { + "epoch": 8.3904, + "grad_norm": 0.0, + "learning_rate": 0.0001197, + "loss": 2.4109, + "step": 5244 + }, + { + "epoch": 8.392, + "grad_norm": 0.0, + "learning_rate": 0.00011954999999999999, + "loss": 2.4054, + "step": 5245 + }, + { + "epoch": 8.3936, + "grad_norm": 0.0, + "learning_rate": 0.0001194, + "loss": 2.9462, + "step": 5246 + }, + { + "epoch": 8.395199999999999, + "grad_norm": 0.0, + "learning_rate": 0.00011925, + "loss": 3.132, + "step": 5247 + }, + { + "epoch": 8.3968, + "grad_norm": 0.0, + "learning_rate": 0.0001191, + "loss": 3.2663, + "step": 5248 + }, + { + "epoch": 8.3984, + "grad_norm": 0.0, + "learning_rate": 0.00011894999999999999, + "loss": 3.5332, + "step": 5249 + }, + { + "epoch": 8.4, + "grad_norm": NaN, + "learning_rate": 0.00011894999999999999, + "loss": 2.3482, + "step": 5250 + }, + { + "epoch": 8.4016, + "grad_norm": 0.0, + "learning_rate": 0.0001188, + "loss": 4.2295, + "step": 5251 + }, + { + "epoch": 8.4032, + "grad_norm": 0.0, + "learning_rate": 0.00011865, + "loss": 3.4233, + "step": 5252 + }, + { + "epoch": 8.4048, + "grad_norm": 0.0, + "learning_rate": 0.0001185, + "loss": 3.2728, + "step": 5253 + }, + { + "epoch": 8.4064, + "grad_norm": 0.0, + "learning_rate": 0.00011834999999999999, + "loss": 3.4883, + "step": 5254 + }, + { + "epoch": 8.408, + "grad_norm": 0.0, + "learning_rate": 0.0001182, + "loss": 3.0078, + "step": 5255 + }, + { + "epoch": 8.4096, + "grad_norm": 0.0, + "learning_rate": 0.00011805, + "loss": 3.2885, + "step": 5256 + }, + { + "epoch": 8.411200000000001, + "grad_norm": 0.0, + "learning_rate": 0.00011789999999999999, + "loss": 2.7993, + "step": 5257 + }, + { + "epoch": 8.4128, + "grad_norm": 0.0, + "learning_rate": 0.00011774999999999999, + "loss": 3.1993, + "step": 5258 + }, + { + "epoch": 8.4144, + "grad_norm": 0.0, + "learning_rate": 0.0001176, + "loss": 3.2918, + "step": 5259 + }, + { + "epoch": 8.416, + "grad_norm": 0.0, + "learning_rate": 0.00011745, + "loss": 3.0677, + "step": 5260 + }, + { + "epoch": 8.4176, + "grad_norm": 0.0, + "learning_rate": 0.00011729999999999999, + "loss": 2.7578, + "step": 5261 + }, + { + "epoch": 8.4192, + "grad_norm": 0.0, + "learning_rate": 0.00011714999999999999, + "loss": 2.841, + "step": 5262 + }, + { + "epoch": 8.4208, + "grad_norm": 0.0, + "learning_rate": 0.000117, + "loss": 3.4409, + "step": 5263 + }, + { + "epoch": 8.4224, + "grad_norm": 0.0, + "learning_rate": 0.00011685, + "loss": 2.457, + "step": 5264 + }, + { + "epoch": 8.424, + "grad_norm": 0.0, + "learning_rate": 0.00011669999999999999, + "loss": 2.8304, + "step": 5265 + }, + { + "epoch": 8.4256, + "grad_norm": 0.0, + "learning_rate": 0.00011654999999999999, + "loss": 2.9242, + "step": 5266 + }, + { + "epoch": 8.4272, + "grad_norm": 0.0, + "learning_rate": 0.0001164, + "loss": 3.1701, + "step": 5267 + }, + { + "epoch": 8.4288, + "grad_norm": 0.0, + "learning_rate": 0.00011624999999999999, + "loss": 2.2805, + "step": 5268 + }, + { + "epoch": 8.4304, + "grad_norm": 0.0, + "learning_rate": 0.00011609999999999999, + "loss": 3.0014, + "step": 5269 + }, + { + "epoch": 8.432, + "grad_norm": 0.0, + "learning_rate": 0.00011595, + "loss": 3.1696, + "step": 5270 + }, + { + "epoch": 8.4336, + "grad_norm": 0.0, + "learning_rate": 0.0001158, + "loss": 2.6706, + "step": 5271 + }, + { + "epoch": 8.4352, + "grad_norm": 0.0, + "learning_rate": 0.00011564999999999999, + "loss": 2.7588, + "step": 5272 + }, + { + "epoch": 8.4368, + "grad_norm": 0.0, + "learning_rate": 0.00011549999999999999, + "loss": 2.5778, + "step": 5273 + }, + { + "epoch": 8.4384, + "grad_norm": 0.0, + "learning_rate": 0.00011535, + "loss": 2.6264, + "step": 5274 + }, + { + "epoch": 8.44, + "grad_norm": 0.0, + "learning_rate": 0.0001152, + "loss": 3.1378, + "step": 5275 + }, + { + "epoch": 8.4416, + "grad_norm": 0.0, + "learning_rate": 0.00011504999999999999, + "loss": 2.9581, + "step": 5276 + }, + { + "epoch": 8.4432, + "grad_norm": 0.0, + "learning_rate": 0.00011489999999999999, + "loss": 2.5971, + "step": 5277 + }, + { + "epoch": 8.4448, + "grad_norm": 0.0, + "learning_rate": 0.00011475, + "loss": 2.7245, + "step": 5278 + }, + { + "epoch": 8.4464, + "grad_norm": 0.0, + "learning_rate": 0.0001146, + "loss": 3.5557, + "step": 5279 + }, + { + "epoch": 8.448, + "grad_norm": 0.0, + "learning_rate": 0.00011444999999999999, + "loss": 2.7126, + "step": 5280 + }, + { + "epoch": 8.4496, + "grad_norm": 0.0, + "learning_rate": 0.00011429999999999999, + "loss": 2.6388, + "step": 5281 + }, + { + "epoch": 8.4512, + "grad_norm": 0.0, + "learning_rate": 0.00011415, + "loss": 3.2968, + "step": 5282 + }, + { + "epoch": 8.4528, + "grad_norm": 0.0, + "learning_rate": 0.00011399999999999999, + "loss": 2.8465, + "step": 5283 + }, + { + "epoch": 8.4544, + "grad_norm": 0.0, + "learning_rate": 0.00011384999999999999, + "loss": 3.0418, + "step": 5284 + }, + { + "epoch": 8.456, + "grad_norm": 0.0, + "learning_rate": 0.00011369999999999999, + "loss": 4.1577, + "step": 5285 + }, + { + "epoch": 8.4576, + "grad_norm": 0.0, + "learning_rate": 0.00011355, + "loss": 2.5981, + "step": 5286 + }, + { + "epoch": 8.4592, + "grad_norm": 0.0, + "learning_rate": 0.00011339999999999999, + "loss": 2.8671, + "step": 5287 + }, + { + "epoch": 8.4608, + "grad_norm": 0.0, + "learning_rate": 0.00011324999999999999, + "loss": 3.0664, + "step": 5288 + }, + { + "epoch": 8.4624, + "grad_norm": 0.0, + "learning_rate": 0.00011309999999999998, + "loss": 2.6248, + "step": 5289 + }, + { + "epoch": 8.464, + "grad_norm": 0.0, + "learning_rate": 0.00011295, + "loss": 3.144, + "step": 5290 + }, + { + "epoch": 8.4656, + "grad_norm": 0.0, + "learning_rate": 0.00011279999999999999, + "loss": 2.5095, + "step": 5291 + }, + { + "epoch": 8.4672, + "grad_norm": 0.0, + "learning_rate": 0.00011264999999999999, + "loss": 2.9168, + "step": 5292 + }, + { + "epoch": 8.4688, + "grad_norm": 0.0, + "learning_rate": 0.0001125, + "loss": 3.5242, + "step": 5293 + }, + { + "epoch": 8.4704, + "grad_norm": 0.0, + "learning_rate": 0.00011235, + "loss": 3.1218, + "step": 5294 + }, + { + "epoch": 8.472, + "grad_norm": 0.0, + "learning_rate": 0.00011219999999999999, + "loss": 3.3528, + "step": 5295 + }, + { + "epoch": 8.4736, + "grad_norm": 0.0, + "learning_rate": 0.00011204999999999999, + "loss": 3.208, + "step": 5296 + }, + { + "epoch": 8.4752, + "grad_norm": 0.0, + "learning_rate": 0.0001119, + "loss": 3.2412, + "step": 5297 + }, + { + "epoch": 8.4768, + "grad_norm": 0.0, + "learning_rate": 0.00011174999999999999, + "loss": 3.0845, + "step": 5298 + }, + { + "epoch": 8.4784, + "grad_norm": 0.0, + "learning_rate": 0.00011159999999999999, + "loss": 2.8777, + "step": 5299 + }, + { + "epoch": 8.48, + "grad_norm": 0.0, + "learning_rate": 0.00011144999999999998, + "loss": 3.3584, + "step": 5300 + }, + { + "epoch": 8.4816, + "grad_norm": 0.0, + "learning_rate": 0.0001113, + "loss": 5.1918, + "step": 5301 + }, + { + "epoch": 8.4832, + "grad_norm": 0.0, + "learning_rate": 0.00011114999999999999, + "loss": 3.8634, + "step": 5302 + }, + { + "epoch": 8.4848, + "grad_norm": 0.0, + "learning_rate": 0.00011099999999999999, + "loss": 3.2812, + "step": 5303 + }, + { + "epoch": 8.4864, + "grad_norm": 0.0, + "learning_rate": 0.00011084999999999998, + "loss": 3.6451, + "step": 5304 + }, + { + "epoch": 8.488, + "grad_norm": 0.0, + "learning_rate": 0.0001107, + "loss": 4.7517, + "step": 5305 + }, + { + "epoch": 8.4896, + "grad_norm": 0.0, + "learning_rate": 0.00011054999999999999, + "loss": 4.3051, + "step": 5306 + }, + { + "epoch": 8.4912, + "grad_norm": 0.0, + "learning_rate": 0.00011039999999999999, + "loss": 3.1874, + "step": 5307 + }, + { + "epoch": 8.4928, + "grad_norm": 0.0, + "learning_rate": 0.00011024999999999998, + "loss": 3.5162, + "step": 5308 + }, + { + "epoch": 8.4944, + "grad_norm": 0.0, + "learning_rate": 0.00011009999999999999, + "loss": 3.6533, + "step": 5309 + }, + { + "epoch": 8.496, + "grad_norm": 0.0, + "learning_rate": 0.00010994999999999999, + "loss": 3.2079, + "step": 5310 + }, + { + "epoch": 8.4976, + "grad_norm": 0.0, + "learning_rate": 0.00010979999999999999, + "loss": 3.0578, + "step": 5311 + }, + { + "epoch": 8.4992, + "grad_norm": 0.0, + "learning_rate": 0.00010964999999999998, + "loss": 2.8665, + "step": 5312 + }, + { + "epoch": 8.5008, + "grad_norm": 0.0, + "learning_rate": 0.00010949999999999999, + "loss": 3.2601, + "step": 5313 + }, + { + "epoch": 8.5024, + "grad_norm": 0.0, + "learning_rate": 0.00010934999999999999, + "loss": 3.4261, + "step": 5314 + }, + { + "epoch": 8.504, + "grad_norm": 0.0, + "learning_rate": 0.00010919999999999998, + "loss": 2.7026, + "step": 5315 + }, + { + "epoch": 8.5056, + "grad_norm": 0.0, + "learning_rate": 0.00010904999999999998, + "loss": 2.4031, + "step": 5316 + }, + { + "epoch": 8.5072, + "grad_norm": 0.0, + "learning_rate": 0.00010889999999999999, + "loss": 3.1147, + "step": 5317 + }, + { + "epoch": 8.5088, + "grad_norm": 0.0, + "learning_rate": 0.00010874999999999999, + "loss": 3.1283, + "step": 5318 + }, + { + "epoch": 8.5104, + "grad_norm": 0.0, + "learning_rate": 0.00010859999999999998, + "loss": 2.8211, + "step": 5319 + }, + { + "epoch": 8.512, + "grad_norm": 0.0, + "learning_rate": 0.00010845, + "loss": 3.076, + "step": 5320 + }, + { + "epoch": 8.5136, + "grad_norm": 0.0, + "learning_rate": 0.00010829999999999999, + "loss": 3.2892, + "step": 5321 + }, + { + "epoch": 8.5152, + "grad_norm": 0.0, + "learning_rate": 0.00010814999999999999, + "loss": 3.1918, + "step": 5322 + }, + { + "epoch": 8.5168, + "grad_norm": 0.0, + "learning_rate": 0.00010799999999999998, + "loss": 2.7595, + "step": 5323 + }, + { + "epoch": 8.5184, + "grad_norm": 0.0, + "learning_rate": 0.00010784999999999999, + "loss": 2.7904, + "step": 5324 + }, + { + "epoch": 8.52, + "grad_norm": 0.0, + "learning_rate": 0.00010769999999999999, + "loss": 3.2722, + "step": 5325 + }, + { + "epoch": 8.5216, + "grad_norm": 0.0, + "learning_rate": 0.00010754999999999999, + "loss": 2.5779, + "step": 5326 + }, + { + "epoch": 8.5232, + "grad_norm": 0.0, + "learning_rate": 0.00010739999999999998, + "loss": 2.7775, + "step": 5327 + }, + { + "epoch": 8.5248, + "grad_norm": 0.0, + "learning_rate": 0.00010724999999999999, + "loss": 3.1146, + "step": 5328 + }, + { + "epoch": 8.5264, + "grad_norm": 0.0, + "learning_rate": 0.00010709999999999999, + "loss": 2.633, + "step": 5329 + }, + { + "epoch": 8.528, + "grad_norm": 0.0, + "learning_rate": 0.00010694999999999998, + "loss": 2.5443, + "step": 5330 + }, + { + "epoch": 8.5296, + "grad_norm": 0.0, + "learning_rate": 0.00010679999999999998, + "loss": 3.069, + "step": 5331 + }, + { + "epoch": 8.5312, + "grad_norm": 0.0, + "learning_rate": 0.00010664999999999999, + "loss": 3.0671, + "step": 5332 + }, + { + "epoch": 8.5328, + "grad_norm": 0.0, + "learning_rate": 0.00010649999999999999, + "loss": 2.842, + "step": 5333 + }, + { + "epoch": 8.5344, + "grad_norm": 0.0, + "learning_rate": 0.00010634999999999998, + "loss": 3.2067, + "step": 5334 + }, + { + "epoch": 8.536, + "grad_norm": 0.0, + "learning_rate": 0.00010619999999999998, + "loss": 2.835, + "step": 5335 + }, + { + "epoch": 8.5376, + "grad_norm": 0.0, + "learning_rate": 0.00010604999999999999, + "loss": 2.7643, + "step": 5336 + }, + { + "epoch": 8.5392, + "grad_norm": 0.0, + "learning_rate": 0.00010589999999999999, + "loss": 2.9063, + "step": 5337 + }, + { + "epoch": 8.5408, + "grad_norm": 0.0, + "learning_rate": 0.00010574999999999998, + "loss": 3.4485, + "step": 5338 + }, + { + "epoch": 8.5424, + "grad_norm": 0.0, + "learning_rate": 0.00010559999999999998, + "loss": 2.6321, + "step": 5339 + }, + { + "epoch": 8.544, + "grad_norm": 0.0, + "learning_rate": 0.00010544999999999999, + "loss": 2.9262, + "step": 5340 + }, + { + "epoch": 8.5456, + "grad_norm": 0.0, + "learning_rate": 0.00010529999999999998, + "loss": 3.3372, + "step": 5341 + }, + { + "epoch": 8.5472, + "grad_norm": 0.0, + "learning_rate": 0.00010514999999999998, + "loss": 3.9157, + "step": 5342 + }, + { + "epoch": 8.5488, + "grad_norm": 0.0, + "learning_rate": 0.00010499999999999999, + "loss": 2.9866, + "step": 5343 + }, + { + "epoch": 8.5504, + "grad_norm": 0.0, + "learning_rate": 0.00010484999999999999, + "loss": 1.9647, + "step": 5344 + }, + { + "epoch": 8.552, + "grad_norm": 0.0, + "learning_rate": 0.00010469999999999998, + "loss": 2.4543, + "step": 5345 + }, + { + "epoch": 8.5536, + "grad_norm": 0.0, + "learning_rate": 0.00010454999999999998, + "loss": 3.2684, + "step": 5346 + }, + { + "epoch": 8.5552, + "grad_norm": 0.0, + "learning_rate": 0.00010439999999999999, + "loss": 2.7297, + "step": 5347 + }, + { + "epoch": 8.556799999999999, + "grad_norm": 0.0, + "learning_rate": 0.00010424999999999999, + "loss": 4.141, + "step": 5348 + }, + { + "epoch": 8.5584, + "grad_norm": 0.0, + "learning_rate": 0.00010409999999999998, + "loss": 2.9914, + "step": 5349 + }, + { + "epoch": 8.56, + "grad_norm": 0.0, + "learning_rate": 0.00010394999999999998, + "loss": 4.2832, + "step": 5350 + }, + { + "epoch": 8.5616, + "grad_norm": 0.0, + "learning_rate": 0.00010379999999999999, + "loss": 4.2738, + "step": 5351 + }, + { + "epoch": 8.5632, + "grad_norm": 0.0, + "learning_rate": 0.00010364999999999999, + "loss": 3.7024, + "step": 5352 + }, + { + "epoch": 8.5648, + "grad_norm": 0.0, + "learning_rate": 0.00010349999999999998, + "loss": 4.0658, + "step": 5353 + }, + { + "epoch": 8.5664, + "grad_norm": 0.0, + "learning_rate": 0.00010334999999999998, + "loss": 4.0144, + "step": 5354 + }, + { + "epoch": 8.568, + "grad_norm": 0.0, + "learning_rate": 0.00010319999999999999, + "loss": 3.2457, + "step": 5355 + }, + { + "epoch": 8.5696, + "grad_norm": 0.0, + "learning_rate": 0.00010305, + "loss": 3.4408, + "step": 5356 + }, + { + "epoch": 8.5712, + "grad_norm": 0.0, + "learning_rate": 0.0001029, + "loss": 3.0264, + "step": 5357 + }, + { + "epoch": 8.5728, + "grad_norm": 0.0, + "learning_rate": 0.00010275, + "loss": 3.8298, + "step": 5358 + }, + { + "epoch": 8.5744, + "grad_norm": 0.0, + "learning_rate": 0.0001026, + "loss": 3.779, + "step": 5359 + }, + { + "epoch": 8.576, + "grad_norm": 0.0, + "learning_rate": 0.00010245, + "loss": 3.0959, + "step": 5360 + }, + { + "epoch": 8.5776, + "grad_norm": 0.0, + "learning_rate": 0.00010229999999999999, + "loss": 3.6852, + "step": 5361 + }, + { + "epoch": 8.5792, + "grad_norm": 0.0, + "learning_rate": 0.00010215, + "loss": 3.4055, + "step": 5362 + }, + { + "epoch": 8.5808, + "grad_norm": 0.0, + "learning_rate": 0.000102, + "loss": 3.4038, + "step": 5363 + }, + { + "epoch": 8.5824, + "grad_norm": 0.0, + "learning_rate": 0.00010185, + "loss": 3.2311, + "step": 5364 + }, + { + "epoch": 8.584, + "grad_norm": 0.0, + "learning_rate": 0.00010169999999999999, + "loss": 2.9334, + "step": 5365 + }, + { + "epoch": 8.5856, + "grad_norm": 0.0, + "learning_rate": 0.00010155, + "loss": 2.3327, + "step": 5366 + }, + { + "epoch": 8.5872, + "grad_norm": 0.0, + "learning_rate": 0.0001014, + "loss": 3.4392, + "step": 5367 + }, + { + "epoch": 8.588799999999999, + "grad_norm": 0.0, + "learning_rate": 0.00010125, + "loss": 2.688, + "step": 5368 + }, + { + "epoch": 8.5904, + "grad_norm": 0.0, + "learning_rate": 0.0001011, + "loss": 3.0311, + "step": 5369 + }, + { + "epoch": 8.592, + "grad_norm": 0.0, + "learning_rate": 0.00010095, + "loss": 3.853, + "step": 5370 + }, + { + "epoch": 8.5936, + "grad_norm": 0.0, + "learning_rate": 0.0001008, + "loss": 3.2355, + "step": 5371 + }, + { + "epoch": 8.5952, + "grad_norm": 0.0, + "learning_rate": 0.00010065, + "loss": 2.6658, + "step": 5372 + }, + { + "epoch": 8.5968, + "grad_norm": 0.0, + "learning_rate": 0.0001005, + "loss": 2.8656, + "step": 5373 + }, + { + "epoch": 8.5984, + "grad_norm": 0.0, + "learning_rate": 0.00010035, + "loss": 2.7324, + "step": 5374 + }, + { + "epoch": 8.6, + "grad_norm": 0.0, + "learning_rate": 0.0001002, + "loss": 2.9669, + "step": 5375 + }, + { + "epoch": 8.6016, + "grad_norm": 0.0, + "learning_rate": 0.00010004999999999999, + "loss": 2.1973, + "step": 5376 + }, + { + "epoch": 8.6032, + "grad_norm": 0.0, + "learning_rate": 9.99e-05, + "loss": 2.7826, + "step": 5377 + }, + { + "epoch": 8.604800000000001, + "grad_norm": 0.0, + "learning_rate": 9.975e-05, + "loss": 2.4138, + "step": 5378 + }, + { + "epoch": 8.6064, + "grad_norm": 0.0, + "learning_rate": 9.96e-05, + "loss": 2.999, + "step": 5379 + }, + { + "epoch": 8.608, + "grad_norm": 0.0, + "learning_rate": 9.944999999999999e-05, + "loss": 3.0998, + "step": 5380 + }, + { + "epoch": 8.6096, + "grad_norm": 0.0, + "learning_rate": 9.93e-05, + "loss": 2.8706, + "step": 5381 + }, + { + "epoch": 8.6112, + "grad_norm": 0.0, + "learning_rate": 9.915e-05, + "loss": 2.7169, + "step": 5382 + }, + { + "epoch": 8.6128, + "grad_norm": 0.0, + "learning_rate": 9.9e-05, + "loss": 3.1331, + "step": 5383 + }, + { + "epoch": 8.6144, + "grad_norm": 0.0, + "learning_rate": 9.884999999999999e-05, + "loss": 2.4797, + "step": 5384 + }, + { + "epoch": 8.616, + "grad_norm": 0.0, + "learning_rate": 9.87e-05, + "loss": 2.5391, + "step": 5385 + }, + { + "epoch": 8.6176, + "grad_norm": 0.0, + "learning_rate": 9.855e-05, + "loss": 2.4524, + "step": 5386 + }, + { + "epoch": 8.6192, + "grad_norm": 0.0, + "learning_rate": 9.839999999999999e-05, + "loss": 2.9776, + "step": 5387 + }, + { + "epoch": 8.6208, + "grad_norm": 0.0, + "learning_rate": 9.824999999999999e-05, + "loss": 3.2171, + "step": 5388 + }, + { + "epoch": 8.6224, + "grad_norm": 0.0, + "learning_rate": 9.81e-05, + "loss": 3.4589, + "step": 5389 + }, + { + "epoch": 8.624, + "grad_norm": 0.0, + "learning_rate": 9.795e-05, + "loss": 2.4906, + "step": 5390 + }, + { + "epoch": 8.6256, + "grad_norm": 0.0, + "learning_rate": 9.779999999999999e-05, + "loss": 3.3645, + "step": 5391 + }, + { + "epoch": 8.6272, + "grad_norm": 0.0, + "learning_rate": 9.764999999999999e-05, + "loss": 2.7121, + "step": 5392 + }, + { + "epoch": 8.6288, + "grad_norm": 0.0, + "learning_rate": 9.75e-05, + "loss": 3.3566, + "step": 5393 + }, + { + "epoch": 8.6304, + "grad_norm": 0.0, + "learning_rate": 9.735e-05, + "loss": 3.0868, + "step": 5394 + }, + { + "epoch": 8.632, + "grad_norm": 0.0, + "learning_rate": 9.719999999999999e-05, + "loss": 2.3634, + "step": 5395 + }, + { + "epoch": 8.6336, + "grad_norm": 0.0, + "learning_rate": 9.705e-05, + "loss": 2.4537, + "step": 5396 + }, + { + "epoch": 8.6352, + "grad_norm": 0.0, + "learning_rate": 9.69e-05, + "loss": 2.7268, + "step": 5397 + }, + { + "epoch": 8.636800000000001, + "grad_norm": 0.0, + "learning_rate": 9.675e-05, + "loss": 2.4014, + "step": 5398 + }, + { + "epoch": 8.6384, + "grad_norm": 0.0, + "learning_rate": 9.659999999999999e-05, + "loss": 3.4346, + "step": 5399 + }, + { + "epoch": 8.64, + "grad_norm": 0.0, + "learning_rate": 9.645e-05, + "loss": 3.1193, + "step": 5400 + }, + { + "epoch": 8.6416, + "grad_norm": 0.0, + "learning_rate": 9.63e-05, + "loss": 4.077, + "step": 5401 + }, + { + "epoch": 8.6432, + "grad_norm": 0.0, + "learning_rate": 9.614999999999999e-05, + "loss": 4.2156, + "step": 5402 + }, + { + "epoch": 8.6448, + "grad_norm": 0.0, + "learning_rate": 9.599999999999999e-05, + "loss": 3.1341, + "step": 5403 + }, + { + "epoch": 8.6464, + "grad_norm": 0.0, + "learning_rate": 9.585e-05, + "loss": 4.1184, + "step": 5404 + }, + { + "epoch": 8.648, + "grad_norm": 0.0, + "learning_rate": 9.57e-05, + "loss": 3.6441, + "step": 5405 + }, + { + "epoch": 8.6496, + "grad_norm": 0.0, + "learning_rate": 9.554999999999999e-05, + "loss": 3.4632, + "step": 5406 + }, + { + "epoch": 8.6512, + "grad_norm": 0.0, + "learning_rate": 9.539999999999999e-05, + "loss": 3.1794, + "step": 5407 + }, + { + "epoch": 8.6528, + "grad_norm": 0.0, + "learning_rate": 9.525e-05, + "loss": 4.5431, + "step": 5408 + }, + { + "epoch": 8.6544, + "grad_norm": 0.0, + "learning_rate": 9.51e-05, + "loss": 3.7904, + "step": 5409 + }, + { + "epoch": 8.656, + "grad_norm": 0.0, + "learning_rate": 9.494999999999999e-05, + "loss": 2.7677, + "step": 5410 + }, + { + "epoch": 8.6576, + "grad_norm": 0.0, + "learning_rate": 9.479999999999999e-05, + "loss": 2.5789, + "step": 5411 + }, + { + "epoch": 8.6592, + "grad_norm": 0.0, + "learning_rate": 9.465e-05, + "loss": 2.8011, + "step": 5412 + }, + { + "epoch": 8.6608, + "grad_norm": 0.0, + "learning_rate": 9.449999999999999e-05, + "loss": 3.3363, + "step": 5413 + }, + { + "epoch": 8.6624, + "grad_norm": 0.0, + "learning_rate": 9.434999999999999e-05, + "loss": 2.5781, + "step": 5414 + }, + { + "epoch": 8.664, + "grad_norm": 0.0, + "learning_rate": 9.419999999999999e-05, + "loss": 3.5032, + "step": 5415 + }, + { + "epoch": 8.6656, + "grad_norm": 0.0, + "learning_rate": 9.405e-05, + "loss": 2.6167, + "step": 5416 + }, + { + "epoch": 8.6672, + "grad_norm": 0.0, + "learning_rate": 9.389999999999999e-05, + "loss": 2.2998, + "step": 5417 + }, + { + "epoch": 8.6688, + "grad_norm": 0.0, + "learning_rate": 9.374999999999999e-05, + "loss": 2.5932, + "step": 5418 + }, + { + "epoch": 8.6704, + "grad_norm": 0.0, + "learning_rate": 9.36e-05, + "loss": 2.5544, + "step": 5419 + }, + { + "epoch": 8.672, + "grad_norm": 0.0, + "learning_rate": 9.345e-05, + "loss": 3.0412, + "step": 5420 + }, + { + "epoch": 8.6736, + "grad_norm": 0.0, + "learning_rate": 9.329999999999999e-05, + "loss": 3.6085, + "step": 5421 + }, + { + "epoch": 8.6752, + "grad_norm": 0.0, + "learning_rate": 9.314999999999999e-05, + "loss": 3.2467, + "step": 5422 + }, + { + "epoch": 8.6768, + "grad_norm": 0.0, + "learning_rate": 9.3e-05, + "loss": 3.0468, + "step": 5423 + }, + { + "epoch": 8.6784, + "grad_norm": 0.0, + "learning_rate": 9.285e-05, + "loss": 2.4345, + "step": 5424 + }, + { + "epoch": 8.68, + "grad_norm": 0.0, + "learning_rate": 9.269999999999999e-05, + "loss": 2.8213, + "step": 5425 + }, + { + "epoch": 8.6816, + "grad_norm": 0.0, + "learning_rate": 9.254999999999999e-05, + "loss": 2.8409, + "step": 5426 + }, + { + "epoch": 8.6832, + "grad_norm": 0.0, + "learning_rate": 9.24e-05, + "loss": 2.8714, + "step": 5427 + }, + { + "epoch": 8.6848, + "grad_norm": 0.0, + "learning_rate": 9.224999999999999e-05, + "loss": 2.918, + "step": 5428 + }, + { + "epoch": 8.6864, + "grad_norm": 0.0, + "learning_rate": 9.209999999999999e-05, + "loss": 3.2587, + "step": 5429 + }, + { + "epoch": 8.688, + "grad_norm": 0.0, + "learning_rate": 9.194999999999999e-05, + "loss": 3.1399, + "step": 5430 + }, + { + "epoch": 8.6896, + "grad_norm": 0.0, + "learning_rate": 9.18e-05, + "loss": 2.755, + "step": 5431 + }, + { + "epoch": 8.6912, + "grad_norm": 0.0, + "learning_rate": 9.164999999999999e-05, + "loss": 2.5296, + "step": 5432 + }, + { + "epoch": 8.6928, + "grad_norm": 0.0, + "learning_rate": 9.149999999999999e-05, + "loss": 2.8267, + "step": 5433 + }, + { + "epoch": 8.6944, + "grad_norm": 0.0, + "learning_rate": 9.134999999999998e-05, + "loss": 2.5156, + "step": 5434 + }, + { + "epoch": 8.696, + "grad_norm": 0.0, + "learning_rate": 9.12e-05, + "loss": 2.5486, + "step": 5435 + }, + { + "epoch": 8.6976, + "grad_norm": 0.0, + "learning_rate": 9.104999999999999e-05, + "loss": 2.533, + "step": 5436 + }, + { + "epoch": 8.6992, + "grad_norm": 0.0, + "learning_rate": 9.089999999999999e-05, + "loss": 2.7671, + "step": 5437 + }, + { + "epoch": 8.7008, + "grad_norm": 0.0, + "learning_rate": 9.074999999999998e-05, + "loss": 2.9547, + "step": 5438 + }, + { + "epoch": 8.7024, + "grad_norm": 0.0, + "learning_rate": 9.059999999999999e-05, + "loss": 2.9633, + "step": 5439 + }, + { + "epoch": 8.704, + "grad_norm": 0.0, + "learning_rate": 9.044999999999999e-05, + "loss": 3.3505, + "step": 5440 + }, + { + "epoch": 8.7056, + "grad_norm": 0.0, + "learning_rate": 9.029999999999999e-05, + "loss": 3.1255, + "step": 5441 + }, + { + "epoch": 8.7072, + "grad_norm": 0.0, + "learning_rate": 9.014999999999998e-05, + "loss": 3.4915, + "step": 5442 + }, + { + "epoch": 8.7088, + "grad_norm": 0.0, + "learning_rate": 8.999999999999999e-05, + "loss": 2.9845, + "step": 5443 + }, + { + "epoch": 8.7104, + "grad_norm": 0.0, + "learning_rate": 8.984999999999999e-05, + "loss": 3.0208, + "step": 5444 + }, + { + "epoch": 8.712, + "grad_norm": 0.0, + "learning_rate": 8.969999999999998e-05, + "loss": 2.9816, + "step": 5445 + }, + { + "epoch": 8.7136, + "grad_norm": 0.0, + "learning_rate": 8.955e-05, + "loss": 3.0994, + "step": 5446 + }, + { + "epoch": 8.7152, + "grad_norm": 0.0, + "learning_rate": 8.939999999999999e-05, + "loss": 2.7481, + "step": 5447 + }, + { + "epoch": 8.7168, + "grad_norm": 0.0, + "learning_rate": 8.924999999999999e-05, + "loss": 2.8729, + "step": 5448 + }, + { + "epoch": 8.7184, + "grad_norm": 0.0, + "learning_rate": 8.909999999999998e-05, + "loss": 3.4677, + "step": 5449 + }, + { + "epoch": 8.72, + "grad_norm": 0.0, + "learning_rate": 8.895e-05, + "loss": 2.262, + "step": 5450 + }, + { + "epoch": 8.7216, + "grad_norm": 0.0, + "learning_rate": 8.879999999999999e-05, + "loss": 4.1577, + "step": 5451 + }, + { + "epoch": 8.7232, + "grad_norm": 0.0, + "learning_rate": 8.864999999999999e-05, + "loss": 4.2487, + "step": 5452 + }, + { + "epoch": 8.7248, + "grad_norm": 0.0, + "learning_rate": 8.849999999999998e-05, + "loss": 3.2539, + "step": 5453 + }, + { + "epoch": 8.7264, + "grad_norm": 0.0, + "learning_rate": 8.834999999999999e-05, + "loss": 3.7699, + "step": 5454 + }, + { + "epoch": 8.728, + "grad_norm": 0.0, + "learning_rate": 8.819999999999999e-05, + "loss": 2.6604, + "step": 5455 + }, + { + "epoch": 8.7296, + "grad_norm": 0.0, + "learning_rate": 8.804999999999999e-05, + "loss": 4.3472, + "step": 5456 + }, + { + "epoch": 8.7312, + "grad_norm": 0.0, + "learning_rate": 8.789999999999998e-05, + "loss": 2.8844, + "step": 5457 + }, + { + "epoch": 8.7328, + "grad_norm": 0.0, + "learning_rate": 8.774999999999999e-05, + "loss": 3.1081, + "step": 5458 + }, + { + "epoch": 8.7344, + "grad_norm": 0.0, + "learning_rate": 8.759999999999999e-05, + "loss": 2.8376, + "step": 5459 + }, + { + "epoch": 8.736, + "grad_norm": 0.0, + "learning_rate": 8.744999999999998e-05, + "loss": 3.3077, + "step": 5460 + }, + { + "epoch": 8.7376, + "grad_norm": 0.0, + "learning_rate": 8.729999999999998e-05, + "loss": 2.8664, + "step": 5461 + }, + { + "epoch": 8.7392, + "grad_norm": 0.0, + "learning_rate": 8.714999999999999e-05, + "loss": 3.8246, + "step": 5462 + }, + { + "epoch": 8.7408, + "grad_norm": 0.0, + "learning_rate": 8.699999999999999e-05, + "loss": 2.7271, + "step": 5463 + }, + { + "epoch": 8.7424, + "grad_norm": 0.0, + "learning_rate": 8.684999999999998e-05, + "loss": 3.0011, + "step": 5464 + }, + { + "epoch": 8.744, + "grad_norm": 0.0, + "learning_rate": 8.669999999999998e-05, + "loss": 2.5552, + "step": 5465 + }, + { + "epoch": 8.7456, + "grad_norm": 0.0, + "learning_rate": 8.654999999999999e-05, + "loss": 2.7657, + "step": 5466 + }, + { + "epoch": 8.7472, + "grad_norm": 0.0, + "learning_rate": 8.639999999999999e-05, + "loss": 2.8866, + "step": 5467 + }, + { + "epoch": 8.7488, + "grad_norm": 0.0, + "learning_rate": 8.624999999999998e-05, + "loss": 3.3402, + "step": 5468 + }, + { + "epoch": 8.750399999999999, + "grad_norm": 0.0, + "learning_rate": 8.609999999999999e-05, + "loss": 2.9473, + "step": 5469 + }, + { + "epoch": 8.752, + "grad_norm": 0.0, + "learning_rate": 8.594999999999999e-05, + "loss": 2.8834, + "step": 5470 + }, + { + "epoch": 8.7536, + "grad_norm": 0.0, + "learning_rate": 8.579999999999998e-05, + "loss": 2.3812, + "step": 5471 + }, + { + "epoch": 8.7552, + "grad_norm": 0.0, + "learning_rate": 8.564999999999998e-05, + "loss": 3.8734, + "step": 5472 + }, + { + "epoch": 8.7568, + "grad_norm": 0.0, + "learning_rate": 8.549999999999999e-05, + "loss": 2.6016, + "step": 5473 + }, + { + "epoch": 8.7584, + "grad_norm": 0.0, + "learning_rate": 8.534999999999999e-05, + "loss": 2.5534, + "step": 5474 + }, + { + "epoch": 8.76, + "grad_norm": 0.0, + "learning_rate": 8.519999999999998e-05, + "loss": 2.5744, + "step": 5475 + }, + { + "epoch": 8.7616, + "grad_norm": 0.0, + "learning_rate": 8.504999999999998e-05, + "loss": 2.7575, + "step": 5476 + }, + { + "epoch": 8.7632, + "grad_norm": 0.0, + "learning_rate": 8.489999999999999e-05, + "loss": 2.1876, + "step": 5477 + }, + { + "epoch": 8.7648, + "grad_norm": 0.0, + "learning_rate": 8.474999999999999e-05, + "loss": 2.4229, + "step": 5478 + }, + { + "epoch": 8.7664, + "grad_norm": 0.0, + "learning_rate": 8.459999999999998e-05, + "loss": 3.0801, + "step": 5479 + }, + { + "epoch": 8.768, + "grad_norm": 0.0, + "learning_rate": 8.444999999999998e-05, + "loss": 2.9251, + "step": 5480 + }, + { + "epoch": 8.7696, + "grad_norm": 0.0, + "learning_rate": 8.43e-05, + "loss": 2.48, + "step": 5481 + }, + { + "epoch": 8.7712, + "grad_norm": 0.0, + "learning_rate": 8.415e-05, + "loss": 2.5742, + "step": 5482 + }, + { + "epoch": 8.7728, + "grad_norm": 0.0, + "learning_rate": 8.4e-05, + "loss": 2.8933, + "step": 5483 + }, + { + "epoch": 8.7744, + "grad_norm": 0.0, + "learning_rate": 8.385e-05, + "loss": 2.7115, + "step": 5484 + }, + { + "epoch": 8.776, + "grad_norm": 0.0, + "learning_rate": 8.37e-05, + "loss": 2.6023, + "step": 5485 + }, + { + "epoch": 8.7776, + "grad_norm": 0.0, + "learning_rate": 8.355e-05, + "loss": 3.4378, + "step": 5486 + }, + { + "epoch": 8.7792, + "grad_norm": 0.0, + "learning_rate": 8.34e-05, + "loss": 3.2154, + "step": 5487 + }, + { + "epoch": 8.7808, + "grad_norm": 0.0, + "learning_rate": 8.325e-05, + "loss": 3.0487, + "step": 5488 + }, + { + "epoch": 8.782399999999999, + "grad_norm": 0.0, + "learning_rate": 8.31e-05, + "loss": 2.8643, + "step": 5489 + }, + { + "epoch": 8.784, + "grad_norm": 0.0, + "learning_rate": 8.295e-05, + "loss": 2.6776, + "step": 5490 + }, + { + "epoch": 8.7856, + "grad_norm": 0.0, + "learning_rate": 8.28e-05, + "loss": 2.92, + "step": 5491 + }, + { + "epoch": 8.7872, + "grad_norm": 0.0, + "learning_rate": 8.265e-05, + "loss": 3.2343, + "step": 5492 + }, + { + "epoch": 8.7888, + "grad_norm": 0.0, + "learning_rate": 8.25e-05, + "loss": 2.5515, + "step": 5493 + }, + { + "epoch": 8.7904, + "grad_norm": 0.0, + "learning_rate": 8.235e-05, + "loss": 2.9681, + "step": 5494 + }, + { + "epoch": 8.792, + "grad_norm": 0.0, + "learning_rate": 8.22e-05, + "loss": 2.7059, + "step": 5495 + }, + { + "epoch": 8.7936, + "grad_norm": 0.0, + "learning_rate": 8.205e-05, + "loss": 2.2784, + "step": 5496 + }, + { + "epoch": 8.7952, + "grad_norm": 0.0, + "learning_rate": 8.19e-05, + "loss": 3.4781, + "step": 5497 + }, + { + "epoch": 8.7968, + "grad_norm": 0.0, + "learning_rate": 8.175e-05, + "loss": 3.1366, + "step": 5498 + }, + { + "epoch": 8.7984, + "grad_norm": 0.0, + "learning_rate": 8.16e-05, + "loss": 3.1494, + "step": 5499 + }, + { + "epoch": 8.8, + "grad_norm": NaN, + "learning_rate": 8.16e-05, + "loss": 3.1003, + "step": 5500 + }, + { + "epoch": 8.8016, + "grad_norm": 0.0, + "learning_rate": 8.145e-05, + "loss": 3.3448, + "step": 5501 + }, + { + "epoch": 8.8032, + "grad_norm": 0.0, + "learning_rate": 8.13e-05, + "loss": 4.0856, + "step": 5502 + }, + { + "epoch": 8.8048, + "grad_norm": 0.0, + "learning_rate": 8.115e-05, + "loss": 3.9545, + "step": 5503 + }, + { + "epoch": 8.8064, + "grad_norm": 0.0, + "learning_rate": 8.1e-05, + "loss": 3.7369, + "step": 5504 + }, + { + "epoch": 8.808, + "grad_norm": 0.0, + "learning_rate": 8.085e-05, + "loss": 2.977, + "step": 5505 + }, + { + "epoch": 8.8096, + "grad_norm": 0.0, + "learning_rate": 8.07e-05, + "loss": 3.2168, + "step": 5506 + }, + { + "epoch": 8.8112, + "grad_norm": 0.0, + "learning_rate": 8.054999999999999e-05, + "loss": 3.0844, + "step": 5507 + }, + { + "epoch": 8.8128, + "grad_norm": 0.0, + "learning_rate": 8.04e-05, + "loss": 2.8623, + "step": 5508 + }, + { + "epoch": 8.8144, + "grad_norm": 0.0, + "learning_rate": 8.025e-05, + "loss": 3.3578, + "step": 5509 + }, + { + "epoch": 8.816, + "grad_norm": 0.0, + "learning_rate": 8.01e-05, + "loss": 2.7042, + "step": 5510 + }, + { + "epoch": 8.8176, + "grad_norm": 0.0, + "learning_rate": 7.994999999999999e-05, + "loss": 3.3687, + "step": 5511 + }, + { + "epoch": 8.8192, + "grad_norm": 0.0, + "learning_rate": 7.98e-05, + "loss": 3.6682, + "step": 5512 + }, + { + "epoch": 8.8208, + "grad_norm": 0.0, + "learning_rate": 7.965e-05, + "loss": 2.6471, + "step": 5513 + }, + { + "epoch": 8.8224, + "grad_norm": 0.0, + "learning_rate": 7.95e-05, + "loss": 2.5197, + "step": 5514 + }, + { + "epoch": 8.824, + "grad_norm": 0.0, + "learning_rate": 7.934999999999999e-05, + "loss": 2.7002, + "step": 5515 + }, + { + "epoch": 8.8256, + "grad_norm": 0.0, + "learning_rate": 7.92e-05, + "loss": 2.9525, + "step": 5516 + }, + { + "epoch": 8.8272, + "grad_norm": 0.0, + "learning_rate": 7.905e-05, + "loss": 3.031, + "step": 5517 + }, + { + "epoch": 8.8288, + "grad_norm": 0.0, + "learning_rate": 7.89e-05, + "loss": 2.4588, + "step": 5518 + }, + { + "epoch": 8.830400000000001, + "grad_norm": 0.0, + "learning_rate": 7.874999999999999e-05, + "loss": 3.0163, + "step": 5519 + }, + { + "epoch": 8.832, + "grad_norm": 0.0, + "learning_rate": 7.86e-05, + "loss": 2.6547, + "step": 5520 + }, + { + "epoch": 8.8336, + "grad_norm": 0.0, + "learning_rate": 7.845e-05, + "loss": 2.6753, + "step": 5521 + }, + { + "epoch": 8.8352, + "grad_norm": 0.0, + "learning_rate": 7.829999999999999e-05, + "loss": 2.4913, + "step": 5522 + }, + { + "epoch": 8.8368, + "grad_norm": 0.0, + "learning_rate": 7.815e-05, + "loss": 2.6377, + "step": 5523 + }, + { + "epoch": 8.8384, + "grad_norm": 0.0, + "learning_rate": 7.8e-05, + "loss": 2.8134, + "step": 5524 + }, + { + "epoch": 8.84, + "grad_norm": 0.0, + "learning_rate": 7.785e-05, + "loss": 2.7685, + "step": 5525 + }, + { + "epoch": 8.8416, + "grad_norm": 0.0, + "learning_rate": 7.769999999999999e-05, + "loss": 2.4115, + "step": 5526 + }, + { + "epoch": 8.8432, + "grad_norm": 0.0, + "learning_rate": 7.755e-05, + "loss": 3.4808, + "step": 5527 + }, + { + "epoch": 8.8448, + "grad_norm": 0.0, + "learning_rate": 7.74e-05, + "loss": 2.845, + "step": 5528 + }, + { + "epoch": 8.8464, + "grad_norm": 0.0, + "learning_rate": 7.725e-05, + "loss": 3.4166, + "step": 5529 + }, + { + "epoch": 8.848, + "grad_norm": 0.0, + "learning_rate": 7.709999999999999e-05, + "loss": 3.1478, + "step": 5530 + }, + { + "epoch": 8.8496, + "grad_norm": 0.0, + "learning_rate": 7.695e-05, + "loss": 3.3035, + "step": 5531 + }, + { + "epoch": 8.8512, + "grad_norm": 0.0, + "learning_rate": 7.68e-05, + "loss": 3.0173, + "step": 5532 + }, + { + "epoch": 8.8528, + "grad_norm": 0.0, + "learning_rate": 7.664999999999999e-05, + "loss": 3.3192, + "step": 5533 + }, + { + "epoch": 8.8544, + "grad_norm": 0.0, + "learning_rate": 7.649999999999999e-05, + "loss": 2.6861, + "step": 5534 + }, + { + "epoch": 8.856, + "grad_norm": 0.0, + "learning_rate": 7.635e-05, + "loss": 2.8469, + "step": 5535 + }, + { + "epoch": 8.8576, + "grad_norm": 0.0, + "learning_rate": 7.62e-05, + "loss": 2.5906, + "step": 5536 + }, + { + "epoch": 8.8592, + "grad_norm": 0.0, + "learning_rate": 7.604999999999999e-05, + "loss": 3.5129, + "step": 5537 + }, + { + "epoch": 8.8608, + "grad_norm": 0.0, + "learning_rate": 7.589999999999999e-05, + "loss": 2.6874, + "step": 5538 + }, + { + "epoch": 8.862400000000001, + "grad_norm": 0.0, + "learning_rate": 7.575e-05, + "loss": 2.8809, + "step": 5539 + }, + { + "epoch": 8.864, + "grad_norm": 0.0, + "learning_rate": 7.56e-05, + "loss": 2.9368, + "step": 5540 + }, + { + "epoch": 8.8656, + "grad_norm": 0.0, + "learning_rate": 7.544999999999999e-05, + "loss": 3.2077, + "step": 5541 + }, + { + "epoch": 8.8672, + "grad_norm": 0.0, + "learning_rate": 7.529999999999999e-05, + "loss": 2.355, + "step": 5542 + }, + { + "epoch": 8.8688, + "grad_norm": 0.0, + "learning_rate": 7.515e-05, + "loss": 2.6583, + "step": 5543 + }, + { + "epoch": 8.8704, + "grad_norm": 0.0, + "learning_rate": 7.5e-05, + "loss": 2.9874, + "step": 5544 + }, + { + "epoch": 8.872, + "grad_norm": 0.0, + "learning_rate": 7.484999999999999e-05, + "loss": 3.6936, + "step": 5545 + }, + { + "epoch": 8.8736, + "grad_norm": 0.0, + "learning_rate": 7.47e-05, + "loss": 3.0383, + "step": 5546 + }, + { + "epoch": 8.8752, + "grad_norm": 0.0, + "learning_rate": 7.455e-05, + "loss": 2.476, + "step": 5547 + }, + { + "epoch": 8.8768, + "grad_norm": 0.0, + "learning_rate": 7.439999999999999e-05, + "loss": 2.962, + "step": 5548 + }, + { + "epoch": 8.8784, + "grad_norm": 0.0, + "learning_rate": 7.424999999999999e-05, + "loss": 2.8282, + "step": 5549 + }, + { + "epoch": 8.88, + "grad_norm": 0.0, + "learning_rate": 7.41e-05, + "loss": 4.6808, + "step": 5550 + }, + { + "epoch": 8.8816, + "grad_norm": 0.0, + "learning_rate": 7.395e-05, + "loss": 5.0056, + "step": 5551 + }, + { + "epoch": 8.8832, + "grad_norm": 0.0, + "learning_rate": 7.379999999999999e-05, + "loss": 3.5208, + "step": 5552 + }, + { + "epoch": 8.8848, + "grad_norm": 0.0, + "learning_rate": 7.364999999999999e-05, + "loss": 3.9444, + "step": 5553 + }, + { + "epoch": 8.8864, + "grad_norm": 0.0, + "learning_rate": 7.35e-05, + "loss": 3.2421, + "step": 5554 + }, + { + "epoch": 8.888, + "grad_norm": 0.0, + "learning_rate": 7.335e-05, + "loss": 3.4752, + "step": 5555 + }, + { + "epoch": 8.8896, + "grad_norm": 0.0, + "learning_rate": 7.319999999999999e-05, + "loss": 2.616, + "step": 5556 + }, + { + "epoch": 8.8912, + "grad_norm": 0.0, + "learning_rate": 7.304999999999999e-05, + "loss": 2.9658, + "step": 5557 + }, + { + "epoch": 8.8928, + "grad_norm": 0.0, + "learning_rate": 7.29e-05, + "loss": 2.8145, + "step": 5558 + }, + { + "epoch": 8.8944, + "grad_norm": 0.0, + "learning_rate": 7.274999999999999e-05, + "loss": 2.8018, + "step": 5559 + }, + { + "epoch": 8.896, + "grad_norm": 0.0, + "learning_rate": 7.259999999999999e-05, + "loss": 3.2103, + "step": 5560 + }, + { + "epoch": 8.8976, + "grad_norm": 0.0, + "learning_rate": 7.244999999999999e-05, + "loss": 3.0906, + "step": 5561 + }, + { + "epoch": 8.8992, + "grad_norm": 0.0, + "learning_rate": 7.23e-05, + "loss": 2.5659, + "step": 5562 + }, + { + "epoch": 8.9008, + "grad_norm": 0.0, + "learning_rate": 7.214999999999999e-05, + "loss": 3.2138, + "step": 5563 + }, + { + "epoch": 8.9024, + "grad_norm": 0.0, + "learning_rate": 7.199999999999999e-05, + "loss": 2.8943, + "step": 5564 + }, + { + "epoch": 8.904, + "grad_norm": 0.0, + "learning_rate": 7.184999999999998e-05, + "loss": 2.9345, + "step": 5565 + }, + { + "epoch": 8.9056, + "grad_norm": 0.0, + "learning_rate": 7.17e-05, + "loss": 2.9651, + "step": 5566 + }, + { + "epoch": 8.9072, + "grad_norm": 0.0, + "learning_rate": 7.154999999999999e-05, + "loss": 3.0088, + "step": 5567 + }, + { + "epoch": 8.9088, + "grad_norm": 0.0, + "learning_rate": 7.139999999999999e-05, + "loss": 2.5948, + "step": 5568 + }, + { + "epoch": 8.9104, + "grad_norm": 0.0, + "learning_rate": 7.125e-05, + "loss": 2.509, + "step": 5569 + }, + { + "epoch": 8.912, + "grad_norm": 0.0, + "learning_rate": 7.11e-05, + "loss": 2.5297, + "step": 5570 + }, + { + "epoch": 8.9136, + "grad_norm": 0.0, + "learning_rate": 7.094999999999999e-05, + "loss": 2.6333, + "step": 5571 + }, + { + "epoch": 8.9152, + "grad_norm": 0.0, + "learning_rate": 7.079999999999999e-05, + "loss": 3.1458, + "step": 5572 + }, + { + "epoch": 8.9168, + "grad_norm": 0.0, + "learning_rate": 7.065e-05, + "loss": 2.8393, + "step": 5573 + }, + { + "epoch": 8.9184, + "grad_norm": 0.0, + "learning_rate": 7.049999999999999e-05, + "loss": 2.3593, + "step": 5574 + }, + { + "epoch": 8.92, + "grad_norm": 0.0, + "learning_rate": 7.034999999999999e-05, + "loss": 2.2004, + "step": 5575 + }, + { + "epoch": 8.9216, + "grad_norm": 0.0, + "learning_rate": 7.02e-05, + "loss": 3.1726, + "step": 5576 + }, + { + "epoch": 8.9232, + "grad_norm": 0.0, + "learning_rate": 7.005e-05, + "loss": 3.0657, + "step": 5577 + }, + { + "epoch": 8.9248, + "grad_norm": 0.0, + "learning_rate": 6.989999999999999e-05, + "loss": 2.9242, + "step": 5578 + }, + { + "epoch": 8.9264, + "grad_norm": 0.0, + "learning_rate": 6.975e-05, + "loss": 2.8571, + "step": 5579 + }, + { + "epoch": 8.928, + "grad_norm": 0.0, + "learning_rate": 6.96e-05, + "loss": 3.4707, + "step": 5580 + }, + { + "epoch": 8.9296, + "grad_norm": 0.0, + "learning_rate": 6.945e-05, + "loss": 2.372, + "step": 5581 + }, + { + "epoch": 8.9312, + "grad_norm": 0.0, + "learning_rate": 6.93e-05, + "loss": 2.6497, + "step": 5582 + }, + { + "epoch": 8.9328, + "grad_norm": 0.0, + "learning_rate": 6.915e-05, + "loss": 3.2633, + "step": 5583 + }, + { + "epoch": 8.9344, + "grad_norm": 0.0, + "learning_rate": 6.9e-05, + "loss": 2.8765, + "step": 5584 + }, + { + "epoch": 8.936, + "grad_norm": 0.0, + "learning_rate": 6.884999999999999e-05, + "loss": 3.5971, + "step": 5585 + }, + { + "epoch": 8.9376, + "grad_norm": 0.0, + "learning_rate": 6.87e-05, + "loss": 3.4084, + "step": 5586 + }, + { + "epoch": 8.9392, + "grad_norm": 0.0, + "learning_rate": 6.855e-05, + "loss": 2.6225, + "step": 5587 + }, + { + "epoch": 8.9408, + "grad_norm": 0.0, + "learning_rate": 6.84e-05, + "loss": 2.7682, + "step": 5588 + }, + { + "epoch": 8.9424, + "grad_norm": 0.0, + "learning_rate": 6.824999999999999e-05, + "loss": 2.3212, + "step": 5589 + }, + { + "epoch": 8.943999999999999, + "grad_norm": 0.0, + "learning_rate": 6.81e-05, + "loss": 2.308, + "step": 5590 + }, + { + "epoch": 8.9456, + "grad_norm": 0.0, + "learning_rate": 6.795e-05, + "loss": 3.1101, + "step": 5591 + }, + { + "epoch": 8.9472, + "grad_norm": 0.0, + "learning_rate": 6.78e-05, + "loss": 2.9434, + "step": 5592 + }, + { + "epoch": 8.9488, + "grad_norm": 0.0, + "learning_rate": 6.764999999999999e-05, + "loss": 4.3318, + "step": 5593 + }, + { + "epoch": 8.9504, + "grad_norm": 0.0, + "learning_rate": 6.75e-05, + "loss": 2.77, + "step": 5594 + }, + { + "epoch": 8.952, + "grad_norm": 0.0, + "learning_rate": 6.735e-05, + "loss": 2.4421, + "step": 5595 + }, + { + "epoch": 8.9536, + "grad_norm": 0.0, + "learning_rate": 6.72e-05, + "loss": 3.7881, + "step": 5596 + }, + { + "epoch": 8.9552, + "grad_norm": 0.0, + "learning_rate": 6.704999999999999e-05, + "loss": 2.5785, + "step": 5597 + }, + { + "epoch": 8.9568, + "grad_norm": 0.0, + "learning_rate": 6.69e-05, + "loss": 3.3298, + "step": 5598 + }, + { + "epoch": 8.9584, + "grad_norm": 0.0, + "learning_rate": 6.675e-05, + "loss": 3.5415, + "step": 5599 + }, + { + "epoch": 8.96, + "grad_norm": 0.0, + "learning_rate": 6.659999999999999e-05, + "loss": 4.0338, + "step": 5600 + }, + { + "epoch": 8.9616, + "grad_norm": 0.0, + "learning_rate": 6.644999999999999e-05, + "loss": 4.8242, + "step": 5601 + }, + { + "epoch": 8.9632, + "grad_norm": 0.0, + "learning_rate": 6.63e-05, + "loss": 4.5149, + "step": 5602 + }, + { + "epoch": 8.9648, + "grad_norm": 0.0, + "learning_rate": 6.615e-05, + "loss": 3.0374, + "step": 5603 + }, + { + "epoch": 8.9664, + "grad_norm": 0.0, + "learning_rate": 6.599999999999999e-05, + "loss": 3.8995, + "step": 5604 + }, + { + "epoch": 8.968, + "grad_norm": 0.0, + "learning_rate": 6.584999999999999e-05, + "loss": 2.7828, + "step": 5605 + }, + { + "epoch": 8.9696, + "grad_norm": 0.0, + "learning_rate": 6.57e-05, + "loss": 2.8965, + "step": 5606 + }, + { + "epoch": 8.9712, + "grad_norm": 0.0, + "learning_rate": 6.555e-05, + "loss": 2.3991, + "step": 5607 + }, + { + "epoch": 8.9728, + "grad_norm": 0.0, + "learning_rate": 6.539999999999999e-05, + "loss": 2.7713, + "step": 5608 + }, + { + "epoch": 8.9744, + "grad_norm": 0.0, + "learning_rate": 6.525e-05, + "loss": 4.3079, + "step": 5609 + }, + { + "epoch": 8.975999999999999, + "grad_norm": 0.0, + "learning_rate": 6.51e-05, + "loss": 2.7852, + "step": 5610 + }, + { + "epoch": 8.9776, + "grad_norm": 0.0, + "learning_rate": 6.494999999999999e-05, + "loss": 2.3143, + "step": 5611 + }, + { + "epoch": 8.9792, + "grad_norm": 0.0, + "learning_rate": 6.479999999999999e-05, + "loss": 2.4507, + "step": 5612 + }, + { + "epoch": 8.9808, + "grad_norm": 0.0, + "learning_rate": 6.465e-05, + "loss": 2.6195, + "step": 5613 + }, + { + "epoch": 8.9824, + "grad_norm": 0.0, + "learning_rate": 6.45e-05, + "loss": 2.269, + "step": 5614 + }, + { + "epoch": 8.984, + "grad_norm": 0.0, + "learning_rate": 6.434999999999999e-05, + "loss": 3.2257, + "step": 5615 + }, + { + "epoch": 8.9856, + "grad_norm": 0.0, + "learning_rate": 6.419999999999999e-05, + "loss": 2.6905, + "step": 5616 + }, + { + "epoch": 8.9872, + "grad_norm": 0.0, + "learning_rate": 6.405e-05, + "loss": 2.8038, + "step": 5617 + }, + { + "epoch": 8.9888, + "grad_norm": 0.0, + "learning_rate": 6.39e-05, + "loss": 2.8442, + "step": 5618 + }, + { + "epoch": 8.9904, + "grad_norm": 0.0, + "learning_rate": 6.374999999999999e-05, + "loss": 2.72, + "step": 5619 + }, + { + "epoch": 8.992, + "grad_norm": 0.0, + "learning_rate": 6.359999999999999e-05, + "loss": 2.7504, + "step": 5620 + }, + { + "epoch": 8.9936, + "grad_norm": 0.0, + "learning_rate": 6.345e-05, + "loss": 3.0555, + "step": 5621 + }, + { + "epoch": 8.9952, + "grad_norm": 0.0, + "learning_rate": 6.33e-05, + "loss": 2.982, + "step": 5622 + }, + { + "epoch": 8.9968, + "grad_norm": 0.0, + "learning_rate": 6.314999999999999e-05, + "loss": 2.7094, + "step": 5623 + }, + { + "epoch": 8.9984, + "grad_norm": 0.0, + "learning_rate": 6.299999999999999e-05, + "loss": 3.1366, + "step": 5624 + }, + { + "epoch": 9.0, + "grad_norm": 0.0, + "learning_rate": 6.285e-05, + "loss": 3.7605, + "step": 5625 + }, + { + "epoch": 9.0016, + "grad_norm": 0.0, + "learning_rate": 6.269999999999999e-05, + "loss": 3.9877, + "step": 5626 + }, + { + "epoch": 9.0032, + "grad_norm": 0.0, + "learning_rate": 6.254999999999999e-05, + "loss": 4.1149, + "step": 5627 + }, + { + "epoch": 9.0048, + "grad_norm": 0.0, + "learning_rate": 6.239999999999999e-05, + "loss": 3.6217, + "step": 5628 + }, + { + "epoch": 9.0064, + "grad_norm": 0.0, + "learning_rate": 6.225e-05, + "loss": 4.6033, + "step": 5629 + }, + { + "epoch": 9.008, + "grad_norm": 0.0, + "learning_rate": 6.209999999999999e-05, + "loss": 3.7522, + "step": 5630 + }, + { + "epoch": 9.0096, + "grad_norm": 0.0, + "learning_rate": 6.194999999999999e-05, + "loss": 3.3814, + "step": 5631 + }, + { + "epoch": 9.0112, + "grad_norm": 0.0, + "learning_rate": 6.18e-05, + "loss": 2.7782, + "step": 5632 + }, + { + "epoch": 9.0128, + "grad_norm": 0.0, + "learning_rate": 6.165e-05, + "loss": 2.7994, + "step": 5633 + }, + { + "epoch": 9.0144, + "grad_norm": 0.0, + "learning_rate": 6.149999999999999e-05, + "loss": 2.7652, + "step": 5634 + }, + { + "epoch": 9.016, + "grad_norm": 0.0, + "learning_rate": 6.134999999999999e-05, + "loss": 2.9545, + "step": 5635 + }, + { + "epoch": 9.0176, + "grad_norm": 0.0, + "learning_rate": 6.12e-05, + "loss": 2.6188, + "step": 5636 + }, + { + "epoch": 9.0192, + "grad_norm": 0.0, + "learning_rate": 6.104999999999999e-05, + "loss": 3.047, + "step": 5637 + }, + { + "epoch": 9.0208, + "grad_norm": 0.0, + "learning_rate": 6.0899999999999996e-05, + "loss": 3.3657, + "step": 5638 + }, + { + "epoch": 9.0224, + "grad_norm": 0.0, + "learning_rate": 6.075e-05, + "loss": 3.0335, + "step": 5639 + }, + { + "epoch": 9.024, + "grad_norm": 0.0, + "learning_rate": 6.0599999999999996e-05, + "loss": 2.4625, + "step": 5640 + }, + { + "epoch": 9.0256, + "grad_norm": 0.0, + "learning_rate": 6.045e-05, + "loss": 2.4959, + "step": 5641 + }, + { + "epoch": 9.0272, + "grad_norm": 0.0, + "learning_rate": 6.0299999999999995e-05, + "loss": 3.5343, + "step": 5642 + }, + { + "epoch": 9.0288, + "grad_norm": 0.0, + "learning_rate": 6.015e-05, + "loss": 3.2499, + "step": 5643 + }, + { + "epoch": 9.0304, + "grad_norm": 0.0, + "learning_rate": 5.9999999999999995e-05, + "loss": 3.5015, + "step": 5644 + }, + { + "epoch": 9.032, + "grad_norm": 0.0, + "learning_rate": 5.985e-05, + "loss": 2.3126, + "step": 5645 + }, + { + "epoch": 9.0336, + "grad_norm": 0.0, + "learning_rate": 5.97e-05, + "loss": 2.6735, + "step": 5646 + }, + { + "epoch": 9.0352, + "grad_norm": 0.0, + "learning_rate": 5.955e-05, + "loss": 2.4169, + "step": 5647 + }, + { + "epoch": 9.0368, + "grad_norm": 0.0, + "learning_rate": 5.94e-05, + "loss": 3.1192, + "step": 5648 + }, + { + "epoch": 9.0384, + "grad_norm": 0.0, + "learning_rate": 5.925e-05, + "loss": 3.5278, + "step": 5649 + }, + { + "epoch": 9.04, + "grad_norm": 0.0, + "learning_rate": 5.91e-05, + "loss": 3.8993, + "step": 5650 + }, + { + "epoch": 9.0416, + "grad_norm": 0.0, + "learning_rate": 5.8949999999999996e-05, + "loss": 3.1883, + "step": 5651 + }, + { + "epoch": 9.0432, + "grad_norm": 0.0, + "learning_rate": 5.88e-05, + "loss": 2.9788, + "step": 5652 + }, + { + "epoch": 9.0448, + "grad_norm": 0.0, + "learning_rate": 5.8649999999999996e-05, + "loss": 2.6266, + "step": 5653 + }, + { + "epoch": 9.0464, + "grad_norm": 0.0, + "learning_rate": 5.85e-05, + "loss": 2.5842, + "step": 5654 + }, + { + "epoch": 9.048, + "grad_norm": 0.0, + "learning_rate": 5.8349999999999995e-05, + "loss": 2.5331, + "step": 5655 + }, + { + "epoch": 9.0496, + "grad_norm": 0.0, + "learning_rate": 5.82e-05, + "loss": 2.6838, + "step": 5656 + }, + { + "epoch": 9.0512, + "grad_norm": 0.0, + "learning_rate": 5.8049999999999995e-05, + "loss": 2.9824, + "step": 5657 + }, + { + "epoch": 9.0528, + "grad_norm": 0.0, + "learning_rate": 5.79e-05, + "loss": 2.8042, + "step": 5658 + }, + { + "epoch": 9.0544, + "grad_norm": 0.0, + "learning_rate": 5.7749999999999994e-05, + "loss": 2.4679, + "step": 5659 + }, + { + "epoch": 9.056, + "grad_norm": 0.0, + "learning_rate": 5.76e-05, + "loss": 2.2159, + "step": 5660 + }, + { + "epoch": 9.0576, + "grad_norm": 0.0, + "learning_rate": 5.7449999999999994e-05, + "loss": 2.9483, + "step": 5661 + }, + { + "epoch": 9.0592, + "grad_norm": 0.0, + "learning_rate": 5.73e-05, + "loss": 3.2915, + "step": 5662 + }, + { + "epoch": 9.0608, + "grad_norm": 0.0, + "learning_rate": 5.714999999999999e-05, + "loss": 2.5169, + "step": 5663 + }, + { + "epoch": 9.0624, + "grad_norm": 0.0, + "learning_rate": 5.6999999999999996e-05, + "loss": 3.3614, + "step": 5664 + }, + { + "epoch": 9.064, + "grad_norm": 0.0, + "learning_rate": 5.684999999999999e-05, + "loss": 2.4081, + "step": 5665 + }, + { + "epoch": 9.0656, + "grad_norm": 0.0, + "learning_rate": 5.6699999999999996e-05, + "loss": 3.4789, + "step": 5666 + }, + { + "epoch": 9.0672, + "grad_norm": 0.0, + "learning_rate": 5.654999999999999e-05, + "loss": 2.5808, + "step": 5667 + }, + { + "epoch": 9.0688, + "grad_norm": 0.0, + "learning_rate": 5.6399999999999995e-05, + "loss": 2.5403, + "step": 5668 + }, + { + "epoch": 9.0704, + "grad_norm": 0.0, + "learning_rate": 5.625e-05, + "loss": 2.92, + "step": 5669 + }, + { + "epoch": 9.072, + "grad_norm": 0.0, + "learning_rate": 5.6099999999999995e-05, + "loss": 2.9666, + "step": 5670 + }, + { + "epoch": 9.0736, + "grad_norm": 0.0, + "learning_rate": 5.595e-05, + "loss": 3.6537, + "step": 5671 + }, + { + "epoch": 9.0752, + "grad_norm": 0.0, + "learning_rate": 5.5799999999999994e-05, + "loss": 3.8247, + "step": 5672 + }, + { + "epoch": 9.0768, + "grad_norm": 0.0, + "learning_rate": 5.565e-05, + "loss": 3.4512, + "step": 5673 + }, + { + "epoch": 9.0784, + "grad_norm": 0.0, + "learning_rate": 5.5499999999999994e-05, + "loss": 2.5999, + "step": 5674 + }, + { + "epoch": 9.08, + "grad_norm": 0.0, + "learning_rate": 5.535e-05, + "loss": 3.7474, + "step": 5675 + }, + { + "epoch": 9.0816, + "grad_norm": 0.0, + "learning_rate": 5.519999999999999e-05, + "loss": 3.7495, + "step": 5676 + }, + { + "epoch": 9.0832, + "grad_norm": 0.0, + "learning_rate": 5.5049999999999996e-05, + "loss": 3.5728, + "step": 5677 + }, + { + "epoch": 9.0848, + "grad_norm": 0.0, + "learning_rate": 5.489999999999999e-05, + "loss": 3.3557, + "step": 5678 + }, + { + "epoch": 9.0864, + "grad_norm": 0.0, + "learning_rate": 5.4749999999999996e-05, + "loss": 3.2979, + "step": 5679 + }, + { + "epoch": 9.088, + "grad_norm": 0.0, + "learning_rate": 5.459999999999999e-05, + "loss": 3.5679, + "step": 5680 + }, + { + "epoch": 9.0896, + "grad_norm": 0.0, + "learning_rate": 5.4449999999999995e-05, + "loss": 2.8948, + "step": 5681 + }, + { + "epoch": 9.0912, + "grad_norm": 0.0, + "learning_rate": 5.429999999999999e-05, + "loss": 3.92, + "step": 5682 + }, + { + "epoch": 9.0928, + "grad_norm": 0.0, + "learning_rate": 5.4149999999999995e-05, + "loss": 3.2817, + "step": 5683 + }, + { + "epoch": 9.0944, + "grad_norm": 0.0, + "learning_rate": 5.399999999999999e-05, + "loss": 3.2031, + "step": 5684 + }, + { + "epoch": 9.096, + "grad_norm": 0.0, + "learning_rate": 5.3849999999999994e-05, + "loss": 2.8687, + "step": 5685 + }, + { + "epoch": 9.0976, + "grad_norm": 0.0, + "learning_rate": 5.369999999999999e-05, + "loss": 2.5963, + "step": 5686 + }, + { + "epoch": 9.0992, + "grad_norm": 0.0, + "learning_rate": 5.3549999999999994e-05, + "loss": 2.6347, + "step": 5687 + }, + { + "epoch": 9.1008, + "grad_norm": 0.0, + "learning_rate": 5.339999999999999e-05, + "loss": 3.149, + "step": 5688 + }, + { + "epoch": 9.1024, + "grad_norm": 0.0, + "learning_rate": 5.324999999999999e-05, + "loss": 2.6927, + "step": 5689 + }, + { + "epoch": 9.104, + "grad_norm": 0.0, + "learning_rate": 5.309999999999999e-05, + "loss": 2.7637, + "step": 5690 + }, + { + "epoch": 9.1056, + "grad_norm": 0.0, + "learning_rate": 5.294999999999999e-05, + "loss": 2.6973, + "step": 5691 + }, + { + "epoch": 9.1072, + "grad_norm": 0.0, + "learning_rate": 5.279999999999999e-05, + "loss": 3.2617, + "step": 5692 + }, + { + "epoch": 9.1088, + "grad_norm": 0.0, + "learning_rate": 5.264999999999999e-05, + "loss": 3.2056, + "step": 5693 + }, + { + "epoch": 9.1104, + "grad_norm": 0.0, + "learning_rate": 5.2499999999999995e-05, + "loss": 3.1844, + "step": 5694 + }, + { + "epoch": 9.112, + "grad_norm": 0.0, + "learning_rate": 5.234999999999999e-05, + "loss": 2.8712, + "step": 5695 + }, + { + "epoch": 9.1136, + "grad_norm": 0.0, + "learning_rate": 5.2199999999999995e-05, + "loss": 2.5566, + "step": 5696 + }, + { + "epoch": 9.1152, + "grad_norm": 0.0, + "learning_rate": 5.204999999999999e-05, + "loss": 3.2583, + "step": 5697 + }, + { + "epoch": 9.1168, + "grad_norm": 0.0, + "learning_rate": 5.1899999999999994e-05, + "loss": 3.2741, + "step": 5698 + }, + { + "epoch": 9.1184, + "grad_norm": 0.0, + "learning_rate": 5.174999999999999e-05, + "loss": 2.7779, + "step": 5699 + }, + { + "epoch": 9.12, + "grad_norm": 0.0, + "learning_rate": 5.1599999999999994e-05, + "loss": 4.0854, + "step": 5700 + }, + { + "epoch": 9.1216, + "grad_norm": 0.0, + "learning_rate": 5.145e-05, + "loss": 3.047, + "step": 5701 + }, + { + "epoch": 9.1232, + "grad_norm": 0.0, + "learning_rate": 5.13e-05, + "loss": 2.4914, + "step": 5702 + }, + { + "epoch": 9.1248, + "grad_norm": 0.0, + "learning_rate": 5.1149999999999996e-05, + "loss": 2.7614, + "step": 5703 + }, + { + "epoch": 9.1264, + "grad_norm": 0.0, + "learning_rate": 5.1e-05, + "loss": 3.0712, + "step": 5704 + }, + { + "epoch": 9.128, + "grad_norm": 0.0, + "learning_rate": 5.0849999999999996e-05, + "loss": 2.5899, + "step": 5705 + }, + { + "epoch": 9.1296, + "grad_norm": 0.0, + "learning_rate": 5.07e-05, + "loss": 3.2392, + "step": 5706 + }, + { + "epoch": 9.1312, + "grad_norm": 0.0, + "learning_rate": 5.055e-05, + "loss": 2.7351, + "step": 5707 + }, + { + "epoch": 9.1328, + "grad_norm": 0.0, + "learning_rate": 5.04e-05, + "loss": 3.9668, + "step": 5708 + }, + { + "epoch": 9.1344, + "grad_norm": 0.0, + "learning_rate": 5.025e-05, + "loss": 2.4666, + "step": 5709 + }, + { + "epoch": 9.136, + "grad_norm": 0.0, + "learning_rate": 5.01e-05, + "loss": 2.7733, + "step": 5710 + }, + { + "epoch": 9.1376, + "grad_norm": 0.0, + "learning_rate": 4.995e-05, + "loss": 2.1372, + "step": 5711 + }, + { + "epoch": 9.1392, + "grad_norm": 0.0, + "learning_rate": 4.98e-05, + "loss": 2.7009, + "step": 5712 + }, + { + "epoch": 9.1408, + "grad_norm": 0.0, + "learning_rate": 4.965e-05, + "loss": 2.9297, + "step": 5713 + }, + { + "epoch": 9.1424, + "grad_norm": 0.0, + "learning_rate": 4.95e-05, + "loss": 2.505, + "step": 5714 + }, + { + "epoch": 9.144, + "grad_norm": 0.0, + "learning_rate": 4.935e-05, + "loss": 2.9298, + "step": 5715 + }, + { + "epoch": 9.1456, + "grad_norm": 0.0, + "learning_rate": 4.9199999999999997e-05, + "loss": 2.3865, + "step": 5716 + }, + { + "epoch": 9.1472, + "grad_norm": 0.0, + "learning_rate": 4.905e-05, + "loss": 2.2821, + "step": 5717 + }, + { + "epoch": 9.1488, + "grad_norm": 0.0, + "learning_rate": 4.8899999999999996e-05, + "loss": 3.4511, + "step": 5718 + }, + { + "epoch": 9.1504, + "grad_norm": 0.0, + "learning_rate": 4.875e-05, + "loss": 3.0349, + "step": 5719 + }, + { + "epoch": 9.152, + "grad_norm": 0.0, + "learning_rate": 4.8599999999999995e-05, + "loss": 3.9405, + "step": 5720 + }, + { + "epoch": 9.1536, + "grad_norm": 0.0, + "learning_rate": 4.845e-05, + "loss": 2.305, + "step": 5721 + }, + { + "epoch": 9.1552, + "grad_norm": 0.0, + "learning_rate": 4.8299999999999995e-05, + "loss": 3.6042, + "step": 5722 + }, + { + "epoch": 9.1568, + "grad_norm": 0.0, + "learning_rate": 4.815e-05, + "loss": 3.039, + "step": 5723 + }, + { + "epoch": 9.1584, + "grad_norm": 0.0, + "learning_rate": 4.7999999999999994e-05, + "loss": 2.7975, + "step": 5724 + }, + { + "epoch": 9.16, + "grad_norm": NaN, + "learning_rate": 4.7999999999999994e-05, + "loss": 2.1338, + "step": 5725 + }, + { + "epoch": 9.1616, + "grad_norm": 0.0, + "learning_rate": 4.785e-05, + "loss": 4.1658, + "step": 5726 + }, + { + "epoch": 9.1632, + "grad_norm": 0.0, + "learning_rate": 4.7699999999999994e-05, + "loss": 3.3, + "step": 5727 + }, + { + "epoch": 9.1648, + "grad_norm": 0.0, + "learning_rate": 4.755e-05, + "loss": 3.6535, + "step": 5728 + }, + { + "epoch": 9.1664, + "grad_norm": 0.0, + "learning_rate": 4.7399999999999993e-05, + "loss": 3.4712, + "step": 5729 + }, + { + "epoch": 9.168, + "grad_norm": 0.0, + "learning_rate": 4.7249999999999997e-05, + "loss": 3.1756, + "step": 5730 + }, + { + "epoch": 9.1696, + "grad_norm": 0.0, + "learning_rate": 4.709999999999999e-05, + "loss": 2.7233, + "step": 5731 + }, + { + "epoch": 9.1712, + "grad_norm": 0.0, + "learning_rate": 4.6949999999999996e-05, + "loss": 2.8601, + "step": 5732 + }, + { + "epoch": 9.1728, + "grad_norm": 0.0, + "learning_rate": 4.68e-05, + "loss": 4.1129, + "step": 5733 + }, + { + "epoch": 9.1744, + "grad_norm": 0.0, + "learning_rate": 4.6649999999999996e-05, + "loss": 3.6857, + "step": 5734 + }, + { + "epoch": 9.176, + "grad_norm": 0.0, + "learning_rate": 4.65e-05, + "loss": 2.8752, + "step": 5735 + }, + { + "epoch": 9.1776, + "grad_norm": 0.0, + "learning_rate": 4.6349999999999995e-05, + "loss": 2.7681, + "step": 5736 + }, + { + "epoch": 9.1792, + "grad_norm": 0.0, + "learning_rate": 4.62e-05, + "loss": 3.0197, + "step": 5737 + }, + { + "epoch": 9.1808, + "grad_norm": 0.0, + "learning_rate": 4.6049999999999994e-05, + "loss": 2.6093, + "step": 5738 + }, + { + "epoch": 9.1824, + "grad_norm": 0.0, + "learning_rate": 4.59e-05, + "loss": 3.1584, + "step": 5739 + }, + { + "epoch": 9.184, + "grad_norm": 0.0, + "learning_rate": 4.5749999999999994e-05, + "loss": 3.0305, + "step": 5740 + }, + { + "epoch": 9.1856, + "grad_norm": 0.0, + "learning_rate": 4.56e-05, + "loss": 2.4518, + "step": 5741 + }, + { + "epoch": 9.1872, + "grad_norm": 0.0, + "learning_rate": 4.5449999999999993e-05, + "loss": 2.417, + "step": 5742 + }, + { + "epoch": 9.1888, + "grad_norm": 0.0, + "learning_rate": 4.5299999999999997e-05, + "loss": 2.9896, + "step": 5743 + }, + { + "epoch": 9.1904, + "grad_norm": 0.0, + "learning_rate": 4.514999999999999e-05, + "loss": 3.043, + "step": 5744 + }, + { + "epoch": 9.192, + "grad_norm": 0.0, + "learning_rate": 4.4999999999999996e-05, + "loss": 2.8806, + "step": 5745 + }, + { + "epoch": 9.1936, + "grad_norm": 0.0, + "learning_rate": 4.484999999999999e-05, + "loss": 2.6109, + "step": 5746 + }, + { + "epoch": 9.1952, + "grad_norm": 0.0, + "learning_rate": 4.4699999999999996e-05, + "loss": 2.5296, + "step": 5747 + }, + { + "epoch": 9.1968, + "grad_norm": 0.0, + "learning_rate": 4.454999999999999e-05, + "loss": 3.3942, + "step": 5748 + }, + { + "epoch": 9.1984, + "grad_norm": 0.0, + "learning_rate": 4.4399999999999995e-05, + "loss": 2.9008, + "step": 5749 + }, + { + "epoch": 9.2, + "grad_norm": 0.0, + "learning_rate": 4.424999999999999e-05, + "loss": 3.6638, + "step": 5750 + }, + { + "epoch": 9.2016, + "grad_norm": 0.0, + "learning_rate": 4.4099999999999995e-05, + "loss": 2.678, + "step": 5751 + }, + { + "epoch": 9.2032, + "grad_norm": 0.0, + "learning_rate": 4.394999999999999e-05, + "loss": 3.0434, + "step": 5752 + }, + { + "epoch": 9.2048, + "grad_norm": 0.0, + "learning_rate": 4.3799999999999994e-05, + "loss": 3.5293, + "step": 5753 + }, + { + "epoch": 9.2064, + "grad_norm": 0.0, + "learning_rate": 4.364999999999999e-05, + "loss": 2.5569, + "step": 5754 + }, + { + "epoch": 9.208, + "grad_norm": 0.0, + "learning_rate": 4.3499999999999993e-05, + "loss": 3.0115, + "step": 5755 + }, + { + "epoch": 9.2096, + "grad_norm": 0.0, + "learning_rate": 4.334999999999999e-05, + "loss": 2.4495, + "step": 5756 + }, + { + "epoch": 9.2112, + "grad_norm": 0.0, + "learning_rate": 4.319999999999999e-05, + "loss": 2.937, + "step": 5757 + }, + { + "epoch": 9.2128, + "grad_norm": 0.0, + "learning_rate": 4.3049999999999996e-05, + "loss": 2.8026, + "step": 5758 + }, + { + "epoch": 9.2144, + "grad_norm": 0.0, + "learning_rate": 4.289999999999999e-05, + "loss": 2.8708, + "step": 5759 + }, + { + "epoch": 9.216, + "grad_norm": 0.0, + "learning_rate": 4.2749999999999996e-05, + "loss": 3.5378, + "step": 5760 + }, + { + "epoch": 9.2176, + "grad_norm": 0.0, + "learning_rate": 4.259999999999999e-05, + "loss": 2.7786, + "step": 5761 + }, + { + "epoch": 9.2192, + "grad_norm": 0.0, + "learning_rate": 4.2449999999999995e-05, + "loss": 2.6413, + "step": 5762 + }, + { + "epoch": 9.2208, + "grad_norm": 0.0, + "learning_rate": 4.229999999999999e-05, + "loss": 2.7484, + "step": 5763 + }, + { + "epoch": 9.2224, + "grad_norm": 0.0, + "learning_rate": 4.215e-05, + "loss": 2.9544, + "step": 5764 + }, + { + "epoch": 9.224, + "grad_norm": 0.0, + "learning_rate": 4.2e-05, + "loss": 3.135, + "step": 5765 + }, + { + "epoch": 9.2256, + "grad_norm": 0.0, + "learning_rate": 4.185e-05, + "loss": 2.823, + "step": 5766 + }, + { + "epoch": 9.2272, + "grad_norm": 0.0, + "learning_rate": 4.17e-05, + "loss": 3.5126, + "step": 5767 + }, + { + "epoch": 9.2288, + "grad_norm": 0.0, + "learning_rate": 4.155e-05, + "loss": 3.7753, + "step": 5768 + }, + { + "epoch": 9.2304, + "grad_norm": 0.0, + "learning_rate": 4.14e-05, + "loss": 2.8611, + "step": 5769 + }, + { + "epoch": 9.232, + "grad_norm": 0.0, + "learning_rate": 4.125e-05, + "loss": 2.6387, + "step": 5770 + }, + { + "epoch": 9.2336, + "grad_norm": 0.0, + "learning_rate": 4.11e-05, + "loss": 2.8033, + "step": 5771 + }, + { + "epoch": 9.2352, + "grad_norm": 0.0, + "learning_rate": 4.095e-05, + "loss": 2.6439, + "step": 5772 + }, + { + "epoch": 9.2368, + "grad_norm": 0.0, + "learning_rate": 4.08e-05, + "loss": 2.7774, + "step": 5773 + }, + { + "epoch": 9.2384, + "grad_norm": 0.0, + "learning_rate": 4.065e-05, + "loss": 2.9947, + "step": 5774 + }, + { + "epoch": 9.24, + "grad_norm": 0.0, + "learning_rate": 4.05e-05, + "loss": 3.3206, + "step": 5775 + }, + { + "epoch": 9.2416, + "grad_norm": 0.0, + "learning_rate": 4.035e-05, + "loss": 3.8351, + "step": 5776 + }, + { + "epoch": 9.2432, + "grad_norm": 0.0, + "learning_rate": 4.02e-05, + "loss": 4.5679, + "step": 5777 + }, + { + "epoch": 9.2448, + "grad_norm": 0.0, + "learning_rate": 4.005e-05, + "loss": 3.8772, + "step": 5778 + }, + { + "epoch": 9.2464, + "grad_norm": 0.0, + "learning_rate": 3.99e-05, + "loss": 3.8999, + "step": 5779 + }, + { + "epoch": 9.248, + "grad_norm": 0.0, + "learning_rate": 3.975e-05, + "loss": 3.509, + "step": 5780 + }, + { + "epoch": 9.2496, + "grad_norm": 0.0, + "learning_rate": 3.96e-05, + "loss": 3.024, + "step": 5781 + }, + { + "epoch": 9.2512, + "grad_norm": 0.0, + "learning_rate": 3.945e-05, + "loss": 3.4285, + "step": 5782 + }, + { + "epoch": 9.2528, + "grad_norm": 0.0, + "learning_rate": 3.93e-05, + "loss": 3.8175, + "step": 5783 + }, + { + "epoch": 9.2544, + "grad_norm": 0.0, + "learning_rate": 3.9149999999999996e-05, + "loss": 2.5193, + "step": 5784 + }, + { + "epoch": 9.256, + "grad_norm": 0.0, + "learning_rate": 3.9e-05, + "loss": 3.9423, + "step": 5785 + }, + { + "epoch": 9.2576, + "grad_norm": 0.0, + "learning_rate": 3.8849999999999996e-05, + "loss": 3.4668, + "step": 5786 + }, + { + "epoch": 9.2592, + "grad_norm": 0.0, + "learning_rate": 3.87e-05, + "loss": 3.8142, + "step": 5787 + }, + { + "epoch": 9.2608, + "grad_norm": 0.0, + "learning_rate": 3.8549999999999995e-05, + "loss": 3.697, + "step": 5788 + }, + { + "epoch": 9.2624, + "grad_norm": 0.0, + "learning_rate": 3.84e-05, + "loss": 2.7373, + "step": 5789 + }, + { + "epoch": 9.264, + "grad_norm": 0.0, + "learning_rate": 3.8249999999999995e-05, + "loss": 3.5196, + "step": 5790 + }, + { + "epoch": 9.2656, + "grad_norm": 0.0, + "learning_rate": 3.81e-05, + "loss": 2.7335, + "step": 5791 + }, + { + "epoch": 9.2672, + "grad_norm": 0.0, + "learning_rate": 3.7949999999999994e-05, + "loss": 2.546, + "step": 5792 + }, + { + "epoch": 9.2688, + "grad_norm": 0.0, + "learning_rate": 3.78e-05, + "loss": 3.3374, + "step": 5793 + }, + { + "epoch": 9.2704, + "grad_norm": 0.0, + "learning_rate": 3.7649999999999994e-05, + "loss": 2.9118, + "step": 5794 + }, + { + "epoch": 9.272, + "grad_norm": 0.0, + "learning_rate": 3.75e-05, + "loss": 3.0383, + "step": 5795 + }, + { + "epoch": 9.2736, + "grad_norm": 0.0, + "learning_rate": 3.735e-05, + "loss": 2.5258, + "step": 5796 + }, + { + "epoch": 9.2752, + "grad_norm": 0.0, + "learning_rate": 3.7199999999999996e-05, + "loss": 2.2185, + "step": 5797 + }, + { + "epoch": 9.2768, + "grad_norm": 0.0, + "learning_rate": 3.705e-05, + "loss": 2.511, + "step": 5798 + }, + { + "epoch": 9.2784, + "grad_norm": 0.0, + "learning_rate": 3.6899999999999996e-05, + "loss": 2.5834, + "step": 5799 + }, + { + "epoch": 9.28, + "grad_norm": 0.0, + "learning_rate": 3.675e-05, + "loss": 2.6032, + "step": 5800 + }, + { + "epoch": 9.2816, + "grad_norm": 0.0, + "learning_rate": 3.6599999999999995e-05, + "loss": 2.407, + "step": 5801 + }, + { + "epoch": 9.2832, + "grad_norm": 0.0, + "learning_rate": 3.645e-05, + "loss": 2.821, + "step": 5802 + }, + { + "epoch": 9.2848, + "grad_norm": 0.0, + "learning_rate": 3.6299999999999995e-05, + "loss": 2.4382, + "step": 5803 + }, + { + "epoch": 9.2864, + "grad_norm": 0.0, + "learning_rate": 3.615e-05, + "loss": 2.9931, + "step": 5804 + }, + { + "epoch": 9.288, + "grad_norm": 0.0, + "learning_rate": 3.5999999999999994e-05, + "loss": 2.8703, + "step": 5805 + }, + { + "epoch": 9.2896, + "grad_norm": 0.0, + "learning_rate": 3.585e-05, + "loss": 2.5434, + "step": 5806 + }, + { + "epoch": 9.2912, + "grad_norm": 0.0, + "learning_rate": 3.5699999999999994e-05, + "loss": 3.2047, + "step": 5807 + }, + { + "epoch": 9.2928, + "grad_norm": 0.0, + "learning_rate": 3.555e-05, + "loss": 2.3115, + "step": 5808 + }, + { + "epoch": 9.2944, + "grad_norm": 0.0, + "learning_rate": 3.539999999999999e-05, + "loss": 2.7968, + "step": 5809 + }, + { + "epoch": 9.296, + "grad_norm": 0.0, + "learning_rate": 3.5249999999999996e-05, + "loss": 3.1432, + "step": 5810 + }, + { + "epoch": 9.2976, + "grad_norm": 0.0, + "learning_rate": 3.51e-05, + "loss": 3.9086, + "step": 5811 + }, + { + "epoch": 9.2992, + "grad_norm": 0.0, + "learning_rate": 3.4949999999999996e-05, + "loss": 2.4662, + "step": 5812 + }, + { + "epoch": 9.3008, + "grad_norm": 0.0, + "learning_rate": 3.48e-05, + "loss": 2.2656, + "step": 5813 + }, + { + "epoch": 9.3024, + "grad_norm": 0.0, + "learning_rate": 3.465e-05, + "loss": 2.8611, + "step": 5814 + }, + { + "epoch": 9.304, + "grad_norm": 0.0, + "learning_rate": 3.45e-05, + "loss": 3.6223, + "step": 5815 + }, + { + "epoch": 9.3056, + "grad_norm": 0.0, + "learning_rate": 3.435e-05, + "loss": 2.6435, + "step": 5816 + }, + { + "epoch": 9.3072, + "grad_norm": 0.0, + "learning_rate": 3.42e-05, + "loss": 3.2736, + "step": 5817 + }, + { + "epoch": 9.3088, + "grad_norm": 0.0, + "learning_rate": 3.405e-05, + "loss": 2.9351, + "step": 5818 + }, + { + "epoch": 9.3104, + "grad_norm": 0.0, + "learning_rate": 3.39e-05, + "loss": 2.6843, + "step": 5819 + }, + { + "epoch": 9.312, + "grad_norm": 0.0, + "learning_rate": 3.375e-05, + "loss": 2.9742, + "step": 5820 + }, + { + "epoch": 9.3136, + "grad_norm": 0.0, + "learning_rate": 3.36e-05, + "loss": 2.4212, + "step": 5821 + }, + { + "epoch": 9.3152, + "grad_norm": 0.0, + "learning_rate": 3.345e-05, + "loss": 3.1968, + "step": 5822 + }, + { + "epoch": 9.3168, + "grad_norm": 0.0, + "learning_rate": 3.3299999999999996e-05, + "loss": 3.6791, + "step": 5823 + }, + { + "epoch": 9.3184, + "grad_norm": 0.0, + "learning_rate": 3.315e-05, + "loss": 2.5402, + "step": 5824 + }, + { + "epoch": 9.32, + "grad_norm": 0.0, + "learning_rate": 3.2999999999999996e-05, + "loss": 3.9026, + "step": 5825 + }, + { + "epoch": 9.3216, + "grad_norm": 0.0, + "learning_rate": 3.285e-05, + "loss": 3.9221, + "step": 5826 + }, + { + "epoch": 9.3232, + "grad_norm": 0.0, + "learning_rate": 3.2699999999999995e-05, + "loss": 4.0257, + "step": 5827 + }, + { + "epoch": 9.3248, + "grad_norm": 0.0, + "learning_rate": 3.255e-05, + "loss": 4.8515, + "step": 5828 + }, + { + "epoch": 9.3264, + "grad_norm": 0.0, + "learning_rate": 3.2399999999999995e-05, + "loss": 3.9743, + "step": 5829 + }, + { + "epoch": 9.328, + "grad_norm": 0.0, + "learning_rate": 3.225e-05, + "loss": 3.0707, + "step": 5830 + }, + { + "epoch": 9.3296, + "grad_norm": 0.0, + "learning_rate": 3.2099999999999994e-05, + "loss": 3.5754, + "step": 5831 + }, + { + "epoch": 9.3312, + "grad_norm": 0.0, + "learning_rate": 3.195e-05, + "loss": 4.3465, + "step": 5832 + }, + { + "epoch": 9.3328, + "grad_norm": 0.0, + "learning_rate": 3.1799999999999994e-05, + "loss": 3.3521, + "step": 5833 + }, + { + "epoch": 9.3344, + "grad_norm": 0.0, + "learning_rate": 3.165e-05, + "loss": 3.5843, + "step": 5834 + }, + { + "epoch": 9.336, + "grad_norm": 0.0, + "learning_rate": 3.149999999999999e-05, + "loss": 3.0415, + "step": 5835 + }, + { + "epoch": 9.3376, + "grad_norm": 0.0, + "learning_rate": 3.1349999999999996e-05, + "loss": 2.9932, + "step": 5836 + }, + { + "epoch": 9.3392, + "grad_norm": 0.0, + "learning_rate": 3.119999999999999e-05, + "loss": 3.667, + "step": 5837 + }, + { + "epoch": 9.3408, + "grad_norm": 0.0, + "learning_rate": 3.1049999999999996e-05, + "loss": 3.0368, + "step": 5838 + }, + { + "epoch": 9.3424, + "grad_norm": 0.0, + "learning_rate": 3.09e-05, + "loss": 3.2513, + "step": 5839 + }, + { + "epoch": 9.344, + "grad_norm": 0.0, + "learning_rate": 3.0749999999999995e-05, + "loss": 2.686, + "step": 5840 + }, + { + "epoch": 9.3456, + "grad_norm": 0.0, + "learning_rate": 3.06e-05, + "loss": 4.3405, + "step": 5841 + }, + { + "epoch": 9.3472, + "grad_norm": 0.0, + "learning_rate": 3.0449999999999998e-05, + "loss": 2.9989, + "step": 5842 + }, + { + "epoch": 9.3488, + "grad_norm": 0.0, + "learning_rate": 3.0299999999999998e-05, + "loss": 2.8285, + "step": 5843 + }, + { + "epoch": 9.3504, + "grad_norm": 0.0, + "learning_rate": 3.0149999999999998e-05, + "loss": 2.6507, + "step": 5844 + }, + { + "epoch": 9.352, + "grad_norm": 0.0, + "learning_rate": 2.9999999999999997e-05, + "loss": 2.6162, + "step": 5845 + }, + { + "epoch": 9.3536, + "grad_norm": 0.0, + "learning_rate": 2.985e-05, + "loss": 3.0373, + "step": 5846 + }, + { + "epoch": 9.3552, + "grad_norm": 0.0, + "learning_rate": 2.97e-05, + "loss": 2.9476, + "step": 5847 + }, + { + "epoch": 9.3568, + "grad_norm": 0.0, + "learning_rate": 2.955e-05, + "loss": 2.8315, + "step": 5848 + }, + { + "epoch": 9.3584, + "grad_norm": 0.0, + "learning_rate": 2.94e-05, + "loss": 2.36, + "step": 5849 + }, + { + "epoch": 9.36, + "grad_norm": 0.0, + "learning_rate": 2.925e-05, + "loss": 2.9085, + "step": 5850 + }, + { + "epoch": 9.3616, + "grad_norm": 0.0, + "learning_rate": 2.91e-05, + "loss": 2.8154, + "step": 5851 + }, + { + "epoch": 9.3632, + "grad_norm": 0.0, + "learning_rate": 2.895e-05, + "loss": 2.5466, + "step": 5852 + }, + { + "epoch": 9.3648, + "grad_norm": 0.0, + "learning_rate": 2.88e-05, + "loss": 2.9695, + "step": 5853 + }, + { + "epoch": 9.3664, + "grad_norm": 0.0, + "learning_rate": 2.865e-05, + "loss": 2.6365, + "step": 5854 + }, + { + "epoch": 9.368, + "grad_norm": 0.0, + "learning_rate": 2.8499999999999998e-05, + "loss": 2.7683, + "step": 5855 + }, + { + "epoch": 9.3696, + "grad_norm": 0.0, + "learning_rate": 2.8349999999999998e-05, + "loss": 2.5878, + "step": 5856 + }, + { + "epoch": 9.3712, + "grad_norm": 0.0, + "learning_rate": 2.8199999999999998e-05, + "loss": 2.3464, + "step": 5857 + }, + { + "epoch": 9.3728, + "grad_norm": 0.0, + "learning_rate": 2.8049999999999997e-05, + "loss": 2.5351, + "step": 5858 + }, + { + "epoch": 9.3744, + "grad_norm": 0.0, + "learning_rate": 2.7899999999999997e-05, + "loss": 2.4149, + "step": 5859 + }, + { + "epoch": 9.376, + "grad_norm": 0.0, + "learning_rate": 2.7749999999999997e-05, + "loss": 3.3166, + "step": 5860 + }, + { + "epoch": 9.3776, + "grad_norm": 0.0, + "learning_rate": 2.7599999999999997e-05, + "loss": 4.1168, + "step": 5861 + }, + { + "epoch": 9.3792, + "grad_norm": 0.0, + "learning_rate": 2.7449999999999996e-05, + "loss": 3.124, + "step": 5862 + }, + { + "epoch": 9.3808, + "grad_norm": 0.0, + "learning_rate": 2.7299999999999996e-05, + "loss": 3.1329, + "step": 5863 + }, + { + "epoch": 9.3824, + "grad_norm": 0.0, + "learning_rate": 2.7149999999999996e-05, + "loss": 3.0086, + "step": 5864 + }, + { + "epoch": 9.384, + "grad_norm": 0.0, + "learning_rate": 2.6999999999999996e-05, + "loss": 2.7413, + "step": 5865 + }, + { + "epoch": 9.3856, + "grad_norm": NaN, + "learning_rate": 2.6999999999999996e-05, + "loss": 1.6222, + "step": 5866 + }, + { + "epoch": 9.3872, + "grad_norm": 0.0, + "learning_rate": 2.6849999999999995e-05, + "loss": 2.4586, + "step": 5867 + }, + { + "epoch": 9.3888, + "grad_norm": 0.0, + "learning_rate": 2.6699999999999995e-05, + "loss": 3.3739, + "step": 5868 + }, + { + "epoch": 9.3904, + "grad_norm": 0.0, + "learning_rate": 2.6549999999999995e-05, + "loss": 2.6869, + "step": 5869 + }, + { + "epoch": 9.392, + "grad_norm": 0.0, + "learning_rate": 2.6399999999999995e-05, + "loss": 3.0628, + "step": 5870 + }, + { + "epoch": 9.3936, + "grad_norm": 0.0, + "learning_rate": 2.6249999999999998e-05, + "loss": 3.9845, + "step": 5871 + }, + { + "epoch": 9.395199999999999, + "grad_norm": 0.0, + "learning_rate": 2.6099999999999997e-05, + "loss": 2.5343, + "step": 5872 + }, + { + "epoch": 9.3968, + "grad_norm": 0.0, + "learning_rate": 2.5949999999999997e-05, + "loss": 3.7584, + "step": 5873 + }, + { + "epoch": 9.3984, + "grad_norm": 0.0, + "learning_rate": 2.5799999999999997e-05, + "loss": 2.9493, + "step": 5874 + }, + { + "epoch": 9.4, + "grad_norm": NaN, + "learning_rate": 2.5799999999999997e-05, + "loss": 3.2706, + "step": 5875 + }, + { + "epoch": 9.4016, + "grad_norm": 0.0, + "learning_rate": 2.565e-05, + "loss": 5.3678, + "step": 5876 + }, + { + "epoch": 9.4032, + "grad_norm": 0.0, + "learning_rate": 2.55e-05, + "loss": 3.8882, + "step": 5877 + }, + { + "epoch": 9.4048, + "grad_norm": 0.0, + "learning_rate": 2.535e-05, + "loss": 2.9439, + "step": 5878 + }, + { + "epoch": 9.4064, + "grad_norm": 0.0, + "learning_rate": 2.52e-05, + "loss": 2.6025, + "step": 5879 + }, + { + "epoch": 9.408, + "grad_norm": 0.0, + "learning_rate": 2.505e-05, + "loss": 3.2556, + "step": 5880 + }, + { + "epoch": 9.4096, + "grad_norm": 0.0, + "learning_rate": 2.49e-05, + "loss": 2.9982, + "step": 5881 + }, + { + "epoch": 9.411200000000001, + "grad_norm": 0.0, + "learning_rate": 2.475e-05, + "loss": 3.9662, + "step": 5882 + }, + { + "epoch": 9.4128, + "grad_norm": 0.0, + "learning_rate": 2.4599999999999998e-05, + "loss": 2.7577, + "step": 5883 + }, + { + "epoch": 9.4144, + "grad_norm": 0.0, + "learning_rate": 2.4449999999999998e-05, + "loss": 4.4033, + "step": 5884 + }, + { + "epoch": 9.416, + "grad_norm": 0.0, + "learning_rate": 2.4299999999999998e-05, + "loss": 2.8199, + "step": 5885 + }, + { + "epoch": 9.4176, + "grad_norm": 0.0, + "learning_rate": 2.4149999999999997e-05, + "loss": 4.025, + "step": 5886 + }, + { + "epoch": 9.4192, + "grad_norm": 0.0, + "learning_rate": 2.3999999999999997e-05, + "loss": 3.2348, + "step": 5887 + }, + { + "epoch": 9.4208, + "grad_norm": 0.0, + "learning_rate": 2.3849999999999997e-05, + "loss": 2.6776, + "step": 5888 + }, + { + "epoch": 9.4224, + "grad_norm": 0.0, + "learning_rate": 2.3699999999999997e-05, + "loss": 3.5189, + "step": 5889 + }, + { + "epoch": 9.424, + "grad_norm": 0.0, + "learning_rate": 2.3549999999999996e-05, + "loss": 2.7538, + "step": 5890 + }, + { + "epoch": 9.4256, + "grad_norm": 0.0, + "learning_rate": 2.34e-05, + "loss": 2.5793, + "step": 5891 + }, + { + "epoch": 9.4272, + "grad_norm": 0.0, + "learning_rate": 2.325e-05, + "loss": 3.0414, + "step": 5892 + }, + { + "epoch": 9.4288, + "grad_norm": 0.0, + "learning_rate": 2.31e-05, + "loss": 2.6425, + "step": 5893 + }, + { + "epoch": 9.4304, + "grad_norm": 0.0, + "learning_rate": 2.295e-05, + "loss": 2.6048, + "step": 5894 + }, + { + "epoch": 9.432, + "grad_norm": 0.0, + "learning_rate": 2.28e-05, + "loss": 2.5132, + "step": 5895 + }, + { + "epoch": 9.4336, + "grad_norm": 0.0, + "learning_rate": 2.2649999999999998e-05, + "loss": 2.2403, + "step": 5896 + }, + { + "epoch": 9.4352, + "grad_norm": 0.0, + "learning_rate": 2.2499999999999998e-05, + "loss": 3.1129, + "step": 5897 + }, + { + "epoch": 9.4368, + "grad_norm": 0.0, + "learning_rate": 2.2349999999999998e-05, + "loss": 2.899, + "step": 5898 + }, + { + "epoch": 9.4384, + "grad_norm": 0.0, + "learning_rate": 2.2199999999999998e-05, + "loss": 2.8279, + "step": 5899 + }, + { + "epoch": 9.44, + "grad_norm": 0.0, + "learning_rate": 2.2049999999999997e-05, + "loss": 2.9215, + "step": 5900 + }, + { + "epoch": 9.4416, + "grad_norm": 0.0, + "learning_rate": 2.1899999999999997e-05, + "loss": 2.9392, + "step": 5901 + }, + { + "epoch": 9.4432, + "grad_norm": 0.0, + "learning_rate": 2.1749999999999997e-05, + "loss": 3.105, + "step": 5902 + }, + { + "epoch": 9.4448, + "grad_norm": 0.0, + "learning_rate": 2.1599999999999996e-05, + "loss": 2.6916, + "step": 5903 + }, + { + "epoch": 9.4464, + "grad_norm": 0.0, + "learning_rate": 2.1449999999999996e-05, + "loss": 3.3384, + "step": 5904 + }, + { + "epoch": 9.448, + "grad_norm": 0.0, + "learning_rate": 2.1299999999999996e-05, + "loss": 2.9018, + "step": 5905 + }, + { + "epoch": 9.4496, + "grad_norm": 0.0, + "learning_rate": 2.1149999999999996e-05, + "loss": 2.5655, + "step": 5906 + }, + { + "epoch": 9.4512, + "grad_norm": 0.0, + "learning_rate": 2.1e-05, + "loss": 2.9827, + "step": 5907 + }, + { + "epoch": 9.4528, + "grad_norm": 0.0, + "learning_rate": 2.085e-05, + "loss": 3.165, + "step": 5908 + }, + { + "epoch": 9.4544, + "grad_norm": 0.0, + "learning_rate": 2.07e-05, + "loss": 2.5769, + "step": 5909 + }, + { + "epoch": 9.456, + "grad_norm": 0.0, + "learning_rate": 2.055e-05, + "loss": 3.0794, + "step": 5910 + }, + { + "epoch": 9.4576, + "grad_norm": 0.0, + "learning_rate": 2.04e-05, + "loss": 2.8733, + "step": 5911 + }, + { + "epoch": 9.4592, + "grad_norm": 0.0, + "learning_rate": 2.025e-05, + "loss": 2.8476, + "step": 5912 + }, + { + "epoch": 9.4608, + "grad_norm": 0.0, + "learning_rate": 2.01e-05, + "loss": 2.986, + "step": 5913 + }, + { + "epoch": 9.4624, + "grad_norm": 0.0, + "learning_rate": 1.995e-05, + "loss": 2.6161, + "step": 5914 + }, + { + "epoch": 9.464, + "grad_norm": 0.0, + "learning_rate": 1.98e-05, + "loss": 2.8138, + "step": 5915 + }, + { + "epoch": 9.4656, + "grad_norm": 0.0, + "learning_rate": 1.965e-05, + "loss": 2.5781, + "step": 5916 + }, + { + "epoch": 9.4672, + "grad_norm": 0.0, + "learning_rate": 1.95e-05, + "loss": 3.0813, + "step": 5917 + }, + { + "epoch": 9.4688, + "grad_norm": 0.0, + "learning_rate": 1.935e-05, + "loss": 2.7178, + "step": 5918 + }, + { + "epoch": 9.4704, + "grad_norm": 0.0, + "learning_rate": 1.92e-05, + "loss": 2.3634, + "step": 5919 + }, + { + "epoch": 9.472, + "grad_norm": 0.0, + "learning_rate": 1.905e-05, + "loss": 3.006, + "step": 5920 + }, + { + "epoch": 9.4736, + "grad_norm": 0.0, + "learning_rate": 1.89e-05, + "loss": 2.4351, + "step": 5921 + }, + { + "epoch": 9.4752, + "grad_norm": 0.0, + "learning_rate": 1.875e-05, + "loss": 2.6789, + "step": 5922 + }, + { + "epoch": 9.4768, + "grad_norm": 0.0, + "learning_rate": 1.8599999999999998e-05, + "loss": 3.3793, + "step": 5923 + }, + { + "epoch": 9.4784, + "grad_norm": 0.0, + "learning_rate": 1.8449999999999998e-05, + "loss": 3.5019, + "step": 5924 + }, + { + "epoch": 9.48, + "grad_norm": 0.0, + "learning_rate": 1.8299999999999998e-05, + "loss": 3.5246, + "step": 5925 + }, + { + "epoch": 9.4816, + "grad_norm": 0.0, + "learning_rate": 1.8149999999999997e-05, + "loss": 3.9433, + "step": 5926 + }, + { + "epoch": 9.4832, + "grad_norm": 0.0, + "learning_rate": 1.7999999999999997e-05, + "loss": 4.1305, + "step": 5927 + }, + { + "epoch": 9.4848, + "grad_norm": 0.0, + "learning_rate": 1.7849999999999997e-05, + "loss": 4.3702, + "step": 5928 + }, + { + "epoch": 9.4864, + "grad_norm": 0.0, + "learning_rate": 1.7699999999999997e-05, + "loss": 3.1913, + "step": 5929 + }, + { + "epoch": 9.488, + "grad_norm": 0.0, + "learning_rate": 1.755e-05, + "loss": 3.7054, + "step": 5930 + }, + { + "epoch": 9.4896, + "grad_norm": 0.0, + "learning_rate": 1.74e-05, + "loss": 3.1622, + "step": 5931 + }, + { + "epoch": 9.4912, + "grad_norm": 0.0, + "learning_rate": 1.725e-05, + "loss": 3.7106, + "step": 5932 + }, + { + "epoch": 9.4928, + "grad_norm": 0.0, + "learning_rate": 1.71e-05, + "loss": 3.2574, + "step": 5933 + }, + { + "epoch": 9.4944, + "grad_norm": 0.0, + "learning_rate": 1.695e-05, + "loss": 2.7068, + "step": 5934 + }, + { + "epoch": 9.496, + "grad_norm": 0.0, + "learning_rate": 1.68e-05, + "loss": 3.2067, + "step": 5935 + }, + { + "epoch": 9.4976, + "grad_norm": 0.0, + "learning_rate": 1.6649999999999998e-05, + "loss": 2.554, + "step": 5936 + }, + { + "epoch": 9.4992, + "grad_norm": 0.0, + "learning_rate": 1.6499999999999998e-05, + "loss": 2.9718, + "step": 5937 + }, + { + "epoch": 9.5008, + "grad_norm": 0.0, + "learning_rate": 1.6349999999999998e-05, + "loss": 2.844, + "step": 5938 + }, + { + "epoch": 9.5024, + "grad_norm": 0.0, + "learning_rate": 1.6199999999999997e-05, + "loss": 3.0721, + "step": 5939 + }, + { + "epoch": 9.504, + "grad_norm": 0.0, + "learning_rate": 1.6049999999999997e-05, + "loss": 2.6736, + "step": 5940 + }, + { + "epoch": 9.5056, + "grad_norm": 0.0, + "learning_rate": 1.5899999999999997e-05, + "loss": 2.76, + "step": 5941 + }, + { + "epoch": 9.5072, + "grad_norm": 0.0, + "learning_rate": 1.5749999999999997e-05, + "loss": 2.7879, + "step": 5942 + }, + { + "epoch": 9.5088, + "grad_norm": 0.0, + "learning_rate": 1.5599999999999996e-05, + "loss": 3.4875, + "step": 5943 + }, + { + "epoch": 9.5104, + "grad_norm": 0.0, + "learning_rate": 1.545e-05, + "loss": 2.5975, + "step": 5944 + }, + { + "epoch": 9.512, + "grad_norm": 0.0, + "learning_rate": 1.53e-05, + "loss": 2.9525, + "step": 5945 + }, + { + "epoch": 9.5136, + "grad_norm": 0.0, + "learning_rate": 1.5149999999999999e-05, + "loss": 3.0019, + "step": 5946 + }, + { + "epoch": 9.5152, + "grad_norm": 0.0, + "learning_rate": 1.4999999999999999e-05, + "loss": 2.6932, + "step": 5947 + }, + { + "epoch": 9.5168, + "grad_norm": 0.0, + "learning_rate": 1.485e-05, + "loss": 2.3499, + "step": 5948 + }, + { + "epoch": 9.5184, + "grad_norm": 0.0, + "learning_rate": 1.47e-05, + "loss": 2.7315, + "step": 5949 + }, + { + "epoch": 9.52, + "grad_norm": 0.0, + "learning_rate": 1.455e-05, + "loss": 2.4128, + "step": 5950 + }, + { + "epoch": 9.5216, + "grad_norm": 0.0, + "learning_rate": 1.44e-05, + "loss": 3.1493, + "step": 5951 + }, + { + "epoch": 9.5232, + "grad_norm": 0.0, + "learning_rate": 1.4249999999999999e-05, + "loss": 3.5148, + "step": 5952 + }, + { + "epoch": 9.5248, + "grad_norm": 0.0, + "learning_rate": 1.4099999999999999e-05, + "loss": 2.7583, + "step": 5953 + }, + { + "epoch": 9.5264, + "grad_norm": 0.0, + "learning_rate": 1.3949999999999999e-05, + "loss": 2.7706, + "step": 5954 + }, + { + "epoch": 9.528, + "grad_norm": 0.0, + "learning_rate": 1.3799999999999998e-05, + "loss": 3.7789, + "step": 5955 + }, + { + "epoch": 9.5296, + "grad_norm": 0.0, + "learning_rate": 1.3649999999999998e-05, + "loss": 3.1102, + "step": 5956 + }, + { + "epoch": 9.5312, + "grad_norm": 0.0, + "learning_rate": 1.3499999999999998e-05, + "loss": 3.0082, + "step": 5957 + }, + { + "epoch": 9.5328, + "grad_norm": 0.0, + "learning_rate": 1.3349999999999998e-05, + "loss": 2.8169, + "step": 5958 + }, + { + "epoch": 9.5344, + "grad_norm": 0.0, + "learning_rate": 1.3199999999999997e-05, + "loss": 3.482, + "step": 5959 + }, + { + "epoch": 9.536, + "grad_norm": 0.0, + "learning_rate": 1.3049999999999999e-05, + "loss": 3.1191, + "step": 5960 + }, + { + "epoch": 9.5376, + "grad_norm": 0.0, + "learning_rate": 1.2899999999999998e-05, + "loss": 3.3154, + "step": 5961 + }, + { + "epoch": 9.5392, + "grad_norm": 0.0, + "learning_rate": 1.275e-05, + "loss": 3.1597, + "step": 5962 + }, + { + "epoch": 9.5408, + "grad_norm": 0.0, + "learning_rate": 1.26e-05, + "loss": 2.663, + "step": 5963 + }, + { + "epoch": 9.5424, + "grad_norm": 0.0, + "learning_rate": 1.245e-05, + "loss": 3.9024, + "step": 5964 + }, + { + "epoch": 9.544, + "grad_norm": 0.0, + "learning_rate": 1.2299999999999999e-05, + "loss": 3.2608, + "step": 5965 + }, + { + "epoch": 9.5456, + "grad_norm": 0.0, + "learning_rate": 1.2149999999999999e-05, + "loss": 2.6814, + "step": 5966 + }, + { + "epoch": 9.5472, + "grad_norm": 0.0, + "learning_rate": 1.1999999999999999e-05, + "loss": 3.5144, + "step": 5967 + }, + { + "epoch": 9.5488, + "grad_norm": 0.0, + "learning_rate": 1.1849999999999998e-05, + "loss": 2.4768, + "step": 5968 + }, + { + "epoch": 9.5504, + "grad_norm": 0.0, + "learning_rate": 1.17e-05, + "loss": 2.67, + "step": 5969 + }, + { + "epoch": 9.552, + "grad_norm": 0.0, + "learning_rate": 1.155e-05, + "loss": 2.5965, + "step": 5970 + }, + { + "epoch": 9.5536, + "grad_norm": 0.0, + "learning_rate": 1.14e-05, + "loss": 2.9419, + "step": 5971 + }, + { + "epoch": 9.5552, + "grad_norm": 0.0, + "learning_rate": 1.1249999999999999e-05, + "loss": 2.772, + "step": 5972 + }, + { + "epoch": 9.556799999999999, + "grad_norm": 0.0, + "learning_rate": 1.1099999999999999e-05, + "loss": 3.3538, + "step": 5973 + }, + { + "epoch": 9.5584, + "grad_norm": 0.0, + "learning_rate": 1.0949999999999998e-05, + "loss": 3.2122, + "step": 5974 + }, + { + "epoch": 9.56, + "grad_norm": 0.0, + "learning_rate": 1.0799999999999998e-05, + "loss": 2.8816, + "step": 5975 + }, + { + "epoch": 9.5616, + "grad_norm": 0.0, + "learning_rate": 1.0649999999999998e-05, + "loss": 3.9349, + "step": 5976 + }, + { + "epoch": 9.5632, + "grad_norm": 0.0, + "learning_rate": 1.05e-05, + "loss": 3.7896, + "step": 5977 + }, + { + "epoch": 9.5648, + "grad_norm": 0.0, + "learning_rate": 1.035e-05, + "loss": 3.084, + "step": 5978 + }, + { + "epoch": 9.5664, + "grad_norm": 0.0, + "learning_rate": 1.02e-05, + "loss": 4.2844, + "step": 5979 + }, + { + "epoch": 9.568, + "grad_norm": 0.0, + "learning_rate": 1.005e-05, + "loss": 3.3977, + "step": 5980 + }, + { + "epoch": 9.5696, + "grad_norm": 0.0, + "learning_rate": 9.9e-06, + "loss": 3.6734, + "step": 5981 + }, + { + "epoch": 9.5712, + "grad_norm": 0.0, + "learning_rate": 9.75e-06, + "loss": 3.9535, + "step": 5982 + }, + { + "epoch": 9.5728, + "grad_norm": 0.0, + "learning_rate": 9.6e-06, + "loss": 2.9579, + "step": 5983 + }, + { + "epoch": 9.5744, + "grad_norm": 0.0, + "learning_rate": 9.45e-06, + "loss": 3.5256, + "step": 5984 + }, + { + "epoch": 9.576, + "grad_norm": 0.0, + "learning_rate": 9.299999999999999e-06, + "loss": 2.9872, + "step": 5985 + }, + { + "epoch": 9.5776, + "grad_norm": 0.0, + "learning_rate": 9.149999999999999e-06, + "loss": 2.7637, + "step": 5986 + }, + { + "epoch": 9.5792, + "grad_norm": 0.0, + "learning_rate": 8.999999999999999e-06, + "loss": 2.3799, + "step": 5987 + }, + { + "epoch": 9.5808, + "grad_norm": 0.0, + "learning_rate": 8.849999999999998e-06, + "loss": 2.8928, + "step": 5988 + }, + { + "epoch": 9.5824, + "grad_norm": 0.0, + "learning_rate": 8.7e-06, + "loss": 2.941, + "step": 5989 + }, + { + "epoch": 9.584, + "grad_norm": 0.0, + "learning_rate": 8.55e-06, + "loss": 2.6913, + "step": 5990 + }, + { + "epoch": 9.5856, + "grad_norm": 0.0, + "learning_rate": 8.4e-06, + "loss": 2.8565, + "step": 5991 + }, + { + "epoch": 9.5872, + "grad_norm": 0.0, + "learning_rate": 8.249999999999999e-06, + "loss": 2.6545, + "step": 5992 + }, + { + "epoch": 9.588799999999999, + "grad_norm": 0.0, + "learning_rate": 8.099999999999999e-06, + "loss": 2.7364, + "step": 5993 + }, + { + "epoch": 9.5904, + "grad_norm": 0.0, + "learning_rate": 7.949999999999998e-06, + "loss": 2.4076, + "step": 5994 + }, + { + "epoch": 9.592, + "grad_norm": 0.0, + "learning_rate": 7.799999999999998e-06, + "loss": 2.8067, + "step": 5995 + }, + { + "epoch": 9.5936, + "grad_norm": 0.0, + "learning_rate": 7.65e-06, + "loss": 2.4151, + "step": 5996 + }, + { + "epoch": 9.5952, + "grad_norm": 0.0, + "learning_rate": 7.499999999999999e-06, + "loss": 2.9682, + "step": 5997 + }, + { + "epoch": 9.5968, + "grad_norm": 0.0, + "learning_rate": 7.35e-06, + "loss": 2.9539, + "step": 5998 + }, + { + "epoch": 9.5984, + "grad_norm": 0.0, + "learning_rate": 7.2e-06, + "loss": 3.2376, + "step": 5999 + }, + { + "epoch": 9.6, + "grad_norm": 0.0, + "learning_rate": 7.049999999999999e-06, + "loss": 3.1155, + "step": 6000 + }, + { + "epoch": 9.6, + "eval_cer": 0.5195918204647826, + "eval_loss": 3.2885050773620605, + "eval_runtime": 157.437, + "eval_samples_per_second": 19.919, + "eval_steps_per_second": 1.245, + "eval_wer": 0.8050150243498083, + "step": 6000 + }, + { + "epoch": 9.6, + "step": 6000, + "total_flos": 3.700768773245485e+19, + "train_loss": 2.825458660195271, + "train_runtime": 12931.7591, + "train_samples_per_second": 14.847, + "train_steps_per_second": 0.464 } ], "logging_steps": 1.0, - "max_steps": 3500, + "max_steps": 6000, "num_input_tokens_seen": 0, - "num_train_epochs": 6, - "save_steps": 500, + "num_train_epochs": 10, + "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { @@ -24885,7 +42095,7 @@ "attributes": {} } }, - "total_flos": 2.158464150901847e+19, + "total_flos": 3.700768773245485e+19, "train_batch_size": 16, "trial_name": null, "trial_params": null