{ "best_metric": 1.3649392127990723, "best_model_checkpoint": "./output/checkpoints/2024-06-11_11-02-23/checkpoint-50", "epoch": 1.0, "eval_steps": 1, "global_step": 97, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.010309278350515464, "grad_norm": 2.784351348876953, "learning_rate": 4e-05, "loss": 5.0719, "step": 1 }, { "epoch": 0.010309278350515464, "eval_loss": 4.890735149383545, "eval_runtime": 11.998, "eval_samples_per_second": 11.252, "eval_steps_per_second": 0.75, "step": 1 }, { "epoch": 0.020618556701030927, "grad_norm": 2.8573310375213623, "learning_rate": 8e-05, "loss": 4.9508, "step": 2 }, { "epoch": 0.020618556701030927, "eval_loss": 4.775210857391357, "eval_runtime": 12.0698, "eval_samples_per_second": 11.185, "eval_steps_per_second": 0.746, "step": 2 }, { "epoch": 0.030927835051546393, "grad_norm": 2.920828104019165, "learning_rate": 0.00012, "loss": 4.973, "step": 3 }, { "epoch": 0.030927835051546393, "eval_loss": 4.2439093589782715, "eval_runtime": 12.1674, "eval_samples_per_second": 11.095, "eval_steps_per_second": 0.74, "step": 3 }, { "epoch": 0.041237113402061855, "grad_norm": 3.0682826042175293, "learning_rate": 0.00016, "loss": 4.3398, "step": 4 }, { "epoch": 0.041237113402061855, "eval_loss": 3.316483497619629, "eval_runtime": 12.1421, "eval_samples_per_second": 11.118, "eval_steps_per_second": 0.741, "step": 4 }, { "epoch": 0.05154639175257732, "grad_norm": 3.1937592029571533, "learning_rate": 0.0002, "loss": 3.253, "step": 5 }, { "epoch": 0.05154639175257732, "eval_loss": 2.4086239337921143, "eval_runtime": 12.0796, "eval_samples_per_second": 11.176, "eval_steps_per_second": 0.745, "step": 5 }, { "epoch": 0.061855670103092786, "grad_norm": 2.0619874000549316, "learning_rate": 0.00024, "loss": 2.3725, "step": 6 }, { "epoch": 0.061855670103092786, "eval_loss": 1.9427752494812012, "eval_runtime": 12.1816, "eval_samples_per_second": 11.082, "eval_steps_per_second": 0.739, "step": 6 }, { "epoch": 0.07216494845360824, "grad_norm": 1.4178358316421509, "learning_rate": 0.00028, "loss": 1.7391, "step": 7 }, { "epoch": 0.07216494845360824, "eval_loss": 1.6751385927200317, "eval_runtime": 12.2395, "eval_samples_per_second": 11.03, "eval_steps_per_second": 0.735, "step": 7 }, { "epoch": 0.08247422680412371, "grad_norm": 0.9345605373382568, "learning_rate": 0.00032, "loss": 1.4029, "step": 8 }, { "epoch": 0.08247422680412371, "eval_loss": 1.551221489906311, "eval_runtime": 12.2329, "eval_samples_per_second": 11.036, "eval_steps_per_second": 0.736, "step": 8 }, { "epoch": 0.09278350515463918, "grad_norm": 1.1335052251815796, "learning_rate": 0.00036, "loss": 1.4404, "step": 9 }, { "epoch": 0.09278350515463918, "eval_loss": 1.447738528251648, "eval_runtime": 12.2069, "eval_samples_per_second": 11.059, "eval_steps_per_second": 0.737, "step": 9 }, { "epoch": 0.10309278350515463, "grad_norm": 0.23779241740703583, "learning_rate": 0.0004, "loss": 1.3197, "step": 10 }, { "epoch": 0.10309278350515463, "eval_loss": 1.4138007164001465, "eval_runtime": 12.2609, "eval_samples_per_second": 11.011, "eval_steps_per_second": 0.734, "step": 10 }, { "epoch": 0.1134020618556701, "grad_norm": 0.19672752916812897, "learning_rate": 0.00039540229885057476, "loss": 1.2882, "step": 11 }, { "epoch": 0.1134020618556701, "eval_loss": 1.393249750137329, "eval_runtime": 12.1836, "eval_samples_per_second": 11.08, "eval_steps_per_second": 0.739, "step": 11 }, { "epoch": 0.12371134020618557, "grad_norm": 0.2525981366634369, "learning_rate": 0.00039080459770114945, "loss": 1.2831, "step": 12 }, { "epoch": 0.12371134020618557, "eval_loss": 1.4042422771453857, "eval_runtime": 12.232, "eval_samples_per_second": 11.037, "eval_steps_per_second": 0.736, "step": 12 }, { "epoch": 0.13402061855670103, "grad_norm": 0.23460708558559418, "learning_rate": 0.0003862068965517242, "loss": 1.1615, "step": 13 }, { "epoch": 0.13402061855670103, "eval_loss": 1.4705734252929688, "eval_runtime": 12.2648, "eval_samples_per_second": 11.007, "eval_steps_per_second": 0.734, "step": 13 }, { "epoch": 0.14432989690721648, "grad_norm": 0.21198733150959015, "learning_rate": 0.00038160919540229887, "loss": 1.0625, "step": 14 }, { "epoch": 0.14432989690721648, "eval_loss": 1.5055038928985596, "eval_runtime": 12.2044, "eval_samples_per_second": 11.062, "eval_steps_per_second": 0.737, "step": 14 }, { "epoch": 0.15463917525773196, "grad_norm": 0.3658374845981598, "learning_rate": 0.00037701149425287356, "loss": 1.1191, "step": 15 }, { "epoch": 0.15463917525773196, "eval_loss": 1.4498964548110962, "eval_runtime": 12.2004, "eval_samples_per_second": 11.065, "eval_steps_per_second": 0.738, "step": 15 }, { "epoch": 0.16494845360824742, "grad_norm": 0.222326397895813, "learning_rate": 0.0003724137931034483, "loss": 1.0887, "step": 16 }, { "epoch": 0.16494845360824742, "eval_loss": 1.3819501399993896, "eval_runtime": 12.1444, "eval_samples_per_second": 11.116, "eval_steps_per_second": 0.741, "step": 16 }, { "epoch": 0.17525773195876287, "grad_norm": 0.1682404726743698, "learning_rate": 0.000367816091954023, "loss": 1.0915, "step": 17 }, { "epoch": 0.17525773195876287, "eval_loss": 1.3552738428115845, "eval_runtime": 12.2121, "eval_samples_per_second": 11.055, "eval_steps_per_second": 0.737, "step": 17 }, { "epoch": 0.18556701030927836, "grad_norm": 0.15567483007907867, "learning_rate": 0.0003632183908045977, "loss": 1.0509, "step": 18 }, { "epoch": 0.18556701030927836, "eval_loss": 1.3488575220108032, "eval_runtime": 12.2988, "eval_samples_per_second": 10.977, "eval_steps_per_second": 0.732, "step": 18 }, { "epoch": 0.1958762886597938, "grad_norm": 0.1600300371646881, "learning_rate": 0.0003586206896551724, "loss": 0.9982, "step": 19 }, { "epoch": 0.1958762886597938, "eval_loss": 1.3651177883148193, "eval_runtime": 12.1263, "eval_samples_per_second": 11.133, "eval_steps_per_second": 0.742, "step": 19 }, { "epoch": 0.20618556701030927, "grad_norm": 0.1044178307056427, "learning_rate": 0.00035402298850574715, "loss": 0.9809, "step": 20 }, { "epoch": 0.20618556701030927, "eval_loss": 1.401971697807312, "eval_runtime": 12.2092, "eval_samples_per_second": 11.057, "eval_steps_per_second": 0.737, "step": 20 }, { "epoch": 0.21649484536082475, "grad_norm": 0.12360141426324844, "learning_rate": 0.0003494252873563219, "loss": 1.0549, "step": 21 }, { "epoch": 0.21649484536082475, "eval_loss": 1.426545262336731, "eval_runtime": 12.1581, "eval_samples_per_second": 11.104, "eval_steps_per_second": 0.74, "step": 21 }, { "epoch": 0.2268041237113402, "grad_norm": 0.12509943544864655, "learning_rate": 0.0003448275862068965, "loss": 1.0323, "step": 22 }, { "epoch": 0.2268041237113402, "eval_loss": 1.4258630275726318, "eval_runtime": 12.201, "eval_samples_per_second": 11.065, "eval_steps_per_second": 0.738, "step": 22 }, { "epoch": 0.23711340206185566, "grad_norm": 0.13586747646331787, "learning_rate": 0.00034022988505747127, "loss": 1.0746, "step": 23 }, { "epoch": 0.23711340206185566, "eval_loss": 1.3952091932296753, "eval_runtime": 12.1932, "eval_samples_per_second": 11.072, "eval_steps_per_second": 0.738, "step": 23 }, { "epoch": 0.24742268041237114, "grad_norm": 0.08069202303886414, "learning_rate": 0.000335632183908046, "loss": 0.9645, "step": 24 }, { "epoch": 0.24742268041237114, "eval_loss": 1.376875638961792, "eval_runtime": 12.2172, "eval_samples_per_second": 11.05, "eval_steps_per_second": 0.737, "step": 24 }, { "epoch": 0.25773195876288657, "grad_norm": 0.09109444171190262, "learning_rate": 0.0003310344827586207, "loss": 0.9345, "step": 25 }, { "epoch": 0.25773195876288657, "eval_loss": 1.373625636100769, "eval_runtime": 12.2642, "eval_samples_per_second": 11.008, "eval_steps_per_second": 0.734, "step": 25 }, { "epoch": 0.26804123711340205, "grad_norm": 0.0649966150522232, "learning_rate": 0.00032643678160919543, "loss": 1.0266, "step": 26 }, { "epoch": 0.26804123711340205, "eval_loss": 1.3720897436141968, "eval_runtime": 12.1576, "eval_samples_per_second": 11.104, "eval_steps_per_second": 0.74, "step": 26 }, { "epoch": 0.27835051546391754, "grad_norm": 0.09308775514364243, "learning_rate": 0.0003218390804597701, "loss": 0.9797, "step": 27 }, { "epoch": 0.27835051546391754, "eval_loss": 1.3858685493469238, "eval_runtime": 12.1546, "eval_samples_per_second": 11.107, "eval_steps_per_second": 0.74, "step": 27 }, { "epoch": 0.28865979381443296, "grad_norm": 0.06654678285121918, "learning_rate": 0.00031724137931034486, "loss": 1.0072, "step": 28 }, { "epoch": 0.28865979381443296, "eval_loss": 1.4031970500946045, "eval_runtime": 12.1666, "eval_samples_per_second": 11.096, "eval_steps_per_second": 0.74, "step": 28 }, { "epoch": 0.29896907216494845, "grad_norm": 0.07720344513654709, "learning_rate": 0.0003126436781609196, "loss": 0.923, "step": 29 }, { "epoch": 0.29896907216494845, "eval_loss": 1.4118249416351318, "eval_runtime": 12.1572, "eval_samples_per_second": 11.105, "eval_steps_per_second": 0.74, "step": 29 }, { "epoch": 0.30927835051546393, "grad_norm": 0.10230278223752975, "learning_rate": 0.00030804597701149423, "loss": 0.9821, "step": 30 }, { "epoch": 0.30927835051546393, "eval_loss": 1.4045817852020264, "eval_runtime": 12.1974, "eval_samples_per_second": 11.068, "eval_steps_per_second": 0.738, "step": 30 }, { "epoch": 0.31958762886597936, "grad_norm": 0.07451125234365463, "learning_rate": 0.00030344827586206897, "loss": 1.0021, "step": 31 }, { "epoch": 0.31958762886597936, "eval_loss": 1.391209363937378, "eval_runtime": 12.2389, "eval_samples_per_second": 11.03, "eval_steps_per_second": 0.735, "step": 31 }, { "epoch": 0.32989690721649484, "grad_norm": 0.0714351087808609, "learning_rate": 0.00029885057471264366, "loss": 1.0071, "step": 32 }, { "epoch": 0.32989690721649484, "eval_loss": 1.3730576038360596, "eval_runtime": 12.2102, "eval_samples_per_second": 11.056, "eval_steps_per_second": 0.737, "step": 32 }, { "epoch": 0.3402061855670103, "grad_norm": 0.06839103996753693, "learning_rate": 0.0002942528735632184, "loss": 0.973, "step": 33 }, { "epoch": 0.3402061855670103, "eval_loss": 1.3659778833389282, "eval_runtime": 12.2323, "eval_samples_per_second": 11.036, "eval_steps_per_second": 0.736, "step": 33 }, { "epoch": 0.35051546391752575, "grad_norm": 0.08078178018331528, "learning_rate": 0.00028965517241379314, "loss": 0.964, "step": 34 }, { "epoch": 0.35051546391752575, "eval_loss": 1.3762452602386475, "eval_runtime": 12.2027, "eval_samples_per_second": 11.063, "eval_steps_per_second": 0.738, "step": 34 }, { "epoch": 0.36082474226804123, "grad_norm": 0.06870069354772568, "learning_rate": 0.0002850574712643678, "loss": 0.9148, "step": 35 }, { "epoch": 0.36082474226804123, "eval_loss": 1.3925738334655762, "eval_runtime": 12.2644, "eval_samples_per_second": 11.007, "eval_steps_per_second": 0.734, "step": 35 }, { "epoch": 0.3711340206185567, "grad_norm": 0.06974003463983536, "learning_rate": 0.00028045977011494257, "loss": 1.0128, "step": 36 }, { "epoch": 0.3711340206185567, "eval_loss": 1.4087179899215698, "eval_runtime": 12.2407, "eval_samples_per_second": 11.029, "eval_steps_per_second": 0.735, "step": 36 }, { "epoch": 0.38144329896907214, "grad_norm": 0.08603405207395554, "learning_rate": 0.00027586206896551725, "loss": 0.9776, "step": 37 }, { "epoch": 0.38144329896907214, "eval_loss": 1.4067703485488892, "eval_runtime": 12.173, "eval_samples_per_second": 11.09, "eval_steps_per_second": 0.739, "step": 37 }, { "epoch": 0.3917525773195876, "grad_norm": 0.07761300355195999, "learning_rate": 0.00027126436781609194, "loss": 0.9655, "step": 38 }, { "epoch": 0.3917525773195876, "eval_loss": 1.3932013511657715, "eval_runtime": 12.1941, "eval_samples_per_second": 11.071, "eval_steps_per_second": 0.738, "step": 38 }, { "epoch": 0.4020618556701031, "grad_norm": 0.06392566114664078, "learning_rate": 0.0002666666666666667, "loss": 0.974, "step": 39 }, { "epoch": 0.4020618556701031, "eval_loss": 1.3819767236709595, "eval_runtime": 12.1765, "eval_samples_per_second": 11.087, "eval_steps_per_second": 0.739, "step": 39 }, { "epoch": 0.41237113402061853, "grad_norm": 0.05517549812793732, "learning_rate": 0.00026206896551724137, "loss": 0.9793, "step": 40 }, { "epoch": 0.41237113402061853, "eval_loss": 1.3717000484466553, "eval_runtime": 12.2217, "eval_samples_per_second": 11.046, "eval_steps_per_second": 0.736, "step": 40 }, { "epoch": 0.422680412371134, "grad_norm": 0.0804053246974945, "learning_rate": 0.0002574712643678161, "loss": 0.9585, "step": 41 }, { "epoch": 0.422680412371134, "eval_loss": 1.3579998016357422, "eval_runtime": 12.182, "eval_samples_per_second": 11.082, "eval_steps_per_second": 0.739, "step": 41 }, { "epoch": 0.4329896907216495, "grad_norm": 0.07214821875095367, "learning_rate": 0.0002528735632183908, "loss": 0.9332, "step": 42 }, { "epoch": 0.4329896907216495, "eval_loss": 1.3583327531814575, "eval_runtime": 12.1257, "eval_samples_per_second": 11.133, "eval_steps_per_second": 0.742, "step": 42 }, { "epoch": 0.44329896907216493, "grad_norm": 0.07595060020685196, "learning_rate": 0.00024827586206896553, "loss": 0.8998, "step": 43 }, { "epoch": 0.44329896907216493, "eval_loss": 1.3721704483032227, "eval_runtime": 12.2745, "eval_samples_per_second": 10.998, "eval_steps_per_second": 0.733, "step": 43 }, { "epoch": 0.4536082474226804, "grad_norm": 0.07757716625928879, "learning_rate": 0.00024367816091954025, "loss": 0.9661, "step": 44 }, { "epoch": 0.4536082474226804, "eval_loss": 1.3984168767929077, "eval_runtime": 12.1398, "eval_samples_per_second": 11.12, "eval_steps_per_second": 0.741, "step": 44 }, { "epoch": 0.4639175257731959, "grad_norm": 0.053873661905527115, "learning_rate": 0.00023908045977011496, "loss": 0.9418, "step": 45 }, { "epoch": 0.4639175257731959, "eval_loss": 1.4266961812973022, "eval_runtime": 12.1955, "eval_samples_per_second": 11.07, "eval_steps_per_second": 0.738, "step": 45 }, { "epoch": 0.4742268041237113, "grad_norm": 0.08943776786327362, "learning_rate": 0.00023448275862068965, "loss": 0.9309, "step": 46 }, { "epoch": 0.4742268041237113, "eval_loss": 1.4349451065063477, "eval_runtime": 12.1983, "eval_samples_per_second": 11.067, "eval_steps_per_second": 0.738, "step": 46 }, { "epoch": 0.4845360824742268, "grad_norm": 0.0885058343410492, "learning_rate": 0.00022988505747126436, "loss": 1.0245, "step": 47 }, { "epoch": 0.4845360824742268, "eval_loss": 1.4226434230804443, "eval_runtime": 12.1753, "eval_samples_per_second": 11.088, "eval_steps_per_second": 0.739, "step": 47 }, { "epoch": 0.4948453608247423, "grad_norm": 0.058818139135837555, "learning_rate": 0.00022528735632183907, "loss": 0.9007, "step": 48 }, { "epoch": 0.4948453608247423, "eval_loss": 1.4033018350601196, "eval_runtime": 12.267, "eval_samples_per_second": 11.005, "eval_steps_per_second": 0.734, "step": 48 }, { "epoch": 0.5051546391752577, "grad_norm": 0.07104739546775818, "learning_rate": 0.0002206896551724138, "loss": 0.9469, "step": 49 }, { "epoch": 0.5051546391752577, "eval_loss": 1.3786002397537231, "eval_runtime": 12.2731, "eval_samples_per_second": 11.0, "eval_steps_per_second": 0.733, "step": 49 }, { "epoch": 0.5154639175257731, "grad_norm": 0.05872216075658798, "learning_rate": 0.00021609195402298853, "loss": 0.9671, "step": 50 }, { "epoch": 0.5154639175257731, "eval_loss": 1.3649392127990723, "eval_runtime": 12.1662, "eval_samples_per_second": 11.096, "eval_steps_per_second": 0.74, "step": 50 }, { "epoch": 0.5257731958762887, "grad_norm": 0.07843936234712601, "learning_rate": 0.00021149425287356324, "loss": 0.9052, "step": 51 }, { "epoch": 0.5257731958762887, "eval_loss": 1.3671882152557373, "eval_runtime": 12.1527, "eval_samples_per_second": 11.109, "eval_steps_per_second": 0.741, "step": 51 }, { "epoch": 0.5360824742268041, "grad_norm": 0.07407287508249283, "learning_rate": 0.00020689655172413795, "loss": 0.9221, "step": 52 }, { "epoch": 0.5360824742268041, "eval_loss": 1.3811315298080444, "eval_runtime": 12.1661, "eval_samples_per_second": 11.096, "eval_steps_per_second": 0.74, "step": 52 }, { "epoch": 0.5463917525773195, "grad_norm": 0.06168922409415245, "learning_rate": 0.00020229885057471267, "loss": 0.9809, "step": 53 }, { "epoch": 0.5463917525773195, "eval_loss": 1.3900806903839111, "eval_runtime": 12.1958, "eval_samples_per_second": 11.069, "eval_steps_per_second": 0.738, "step": 53 }, { "epoch": 0.5567010309278351, "grad_norm": 0.05980532988905907, "learning_rate": 0.00019770114942528738, "loss": 0.9492, "step": 54 }, { "epoch": 0.5567010309278351, "eval_loss": 1.4011154174804688, "eval_runtime": 12.1559, "eval_samples_per_second": 11.106, "eval_steps_per_second": 0.74, "step": 54 }, { "epoch": 0.5670103092783505, "grad_norm": 0.05770307034254074, "learning_rate": 0.0001931034482758621, "loss": 0.9377, "step": 55 }, { "epoch": 0.5670103092783505, "eval_loss": 1.4104657173156738, "eval_runtime": 12.2103, "eval_samples_per_second": 11.056, "eval_steps_per_second": 0.737, "step": 55 }, { "epoch": 0.5773195876288659, "grad_norm": 0.05812888965010643, "learning_rate": 0.00018850574712643678, "loss": 0.9139, "step": 56 }, { "epoch": 0.5773195876288659, "eval_loss": 1.4152581691741943, "eval_runtime": 12.1515, "eval_samples_per_second": 11.11, "eval_steps_per_second": 0.741, "step": 56 }, { "epoch": 0.5876288659793815, "grad_norm": 0.0625990554690361, "learning_rate": 0.0001839080459770115, "loss": 0.9111, "step": 57 }, { "epoch": 0.5876288659793815, "eval_loss": 1.4142401218414307, "eval_runtime": 12.208, "eval_samples_per_second": 11.058, "eval_steps_per_second": 0.737, "step": 57 }, { "epoch": 0.5979381443298969, "grad_norm": 0.06904823333024979, "learning_rate": 0.0001793103448275862, "loss": 0.9019, "step": 58 }, { "epoch": 0.5979381443298969, "eval_loss": 1.4036115407943726, "eval_runtime": 12.1626, "eval_samples_per_second": 11.1, "eval_steps_per_second": 0.74, "step": 58 }, { "epoch": 0.6082474226804123, "grad_norm": 0.06511174887418747, "learning_rate": 0.00017471264367816095, "loss": 0.9409, "step": 59 }, { "epoch": 0.6082474226804123, "eval_loss": 1.3875020742416382, "eval_runtime": 12.1594, "eval_samples_per_second": 11.103, "eval_steps_per_second": 0.74, "step": 59 }, { "epoch": 0.6185567010309279, "grad_norm": 0.06755220890045166, "learning_rate": 0.00017011494252873563, "loss": 0.8928, "step": 60 }, { "epoch": 0.6185567010309279, "eval_loss": 1.3814879655838013, "eval_runtime": 12.1832, "eval_samples_per_second": 11.081, "eval_steps_per_second": 0.739, "step": 60 }, { "epoch": 0.6288659793814433, "grad_norm": 0.057419709861278534, "learning_rate": 0.00016551724137931035, "loss": 0.8698, "step": 61 }, { "epoch": 0.6288659793814433, "eval_loss": 1.376993179321289, "eval_runtime": 12.1649, "eval_samples_per_second": 11.097, "eval_steps_per_second": 0.74, "step": 61 }, { "epoch": 0.6391752577319587, "grad_norm": 0.09423535317182541, "learning_rate": 0.00016091954022988506, "loss": 0.9605, "step": 62 }, { "epoch": 0.6391752577319587, "eval_loss": 1.3864612579345703, "eval_runtime": 12.1886, "eval_samples_per_second": 11.076, "eval_steps_per_second": 0.738, "step": 62 }, { "epoch": 0.6494845360824743, "grad_norm": 0.05667712539434433, "learning_rate": 0.0001563218390804598, "loss": 0.9863, "step": 63 }, { "epoch": 0.6494845360824743, "eval_loss": 1.3983639478683472, "eval_runtime": 12.1418, "eval_samples_per_second": 11.119, "eval_steps_per_second": 0.741, "step": 63 }, { "epoch": 0.6597938144329897, "grad_norm": 0.061302803456783295, "learning_rate": 0.00015172413793103449, "loss": 0.9454, "step": 64 }, { "epoch": 0.6597938144329897, "eval_loss": 1.406610369682312, "eval_runtime": 12.2718, "eval_samples_per_second": 11.001, "eval_steps_per_second": 0.733, "step": 64 }, { "epoch": 0.6701030927835051, "grad_norm": 0.06619007140398026, "learning_rate": 0.0001471264367816092, "loss": 0.9302, "step": 65 }, { "epoch": 0.6701030927835051, "eval_loss": 1.408695936203003, "eval_runtime": 12.1873, "eval_samples_per_second": 11.077, "eval_steps_per_second": 0.738, "step": 65 }, { "epoch": 0.6804123711340206, "grad_norm": 0.059212010353803635, "learning_rate": 0.0001425287356321839, "loss": 0.9409, "step": 66 }, { "epoch": 0.6804123711340206, "eval_loss": 1.4100947380065918, "eval_runtime": 12.2058, "eval_samples_per_second": 11.06, "eval_steps_per_second": 0.737, "step": 66 }, { "epoch": 0.6907216494845361, "grad_norm": 0.06854245811700821, "learning_rate": 0.00013793103448275863, "loss": 0.9408, "step": 67 }, { "epoch": 0.6907216494845361, "eval_loss": 1.4050439596176147, "eval_runtime": 12.1478, "eval_samples_per_second": 11.113, "eval_steps_per_second": 0.741, "step": 67 }, { "epoch": 0.7010309278350515, "grad_norm": 0.05722883343696594, "learning_rate": 0.00013333333333333334, "loss": 0.91, "step": 68 }, { "epoch": 0.7010309278350515, "eval_loss": 1.397505283355713, "eval_runtime": 12.1678, "eval_samples_per_second": 11.095, "eval_steps_per_second": 0.74, "step": 68 }, { "epoch": 0.711340206185567, "grad_norm": 0.07448893785476685, "learning_rate": 0.00012873563218390805, "loss": 0.9451, "step": 69 }, { "epoch": 0.711340206185567, "eval_loss": 1.3847121000289917, "eval_runtime": 12.1444, "eval_samples_per_second": 11.116, "eval_steps_per_second": 0.741, "step": 69 }, { "epoch": 0.7216494845360825, "grad_norm": 0.055692195892333984, "learning_rate": 0.00012413793103448277, "loss": 0.8967, "step": 70 }, { "epoch": 0.7216494845360825, "eval_loss": 1.376731038093567, "eval_runtime": 12.1911, "eval_samples_per_second": 11.074, "eval_steps_per_second": 0.738, "step": 70 }, { "epoch": 0.7319587628865979, "grad_norm": 0.06589022278785706, "learning_rate": 0.00011954022988505748, "loss": 0.8795, "step": 71 }, { "epoch": 0.7319587628865979, "eval_loss": 1.3753036260604858, "eval_runtime": 12.1728, "eval_samples_per_second": 11.09, "eval_steps_per_second": 0.739, "step": 71 }, { "epoch": 0.7422680412371134, "grad_norm": 0.12176728248596191, "learning_rate": 0.00011494252873563218, "loss": 0.9092, "step": 72 }, { "epoch": 0.7422680412371134, "eval_loss": 1.3911007642745972, "eval_runtime": 12.1946, "eval_samples_per_second": 11.07, "eval_steps_per_second": 0.738, "step": 72 }, { "epoch": 0.7525773195876289, "grad_norm": 0.05275936424732208, "learning_rate": 0.0001103448275862069, "loss": 0.9621, "step": 73 }, { "epoch": 0.7525773195876289, "eval_loss": 1.407221794128418, "eval_runtime": 12.2388, "eval_samples_per_second": 11.031, "eval_steps_per_second": 0.735, "step": 73 }, { "epoch": 0.7628865979381443, "grad_norm": 0.06748662143945694, "learning_rate": 0.00010574712643678162, "loss": 0.9154, "step": 74 }, { "epoch": 0.7628865979381443, "eval_loss": 1.4170591831207275, "eval_runtime": 12.1869, "eval_samples_per_second": 11.078, "eval_steps_per_second": 0.739, "step": 74 }, { "epoch": 0.7731958762886598, "grad_norm": 0.0736880972981453, "learning_rate": 0.00010114942528735633, "loss": 0.911, "step": 75 }, { "epoch": 0.7731958762886598, "eval_loss": 1.4201780557632446, "eval_runtime": 12.2248, "eval_samples_per_second": 11.043, "eval_steps_per_second": 0.736, "step": 75 }, { "epoch": 0.7835051546391752, "grad_norm": 0.05896177887916565, "learning_rate": 9.655172413793105e-05, "loss": 0.9412, "step": 76 }, { "epoch": 0.7835051546391752, "eval_loss": 1.4200078248977661, "eval_runtime": 12.1599, "eval_samples_per_second": 11.102, "eval_steps_per_second": 0.74, "step": 76 }, { "epoch": 0.7938144329896907, "grad_norm": 0.06385839730501175, "learning_rate": 9.195402298850575e-05, "loss": 0.8999, "step": 77 }, { "epoch": 0.7938144329896907, "eval_loss": 1.4164679050445557, "eval_runtime": 12.1435, "eval_samples_per_second": 11.117, "eval_steps_per_second": 0.741, "step": 77 }, { "epoch": 0.8041237113402062, "grad_norm": 0.0656963661313057, "learning_rate": 8.735632183908047e-05, "loss": 0.8924, "step": 78 }, { "epoch": 0.8041237113402062, "eval_loss": 1.4131464958190918, "eval_runtime": 12.147, "eval_samples_per_second": 11.114, "eval_steps_per_second": 0.741, "step": 78 }, { "epoch": 0.8144329896907216, "grad_norm": 0.07376889884471893, "learning_rate": 8.275862068965517e-05, "loss": 0.9304, "step": 79 }, { "epoch": 0.8144329896907216, "eval_loss": 1.4098708629608154, "eval_runtime": 12.1509, "eval_samples_per_second": 11.11, "eval_steps_per_second": 0.741, "step": 79 }, { "epoch": 0.8247422680412371, "grad_norm": 0.06411939859390259, "learning_rate": 7.81609195402299e-05, "loss": 0.9216, "step": 80 }, { "epoch": 0.8247422680412371, "eval_loss": 1.4088366031646729, "eval_runtime": 12.1696, "eval_samples_per_second": 11.093, "eval_steps_per_second": 0.74, "step": 80 }, { "epoch": 0.8350515463917526, "grad_norm": 0.06034550443291664, "learning_rate": 7.35632183908046e-05, "loss": 0.8914, "step": 81 }, { "epoch": 0.8350515463917526, "eval_loss": 1.4062119722366333, "eval_runtime": 12.1085, "eval_samples_per_second": 11.149, "eval_steps_per_second": 0.743, "step": 81 }, { "epoch": 0.845360824742268, "grad_norm": 0.06504890322685242, "learning_rate": 6.896551724137931e-05, "loss": 0.9608, "step": 82 }, { "epoch": 0.845360824742268, "eval_loss": 1.4031813144683838, "eval_runtime": 12.1849, "eval_samples_per_second": 11.079, "eval_steps_per_second": 0.739, "step": 82 }, { "epoch": 0.8556701030927835, "grad_norm": 0.05961364135146141, "learning_rate": 6.436781609195403e-05, "loss": 0.8992, "step": 83 }, { "epoch": 0.8556701030927835, "eval_loss": 1.4021321535110474, "eval_runtime": 12.2033, "eval_samples_per_second": 11.063, "eval_steps_per_second": 0.738, "step": 83 }, { "epoch": 0.865979381443299, "grad_norm": 0.06472059339284897, "learning_rate": 5.977011494252874e-05, "loss": 0.9458, "step": 84 }, { "epoch": 0.865979381443299, "eval_loss": 1.3988347053527832, "eval_runtime": 12.169, "eval_samples_per_second": 11.094, "eval_steps_per_second": 0.74, "step": 84 }, { "epoch": 0.8762886597938144, "grad_norm": 0.05986656993627548, "learning_rate": 5.517241379310345e-05, "loss": 0.8628, "step": 85 }, { "epoch": 0.8762886597938144, "eval_loss": 1.3969188928604126, "eval_runtime": 12.2292, "eval_samples_per_second": 11.039, "eval_steps_per_second": 0.736, "step": 85 }, { "epoch": 0.8865979381443299, "grad_norm": 0.062148451805114746, "learning_rate": 5.057471264367817e-05, "loss": 0.8841, "step": 86 }, { "epoch": 0.8865979381443299, "eval_loss": 1.3965938091278076, "eval_runtime": 12.1883, "eval_samples_per_second": 11.076, "eval_steps_per_second": 0.738, "step": 86 }, { "epoch": 0.8969072164948454, "grad_norm": 0.05559258908033371, "learning_rate": 4.597701149425287e-05, "loss": 0.8883, "step": 87 }, { "epoch": 0.8969072164948454, "eval_loss": 1.3965078592300415, "eval_runtime": 12.1185, "eval_samples_per_second": 11.14, "eval_steps_per_second": 0.743, "step": 87 }, { "epoch": 0.9072164948453608, "grad_norm": 0.05684094876050949, "learning_rate": 4.1379310344827587e-05, "loss": 0.8765, "step": 88 }, { "epoch": 0.9072164948453608, "eval_loss": 1.3967227935791016, "eval_runtime": 12.1899, "eval_samples_per_second": 11.075, "eval_steps_per_second": 0.738, "step": 88 }, { "epoch": 0.9175257731958762, "grad_norm": 0.05952519550919533, "learning_rate": 3.67816091954023e-05, "loss": 0.8598, "step": 89 }, { "epoch": 0.9175257731958762, "eval_loss": 1.3951915502548218, "eval_runtime": 12.1745, "eval_samples_per_second": 11.089, "eval_steps_per_second": 0.739, "step": 89 }, { "epoch": 0.9278350515463918, "grad_norm": 0.06364478170871735, "learning_rate": 3.218390804597701e-05, "loss": 0.9653, "step": 90 }, { "epoch": 0.9278350515463918, "eval_loss": 1.394257664680481, "eval_runtime": 12.2896, "eval_samples_per_second": 10.985, "eval_steps_per_second": 0.732, "step": 90 }, { "epoch": 0.9381443298969072, "grad_norm": 0.06441052258014679, "learning_rate": 2.7586206896551727e-05, "loss": 0.9397, "step": 91 }, { "epoch": 0.9381443298969072, "eval_loss": 1.392314076423645, "eval_runtime": 12.2278, "eval_samples_per_second": 11.04, "eval_steps_per_second": 0.736, "step": 91 }, { "epoch": 0.9484536082474226, "grad_norm": 0.06320352107286453, "learning_rate": 2.2988505747126437e-05, "loss": 0.8635, "step": 92 }, { "epoch": 0.9484536082474226, "eval_loss": 1.3919230699539185, "eval_runtime": 12.1765, "eval_samples_per_second": 11.087, "eval_steps_per_second": 0.739, "step": 92 }, { "epoch": 0.9587628865979382, "grad_norm": 0.062386397272348404, "learning_rate": 1.839080459770115e-05, "loss": 0.9257, "step": 93 }, { "epoch": 0.9587628865979382, "eval_loss": 1.3923670053482056, "eval_runtime": 12.2566, "eval_samples_per_second": 11.014, "eval_steps_per_second": 0.734, "step": 93 }, { "epoch": 0.9690721649484536, "grad_norm": 0.05672856792807579, "learning_rate": 1.3793103448275863e-05, "loss": 0.8754, "step": 94 }, { "epoch": 0.9690721649484536, "eval_loss": 1.393159031867981, "eval_runtime": 12.1859, "eval_samples_per_second": 11.078, "eval_steps_per_second": 0.739, "step": 94 }, { "epoch": 0.979381443298969, "grad_norm": 0.06344141811132431, "learning_rate": 9.195402298850575e-06, "loss": 0.9454, "step": 95 }, { "epoch": 0.979381443298969, "eval_loss": 1.3938028812408447, "eval_runtime": 12.1382, "eval_samples_per_second": 11.122, "eval_steps_per_second": 0.741, "step": 95 }, { "epoch": 0.9896907216494846, "grad_norm": 0.06258992105722427, "learning_rate": 4.5977011494252875e-06, "loss": 0.9112, "step": 96 }, { "epoch": 0.9896907216494846, "eval_loss": 1.394579529762268, "eval_runtime": 12.1768, "eval_samples_per_second": 11.087, "eval_steps_per_second": 0.739, "step": 96 }, { "epoch": 1.0, "grad_norm": 0.08749664574861526, "learning_rate": 0.0, "loss": 0.8526, "step": 97 }, { "epoch": 1.0, "eval_loss": 1.3950037956237793, "eval_runtime": 12.1878, "eval_samples_per_second": 11.077, "eval_steps_per_second": 0.738, "step": 97 }, { "epoch": 1.0, "step": 97, "total_flos": 3.573314566697779e+16, "train_loss": 1.1783372968742527, "train_runtime": 2093.8635, "train_samples_per_second": 1.472, "train_steps_per_second": 0.046 } ], "logging_steps": 1, "max_steps": 97, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.573314566697779e+16, "train_batch_size": 32, "trial_name": null, "trial_params": null }