|
{ |
|
"best_metric": 1.3649392127990723, |
|
"best_model_checkpoint": "./output/checkpoints/2024-06-11_11-02-23/checkpoint-50", |
|
"epoch": 1.0, |
|
"eval_steps": 1, |
|
"global_step": 97, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.010309278350515464, |
|
"grad_norm": 2.784351348876953, |
|
"learning_rate": 4e-05, |
|
"loss": 5.0719, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.010309278350515464, |
|
"eval_loss": 4.890735149383545, |
|
"eval_runtime": 11.998, |
|
"eval_samples_per_second": 11.252, |
|
"eval_steps_per_second": 0.75, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.020618556701030927, |
|
"grad_norm": 2.8573310375213623, |
|
"learning_rate": 8e-05, |
|
"loss": 4.9508, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.020618556701030927, |
|
"eval_loss": 4.775210857391357, |
|
"eval_runtime": 12.0698, |
|
"eval_samples_per_second": 11.185, |
|
"eval_steps_per_second": 0.746, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.030927835051546393, |
|
"grad_norm": 2.920828104019165, |
|
"learning_rate": 0.00012, |
|
"loss": 4.973, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.030927835051546393, |
|
"eval_loss": 4.2439093589782715, |
|
"eval_runtime": 12.1674, |
|
"eval_samples_per_second": 11.095, |
|
"eval_steps_per_second": 0.74, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.041237113402061855, |
|
"grad_norm": 3.0682826042175293, |
|
"learning_rate": 0.00016, |
|
"loss": 4.3398, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.041237113402061855, |
|
"eval_loss": 3.316483497619629, |
|
"eval_runtime": 12.1421, |
|
"eval_samples_per_second": 11.118, |
|
"eval_steps_per_second": 0.741, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.05154639175257732, |
|
"grad_norm": 3.1937592029571533, |
|
"learning_rate": 0.0002, |
|
"loss": 3.253, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.05154639175257732, |
|
"eval_loss": 2.4086239337921143, |
|
"eval_runtime": 12.0796, |
|
"eval_samples_per_second": 11.176, |
|
"eval_steps_per_second": 0.745, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.061855670103092786, |
|
"grad_norm": 2.0619874000549316, |
|
"learning_rate": 0.00024, |
|
"loss": 2.3725, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.061855670103092786, |
|
"eval_loss": 1.9427752494812012, |
|
"eval_runtime": 12.1816, |
|
"eval_samples_per_second": 11.082, |
|
"eval_steps_per_second": 0.739, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.07216494845360824, |
|
"grad_norm": 1.4178358316421509, |
|
"learning_rate": 0.00028, |
|
"loss": 1.7391, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.07216494845360824, |
|
"eval_loss": 1.6751385927200317, |
|
"eval_runtime": 12.2395, |
|
"eval_samples_per_second": 11.03, |
|
"eval_steps_per_second": 0.735, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.08247422680412371, |
|
"grad_norm": 0.9345605373382568, |
|
"learning_rate": 0.00032, |
|
"loss": 1.4029, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.08247422680412371, |
|
"eval_loss": 1.551221489906311, |
|
"eval_runtime": 12.2329, |
|
"eval_samples_per_second": 11.036, |
|
"eval_steps_per_second": 0.736, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.09278350515463918, |
|
"grad_norm": 1.1335052251815796, |
|
"learning_rate": 0.00036, |
|
"loss": 1.4404, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.09278350515463918, |
|
"eval_loss": 1.447738528251648, |
|
"eval_runtime": 12.2069, |
|
"eval_samples_per_second": 11.059, |
|
"eval_steps_per_second": 0.737, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.10309278350515463, |
|
"grad_norm": 0.23779241740703583, |
|
"learning_rate": 0.0004, |
|
"loss": 1.3197, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.10309278350515463, |
|
"eval_loss": 1.4138007164001465, |
|
"eval_runtime": 12.2609, |
|
"eval_samples_per_second": 11.011, |
|
"eval_steps_per_second": 0.734, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.1134020618556701, |
|
"grad_norm": 0.19672752916812897, |
|
"learning_rate": 0.00039540229885057476, |
|
"loss": 1.2882, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.1134020618556701, |
|
"eval_loss": 1.393249750137329, |
|
"eval_runtime": 12.1836, |
|
"eval_samples_per_second": 11.08, |
|
"eval_steps_per_second": 0.739, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.12371134020618557, |
|
"grad_norm": 0.2525981366634369, |
|
"learning_rate": 0.00039080459770114945, |
|
"loss": 1.2831, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.12371134020618557, |
|
"eval_loss": 1.4042422771453857, |
|
"eval_runtime": 12.232, |
|
"eval_samples_per_second": 11.037, |
|
"eval_steps_per_second": 0.736, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.13402061855670103, |
|
"grad_norm": 0.23460708558559418, |
|
"learning_rate": 0.0003862068965517242, |
|
"loss": 1.1615, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.13402061855670103, |
|
"eval_loss": 1.4705734252929688, |
|
"eval_runtime": 12.2648, |
|
"eval_samples_per_second": 11.007, |
|
"eval_steps_per_second": 0.734, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.14432989690721648, |
|
"grad_norm": 0.21198733150959015, |
|
"learning_rate": 0.00038160919540229887, |
|
"loss": 1.0625, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.14432989690721648, |
|
"eval_loss": 1.5055038928985596, |
|
"eval_runtime": 12.2044, |
|
"eval_samples_per_second": 11.062, |
|
"eval_steps_per_second": 0.737, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.15463917525773196, |
|
"grad_norm": 0.3658374845981598, |
|
"learning_rate": 0.00037701149425287356, |
|
"loss": 1.1191, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.15463917525773196, |
|
"eval_loss": 1.4498964548110962, |
|
"eval_runtime": 12.2004, |
|
"eval_samples_per_second": 11.065, |
|
"eval_steps_per_second": 0.738, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.16494845360824742, |
|
"grad_norm": 0.222326397895813, |
|
"learning_rate": 0.0003724137931034483, |
|
"loss": 1.0887, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.16494845360824742, |
|
"eval_loss": 1.3819501399993896, |
|
"eval_runtime": 12.1444, |
|
"eval_samples_per_second": 11.116, |
|
"eval_steps_per_second": 0.741, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.17525773195876287, |
|
"grad_norm": 0.1682404726743698, |
|
"learning_rate": 0.000367816091954023, |
|
"loss": 1.0915, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.17525773195876287, |
|
"eval_loss": 1.3552738428115845, |
|
"eval_runtime": 12.2121, |
|
"eval_samples_per_second": 11.055, |
|
"eval_steps_per_second": 0.737, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.18556701030927836, |
|
"grad_norm": 0.15567483007907867, |
|
"learning_rate": 0.0003632183908045977, |
|
"loss": 1.0509, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.18556701030927836, |
|
"eval_loss": 1.3488575220108032, |
|
"eval_runtime": 12.2988, |
|
"eval_samples_per_second": 10.977, |
|
"eval_steps_per_second": 0.732, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.1958762886597938, |
|
"grad_norm": 0.1600300371646881, |
|
"learning_rate": 0.0003586206896551724, |
|
"loss": 0.9982, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.1958762886597938, |
|
"eval_loss": 1.3651177883148193, |
|
"eval_runtime": 12.1263, |
|
"eval_samples_per_second": 11.133, |
|
"eval_steps_per_second": 0.742, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.20618556701030927, |
|
"grad_norm": 0.1044178307056427, |
|
"learning_rate": 0.00035402298850574715, |
|
"loss": 0.9809, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.20618556701030927, |
|
"eval_loss": 1.401971697807312, |
|
"eval_runtime": 12.2092, |
|
"eval_samples_per_second": 11.057, |
|
"eval_steps_per_second": 0.737, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.21649484536082475, |
|
"grad_norm": 0.12360141426324844, |
|
"learning_rate": 0.0003494252873563219, |
|
"loss": 1.0549, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.21649484536082475, |
|
"eval_loss": 1.426545262336731, |
|
"eval_runtime": 12.1581, |
|
"eval_samples_per_second": 11.104, |
|
"eval_steps_per_second": 0.74, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.2268041237113402, |
|
"grad_norm": 0.12509943544864655, |
|
"learning_rate": 0.0003448275862068965, |
|
"loss": 1.0323, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.2268041237113402, |
|
"eval_loss": 1.4258630275726318, |
|
"eval_runtime": 12.201, |
|
"eval_samples_per_second": 11.065, |
|
"eval_steps_per_second": 0.738, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.23711340206185566, |
|
"grad_norm": 0.13586747646331787, |
|
"learning_rate": 0.00034022988505747127, |
|
"loss": 1.0746, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.23711340206185566, |
|
"eval_loss": 1.3952091932296753, |
|
"eval_runtime": 12.1932, |
|
"eval_samples_per_second": 11.072, |
|
"eval_steps_per_second": 0.738, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.24742268041237114, |
|
"grad_norm": 0.08069202303886414, |
|
"learning_rate": 0.000335632183908046, |
|
"loss": 0.9645, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.24742268041237114, |
|
"eval_loss": 1.376875638961792, |
|
"eval_runtime": 12.2172, |
|
"eval_samples_per_second": 11.05, |
|
"eval_steps_per_second": 0.737, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.25773195876288657, |
|
"grad_norm": 0.09109444171190262, |
|
"learning_rate": 0.0003310344827586207, |
|
"loss": 0.9345, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.25773195876288657, |
|
"eval_loss": 1.373625636100769, |
|
"eval_runtime": 12.2642, |
|
"eval_samples_per_second": 11.008, |
|
"eval_steps_per_second": 0.734, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.26804123711340205, |
|
"grad_norm": 0.0649966150522232, |
|
"learning_rate": 0.00032643678160919543, |
|
"loss": 1.0266, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.26804123711340205, |
|
"eval_loss": 1.3720897436141968, |
|
"eval_runtime": 12.1576, |
|
"eval_samples_per_second": 11.104, |
|
"eval_steps_per_second": 0.74, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.27835051546391754, |
|
"grad_norm": 0.09308775514364243, |
|
"learning_rate": 0.0003218390804597701, |
|
"loss": 0.9797, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.27835051546391754, |
|
"eval_loss": 1.3858685493469238, |
|
"eval_runtime": 12.1546, |
|
"eval_samples_per_second": 11.107, |
|
"eval_steps_per_second": 0.74, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.28865979381443296, |
|
"grad_norm": 0.06654678285121918, |
|
"learning_rate": 0.00031724137931034486, |
|
"loss": 1.0072, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.28865979381443296, |
|
"eval_loss": 1.4031970500946045, |
|
"eval_runtime": 12.1666, |
|
"eval_samples_per_second": 11.096, |
|
"eval_steps_per_second": 0.74, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.29896907216494845, |
|
"grad_norm": 0.07720344513654709, |
|
"learning_rate": 0.0003126436781609196, |
|
"loss": 0.923, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.29896907216494845, |
|
"eval_loss": 1.4118249416351318, |
|
"eval_runtime": 12.1572, |
|
"eval_samples_per_second": 11.105, |
|
"eval_steps_per_second": 0.74, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.30927835051546393, |
|
"grad_norm": 0.10230278223752975, |
|
"learning_rate": 0.00030804597701149423, |
|
"loss": 0.9821, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.30927835051546393, |
|
"eval_loss": 1.4045817852020264, |
|
"eval_runtime": 12.1974, |
|
"eval_samples_per_second": 11.068, |
|
"eval_steps_per_second": 0.738, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.31958762886597936, |
|
"grad_norm": 0.07451125234365463, |
|
"learning_rate": 0.00030344827586206897, |
|
"loss": 1.0021, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.31958762886597936, |
|
"eval_loss": 1.391209363937378, |
|
"eval_runtime": 12.2389, |
|
"eval_samples_per_second": 11.03, |
|
"eval_steps_per_second": 0.735, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.32989690721649484, |
|
"grad_norm": 0.0714351087808609, |
|
"learning_rate": 0.00029885057471264366, |
|
"loss": 1.0071, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.32989690721649484, |
|
"eval_loss": 1.3730576038360596, |
|
"eval_runtime": 12.2102, |
|
"eval_samples_per_second": 11.056, |
|
"eval_steps_per_second": 0.737, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.3402061855670103, |
|
"grad_norm": 0.06839103996753693, |
|
"learning_rate": 0.0002942528735632184, |
|
"loss": 0.973, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.3402061855670103, |
|
"eval_loss": 1.3659778833389282, |
|
"eval_runtime": 12.2323, |
|
"eval_samples_per_second": 11.036, |
|
"eval_steps_per_second": 0.736, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.35051546391752575, |
|
"grad_norm": 0.08078178018331528, |
|
"learning_rate": 0.00028965517241379314, |
|
"loss": 0.964, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.35051546391752575, |
|
"eval_loss": 1.3762452602386475, |
|
"eval_runtime": 12.2027, |
|
"eval_samples_per_second": 11.063, |
|
"eval_steps_per_second": 0.738, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.36082474226804123, |
|
"grad_norm": 0.06870069354772568, |
|
"learning_rate": 0.0002850574712643678, |
|
"loss": 0.9148, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.36082474226804123, |
|
"eval_loss": 1.3925738334655762, |
|
"eval_runtime": 12.2644, |
|
"eval_samples_per_second": 11.007, |
|
"eval_steps_per_second": 0.734, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.3711340206185567, |
|
"grad_norm": 0.06974003463983536, |
|
"learning_rate": 0.00028045977011494257, |
|
"loss": 1.0128, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.3711340206185567, |
|
"eval_loss": 1.4087179899215698, |
|
"eval_runtime": 12.2407, |
|
"eval_samples_per_second": 11.029, |
|
"eval_steps_per_second": 0.735, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.38144329896907214, |
|
"grad_norm": 0.08603405207395554, |
|
"learning_rate": 0.00027586206896551725, |
|
"loss": 0.9776, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.38144329896907214, |
|
"eval_loss": 1.4067703485488892, |
|
"eval_runtime": 12.173, |
|
"eval_samples_per_second": 11.09, |
|
"eval_steps_per_second": 0.739, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.3917525773195876, |
|
"grad_norm": 0.07761300355195999, |
|
"learning_rate": 0.00027126436781609194, |
|
"loss": 0.9655, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.3917525773195876, |
|
"eval_loss": 1.3932013511657715, |
|
"eval_runtime": 12.1941, |
|
"eval_samples_per_second": 11.071, |
|
"eval_steps_per_second": 0.738, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.4020618556701031, |
|
"grad_norm": 0.06392566114664078, |
|
"learning_rate": 0.0002666666666666667, |
|
"loss": 0.974, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.4020618556701031, |
|
"eval_loss": 1.3819767236709595, |
|
"eval_runtime": 12.1765, |
|
"eval_samples_per_second": 11.087, |
|
"eval_steps_per_second": 0.739, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.41237113402061853, |
|
"grad_norm": 0.05517549812793732, |
|
"learning_rate": 0.00026206896551724137, |
|
"loss": 0.9793, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.41237113402061853, |
|
"eval_loss": 1.3717000484466553, |
|
"eval_runtime": 12.2217, |
|
"eval_samples_per_second": 11.046, |
|
"eval_steps_per_second": 0.736, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.422680412371134, |
|
"grad_norm": 0.0804053246974945, |
|
"learning_rate": 0.0002574712643678161, |
|
"loss": 0.9585, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.422680412371134, |
|
"eval_loss": 1.3579998016357422, |
|
"eval_runtime": 12.182, |
|
"eval_samples_per_second": 11.082, |
|
"eval_steps_per_second": 0.739, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.4329896907216495, |
|
"grad_norm": 0.07214821875095367, |
|
"learning_rate": 0.0002528735632183908, |
|
"loss": 0.9332, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.4329896907216495, |
|
"eval_loss": 1.3583327531814575, |
|
"eval_runtime": 12.1257, |
|
"eval_samples_per_second": 11.133, |
|
"eval_steps_per_second": 0.742, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.44329896907216493, |
|
"grad_norm": 0.07595060020685196, |
|
"learning_rate": 0.00024827586206896553, |
|
"loss": 0.8998, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.44329896907216493, |
|
"eval_loss": 1.3721704483032227, |
|
"eval_runtime": 12.2745, |
|
"eval_samples_per_second": 10.998, |
|
"eval_steps_per_second": 0.733, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.4536082474226804, |
|
"grad_norm": 0.07757716625928879, |
|
"learning_rate": 0.00024367816091954025, |
|
"loss": 0.9661, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.4536082474226804, |
|
"eval_loss": 1.3984168767929077, |
|
"eval_runtime": 12.1398, |
|
"eval_samples_per_second": 11.12, |
|
"eval_steps_per_second": 0.741, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.4639175257731959, |
|
"grad_norm": 0.053873661905527115, |
|
"learning_rate": 0.00023908045977011496, |
|
"loss": 0.9418, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.4639175257731959, |
|
"eval_loss": 1.4266961812973022, |
|
"eval_runtime": 12.1955, |
|
"eval_samples_per_second": 11.07, |
|
"eval_steps_per_second": 0.738, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.4742268041237113, |
|
"grad_norm": 0.08943776786327362, |
|
"learning_rate": 0.00023448275862068965, |
|
"loss": 0.9309, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.4742268041237113, |
|
"eval_loss": 1.4349451065063477, |
|
"eval_runtime": 12.1983, |
|
"eval_samples_per_second": 11.067, |
|
"eval_steps_per_second": 0.738, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.4845360824742268, |
|
"grad_norm": 0.0885058343410492, |
|
"learning_rate": 0.00022988505747126436, |
|
"loss": 1.0245, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.4845360824742268, |
|
"eval_loss": 1.4226434230804443, |
|
"eval_runtime": 12.1753, |
|
"eval_samples_per_second": 11.088, |
|
"eval_steps_per_second": 0.739, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.4948453608247423, |
|
"grad_norm": 0.058818139135837555, |
|
"learning_rate": 0.00022528735632183907, |
|
"loss": 0.9007, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.4948453608247423, |
|
"eval_loss": 1.4033018350601196, |
|
"eval_runtime": 12.267, |
|
"eval_samples_per_second": 11.005, |
|
"eval_steps_per_second": 0.734, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.5051546391752577, |
|
"grad_norm": 0.07104739546775818, |
|
"learning_rate": 0.0002206896551724138, |
|
"loss": 0.9469, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.5051546391752577, |
|
"eval_loss": 1.3786002397537231, |
|
"eval_runtime": 12.2731, |
|
"eval_samples_per_second": 11.0, |
|
"eval_steps_per_second": 0.733, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.5154639175257731, |
|
"grad_norm": 0.05872216075658798, |
|
"learning_rate": 0.00021609195402298853, |
|
"loss": 0.9671, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.5154639175257731, |
|
"eval_loss": 1.3649392127990723, |
|
"eval_runtime": 12.1662, |
|
"eval_samples_per_second": 11.096, |
|
"eval_steps_per_second": 0.74, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.5257731958762887, |
|
"grad_norm": 0.07843936234712601, |
|
"learning_rate": 0.00021149425287356324, |
|
"loss": 0.9052, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.5257731958762887, |
|
"eval_loss": 1.3671882152557373, |
|
"eval_runtime": 12.1527, |
|
"eval_samples_per_second": 11.109, |
|
"eval_steps_per_second": 0.741, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.5360824742268041, |
|
"grad_norm": 0.07407287508249283, |
|
"learning_rate": 0.00020689655172413795, |
|
"loss": 0.9221, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.5360824742268041, |
|
"eval_loss": 1.3811315298080444, |
|
"eval_runtime": 12.1661, |
|
"eval_samples_per_second": 11.096, |
|
"eval_steps_per_second": 0.74, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.5463917525773195, |
|
"grad_norm": 0.06168922409415245, |
|
"learning_rate": 0.00020229885057471267, |
|
"loss": 0.9809, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.5463917525773195, |
|
"eval_loss": 1.3900806903839111, |
|
"eval_runtime": 12.1958, |
|
"eval_samples_per_second": 11.069, |
|
"eval_steps_per_second": 0.738, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.5567010309278351, |
|
"grad_norm": 0.05980532988905907, |
|
"learning_rate": 0.00019770114942528738, |
|
"loss": 0.9492, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.5567010309278351, |
|
"eval_loss": 1.4011154174804688, |
|
"eval_runtime": 12.1559, |
|
"eval_samples_per_second": 11.106, |
|
"eval_steps_per_second": 0.74, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.5670103092783505, |
|
"grad_norm": 0.05770307034254074, |
|
"learning_rate": 0.0001931034482758621, |
|
"loss": 0.9377, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.5670103092783505, |
|
"eval_loss": 1.4104657173156738, |
|
"eval_runtime": 12.2103, |
|
"eval_samples_per_second": 11.056, |
|
"eval_steps_per_second": 0.737, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.5773195876288659, |
|
"grad_norm": 0.05812888965010643, |
|
"learning_rate": 0.00018850574712643678, |
|
"loss": 0.9139, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.5773195876288659, |
|
"eval_loss": 1.4152581691741943, |
|
"eval_runtime": 12.1515, |
|
"eval_samples_per_second": 11.11, |
|
"eval_steps_per_second": 0.741, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.5876288659793815, |
|
"grad_norm": 0.0625990554690361, |
|
"learning_rate": 0.0001839080459770115, |
|
"loss": 0.9111, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.5876288659793815, |
|
"eval_loss": 1.4142401218414307, |
|
"eval_runtime": 12.208, |
|
"eval_samples_per_second": 11.058, |
|
"eval_steps_per_second": 0.737, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.5979381443298969, |
|
"grad_norm": 0.06904823333024979, |
|
"learning_rate": 0.0001793103448275862, |
|
"loss": 0.9019, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.5979381443298969, |
|
"eval_loss": 1.4036115407943726, |
|
"eval_runtime": 12.1626, |
|
"eval_samples_per_second": 11.1, |
|
"eval_steps_per_second": 0.74, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.6082474226804123, |
|
"grad_norm": 0.06511174887418747, |
|
"learning_rate": 0.00017471264367816095, |
|
"loss": 0.9409, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.6082474226804123, |
|
"eval_loss": 1.3875020742416382, |
|
"eval_runtime": 12.1594, |
|
"eval_samples_per_second": 11.103, |
|
"eval_steps_per_second": 0.74, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.6185567010309279, |
|
"grad_norm": 0.06755220890045166, |
|
"learning_rate": 0.00017011494252873563, |
|
"loss": 0.8928, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.6185567010309279, |
|
"eval_loss": 1.3814879655838013, |
|
"eval_runtime": 12.1832, |
|
"eval_samples_per_second": 11.081, |
|
"eval_steps_per_second": 0.739, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.6288659793814433, |
|
"grad_norm": 0.057419709861278534, |
|
"learning_rate": 0.00016551724137931035, |
|
"loss": 0.8698, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.6288659793814433, |
|
"eval_loss": 1.376993179321289, |
|
"eval_runtime": 12.1649, |
|
"eval_samples_per_second": 11.097, |
|
"eval_steps_per_second": 0.74, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.6391752577319587, |
|
"grad_norm": 0.09423535317182541, |
|
"learning_rate": 0.00016091954022988506, |
|
"loss": 0.9605, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.6391752577319587, |
|
"eval_loss": 1.3864612579345703, |
|
"eval_runtime": 12.1886, |
|
"eval_samples_per_second": 11.076, |
|
"eval_steps_per_second": 0.738, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.6494845360824743, |
|
"grad_norm": 0.05667712539434433, |
|
"learning_rate": 0.0001563218390804598, |
|
"loss": 0.9863, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.6494845360824743, |
|
"eval_loss": 1.3983639478683472, |
|
"eval_runtime": 12.1418, |
|
"eval_samples_per_second": 11.119, |
|
"eval_steps_per_second": 0.741, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.6597938144329897, |
|
"grad_norm": 0.061302803456783295, |
|
"learning_rate": 0.00015172413793103449, |
|
"loss": 0.9454, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.6597938144329897, |
|
"eval_loss": 1.406610369682312, |
|
"eval_runtime": 12.2718, |
|
"eval_samples_per_second": 11.001, |
|
"eval_steps_per_second": 0.733, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.6701030927835051, |
|
"grad_norm": 0.06619007140398026, |
|
"learning_rate": 0.0001471264367816092, |
|
"loss": 0.9302, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.6701030927835051, |
|
"eval_loss": 1.408695936203003, |
|
"eval_runtime": 12.1873, |
|
"eval_samples_per_second": 11.077, |
|
"eval_steps_per_second": 0.738, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.6804123711340206, |
|
"grad_norm": 0.059212010353803635, |
|
"learning_rate": 0.0001425287356321839, |
|
"loss": 0.9409, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.6804123711340206, |
|
"eval_loss": 1.4100947380065918, |
|
"eval_runtime": 12.2058, |
|
"eval_samples_per_second": 11.06, |
|
"eval_steps_per_second": 0.737, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.6907216494845361, |
|
"grad_norm": 0.06854245811700821, |
|
"learning_rate": 0.00013793103448275863, |
|
"loss": 0.9408, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.6907216494845361, |
|
"eval_loss": 1.4050439596176147, |
|
"eval_runtime": 12.1478, |
|
"eval_samples_per_second": 11.113, |
|
"eval_steps_per_second": 0.741, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.7010309278350515, |
|
"grad_norm": 0.05722883343696594, |
|
"learning_rate": 0.00013333333333333334, |
|
"loss": 0.91, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.7010309278350515, |
|
"eval_loss": 1.397505283355713, |
|
"eval_runtime": 12.1678, |
|
"eval_samples_per_second": 11.095, |
|
"eval_steps_per_second": 0.74, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.711340206185567, |
|
"grad_norm": 0.07448893785476685, |
|
"learning_rate": 0.00012873563218390805, |
|
"loss": 0.9451, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.711340206185567, |
|
"eval_loss": 1.3847121000289917, |
|
"eval_runtime": 12.1444, |
|
"eval_samples_per_second": 11.116, |
|
"eval_steps_per_second": 0.741, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.7216494845360825, |
|
"grad_norm": 0.055692195892333984, |
|
"learning_rate": 0.00012413793103448277, |
|
"loss": 0.8967, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.7216494845360825, |
|
"eval_loss": 1.376731038093567, |
|
"eval_runtime": 12.1911, |
|
"eval_samples_per_second": 11.074, |
|
"eval_steps_per_second": 0.738, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.7319587628865979, |
|
"grad_norm": 0.06589022278785706, |
|
"learning_rate": 0.00011954022988505748, |
|
"loss": 0.8795, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.7319587628865979, |
|
"eval_loss": 1.3753036260604858, |
|
"eval_runtime": 12.1728, |
|
"eval_samples_per_second": 11.09, |
|
"eval_steps_per_second": 0.739, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.7422680412371134, |
|
"grad_norm": 0.12176728248596191, |
|
"learning_rate": 0.00011494252873563218, |
|
"loss": 0.9092, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.7422680412371134, |
|
"eval_loss": 1.3911007642745972, |
|
"eval_runtime": 12.1946, |
|
"eval_samples_per_second": 11.07, |
|
"eval_steps_per_second": 0.738, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.7525773195876289, |
|
"grad_norm": 0.05275936424732208, |
|
"learning_rate": 0.0001103448275862069, |
|
"loss": 0.9621, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.7525773195876289, |
|
"eval_loss": 1.407221794128418, |
|
"eval_runtime": 12.2388, |
|
"eval_samples_per_second": 11.031, |
|
"eval_steps_per_second": 0.735, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.7628865979381443, |
|
"grad_norm": 0.06748662143945694, |
|
"learning_rate": 0.00010574712643678162, |
|
"loss": 0.9154, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.7628865979381443, |
|
"eval_loss": 1.4170591831207275, |
|
"eval_runtime": 12.1869, |
|
"eval_samples_per_second": 11.078, |
|
"eval_steps_per_second": 0.739, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.7731958762886598, |
|
"grad_norm": 0.0736880972981453, |
|
"learning_rate": 0.00010114942528735633, |
|
"loss": 0.911, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.7731958762886598, |
|
"eval_loss": 1.4201780557632446, |
|
"eval_runtime": 12.2248, |
|
"eval_samples_per_second": 11.043, |
|
"eval_steps_per_second": 0.736, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.7835051546391752, |
|
"grad_norm": 0.05896177887916565, |
|
"learning_rate": 9.655172413793105e-05, |
|
"loss": 0.9412, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.7835051546391752, |
|
"eval_loss": 1.4200078248977661, |
|
"eval_runtime": 12.1599, |
|
"eval_samples_per_second": 11.102, |
|
"eval_steps_per_second": 0.74, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.7938144329896907, |
|
"grad_norm": 0.06385839730501175, |
|
"learning_rate": 9.195402298850575e-05, |
|
"loss": 0.8999, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.7938144329896907, |
|
"eval_loss": 1.4164679050445557, |
|
"eval_runtime": 12.1435, |
|
"eval_samples_per_second": 11.117, |
|
"eval_steps_per_second": 0.741, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.8041237113402062, |
|
"grad_norm": 0.0656963661313057, |
|
"learning_rate": 8.735632183908047e-05, |
|
"loss": 0.8924, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.8041237113402062, |
|
"eval_loss": 1.4131464958190918, |
|
"eval_runtime": 12.147, |
|
"eval_samples_per_second": 11.114, |
|
"eval_steps_per_second": 0.741, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.8144329896907216, |
|
"grad_norm": 0.07376889884471893, |
|
"learning_rate": 8.275862068965517e-05, |
|
"loss": 0.9304, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.8144329896907216, |
|
"eval_loss": 1.4098708629608154, |
|
"eval_runtime": 12.1509, |
|
"eval_samples_per_second": 11.11, |
|
"eval_steps_per_second": 0.741, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.8247422680412371, |
|
"grad_norm": 0.06411939859390259, |
|
"learning_rate": 7.81609195402299e-05, |
|
"loss": 0.9216, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.8247422680412371, |
|
"eval_loss": 1.4088366031646729, |
|
"eval_runtime": 12.1696, |
|
"eval_samples_per_second": 11.093, |
|
"eval_steps_per_second": 0.74, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.8350515463917526, |
|
"grad_norm": 0.06034550443291664, |
|
"learning_rate": 7.35632183908046e-05, |
|
"loss": 0.8914, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.8350515463917526, |
|
"eval_loss": 1.4062119722366333, |
|
"eval_runtime": 12.1085, |
|
"eval_samples_per_second": 11.149, |
|
"eval_steps_per_second": 0.743, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.845360824742268, |
|
"grad_norm": 0.06504890322685242, |
|
"learning_rate": 6.896551724137931e-05, |
|
"loss": 0.9608, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.845360824742268, |
|
"eval_loss": 1.4031813144683838, |
|
"eval_runtime": 12.1849, |
|
"eval_samples_per_second": 11.079, |
|
"eval_steps_per_second": 0.739, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.8556701030927835, |
|
"grad_norm": 0.05961364135146141, |
|
"learning_rate": 6.436781609195403e-05, |
|
"loss": 0.8992, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.8556701030927835, |
|
"eval_loss": 1.4021321535110474, |
|
"eval_runtime": 12.2033, |
|
"eval_samples_per_second": 11.063, |
|
"eval_steps_per_second": 0.738, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.865979381443299, |
|
"grad_norm": 0.06472059339284897, |
|
"learning_rate": 5.977011494252874e-05, |
|
"loss": 0.9458, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.865979381443299, |
|
"eval_loss": 1.3988347053527832, |
|
"eval_runtime": 12.169, |
|
"eval_samples_per_second": 11.094, |
|
"eval_steps_per_second": 0.74, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.8762886597938144, |
|
"grad_norm": 0.05986656993627548, |
|
"learning_rate": 5.517241379310345e-05, |
|
"loss": 0.8628, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.8762886597938144, |
|
"eval_loss": 1.3969188928604126, |
|
"eval_runtime": 12.2292, |
|
"eval_samples_per_second": 11.039, |
|
"eval_steps_per_second": 0.736, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.8865979381443299, |
|
"grad_norm": 0.062148451805114746, |
|
"learning_rate": 5.057471264367817e-05, |
|
"loss": 0.8841, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.8865979381443299, |
|
"eval_loss": 1.3965938091278076, |
|
"eval_runtime": 12.1883, |
|
"eval_samples_per_second": 11.076, |
|
"eval_steps_per_second": 0.738, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.8969072164948454, |
|
"grad_norm": 0.05559258908033371, |
|
"learning_rate": 4.597701149425287e-05, |
|
"loss": 0.8883, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.8969072164948454, |
|
"eval_loss": 1.3965078592300415, |
|
"eval_runtime": 12.1185, |
|
"eval_samples_per_second": 11.14, |
|
"eval_steps_per_second": 0.743, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.9072164948453608, |
|
"grad_norm": 0.05684094876050949, |
|
"learning_rate": 4.1379310344827587e-05, |
|
"loss": 0.8765, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.9072164948453608, |
|
"eval_loss": 1.3967227935791016, |
|
"eval_runtime": 12.1899, |
|
"eval_samples_per_second": 11.075, |
|
"eval_steps_per_second": 0.738, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.9175257731958762, |
|
"grad_norm": 0.05952519550919533, |
|
"learning_rate": 3.67816091954023e-05, |
|
"loss": 0.8598, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.9175257731958762, |
|
"eval_loss": 1.3951915502548218, |
|
"eval_runtime": 12.1745, |
|
"eval_samples_per_second": 11.089, |
|
"eval_steps_per_second": 0.739, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.9278350515463918, |
|
"grad_norm": 0.06364478170871735, |
|
"learning_rate": 3.218390804597701e-05, |
|
"loss": 0.9653, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.9278350515463918, |
|
"eval_loss": 1.394257664680481, |
|
"eval_runtime": 12.2896, |
|
"eval_samples_per_second": 10.985, |
|
"eval_steps_per_second": 0.732, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.9381443298969072, |
|
"grad_norm": 0.06441052258014679, |
|
"learning_rate": 2.7586206896551727e-05, |
|
"loss": 0.9397, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.9381443298969072, |
|
"eval_loss": 1.392314076423645, |
|
"eval_runtime": 12.2278, |
|
"eval_samples_per_second": 11.04, |
|
"eval_steps_per_second": 0.736, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.9484536082474226, |
|
"grad_norm": 0.06320352107286453, |
|
"learning_rate": 2.2988505747126437e-05, |
|
"loss": 0.8635, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.9484536082474226, |
|
"eval_loss": 1.3919230699539185, |
|
"eval_runtime": 12.1765, |
|
"eval_samples_per_second": 11.087, |
|
"eval_steps_per_second": 0.739, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.9587628865979382, |
|
"grad_norm": 0.062386397272348404, |
|
"learning_rate": 1.839080459770115e-05, |
|
"loss": 0.9257, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.9587628865979382, |
|
"eval_loss": 1.3923670053482056, |
|
"eval_runtime": 12.2566, |
|
"eval_samples_per_second": 11.014, |
|
"eval_steps_per_second": 0.734, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.9690721649484536, |
|
"grad_norm": 0.05672856792807579, |
|
"learning_rate": 1.3793103448275863e-05, |
|
"loss": 0.8754, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.9690721649484536, |
|
"eval_loss": 1.393159031867981, |
|
"eval_runtime": 12.1859, |
|
"eval_samples_per_second": 11.078, |
|
"eval_steps_per_second": 0.739, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.979381443298969, |
|
"grad_norm": 0.06344141811132431, |
|
"learning_rate": 9.195402298850575e-06, |
|
"loss": 0.9454, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.979381443298969, |
|
"eval_loss": 1.3938028812408447, |
|
"eval_runtime": 12.1382, |
|
"eval_samples_per_second": 11.122, |
|
"eval_steps_per_second": 0.741, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.9896907216494846, |
|
"grad_norm": 0.06258992105722427, |
|
"learning_rate": 4.5977011494252875e-06, |
|
"loss": 0.9112, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.9896907216494846, |
|
"eval_loss": 1.394579529762268, |
|
"eval_runtime": 12.1768, |
|
"eval_samples_per_second": 11.087, |
|
"eval_steps_per_second": 0.739, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.08749664574861526, |
|
"learning_rate": 0.0, |
|
"loss": 0.8526, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 1.3950037956237793, |
|
"eval_runtime": 12.1878, |
|
"eval_samples_per_second": 11.077, |
|
"eval_steps_per_second": 0.738, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 97, |
|
"total_flos": 3.573314566697779e+16, |
|
"train_loss": 1.1783372968742527, |
|
"train_runtime": 2093.8635, |
|
"train_samples_per_second": 1.472, |
|
"train_steps_per_second": 0.046 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 97, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 10, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.573314566697779e+16, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|