|
{ |
|
"best_metric": 0.23812708258628845, |
|
"best_model_checkpoint": "./model_outputs/checkpoint-9800", |
|
"epoch": 1.9338753280580934, |
|
"eval_steps": 100, |
|
"global_step": 9800, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.019732817649032106, |
|
"grad_norm": 0.579439103603363, |
|
"learning_rate": 1.6666666666666667e-05, |
|
"loss": 1.8467, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.019732817649032106, |
|
"eval_loss": 1.6029163599014282, |
|
"eval_runtime": 0.1393, |
|
"eval_samples_per_second": 35.888, |
|
"eval_steps_per_second": 7.178, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.03946563529806421, |
|
"grad_norm": 1.7087633609771729, |
|
"learning_rate": 3.3333333333333335e-05, |
|
"loss": 1.4062, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.03946563529806421, |
|
"eval_loss": 1.2955422401428223, |
|
"eval_runtime": 0.0895, |
|
"eval_samples_per_second": 55.895, |
|
"eval_steps_per_second": 11.179, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.05919845294709632, |
|
"grad_norm": 2.1446690559387207, |
|
"learning_rate": 5e-05, |
|
"loss": 1.2561, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.05919845294709632, |
|
"eval_loss": 1.14933443069458, |
|
"eval_runtime": 0.0896, |
|
"eval_samples_per_second": 55.787, |
|
"eval_steps_per_second": 11.157, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.07893127059612842, |
|
"grad_norm": 1.9442692995071411, |
|
"learning_rate": 6.666666666666667e-05, |
|
"loss": 1.1758, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.07893127059612842, |
|
"eval_loss": 1.070350170135498, |
|
"eval_runtime": 0.0899, |
|
"eval_samples_per_second": 55.607, |
|
"eval_steps_per_second": 11.121, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.09866408824516053, |
|
"grad_norm": 1.6005765199661255, |
|
"learning_rate": 8.333333333333334e-05, |
|
"loss": 1.1199, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.09866408824516053, |
|
"eval_loss": 1.0037094354629517, |
|
"eval_runtime": 0.0918, |
|
"eval_samples_per_second": 54.446, |
|
"eval_steps_per_second": 10.889, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.11839690589419263, |
|
"grad_norm": 1.4335505962371826, |
|
"learning_rate": 0.0001, |
|
"loss": 1.0773, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.11839690589419263, |
|
"eval_loss": 0.9590722918510437, |
|
"eval_runtime": 0.0897, |
|
"eval_samples_per_second": 55.743, |
|
"eval_steps_per_second": 11.149, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.13812972354322473, |
|
"grad_norm": 1.535965085029602, |
|
"learning_rate": 0.00011666666666666668, |
|
"loss": 1.0446, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.13812972354322473, |
|
"eval_loss": 0.9230692982673645, |
|
"eval_runtime": 0.0893, |
|
"eval_samples_per_second": 55.988, |
|
"eval_steps_per_second": 11.198, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.15786254119225684, |
|
"grad_norm": 1.1867926120758057, |
|
"learning_rate": 0.00013333333333333334, |
|
"loss": 1.0234, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.15786254119225684, |
|
"eval_loss": 0.8973449468612671, |
|
"eval_runtime": 0.0898, |
|
"eval_samples_per_second": 55.673, |
|
"eval_steps_per_second": 11.135, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.17759535884128894, |
|
"grad_norm": 0.9903791546821594, |
|
"learning_rate": 0.00015000000000000001, |
|
"loss": 1.0057, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.17759535884128894, |
|
"eval_loss": 0.8900602459907532, |
|
"eval_runtime": 0.0911, |
|
"eval_samples_per_second": 54.888, |
|
"eval_steps_per_second": 10.978, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.19732817649032106, |
|
"grad_norm": 1.0672552585601807, |
|
"learning_rate": 0.0001666666666666667, |
|
"loss": 0.9416, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.19732817649032106, |
|
"eval_loss": 0.7900083065032959, |
|
"eval_runtime": 0.0909, |
|
"eval_samples_per_second": 54.981, |
|
"eval_steps_per_second": 10.996, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.21706099413935315, |
|
"grad_norm": 1.1501110792160034, |
|
"learning_rate": 0.00018333333333333334, |
|
"loss": 0.7217, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.21706099413935315, |
|
"eval_loss": 0.5064653754234314, |
|
"eval_runtime": 0.089, |
|
"eval_samples_per_second": 56.189, |
|
"eval_steps_per_second": 11.238, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.23679381178838527, |
|
"grad_norm": 0.7229278683662415, |
|
"learning_rate": 0.0002, |
|
"loss": 0.5166, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.23679381178838527, |
|
"eval_loss": 0.42082151770591736, |
|
"eval_runtime": 0.0889, |
|
"eval_samples_per_second": 56.242, |
|
"eval_steps_per_second": 11.248, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.2565266294374174, |
|
"grad_norm": 0.753103494644165, |
|
"learning_rate": 0.00019993817941631932, |
|
"loss": 0.4558, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.2565266294374174, |
|
"eval_loss": 0.39703065156936646, |
|
"eval_runtime": 0.09, |
|
"eval_samples_per_second": 55.574, |
|
"eval_steps_per_second": 11.115, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.27625944708644945, |
|
"grad_norm": 0.4141121506690979, |
|
"learning_rate": 0.00019975279410096856, |
|
"loss": 0.4347, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.27625944708644945, |
|
"eval_loss": 0.3718082904815674, |
|
"eval_runtime": 0.0891, |
|
"eval_samples_per_second": 56.096, |
|
"eval_steps_per_second": 11.219, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.29599226473548157, |
|
"grad_norm": 0.6031398773193359, |
|
"learning_rate": 0.00019944407326651575, |
|
"loss": 0.4217, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.29599226473548157, |
|
"eval_loss": 0.3526321351528168, |
|
"eval_runtime": 0.0904, |
|
"eval_samples_per_second": 55.331, |
|
"eval_steps_per_second": 11.066, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.3157250823845137, |
|
"grad_norm": 0.5605940222740173, |
|
"learning_rate": 0.0001990123986190045, |
|
"loss": 0.4128, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.3157250823845137, |
|
"eval_loss": 0.35304129123687744, |
|
"eval_runtime": 0.0901, |
|
"eval_samples_per_second": 55.473, |
|
"eval_steps_per_second": 11.095, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.3354579000335458, |
|
"grad_norm": 0.5206743478775024, |
|
"learning_rate": 0.00019845830388600822, |
|
"loss": 0.4054, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.3354579000335458, |
|
"eval_loss": 0.34606409072875977, |
|
"eval_runtime": 0.0913, |
|
"eval_samples_per_second": 54.794, |
|
"eval_steps_per_second": 10.959, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.3551907176825779, |
|
"grad_norm": 0.46870893239974976, |
|
"learning_rate": 0.000197782474156723, |
|
"loss": 0.4004, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.3551907176825779, |
|
"eval_loss": 0.33528828620910645, |
|
"eval_runtime": 0.0919, |
|
"eval_samples_per_second": 54.394, |
|
"eval_steps_per_second": 10.879, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.37492353533161, |
|
"grad_norm": 0.4280480444431305, |
|
"learning_rate": 0.0001969857450349156, |
|
"loss": 0.398, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.37492353533161, |
|
"eval_loss": 0.3284319043159485, |
|
"eval_runtime": 0.089, |
|
"eval_samples_per_second": 56.157, |
|
"eval_steps_per_second": 11.231, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.3946563529806421, |
|
"grad_norm": 0.37423691153526306, |
|
"learning_rate": 0.00019606910160577286, |
|
"loss": 0.3932, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.3946563529806421, |
|
"eval_loss": 0.32890015840530396, |
|
"eval_runtime": 0.0894, |
|
"eval_samples_per_second": 55.933, |
|
"eval_steps_per_second": 11.187, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.41438917062967423, |
|
"grad_norm": 0.5279616117477417, |
|
"learning_rate": 0.00019503367721793112, |
|
"loss": 0.3902, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.41438917062967423, |
|
"eval_loss": 0.3163706958293915, |
|
"eval_runtime": 0.0913, |
|
"eval_samples_per_second": 54.779, |
|
"eval_steps_per_second": 10.956, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.4341219882787063, |
|
"grad_norm": 0.5170876383781433, |
|
"learning_rate": 0.00019388075208219072, |
|
"loss": 0.3854, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.4341219882787063, |
|
"eval_loss": 0.3095114827156067, |
|
"eval_runtime": 0.0905, |
|
"eval_samples_per_second": 55.269, |
|
"eval_steps_per_second": 11.054, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.4538548059277384, |
|
"grad_norm": 0.5783583521842957, |
|
"learning_rate": 0.00019261175168864823, |
|
"loss": 0.3845, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.4538548059277384, |
|
"eval_loss": 0.31021368503570557, |
|
"eval_runtime": 0.0899, |
|
"eval_samples_per_second": 55.639, |
|
"eval_steps_per_second": 11.128, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.47358762357677053, |
|
"grad_norm": 0.3570936322212219, |
|
"learning_rate": 0.00019122824504420402, |
|
"loss": 0.3796, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.47358762357677053, |
|
"eval_loss": 0.30086052417755127, |
|
"eval_runtime": 0.0901, |
|
"eval_samples_per_second": 55.499, |
|
"eval_steps_per_second": 11.1, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.49332044122580265, |
|
"grad_norm": 0.3303937613964081, |
|
"learning_rate": 0.0001897319427326239, |
|
"loss": 0.381, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.49332044122580265, |
|
"eval_loss": 0.3088415861129761, |
|
"eval_runtime": 0.0896, |
|
"eval_samples_per_second": 55.778, |
|
"eval_steps_per_second": 11.156, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.5130532588748348, |
|
"grad_norm": 0.39968788623809814, |
|
"learning_rate": 0.00018812469479955306, |
|
"loss": 0.3758, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.5130532588748348, |
|
"eval_loss": 0.3069296181201935, |
|
"eval_runtime": 0.0906, |
|
"eval_samples_per_second": 55.215, |
|
"eval_steps_per_second": 11.043, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.5327860765238669, |
|
"grad_norm": 0.4014946520328522, |
|
"learning_rate": 0.00018640848846509836, |
|
"loss": 0.3728, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.5327860765238669, |
|
"eval_loss": 0.302837997674942, |
|
"eval_runtime": 0.0907, |
|
"eval_samples_per_second": 55.152, |
|
"eval_steps_per_second": 11.03, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.5525188941728989, |
|
"grad_norm": 0.3048448860645294, |
|
"learning_rate": 0.00018458544566680613, |
|
"loss": 0.374, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.5525188941728989, |
|
"eval_loss": 0.30044040083885193, |
|
"eval_runtime": 0.0918, |
|
"eval_samples_per_second": 54.44, |
|
"eval_steps_per_second": 10.888, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.572251711821931, |
|
"grad_norm": 0.36768218874931335, |
|
"learning_rate": 0.00018265782043607362, |
|
"loss": 0.3694, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.572251711821931, |
|
"eval_loss": 0.2935519516468048, |
|
"eval_runtime": 0.0918, |
|
"eval_samples_per_second": 54.473, |
|
"eval_steps_per_second": 10.895, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.5919845294709631, |
|
"grad_norm": 0.3077068030834198, |
|
"learning_rate": 0.00018062799611123843, |
|
"loss": 0.3649, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.5919845294709631, |
|
"eval_loss": 0.2956543564796448, |
|
"eval_runtime": 0.0897, |
|
"eval_samples_per_second": 55.755, |
|
"eval_steps_per_second": 11.151, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.6117173471199953, |
|
"grad_norm": 0.3608091175556183, |
|
"learning_rate": 0.00017849848239079126, |
|
"loss": 0.3613, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.6117173471199953, |
|
"eval_loss": 0.29713118076324463, |
|
"eval_runtime": 0.0892, |
|
"eval_samples_per_second": 56.045, |
|
"eval_steps_per_second": 11.209, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.6314501647690274, |
|
"grad_norm": 0.35845324397087097, |
|
"learning_rate": 0.00017627191223035512, |
|
"loss": 0.3646, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.6314501647690274, |
|
"eval_loss": 0.28599464893341064, |
|
"eval_runtime": 0.0897, |
|
"eval_samples_per_second": 55.735, |
|
"eval_steps_per_second": 11.147, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.6511829824180595, |
|
"grad_norm": 0.3350367546081543, |
|
"learning_rate": 0.00017395103858726846, |
|
"loss": 0.3619, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.6511829824180595, |
|
"eval_loss": 0.2815980315208435, |
|
"eval_runtime": 0.0892, |
|
"eval_samples_per_second": 56.042, |
|
"eval_steps_per_second": 11.208, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.6709158000670916, |
|
"grad_norm": 0.30465880036354065, |
|
"learning_rate": 0.00017153873101679668, |
|
"loss": 0.3625, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.6709158000670916, |
|
"eval_loss": 0.2854335308074951, |
|
"eval_runtime": 0.09, |
|
"eval_samples_per_second": 55.528, |
|
"eval_steps_per_second": 11.106, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.6906486177161237, |
|
"grad_norm": 0.31288811564445496, |
|
"learning_rate": 0.00016903797212418015, |
|
"loss": 0.3552, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.6906486177161237, |
|
"eval_loss": 0.28378957509994507, |
|
"eval_runtime": 0.0889, |
|
"eval_samples_per_second": 56.248, |
|
"eval_steps_per_second": 11.25, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.7103814353651557, |
|
"grad_norm": 0.49065399169921875, |
|
"learning_rate": 0.0001664518538769067, |
|
"loss": 0.3545, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.7103814353651557, |
|
"eval_loss": 0.28242945671081543, |
|
"eval_runtime": 0.0893, |
|
"eval_samples_per_second": 56.004, |
|
"eval_steps_per_second": 11.201, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.7301142530141879, |
|
"grad_norm": 0.34333252906799316, |
|
"learning_rate": 0.00016378357378176654, |
|
"loss": 0.3531, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.7301142530141879, |
|
"eval_loss": 0.28695201873779297, |
|
"eval_runtime": 0.0901, |
|
"eval_samples_per_second": 55.475, |
|
"eval_steps_per_second": 11.095, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.74984707066322, |
|
"grad_norm": 0.42468976974487305, |
|
"learning_rate": 0.0001610364309314178, |
|
"loss": 0.3528, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.74984707066322, |
|
"eval_loss": 0.282599538564682, |
|
"eval_runtime": 0.0888, |
|
"eval_samples_per_second": 56.288, |
|
"eval_steps_per_second": 11.258, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.7695798883122521, |
|
"grad_norm": 0.28823015093803406, |
|
"learning_rate": 0.00015821382192534968, |
|
"loss": 0.3515, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.7695798883122521, |
|
"eval_loss": 0.28318002820014954, |
|
"eval_runtime": 0.089, |
|
"eval_samples_per_second": 56.211, |
|
"eval_steps_per_second": 11.242, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.7893127059612842, |
|
"grad_norm": 0.3027402460575104, |
|
"learning_rate": 0.0001553192366702874, |
|
"loss": 0.3515, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.7893127059612842, |
|
"eval_loss": 0.2733287513256073, |
|
"eval_runtime": 0.0893, |
|
"eval_samples_per_second": 55.972, |
|
"eval_steps_per_second": 11.194, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.8090455236103163, |
|
"grad_norm": 0.29556697607040405, |
|
"learning_rate": 0.00015235625406523058, |
|
"loss": 0.3485, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.8090455236103163, |
|
"eval_loss": 0.2786538004875183, |
|
"eval_runtime": 0.0905, |
|
"eval_samples_per_second": 55.253, |
|
"eval_steps_per_second": 11.051, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.8287783412593485, |
|
"grad_norm": 0.36687153577804565, |
|
"learning_rate": 0.0001493285375764608, |
|
"loss": 0.3486, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.8287783412593485, |
|
"eval_loss": 0.281529039144516, |
|
"eval_runtime": 0.089, |
|
"eval_samples_per_second": 56.211, |
|
"eval_steps_per_second": 11.242, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.8485111589083806, |
|
"grad_norm": 0.3069268763065338, |
|
"learning_rate": 0.00014623983070798918, |
|
"loss": 0.3466, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.8485111589083806, |
|
"eval_loss": 0.2738308012485504, |
|
"eval_runtime": 0.0905, |
|
"eval_samples_per_second": 55.242, |
|
"eval_steps_per_second": 11.048, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.8682439765574126, |
|
"grad_norm": 0.362088680267334, |
|
"learning_rate": 0.00014309395237304426, |
|
"loss": 0.3441, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.8682439765574126, |
|
"eval_loss": 0.2713426649570465, |
|
"eval_runtime": 0.09, |
|
"eval_samples_per_second": 55.556, |
|
"eval_steps_per_second": 11.111, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.8879767942064447, |
|
"grad_norm": 0.35860228538513184, |
|
"learning_rate": 0.00013989479217232315, |
|
"loss": 0.3459, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.8879767942064447, |
|
"eval_loss": 0.27472037076950073, |
|
"eval_runtime": 0.0888, |
|
"eval_samples_per_second": 56.322, |
|
"eval_steps_per_second": 11.264, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.9077096118554768, |
|
"grad_norm": 0.39065611362457275, |
|
"learning_rate": 0.00013664630558484379, |
|
"loss": 0.3365, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.9077096118554768, |
|
"eval_loss": 0.2625993490219116, |
|
"eval_runtime": 0.0905, |
|
"eval_samples_per_second": 55.261, |
|
"eval_steps_per_second": 11.052, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.927442429504509, |
|
"grad_norm": 0.3311096131801605, |
|
"learning_rate": 0.00013335250907734448, |
|
"loss": 0.3433, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.927442429504509, |
|
"eval_loss": 0.26789966225624084, |
|
"eval_runtime": 0.0901, |
|
"eval_samples_per_second": 55.469, |
|
"eval_steps_per_second": 11.094, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.9471752471535411, |
|
"grad_norm": 0.29878920316696167, |
|
"learning_rate": 0.00013001747513827764, |
|
"loss": 0.3421, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.9471752471535411, |
|
"eval_loss": 0.26495981216430664, |
|
"eval_runtime": 0.0901, |
|
"eval_samples_per_second": 55.48, |
|
"eval_steps_per_second": 11.096, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.9669080648025732, |
|
"grad_norm": 0.27687743306159973, |
|
"learning_rate": 0.00012664532724253745, |
|
"loss": 0.3412, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.9669080648025732, |
|
"eval_loss": 0.26512324810028076, |
|
"eval_runtime": 0.0891, |
|
"eval_samples_per_second": 56.121, |
|
"eval_steps_per_second": 11.224, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.9866408824516053, |
|
"grad_norm": 0.2745589017868042, |
|
"learning_rate": 0.00012324023475314725, |
|
"loss": 0.3389, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.9866408824516053, |
|
"eval_loss": 0.2671201825141907, |
|
"eval_runtime": 0.0893, |
|
"eval_samples_per_second": 55.975, |
|
"eval_steps_per_second": 11.195, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 1.0064328985535844, |
|
"grad_norm": 0.3213329017162323, |
|
"learning_rate": 0.00011980640776621077, |
|
"loss": 0.3342, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 1.0064328985535844, |
|
"eval_loss": 0.26046106219291687, |
|
"eval_runtime": 0.0901, |
|
"eval_samples_per_second": 55.499, |
|
"eval_steps_per_second": 11.1, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 1.0261657162026165, |
|
"grad_norm": 0.27474504709243774, |
|
"learning_rate": 0.0001163480919054998, |
|
"loss": 0.3256, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 1.0261657162026165, |
|
"eval_loss": 0.25896698236465454, |
|
"eval_runtime": 0.0888, |
|
"eval_samples_per_second": 56.29, |
|
"eval_steps_per_second": 11.258, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 1.0458985338516487, |
|
"grad_norm": 0.30879899859428406, |
|
"learning_rate": 0.00011286956307311555, |
|
"loss": 0.3221, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 1.0458985338516487, |
|
"eval_loss": 0.26479166746139526, |
|
"eval_runtime": 0.0899, |
|
"eval_samples_per_second": 55.608, |
|
"eval_steps_per_second": 11.122, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 1.0656313515006808, |
|
"grad_norm": 0.31644341349601746, |
|
"learning_rate": 0.00010937512216271338, |
|
"loss": 0.3213, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 1.0656313515006808, |
|
"eval_loss": 0.2630908489227295, |
|
"eval_runtime": 0.0912, |
|
"eval_samples_per_second": 54.85, |
|
"eval_steps_per_second": 10.97, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 1.085364169149713, |
|
"grad_norm": 0.27920085191726685, |
|
"learning_rate": 0.00010586908974182767, |
|
"loss": 0.3236, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 1.085364169149713, |
|
"eval_loss": 0.25873327255249023, |
|
"eval_runtime": 0.0918, |
|
"eval_samples_per_second": 54.458, |
|
"eval_steps_per_second": 10.892, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 1.105096986798745, |
|
"grad_norm": 0.41224974393844604, |
|
"learning_rate": 0.0001023558007098717, |
|
"loss": 0.3201, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 1.105096986798745, |
|
"eval_loss": 0.2597818672657013, |
|
"eval_runtime": 0.0924, |
|
"eval_samples_per_second": 54.094, |
|
"eval_steps_per_second": 10.819, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 1.1248298044477771, |
|
"grad_norm": 0.28382521867752075, |
|
"learning_rate": 9.88395989384173e-05, |
|
"loss": 0.3227, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 1.1248298044477771, |
|
"eval_loss": 0.2594955563545227, |
|
"eval_runtime": 0.0907, |
|
"eval_samples_per_second": 55.14, |
|
"eval_steps_per_second": 11.028, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 1.1445626220968093, |
|
"grad_norm": 0.258071631193161, |
|
"learning_rate": 9.532483190038153e-05, |
|
"loss": 0.3188, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 1.1445626220968093, |
|
"eval_loss": 0.2607273459434509, |
|
"eval_runtime": 0.0902, |
|
"eval_samples_per_second": 55.429, |
|
"eval_steps_per_second": 11.086, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 1.1642954397458414, |
|
"grad_norm": 0.4713532030582428, |
|
"learning_rate": 9.181584529476025e-05, |
|
"loss": 0.3199, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 1.1642954397458414, |
|
"eval_loss": 0.2583589255809784, |
|
"eval_runtime": 0.0897, |
|
"eval_samples_per_second": 55.713, |
|
"eval_steps_per_second": 11.143, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 1.1840282573948735, |
|
"grad_norm": 0.2917826771736145, |
|
"learning_rate": 8.831697767355519e-05, |
|
"loss": 0.3185, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 1.1840282573948735, |
|
"eval_loss": 0.256054550409317, |
|
"eval_runtime": 0.091, |
|
"eval_samples_per_second": 54.935, |
|
"eval_steps_per_second": 10.987, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 1.2037610750439056, |
|
"grad_norm": 0.27514272928237915, |
|
"learning_rate": 8.483255507753762e-05, |
|
"loss": 0.3167, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 1.2037610750439056, |
|
"eval_loss": 0.2567542791366577, |
|
"eval_runtime": 0.0902, |
|
"eval_samples_per_second": 55.458, |
|
"eval_steps_per_second": 11.092, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 1.2234938926929377, |
|
"grad_norm": 0.28672000765800476, |
|
"learning_rate": 8.136688568748113e-05, |
|
"loss": 0.3131, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 1.2234938926929377, |
|
"eval_loss": 0.2553618848323822, |
|
"eval_runtime": 0.0906, |
|
"eval_samples_per_second": 55.177, |
|
"eval_steps_per_second": 11.035, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 1.2432267103419696, |
|
"grad_norm": 0.30310142040252686, |
|
"learning_rate": 7.792425449747635e-05, |
|
"loss": 0.3189, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 1.2432267103419696, |
|
"eval_loss": 0.2524791657924652, |
|
"eval_runtime": 0.0904, |
|
"eval_samples_per_second": 55.281, |
|
"eval_steps_per_second": 11.056, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 1.262959527991002, |
|
"grad_norm": 0.2990928888320923, |
|
"learning_rate": 7.450891801691468e-05, |
|
"loss": 0.3163, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 1.262959527991002, |
|
"eval_loss": 0.25296735763549805, |
|
"eval_runtime": 0.0903, |
|
"eval_samples_per_second": 55.377, |
|
"eval_steps_per_second": 11.075, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 1.2826923456400339, |
|
"grad_norm": 0.28423842787742615, |
|
"learning_rate": 7.112509900768989e-05, |
|
"loss": 0.3171, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 1.2826923456400339, |
|
"eval_loss": 0.25522494316101074, |
|
"eval_runtime": 0.09, |
|
"eval_samples_per_second": 55.554, |
|
"eval_steps_per_second": 11.111, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 1.302425163289066, |
|
"grad_norm": 0.2940766513347626, |
|
"learning_rate": 6.777698126312647e-05, |
|
"loss": 0.3122, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 1.302425163289066, |
|
"eval_loss": 0.2504900395870209, |
|
"eval_runtime": 0.0898, |
|
"eval_samples_per_second": 55.653, |
|
"eval_steps_per_second": 11.131, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 1.322157980938098, |
|
"grad_norm": 0.29685819149017334, |
|
"learning_rate": 6.446870443508839e-05, |
|
"loss": 0.3094, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 1.322157980938098, |
|
"eval_loss": 0.2528066039085388, |
|
"eval_runtime": 0.0914, |
|
"eval_samples_per_second": 54.709, |
|
"eval_steps_per_second": 10.942, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 1.3418907985871302, |
|
"grad_norm": 0.28402870893478394, |
|
"learning_rate": 6.120435891566542e-05, |
|
"loss": 0.3143, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 1.3418907985871302, |
|
"eval_loss": 0.2532269358634949, |
|
"eval_runtime": 0.0909, |
|
"eval_samples_per_second": 55.009, |
|
"eval_steps_per_second": 11.002, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 1.3616236162361623, |
|
"grad_norm": 0.29260462522506714, |
|
"learning_rate": 5.7987980779764463e-05, |
|
"loss": 0.3106, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 1.3616236162361623, |
|
"eval_loss": 0.24992766976356506, |
|
"eval_runtime": 0.0908, |
|
"eval_samples_per_second": 55.057, |
|
"eval_steps_per_second": 11.011, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 1.3813564338851945, |
|
"grad_norm": 0.27314430475234985, |
|
"learning_rate": 5.482354679485948e-05, |
|
"loss": 0.3162, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 1.3813564338851945, |
|
"eval_loss": 0.24991869926452637, |
|
"eval_runtime": 0.091, |
|
"eval_samples_per_second": 54.919, |
|
"eval_steps_per_second": 10.984, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 1.4010892515342266, |
|
"grad_norm": 0.28783899545669556, |
|
"learning_rate": 5.17149695040698e-05, |
|
"loss": 0.3093, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 1.4010892515342266, |
|
"eval_loss": 0.2506985068321228, |
|
"eval_runtime": 0.0908, |
|
"eval_samples_per_second": 55.089, |
|
"eval_steps_per_second": 11.018, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 1.4208220691832587, |
|
"grad_norm": 0.3105429708957672, |
|
"learning_rate": 4.866609238864609e-05, |
|
"loss": 0.3105, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 1.4208220691832587, |
|
"eval_loss": 0.2487274706363678, |
|
"eval_runtime": 0.0911, |
|
"eval_samples_per_second": 54.856, |
|
"eval_steps_per_second": 10.971, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 1.4405548868322908, |
|
"grad_norm": 0.2590714693069458, |
|
"learning_rate": 4.568068511584529e-05, |
|
"loss": 0.3092, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 1.4405548868322908, |
|
"eval_loss": 0.2481408566236496, |
|
"eval_runtime": 0.0906, |
|
"eval_samples_per_second": 55.214, |
|
"eval_steps_per_second": 11.043, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 1.460287704481323, |
|
"grad_norm": 0.27170825004577637, |
|
"learning_rate": 4.2762438878069955e-05, |
|
"loss": 0.3113, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 1.460287704481323, |
|
"eval_loss": 0.24591541290283203, |
|
"eval_runtime": 0.0904, |
|
"eval_samples_per_second": 55.31, |
|
"eval_steps_per_second": 11.062, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 1.480020522130355, |
|
"grad_norm": 0.2719477117061615, |
|
"learning_rate": 3.991496182903498e-05, |
|
"loss": 0.3077, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 1.480020522130355, |
|
"eval_loss": 0.24259333312511444, |
|
"eval_runtime": 0.091, |
|
"eval_samples_per_second": 54.966, |
|
"eval_steps_per_second": 10.993, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 1.499753339779387, |
|
"grad_norm": 0.28030896186828613, |
|
"learning_rate": 3.714177462260412e-05, |
|
"loss": 0.3073, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 1.499753339779387, |
|
"eval_loss": 0.2429008185863495, |
|
"eval_runtime": 0.093, |
|
"eval_samples_per_second": 53.774, |
|
"eval_steps_per_second": 10.755, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 1.5194861574284193, |
|
"grad_norm": 0.2803615629673004, |
|
"learning_rate": 3.444630605981256e-05, |
|
"loss": 0.3042, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 1.5194861574284193, |
|
"eval_loss": 0.2448611706495285, |
|
"eval_runtime": 0.0905, |
|
"eval_samples_per_second": 55.251, |
|
"eval_steps_per_second": 11.05, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 1.5392189750774512, |
|
"grad_norm": 0.326019823551178, |
|
"learning_rate": 3.183188884945714e-05, |
|
"loss": 0.3075, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 1.5392189750774512, |
|
"eval_loss": 0.24277858436107635, |
|
"eval_runtime": 0.0897, |
|
"eval_samples_per_second": 55.757, |
|
"eval_steps_per_second": 11.151, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 1.5589517927264835, |
|
"grad_norm": 0.3696662187576294, |
|
"learning_rate": 2.930175548749645e-05, |
|
"loss": 0.3047, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 1.5589517927264835, |
|
"eval_loss": 0.2429245412349701, |
|
"eval_runtime": 0.0909, |
|
"eval_samples_per_second": 55.022, |
|
"eval_steps_per_second": 11.004, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 1.5786846103755154, |
|
"grad_norm": 0.2868829667568207, |
|
"learning_rate": 2.6859034260355042e-05, |
|
"loss": 0.307, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 1.5786846103755154, |
|
"eval_loss": 0.2427954375743866, |
|
"eval_runtime": 0.0901, |
|
"eval_samples_per_second": 55.51, |
|
"eval_steps_per_second": 11.102, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 1.5984174280245478, |
|
"grad_norm": 0.28634509444236755, |
|
"learning_rate": 2.4506745377073535e-05, |
|
"loss": 0.303, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 1.5984174280245478, |
|
"eval_loss": 0.24240228533744812, |
|
"eval_runtime": 0.0911, |
|
"eval_samples_per_second": 54.88, |
|
"eval_steps_per_second": 10.976, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 1.6181502456735797, |
|
"grad_norm": 0.2582091987133026, |
|
"learning_rate": 2.224779723508692e-05, |
|
"loss": 0.3063, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 1.6181502456735797, |
|
"eval_loss": 0.24210545420646667, |
|
"eval_runtime": 0.0895, |
|
"eval_samples_per_second": 55.845, |
|
"eval_steps_per_second": 11.169, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 1.6378830633226118, |
|
"grad_norm": 0.29064637422561646, |
|
"learning_rate": 2.0084982824248034e-05, |
|
"loss": 0.3041, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 1.6378830633226118, |
|
"eval_loss": 0.24005956947803497, |
|
"eval_runtime": 0.0916, |
|
"eval_samples_per_second": 54.615, |
|
"eval_steps_per_second": 10.923, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 1.657615880971644, |
|
"grad_norm": 0.29442399740219116, |
|
"learning_rate": 1.802097627354231e-05, |
|
"loss": 0.3046, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 1.657615880971644, |
|
"eval_loss": 0.238916277885437, |
|
"eval_runtime": 0.0907, |
|
"eval_samples_per_second": 55.107, |
|
"eval_steps_per_second": 11.021, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 1.677348698620676, |
|
"grad_norm": 0.2660856246948242, |
|
"learning_rate": 1.605832954476346e-05, |
|
"loss": 0.3054, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 1.677348698620676, |
|
"eval_loss": 0.2391015589237213, |
|
"eval_runtime": 0.0907, |
|
"eval_samples_per_second": 55.141, |
|
"eval_steps_per_second": 11.028, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 1.6970815162697082, |
|
"grad_norm": 0.29835525155067444, |
|
"learning_rate": 1.4199469277238143e-05, |
|
"loss": 0.3043, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 1.6970815162697082, |
|
"eval_loss": 0.23956787586212158, |
|
"eval_runtime": 0.0899, |
|
"eval_samples_per_second": 55.608, |
|
"eval_steps_per_second": 11.122, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 1.7168143339187403, |
|
"grad_norm": 0.33219653367996216, |
|
"learning_rate": 1.2446693787500697e-05, |
|
"loss": 0.3033, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 1.7168143339187403, |
|
"eval_loss": 0.23964008688926697, |
|
"eval_runtime": 0.0905, |
|
"eval_samples_per_second": 55.26, |
|
"eval_steps_per_second": 11.052, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 1.7365471515677724, |
|
"grad_norm": 0.26195675134658813, |
|
"learning_rate": 1.0802170227627873e-05, |
|
"loss": 0.3062, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 1.7365471515677724, |
|
"eval_loss": 0.2384120523929596, |
|
"eval_runtime": 0.0895, |
|
"eval_samples_per_second": 55.837, |
|
"eval_steps_per_second": 11.167, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 1.7562799692168045, |
|
"grad_norm": 0.2874230146408081, |
|
"learning_rate": 9.26793190574664e-06, |
|
"loss": 0.3079, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 1.7562799692168045, |
|
"eval_loss": 0.23803596198558807, |
|
"eval_runtime": 0.0892, |
|
"eval_samples_per_second": 56.06, |
|
"eval_steps_per_second": 11.212, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 1.7760127868658366, |
|
"grad_norm": 0.31188592314720154, |
|
"learning_rate": 7.845875772028289e-06, |
|
"loss": 0.3026, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 1.7760127868658366, |
|
"eval_loss": 0.23819437623023987, |
|
"eval_runtime": 0.0916, |
|
"eval_samples_per_second": 54.586, |
|
"eval_steps_per_second": 10.917, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 1.7957456045148685, |
|
"grad_norm": 0.25863921642303467, |
|
"learning_rate": 6.537760073277066e-06, |
|
"loss": 0.3028, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 1.7957456045148685, |
|
"eval_loss": 0.23811522126197815, |
|
"eval_runtime": 0.0907, |
|
"eval_samples_per_second": 55.122, |
|
"eval_steps_per_second": 11.024, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 1.8154784221639009, |
|
"grad_norm": 0.2967861592769623, |
|
"learning_rate": 5.345202179013353e-06, |
|
"loss": 0.3036, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 1.8154784221639009, |
|
"eval_loss": 0.23839135468006134, |
|
"eval_runtime": 0.0891, |
|
"eval_samples_per_second": 56.087, |
|
"eval_steps_per_second": 11.217, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 1.8352112398129328, |
|
"grad_norm": 0.26816633343696594, |
|
"learning_rate": 4.269676581739079e-06, |
|
"loss": 0.3033, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 1.8352112398129328, |
|
"eval_loss": 0.23855826258659363, |
|
"eval_runtime": 0.0915, |
|
"eval_samples_per_second": 54.629, |
|
"eval_steps_per_second": 10.926, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 1.8549440574619651, |
|
"grad_norm": 0.27961990237236023, |
|
"learning_rate": 3.3125130738579922e-06, |
|
"loss": 0.3059, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 1.8549440574619651, |
|
"eval_loss": 0.2385035753250122, |
|
"eval_runtime": 0.0918, |
|
"eval_samples_per_second": 54.481, |
|
"eval_steps_per_second": 10.896, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 1.874676875110997, |
|
"grad_norm": 0.28482890129089355, |
|
"learning_rate": 2.4748951035047596e-06, |
|
"loss": 0.3047, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 1.874676875110997, |
|
"eval_loss": 0.23791757225990295, |
|
"eval_runtime": 0.0933, |
|
"eval_samples_per_second": 53.597, |
|
"eval_steps_per_second": 10.719, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 1.8944096927600294, |
|
"grad_norm": 0.299679160118103, |
|
"learning_rate": 1.7578583113159962e-06, |
|
"loss": 0.3036, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 1.8944096927600294, |
|
"eval_loss": 0.2383710891008377, |
|
"eval_runtime": 0.0895, |
|
"eval_samples_per_second": 55.854, |
|
"eval_steps_per_second": 11.171, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 1.9141425104090612, |
|
"grad_norm": 0.27456724643707275, |
|
"learning_rate": 1.1622892499519421e-06, |
|
"loss": 0.3032, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 1.9141425104090612, |
|
"eval_loss": 0.2380281388759613, |
|
"eval_runtime": 0.0895, |
|
"eval_samples_per_second": 55.876, |
|
"eval_steps_per_second": 11.175, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 1.9338753280580934, |
|
"grad_norm": 0.27245351672172546, |
|
"learning_rate": 6.889242879525415e-07, |
|
"loss": 0.2994, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 1.9338753280580934, |
|
"eval_loss": 0.23812708258628845, |
|
"eval_runtime": 0.09, |
|
"eval_samples_per_second": 55.528, |
|
"eval_steps_per_second": 11.106, |
|
"step": 9800 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 10134, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 200, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 7.236547392844431e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|