|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.487562189054726, |
|
"eval_steps": 25, |
|
"global_step": 1000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 2.43993993993994e-05, |
|
"loss": 0.8106, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 0.2529616057872772, |
|
"eval_runtime": 9.2377, |
|
"eval_samples_per_second": 9.634, |
|
"eval_steps_per_second": 1.299, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"learning_rate": 2.3773773773773775e-05, |
|
"loss": 0.1936, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"eval_loss": 0.1928737312555313, |
|
"eval_runtime": 9.3374, |
|
"eval_samples_per_second": 9.532, |
|
"eval_steps_per_second": 1.285, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"learning_rate": 2.314814814814815e-05, |
|
"loss": 0.1423, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"eval_loss": 0.18249212205410004, |
|
"eval_runtime": 9.3738, |
|
"eval_samples_per_second": 9.495, |
|
"eval_steps_per_second": 1.28, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"learning_rate": 2.2522522522522523e-05, |
|
"loss": 0.1499, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"eval_loss": 0.17995011806488037, |
|
"eval_runtime": 9.4057, |
|
"eval_samples_per_second": 9.462, |
|
"eval_steps_per_second": 1.276, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"learning_rate": 2.18968968968969e-05, |
|
"loss": 0.1177, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"eval_loss": 0.174322709441185, |
|
"eval_runtime": 9.4044, |
|
"eval_samples_per_second": 9.464, |
|
"eval_steps_per_second": 1.276, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"learning_rate": 2.1271271271271275e-05, |
|
"loss": 0.128, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"eval_loss": 0.1779523640871048, |
|
"eval_runtime": 9.4208, |
|
"eval_samples_per_second": 9.447, |
|
"eval_steps_per_second": 1.274, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"learning_rate": 2.0645645645645647e-05, |
|
"loss": 0.1023, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"eval_loss": 0.1758423000574112, |
|
"eval_runtime": 9.3522, |
|
"eval_samples_per_second": 9.517, |
|
"eval_steps_per_second": 1.283, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"learning_rate": 2.0020020020020023e-05, |
|
"loss": 0.1145, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"eval_loss": 0.17488867044448853, |
|
"eval_runtime": 9.397, |
|
"eval_samples_per_second": 9.471, |
|
"eval_steps_per_second": 1.277, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"learning_rate": 1.9394394394394395e-05, |
|
"loss": 0.0945, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"eval_loss": 0.1771743893623352, |
|
"eval_runtime": 9.2878, |
|
"eval_samples_per_second": 9.583, |
|
"eval_steps_per_second": 1.292, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"learning_rate": 1.8768768768768768e-05, |
|
"loss": 0.0813, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"eval_loss": 0.21932578086853027, |
|
"eval_runtime": 9.3376, |
|
"eval_samples_per_second": 9.531, |
|
"eval_steps_per_second": 1.285, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"learning_rate": 1.8143143143143144e-05, |
|
"loss": 0.0898, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"eval_loss": 0.17662294209003448, |
|
"eval_runtime": 9.3638, |
|
"eval_samples_per_second": 9.505, |
|
"eval_steps_per_second": 1.282, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"learning_rate": 1.7517517517517516e-05, |
|
"loss": 0.0846, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"eval_loss": 0.1968599408864975, |
|
"eval_runtime": 9.3136, |
|
"eval_samples_per_second": 9.556, |
|
"eval_steps_per_second": 1.288, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"learning_rate": 1.6891891891891892e-05, |
|
"loss": 0.0743, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"eval_loss": 0.19715578854084015, |
|
"eval_runtime": 9.4533, |
|
"eval_samples_per_second": 9.415, |
|
"eval_steps_per_second": 1.269, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"learning_rate": 1.6266266266266268e-05, |
|
"loss": 0.0763, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"eval_loss": 0.20438912510871887, |
|
"eval_runtime": 9.3395, |
|
"eval_samples_per_second": 9.529, |
|
"eval_steps_per_second": 1.285, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"learning_rate": 1.564064064064064e-05, |
|
"loss": 0.0742, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"eval_loss": 0.19478727877140045, |
|
"eval_runtime": 9.3251, |
|
"eval_samples_per_second": 9.544, |
|
"eval_steps_per_second": 1.287, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"learning_rate": 1.5015015015015016e-05, |
|
"loss": 0.0642, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.21925467252731323, |
|
"eval_runtime": 9.3222, |
|
"eval_samples_per_second": 9.547, |
|
"eval_steps_per_second": 1.287, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"learning_rate": 1.438938938938939e-05, |
|
"loss": 0.0595, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"eval_loss": 0.2161593735218048, |
|
"eval_runtime": 9.317, |
|
"eval_samples_per_second": 9.552, |
|
"eval_steps_per_second": 1.288, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"learning_rate": 1.3763763763763765e-05, |
|
"loss": 0.0549, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"eval_loss": 0.216683030128479, |
|
"eval_runtime": 9.247, |
|
"eval_samples_per_second": 9.625, |
|
"eval_steps_per_second": 1.298, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"learning_rate": 1.3138138138138139e-05, |
|
"loss": 0.051, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"eval_loss": 0.2373329997062683, |
|
"eval_runtime": 8.9608, |
|
"eval_samples_per_second": 9.932, |
|
"eval_steps_per_second": 1.339, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"learning_rate": 1.2512512512512515e-05, |
|
"loss": 0.0552, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"eval_loss": 0.22366206347942352, |
|
"eval_runtime": 9.3515, |
|
"eval_samples_per_second": 9.517, |
|
"eval_steps_per_second": 1.283, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"learning_rate": 1.1886886886886887e-05, |
|
"loss": 0.0551, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"eval_loss": 0.20705640316009521, |
|
"eval_runtime": 9.3684, |
|
"eval_samples_per_second": 9.5, |
|
"eval_steps_per_second": 1.281, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"learning_rate": 1.1261261261261261e-05, |
|
"loss": 0.0558, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"eval_loss": 0.22326058149337769, |
|
"eval_runtime": 9.355, |
|
"eval_samples_per_second": 9.514, |
|
"eval_steps_per_second": 1.283, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"learning_rate": 1.0635635635635637e-05, |
|
"loss": 0.0576, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"eval_loss": 0.21783779561519623, |
|
"eval_runtime": 9.1966, |
|
"eval_samples_per_second": 9.677, |
|
"eval_steps_per_second": 1.305, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"learning_rate": 1.0010010010010011e-05, |
|
"loss": 0.0534, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"eval_loss": 0.2279473841190338, |
|
"eval_runtime": 9.1641, |
|
"eval_samples_per_second": 9.712, |
|
"eval_steps_per_second": 1.309, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"learning_rate": 9.384384384384384e-06, |
|
"loss": 0.0531, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"eval_loss": 0.230553537607193, |
|
"eval_runtime": 9.2393, |
|
"eval_samples_per_second": 9.633, |
|
"eval_steps_per_second": 1.299, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"learning_rate": 8.758758758758758e-06, |
|
"loss": 0.0507, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"eval_loss": 0.2528133690357208, |
|
"eval_runtime": 9.2368, |
|
"eval_samples_per_second": 9.635, |
|
"eval_steps_per_second": 1.299, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"learning_rate": 8.133133133133134e-06, |
|
"loss": 0.0496, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"eval_loss": 0.27711576223373413, |
|
"eval_runtime": 9.2578, |
|
"eval_samples_per_second": 9.614, |
|
"eval_steps_per_second": 1.296, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"learning_rate": 7.507507507507508e-06, |
|
"loss": 0.0525, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"eval_loss": 0.24864882230758667, |
|
"eval_runtime": 9.3261, |
|
"eval_samples_per_second": 9.543, |
|
"eval_steps_per_second": 1.287, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"learning_rate": 6.881881881881882e-06, |
|
"loss": 0.0477, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"eval_loss": 0.26756495237350464, |
|
"eval_runtime": 9.1971, |
|
"eval_samples_per_second": 9.677, |
|
"eval_steps_per_second": 1.305, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"learning_rate": 6.256256256256257e-06, |
|
"loss": 0.0505, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"eval_loss": 0.2570458650588989, |
|
"eval_runtime": 9.2232, |
|
"eval_samples_per_second": 9.65, |
|
"eval_steps_per_second": 1.301, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.93, |
|
"learning_rate": 5.630630630630631e-06, |
|
"loss": 0.0483, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 1.93, |
|
"eval_loss": 0.25569072365760803, |
|
"eval_runtime": 9.2668, |
|
"eval_samples_per_second": 9.604, |
|
"eval_steps_per_second": 1.295, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"learning_rate": 5.005005005005006e-06, |
|
"loss": 0.0499, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"eval_loss": 0.26184016466140747, |
|
"eval_runtime": 9.3556, |
|
"eval_samples_per_second": 9.513, |
|
"eval_steps_per_second": 1.283, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.05, |
|
"learning_rate": 4.379379379379379e-06, |
|
"loss": 0.0438, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 2.05, |
|
"eval_loss": 0.2792048454284668, |
|
"eval_runtime": 9.3539, |
|
"eval_samples_per_second": 9.515, |
|
"eval_steps_per_second": 1.283, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 2.11, |
|
"learning_rate": 3.753753753753754e-06, |
|
"loss": 0.0433, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.11, |
|
"eval_loss": 0.2763405740261078, |
|
"eval_runtime": 9.3987, |
|
"eval_samples_per_second": 9.469, |
|
"eval_steps_per_second": 1.277, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.18, |
|
"learning_rate": 3.1281281281281287e-06, |
|
"loss": 0.0438, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 2.18, |
|
"eval_loss": 0.28726398944854736, |
|
"eval_runtime": 9.3415, |
|
"eval_samples_per_second": 9.527, |
|
"eval_steps_per_second": 1.285, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"learning_rate": 2.502502502502503e-06, |
|
"loss": 0.0439, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"eval_loss": 0.29283300042152405, |
|
"eval_runtime": 9.3228, |
|
"eval_samples_per_second": 9.547, |
|
"eval_steps_per_second": 1.287, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.3, |
|
"learning_rate": 1.876876876876877e-06, |
|
"loss": 0.0443, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 2.3, |
|
"eval_loss": 0.28899624943733215, |
|
"eval_runtime": 9.3231, |
|
"eval_samples_per_second": 9.546, |
|
"eval_steps_per_second": 1.287, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 2.36, |
|
"learning_rate": 1.2512512512512514e-06, |
|
"loss": 0.0458, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.36, |
|
"eval_loss": 0.28502869606018066, |
|
"eval_runtime": 9.3924, |
|
"eval_samples_per_second": 9.476, |
|
"eval_steps_per_second": 1.278, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.43, |
|
"learning_rate": 6.256256256256257e-07, |
|
"loss": 0.0431, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 2.43, |
|
"eval_loss": 0.2905334532260895, |
|
"eval_runtime": 9.3998, |
|
"eval_samples_per_second": 9.468, |
|
"eval_steps_per_second": 1.277, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 2.49, |
|
"learning_rate": 0.0, |
|
"loss": 0.0427, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.49, |
|
"eval_loss": 0.2926941215991974, |
|
"eval_runtime": 9.3458, |
|
"eval_samples_per_second": 9.523, |
|
"eval_steps_per_second": 1.284, |
|
"step": 1000 |
|
} |
|
], |
|
"logging_steps": 25, |
|
"max_steps": 1000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 25, |
|
"total_flos": 2.1565516640256e+16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|