|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 20.0, |
|
"eval_steps": 500, |
|
"global_step": 20320, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.984251968503937, |
|
"grad_norm": 9809.25390625, |
|
"learning_rate": 0.00029969999999999997, |
|
"loss": 1.3222, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.2888088452521089, |
|
"eval_loss": 5.052667617797852, |
|
"eval_runtime": 867.6167, |
|
"eval_samples_per_second": 38.736, |
|
"eval_steps_per_second": 1.211, |
|
"step": 1016 |
|
}, |
|
{ |
|
"epoch": 1.968503937007874, |
|
"grad_norm": 7169.68408203125, |
|
"learning_rate": 0.0005997, |
|
"loss": 1.1829, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.29311490927283645, |
|
"eval_loss": 4.973076343536377, |
|
"eval_runtime": 866.7489, |
|
"eval_samples_per_second": 38.775, |
|
"eval_steps_per_second": 1.213, |
|
"step": 2032 |
|
}, |
|
{ |
|
"epoch": 2.952755905511811, |
|
"grad_norm": 5481.02392578125, |
|
"learning_rate": 0.0005672816593886462, |
|
"loss": 1.1528, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_accuracy": 0.29985522038131823, |
|
"eval_loss": 4.848499298095703, |
|
"eval_runtime": 870.5632, |
|
"eval_samples_per_second": 38.605, |
|
"eval_steps_per_second": 1.207, |
|
"step": 3048 |
|
}, |
|
{ |
|
"epoch": 3.937007874015748, |
|
"grad_norm": 5024.47705078125, |
|
"learning_rate": 0.0005345305676855895, |
|
"loss": 1.128, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.3019899977220967, |
|
"eval_loss": 4.790306568145752, |
|
"eval_runtime": 870.3918, |
|
"eval_samples_per_second": 38.612, |
|
"eval_steps_per_second": 1.208, |
|
"step": 4064 |
|
}, |
|
{ |
|
"epoch": 4.921259842519685, |
|
"grad_norm": 4731.38330078125, |
|
"learning_rate": 0.0005017794759825327, |
|
"loss": 1.1098, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_accuracy": 0.3039251673839655, |
|
"eval_loss": 4.753741264343262, |
|
"eval_runtime": 866.2087, |
|
"eval_samples_per_second": 38.799, |
|
"eval_steps_per_second": 1.213, |
|
"step": 5080 |
|
}, |
|
{ |
|
"epoch": 5.905511811023622, |
|
"grad_norm": 4894.84375, |
|
"learning_rate": 0.00046902838427947594, |
|
"loss": 1.0951, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_accuracy": 0.30580944523971787, |
|
"eval_loss": 4.716702938079834, |
|
"eval_runtime": 865.218, |
|
"eval_samples_per_second": 38.843, |
|
"eval_steps_per_second": 1.215, |
|
"step": 6096 |
|
}, |
|
{ |
|
"epoch": 6.889763779527559, |
|
"grad_norm": 5185.75244140625, |
|
"learning_rate": 0.00043627729257641914, |
|
"loss": 1.0824, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_accuracy": 0.30677394395426305, |
|
"eval_loss": 4.699747562408447, |
|
"eval_runtime": 865.4872, |
|
"eval_samples_per_second": 38.831, |
|
"eval_steps_per_second": 1.214, |
|
"step": 7112 |
|
}, |
|
{ |
|
"epoch": 7.874015748031496, |
|
"grad_norm": 5380.1005859375, |
|
"learning_rate": 0.0004035262008733624, |
|
"loss": 1.0724, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_accuracy": 0.3080556721421747, |
|
"eval_loss": 4.674106597900391, |
|
"eval_runtime": 865.0937, |
|
"eval_samples_per_second": 38.849, |
|
"eval_steps_per_second": 1.215, |
|
"step": 8128 |
|
}, |
|
{ |
|
"epoch": 8.858267716535433, |
|
"grad_norm": 5194.1533203125, |
|
"learning_rate": 0.00037077510917030566, |
|
"loss": 1.0626, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_accuracy": 0.30815867855524104, |
|
"eval_loss": 4.669048309326172, |
|
"eval_runtime": 865.3252, |
|
"eval_samples_per_second": 38.839, |
|
"eval_steps_per_second": 1.215, |
|
"step": 9144 |
|
}, |
|
{ |
|
"epoch": 9.84251968503937, |
|
"grad_norm": 5698.17236328125, |
|
"learning_rate": 0.00033802401746724887, |
|
"loss": 1.0532, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_accuracy": 0.3097009797779021, |
|
"eval_loss": 4.659518241882324, |
|
"eval_runtime": 864.3515, |
|
"eval_samples_per_second": 38.882, |
|
"eval_steps_per_second": 1.216, |
|
"step": 10160 |
|
}, |
|
{ |
|
"epoch": 10.826771653543307, |
|
"grad_norm": 5876.86181640625, |
|
"learning_rate": 0.0003052729257641921, |
|
"loss": 1.0428, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"eval_accuracy": 0.3101439248226706, |
|
"eval_loss": 4.65607213973999, |
|
"eval_runtime": 865.0882, |
|
"eval_samples_per_second": 38.849, |
|
"eval_steps_per_second": 1.215, |
|
"step": 11176 |
|
}, |
|
{ |
|
"epoch": 11.811023622047244, |
|
"grad_norm": 6474.33642578125, |
|
"learning_rate": 0.00027252183406113533, |
|
"loss": 1.0351, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_accuracy": 0.31076691273301343, |
|
"eval_loss": 4.64418363571167, |
|
"eval_runtime": 867.9075, |
|
"eval_samples_per_second": 38.723, |
|
"eval_steps_per_second": 1.211, |
|
"step": 12192 |
|
}, |
|
{ |
|
"epoch": 12.795275590551181, |
|
"grad_norm": 7132.021484375, |
|
"learning_rate": 0.0002397707423580786, |
|
"loss": 1.0274, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"eval_accuracy": 0.3109399099366426, |
|
"eval_loss": 4.64435338973999, |
|
"eval_runtime": 867.2849, |
|
"eval_samples_per_second": 38.751, |
|
"eval_steps_per_second": 1.212, |
|
"step": 13208 |
|
}, |
|
{ |
|
"epoch": 13.779527559055119, |
|
"grad_norm": 6837.5517578125, |
|
"learning_rate": 0.00020701965065502182, |
|
"loss": 1.0206, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_accuracy": 0.3116507648211613, |
|
"eval_loss": 4.642159461975098, |
|
"eval_runtime": 866.026, |
|
"eval_samples_per_second": 38.807, |
|
"eval_steps_per_second": 1.214, |
|
"step": 14224 |
|
}, |
|
{ |
|
"epoch": 14.763779527559056, |
|
"grad_norm": 7605.75244140625, |
|
"learning_rate": 0.00017426855895196505, |
|
"loss": 1.0125, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"eval_accuracy": 0.3118397748928477, |
|
"eval_loss": 4.639294624328613, |
|
"eval_runtime": 867.3792, |
|
"eval_samples_per_second": 38.747, |
|
"eval_steps_per_second": 1.212, |
|
"step": 15240 |
|
}, |
|
{ |
|
"epoch": 15.748031496062993, |
|
"grad_norm": 7835.32275390625, |
|
"learning_rate": 0.0001415174672489083, |
|
"loss": 1.0029, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_accuracy": 0.3124415675887439, |
|
"eval_loss": 4.64095401763916, |
|
"eval_runtime": 867.1892, |
|
"eval_samples_per_second": 38.755, |
|
"eval_steps_per_second": 1.212, |
|
"step": 16256 |
|
}, |
|
{ |
|
"epoch": 16.73228346456693, |
|
"grad_norm": 8777.490234375, |
|
"learning_rate": 0.00010876637554585152, |
|
"loss": 0.9949, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"eval_accuracy": 0.3122564006053912, |
|
"eval_loss": 4.6424150466918945, |
|
"eval_runtime": 865.8304, |
|
"eval_samples_per_second": 38.816, |
|
"eval_steps_per_second": 1.214, |
|
"step": 17272 |
|
}, |
|
{ |
|
"epoch": 17.716535433070867, |
|
"grad_norm": 7878.4384765625, |
|
"learning_rate": 7.601528384279475e-05, |
|
"loss": 0.9891, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"eval_accuracy": 0.31283891963100763, |
|
"eval_loss": 4.6433796882629395, |
|
"eval_runtime": 865.5459, |
|
"eval_samples_per_second": 38.829, |
|
"eval_steps_per_second": 1.214, |
|
"step": 18288 |
|
}, |
|
{ |
|
"epoch": 18.700787401574804, |
|
"grad_norm": 8385.39453125, |
|
"learning_rate": 4.326419213973799e-05, |
|
"loss": 0.9799, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"eval_accuracy": 0.3129960786524129, |
|
"eval_loss": 4.641451358795166, |
|
"eval_runtime": 865.77, |
|
"eval_samples_per_second": 38.819, |
|
"eval_steps_per_second": 1.214, |
|
"step": 19304 |
|
}, |
|
{ |
|
"epoch": 19.68503937007874, |
|
"grad_norm": 8282.5029296875, |
|
"learning_rate": 1.0513100436681222e-05, |
|
"loss": 0.977, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"eval_accuracy": 0.3132122232568799, |
|
"eval_loss": 4.643510341644287, |
|
"eval_runtime": 865.9594, |
|
"eval_samples_per_second": 38.81, |
|
"eval_steps_per_second": 1.214, |
|
"step": 20320 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"step": 20320, |
|
"total_flos": 1.6974575566848e+17, |
|
"train_loss": 1.065762345621905, |
|
"train_runtime": 40277.3058, |
|
"train_samples_per_second": 16.129, |
|
"train_steps_per_second": 0.505 |
|
} |
|
], |
|
"logging_steps": 1000, |
|
"max_steps": 20320, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 20, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.6974575566848e+17, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|