GPT2_10MRR_50000 / trainer_state.json
xiulinyang's picture
Add checkpoint
7fab909
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 20.0,
"eval_steps": 500,
"global_step": 20320,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.984251968503937,
"grad_norm": 9809.25390625,
"learning_rate": 0.00029969999999999997,
"loss": 1.3222,
"step": 1000
},
{
"epoch": 1.0,
"eval_accuracy": 0.2888088452521089,
"eval_loss": 5.052667617797852,
"eval_runtime": 867.6167,
"eval_samples_per_second": 38.736,
"eval_steps_per_second": 1.211,
"step": 1016
},
{
"epoch": 1.968503937007874,
"grad_norm": 7169.68408203125,
"learning_rate": 0.0005997,
"loss": 1.1829,
"step": 2000
},
{
"epoch": 2.0,
"eval_accuracy": 0.29311490927283645,
"eval_loss": 4.973076343536377,
"eval_runtime": 866.7489,
"eval_samples_per_second": 38.775,
"eval_steps_per_second": 1.213,
"step": 2032
},
{
"epoch": 2.952755905511811,
"grad_norm": 5481.02392578125,
"learning_rate": 0.0005672816593886462,
"loss": 1.1528,
"step": 3000
},
{
"epoch": 3.0,
"eval_accuracy": 0.29985522038131823,
"eval_loss": 4.848499298095703,
"eval_runtime": 870.5632,
"eval_samples_per_second": 38.605,
"eval_steps_per_second": 1.207,
"step": 3048
},
{
"epoch": 3.937007874015748,
"grad_norm": 5024.47705078125,
"learning_rate": 0.0005345305676855895,
"loss": 1.128,
"step": 4000
},
{
"epoch": 4.0,
"eval_accuracy": 0.3019899977220967,
"eval_loss": 4.790306568145752,
"eval_runtime": 870.3918,
"eval_samples_per_second": 38.612,
"eval_steps_per_second": 1.208,
"step": 4064
},
{
"epoch": 4.921259842519685,
"grad_norm": 4731.38330078125,
"learning_rate": 0.0005017794759825327,
"loss": 1.1098,
"step": 5000
},
{
"epoch": 5.0,
"eval_accuracy": 0.3039251673839655,
"eval_loss": 4.753741264343262,
"eval_runtime": 866.2087,
"eval_samples_per_second": 38.799,
"eval_steps_per_second": 1.213,
"step": 5080
},
{
"epoch": 5.905511811023622,
"grad_norm": 4894.84375,
"learning_rate": 0.00046902838427947594,
"loss": 1.0951,
"step": 6000
},
{
"epoch": 6.0,
"eval_accuracy": 0.30580944523971787,
"eval_loss": 4.716702938079834,
"eval_runtime": 865.218,
"eval_samples_per_second": 38.843,
"eval_steps_per_second": 1.215,
"step": 6096
},
{
"epoch": 6.889763779527559,
"grad_norm": 5185.75244140625,
"learning_rate": 0.00043627729257641914,
"loss": 1.0824,
"step": 7000
},
{
"epoch": 7.0,
"eval_accuracy": 0.30677394395426305,
"eval_loss": 4.699747562408447,
"eval_runtime": 865.4872,
"eval_samples_per_second": 38.831,
"eval_steps_per_second": 1.214,
"step": 7112
},
{
"epoch": 7.874015748031496,
"grad_norm": 5380.1005859375,
"learning_rate": 0.0004035262008733624,
"loss": 1.0724,
"step": 8000
},
{
"epoch": 8.0,
"eval_accuracy": 0.3080556721421747,
"eval_loss": 4.674106597900391,
"eval_runtime": 865.0937,
"eval_samples_per_second": 38.849,
"eval_steps_per_second": 1.215,
"step": 8128
},
{
"epoch": 8.858267716535433,
"grad_norm": 5194.1533203125,
"learning_rate": 0.00037077510917030566,
"loss": 1.0626,
"step": 9000
},
{
"epoch": 9.0,
"eval_accuracy": 0.30815867855524104,
"eval_loss": 4.669048309326172,
"eval_runtime": 865.3252,
"eval_samples_per_second": 38.839,
"eval_steps_per_second": 1.215,
"step": 9144
},
{
"epoch": 9.84251968503937,
"grad_norm": 5698.17236328125,
"learning_rate": 0.00033802401746724887,
"loss": 1.0532,
"step": 10000
},
{
"epoch": 10.0,
"eval_accuracy": 0.3097009797779021,
"eval_loss": 4.659518241882324,
"eval_runtime": 864.3515,
"eval_samples_per_second": 38.882,
"eval_steps_per_second": 1.216,
"step": 10160
},
{
"epoch": 10.826771653543307,
"grad_norm": 5876.86181640625,
"learning_rate": 0.0003052729257641921,
"loss": 1.0428,
"step": 11000
},
{
"epoch": 11.0,
"eval_accuracy": 0.3101439248226706,
"eval_loss": 4.65607213973999,
"eval_runtime": 865.0882,
"eval_samples_per_second": 38.849,
"eval_steps_per_second": 1.215,
"step": 11176
},
{
"epoch": 11.811023622047244,
"grad_norm": 6474.33642578125,
"learning_rate": 0.00027252183406113533,
"loss": 1.0351,
"step": 12000
},
{
"epoch": 12.0,
"eval_accuracy": 0.31076691273301343,
"eval_loss": 4.64418363571167,
"eval_runtime": 867.9075,
"eval_samples_per_second": 38.723,
"eval_steps_per_second": 1.211,
"step": 12192
},
{
"epoch": 12.795275590551181,
"grad_norm": 7132.021484375,
"learning_rate": 0.0002397707423580786,
"loss": 1.0274,
"step": 13000
},
{
"epoch": 13.0,
"eval_accuracy": 0.3109399099366426,
"eval_loss": 4.64435338973999,
"eval_runtime": 867.2849,
"eval_samples_per_second": 38.751,
"eval_steps_per_second": 1.212,
"step": 13208
},
{
"epoch": 13.779527559055119,
"grad_norm": 6837.5517578125,
"learning_rate": 0.00020701965065502182,
"loss": 1.0206,
"step": 14000
},
{
"epoch": 14.0,
"eval_accuracy": 0.3116507648211613,
"eval_loss": 4.642159461975098,
"eval_runtime": 866.026,
"eval_samples_per_second": 38.807,
"eval_steps_per_second": 1.214,
"step": 14224
},
{
"epoch": 14.763779527559056,
"grad_norm": 7605.75244140625,
"learning_rate": 0.00017426855895196505,
"loss": 1.0125,
"step": 15000
},
{
"epoch": 15.0,
"eval_accuracy": 0.3118397748928477,
"eval_loss": 4.639294624328613,
"eval_runtime": 867.3792,
"eval_samples_per_second": 38.747,
"eval_steps_per_second": 1.212,
"step": 15240
},
{
"epoch": 15.748031496062993,
"grad_norm": 7835.32275390625,
"learning_rate": 0.0001415174672489083,
"loss": 1.0029,
"step": 16000
},
{
"epoch": 16.0,
"eval_accuracy": 0.3124415675887439,
"eval_loss": 4.64095401763916,
"eval_runtime": 867.1892,
"eval_samples_per_second": 38.755,
"eval_steps_per_second": 1.212,
"step": 16256
},
{
"epoch": 16.73228346456693,
"grad_norm": 8777.490234375,
"learning_rate": 0.00010876637554585152,
"loss": 0.9949,
"step": 17000
},
{
"epoch": 17.0,
"eval_accuracy": 0.3122564006053912,
"eval_loss": 4.6424150466918945,
"eval_runtime": 865.8304,
"eval_samples_per_second": 38.816,
"eval_steps_per_second": 1.214,
"step": 17272
},
{
"epoch": 17.716535433070867,
"grad_norm": 7878.4384765625,
"learning_rate": 7.601528384279475e-05,
"loss": 0.9891,
"step": 18000
},
{
"epoch": 18.0,
"eval_accuracy": 0.31283891963100763,
"eval_loss": 4.6433796882629395,
"eval_runtime": 865.5459,
"eval_samples_per_second": 38.829,
"eval_steps_per_second": 1.214,
"step": 18288
},
{
"epoch": 18.700787401574804,
"grad_norm": 8385.39453125,
"learning_rate": 4.326419213973799e-05,
"loss": 0.9799,
"step": 19000
},
{
"epoch": 19.0,
"eval_accuracy": 0.3129960786524129,
"eval_loss": 4.641451358795166,
"eval_runtime": 865.77,
"eval_samples_per_second": 38.819,
"eval_steps_per_second": 1.214,
"step": 19304
},
{
"epoch": 19.68503937007874,
"grad_norm": 8282.5029296875,
"learning_rate": 1.0513100436681222e-05,
"loss": 0.977,
"step": 20000
},
{
"epoch": 20.0,
"eval_accuracy": 0.3132122232568799,
"eval_loss": 4.643510341644287,
"eval_runtime": 865.9594,
"eval_samples_per_second": 38.81,
"eval_steps_per_second": 1.214,
"step": 20320
},
{
"epoch": 20.0,
"step": 20320,
"total_flos": 1.6974575566848e+17,
"train_loss": 1.065762345621905,
"train_runtime": 40277.3058,
"train_samples_per_second": 16.129,
"train_steps_per_second": 0.505
}
],
"logging_steps": 1000,
"max_steps": 20320,
"num_input_tokens_seen": 0,
"num_train_epochs": 20,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.6974575566848e+17,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}