mms-1b-all-bemgen-combined-fusion / trainer_state.json
csikasote's picture
End of training
8142e5a verified
{
"best_global_step": 1900,
"best_metric": 0.21757769584655762,
"best_model_checkpoint": "/scratch/skscla001/experiments/datasets/results/mms-1b-all-bemgen-combined-fusion/checkpoint-1600",
"epoch": 5.0,
"eval_steps": 100,
"global_step": 1970,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.25380710659898476,
"grad_norm": 29.435340881347656,
"learning_rate": 0.00029099999999999997,
"loss": 7.7297,
"step": 100
},
{
"epoch": 0.25380710659898476,
"eval_loss": 5.600469589233398,
"eval_runtime": 68.6711,
"eval_samples_per_second": 24.406,
"eval_steps_per_second": 6.102,
"eval_wer": 0.9995995194233079,
"step": 100
},
{
"epoch": 0.5076142131979695,
"grad_norm": 12.57145881652832,
"learning_rate": 0.0002845989304812834,
"loss": 4.8876,
"step": 200
},
{
"epoch": 0.5076142131979695,
"eval_loss": 5.189175605773926,
"eval_runtime": 69.1166,
"eval_samples_per_second": 24.249,
"eval_steps_per_second": 6.062,
"eval_wer": 1.1228140435188894,
"step": 200
},
{
"epoch": 0.7614213197969543,
"grad_norm": 5.573763847351074,
"learning_rate": 0.00026855614973262027,
"loss": 4.4438,
"step": 300
},
{
"epoch": 0.7614213197969543,
"eval_loss": 4.348100662231445,
"eval_runtime": 68.2882,
"eval_samples_per_second": 24.543,
"eval_steps_per_second": 6.136,
"eval_wer": 1.1233480176211454,
"step": 300
},
{
"epoch": 1.015228426395939,
"grad_norm": 2.898040294647217,
"learning_rate": 0.00025267379679144383,
"loss": 3.7194,
"step": 400
},
{
"epoch": 1.015228426395939,
"eval_loss": 3.3745944499969482,
"eval_runtime": 68.4183,
"eval_samples_per_second": 24.496,
"eval_steps_per_second": 6.124,
"eval_wer": 1.0120144173007608,
"step": 400
},
{
"epoch": 1.2690355329949239,
"grad_norm": 2.0177152156829834,
"learning_rate": 0.00023663101604278074,
"loss": 3.3173,
"step": 500
},
{
"epoch": 1.2690355329949239,
"eval_loss": 3.256007671356201,
"eval_runtime": 68.7372,
"eval_samples_per_second": 24.383,
"eval_steps_per_second": 6.096,
"eval_wer": 1.0060072086503804,
"step": 500
},
{
"epoch": 1.5228426395939088,
"grad_norm": 3.2718145847320557,
"learning_rate": 0.00022058823529411765,
"loss": 3.1776,
"step": 600
},
{
"epoch": 1.5228426395939088,
"eval_loss": 2.9593822956085205,
"eval_runtime": 67.7629,
"eval_samples_per_second": 24.733,
"eval_steps_per_second": 6.183,
"eval_wer": 1.0037378187157924,
"step": 600
},
{
"epoch": 1.7766497461928934,
"grad_norm": 1.4685205221176147,
"learning_rate": 0.0002045454545454545,
"loss": 1.1178,
"step": 700
},
{
"epoch": 1.7766497461928934,
"eval_loss": 0.2583008110523224,
"eval_runtime": 68.257,
"eval_samples_per_second": 24.554,
"eval_steps_per_second": 6.139,
"eval_wer": 0.48344680283006275,
"step": 700
},
{
"epoch": 2.030456852791878,
"grad_norm": 0.8484176993370056,
"learning_rate": 0.00018850267379679142,
"loss": 0.4382,
"step": 800
},
{
"epoch": 2.030456852791878,
"eval_loss": 0.23566803336143494,
"eval_runtime": 68.4145,
"eval_samples_per_second": 24.498,
"eval_steps_per_second": 6.124,
"eval_wer": 0.42964891202776667,
"step": 800
},
{
"epoch": 2.284263959390863,
"grad_norm": 0.4717855155467987,
"learning_rate": 0.0001724598930481283,
"loss": 0.422,
"step": 900
},
{
"epoch": 2.284263959390863,
"eval_loss": 0.23215773701667786,
"eval_runtime": 69.371,
"eval_samples_per_second": 24.16,
"eval_steps_per_second": 6.04,
"eval_wer": 0.43038312641836873,
"step": 900
},
{
"epoch": 2.5380710659898478,
"grad_norm": 1.4339512586593628,
"learning_rate": 0.00015641711229946522,
"loss": 0.4101,
"step": 1000
},
{
"epoch": 2.5380710659898478,
"eval_loss": 0.23107033967971802,
"eval_runtime": 68.1286,
"eval_samples_per_second": 24.601,
"eval_steps_per_second": 6.15,
"eval_wer": 0.4306501134694967,
"step": 1000
},
{
"epoch": 2.7918781725888326,
"grad_norm": 0.7119982838630676,
"learning_rate": 0.00014037433155080213,
"loss": 0.3923,
"step": 1100
},
{
"epoch": 2.7918781725888326,
"eval_loss": 0.23034636676311493,
"eval_runtime": 67.7918,
"eval_samples_per_second": 24.723,
"eval_steps_per_second": 6.181,
"eval_wer": 0.4159658256574556,
"step": 1100
},
{
"epoch": 3.045685279187817,
"grad_norm": 0.410969614982605,
"learning_rate": 0.00012433155080213902,
"loss": 0.382,
"step": 1200
},
{
"epoch": 3.045685279187817,
"eval_loss": 0.22485551238059998,
"eval_runtime": 67.7668,
"eval_samples_per_second": 24.732,
"eval_steps_per_second": 6.183,
"eval_wer": 0.40608730476571886,
"step": 1200
},
{
"epoch": 3.299492385786802,
"grad_norm": 0.44749805331230164,
"learning_rate": 0.00010828877005347593,
"loss": 0.3799,
"step": 1300
},
{
"epoch": 3.299492385786802,
"eval_loss": 0.22400638461112976,
"eval_runtime": 69.3331,
"eval_samples_per_second": 24.173,
"eval_steps_per_second": 6.043,
"eval_wer": 0.40174876518488856,
"step": 1300
},
{
"epoch": 3.553299492385787,
"grad_norm": 0.48391538858413696,
"learning_rate": 9.224598930481283e-05,
"loss": 0.3733,
"step": 1400
},
{
"epoch": 3.553299492385787,
"eval_loss": 0.2204192578792572,
"eval_runtime": 68.018,
"eval_samples_per_second": 24.641,
"eval_steps_per_second": 6.16,
"eval_wer": 0.3992123881991723,
"step": 1400
},
{
"epoch": 3.8071065989847717,
"grad_norm": 0.7022409439086914,
"learning_rate": 7.620320855614973e-05,
"loss": 0.3757,
"step": 1500
},
{
"epoch": 3.8071065989847717,
"eval_loss": 0.22058387100696564,
"eval_runtime": 68.8893,
"eval_samples_per_second": 24.329,
"eval_steps_per_second": 6.082,
"eval_wer": 0.40308370044052866,
"step": 1500
},
{
"epoch": 4.060913705583756,
"grad_norm": 0.5409083366394043,
"learning_rate": 6.016042780748663e-05,
"loss": 0.3789,
"step": 1600
},
{
"epoch": 4.060913705583756,
"eval_loss": 0.2191571146249771,
"eval_runtime": 68.0976,
"eval_samples_per_second": 24.612,
"eval_steps_per_second": 6.153,
"eval_wer": 0.4031504472033106,
"step": 1600
},
{
"epoch": 4.314720812182741,
"grad_norm": 0.6483948230743408,
"learning_rate": 4.4117647058823526e-05,
"loss": 0.3635,
"step": 1700
},
{
"epoch": 4.314720812182741,
"eval_loss": 0.21890130639076233,
"eval_runtime": 68.9885,
"eval_samples_per_second": 24.294,
"eval_steps_per_second": 6.073,
"eval_wer": 0.40515285008677077,
"step": 1700
},
{
"epoch": 4.568527918781726,
"grad_norm": 0.45765629410743713,
"learning_rate": 2.8074866310160424e-05,
"loss": 0.3583,
"step": 1800
},
{
"epoch": 4.568527918781726,
"eval_loss": 0.21793432533740997,
"eval_runtime": 69.0917,
"eval_samples_per_second": 24.258,
"eval_steps_per_second": 6.064,
"eval_wer": 0.3988786543852623,
"step": 1800
},
{
"epoch": 4.822335025380711,
"grad_norm": 1.3359663486480713,
"learning_rate": 1.2032085561497326e-05,
"loss": 0.3733,
"step": 1900
},
{
"epoch": 4.822335025380711,
"eval_loss": 0.21757769584655762,
"eval_runtime": 68.6773,
"eval_samples_per_second": 24.404,
"eval_steps_per_second": 6.101,
"eval_wer": 0.39680950473902016,
"step": 1900
},
{
"epoch": 5.0,
"step": 1970,
"total_flos": 1.9392924694598373e+19,
"train_loss": 1.6902474505042062,
"train_runtime": 4165.473,
"train_samples_per_second": 7.561,
"train_steps_per_second": 0.473
}
],
"logging_steps": 100,
"max_steps": 1970,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 400,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 4,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.9392924694598373e+19,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}