diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..26d33521af10bcc7fd8cea344038eaaeb78d0ef5 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,3 @@ +# Default ignored files +/shelf/ +/workspace.xml diff --git a/added_tokens.json b/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..cd5b477a9075c49d99de65622db37bb06a251985 --- /dev/null +++ b/added_tokens.json @@ -0,0 +1,4 @@ +{ + "": 80, + "": 79 +} diff --git a/checkpoint-1000/added_tokens.json b/checkpoint-1000/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..cd5b477a9075c49d99de65622db37bb06a251985 --- /dev/null +++ b/checkpoint-1000/added_tokens.json @@ -0,0 +1,4 @@ +{ + "": 80, + "": 79 +} diff --git a/checkpoint-1000/config.json b/checkpoint-1000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..1197099a882c66f2bbab2210646af21b05d05ee4 --- /dev/null +++ b/checkpoint-1000/config.json @@ -0,0 +1,91 @@ +{ + "activation_dropout": 0.1, + "apply_spec_augment": true, + "architectures": [ + "SpeechT5ForTextToSpeech" + ], + "attention_dropout": 0.1, + "bos_token_id": 0, + "conv_bias": false, + "conv_dim": [ + 512, + 512, + 512, + 512, + 512, + 512, + 512 + ], + "conv_kernel": [ + 10, + 3, + 3, + 3, + 3, + 2, + 2 + ], + "conv_stride": [ + 5, + 2, + 2, + 2, + 2, + 2, + 2 + ], + "decoder_attention_heads": 12, + "decoder_ffn_dim": 3072, + "decoder_layerdrop": 0.1, + "decoder_layers": 6, + "decoder_start_token_id": 2, + "encoder_attention_heads": 12, + "encoder_ffn_dim": 3072, + "encoder_layerdrop": 0.1, + "encoder_layers": 12, + "encoder_max_relative_position": 160, + "eos_token_id": 2, + "feat_extract_activation": "gelu", + "feat_extract_norm": "group", + "feat_proj_dropout": 0.0, + "guided_attention_loss_num_heads": 2, + "guided_attention_loss_scale": 10.0, + "guided_attention_loss_sigma": 0.4, + "hidden_act": "gelu", + "hidden_dropout": 0.1, + "hidden_size": 768, + "initializer_range": 0.02, + "is_encoder_decoder": true, + "layer_norm_eps": 1e-05, + "mask_feature_length": 10, + "mask_feature_min_masks": 0, + "mask_feature_prob": 0.0, + "mask_time_length": 10, + "mask_time_min_masks": 2, + "mask_time_prob": 0.05, + "max_length": null, + "max_speech_positions": 1876, + "max_text_positions": 600, + "model_type": "speecht5", + "num_conv_pos_embedding_groups": 16, + "num_conv_pos_embeddings": 128, + "num_feat_extract_layers": 7, + "num_mel_bins": 80, + "pad_token_id": 1, + "positional_dropout": 0.1, + "reduction_factor": 2, + "scale_embedding": false, + "speaker_embedding_dim": 512, + "speech_decoder_postnet_dropout": 0.5, + "speech_decoder_postnet_kernel": 5, + "speech_decoder_postnet_layers": 5, + "speech_decoder_postnet_units": 256, + "speech_decoder_prenet_dropout": 0.5, + "speech_decoder_prenet_layers": 2, + "speech_decoder_prenet_units": 256, + "torch_dtype": "float32", + "transformers_version": "4.55.4", + "use_cache": false, + "use_guided_attention_loss": true, + "vocab_size": 81 +} diff --git a/checkpoint-1000/generation_config.json b/checkpoint-1000/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1748094b7e4b8c1fb66bf2357c44c5f625a36179 --- /dev/null +++ b/checkpoint-1000/generation_config.json @@ -0,0 +1,9 @@ +{ + "_from_model_config": true, + "bos_token_id": 0, + "decoder_start_token_id": 2, + "eos_token_id": 2, + "max_length": 1876, + "pad_token_id": 1, + "transformers_version": "4.55.4" +} diff --git a/checkpoint-1000/model.safetensors b/checkpoint-1000/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..745e068dcd86ebc82a19284e74a088cbf4e4f1fc --- /dev/null +++ b/checkpoint-1000/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb5d6fe49ff85411787439f9ad2e6bfa7affebb9cb657848d6ca12433db4e10a +size 577789320 diff --git a/checkpoint-1000/optimizer.pt b/checkpoint-1000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..3f952bd8430e9be7c3e3db31d7ff28a775d8c7f6 --- /dev/null +++ b/checkpoint-1000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a801f2d4ec47bf11dfadfa6c068daebd7c9d851603bd0a0eef429e5a22f6bb2e +size 1155777946 diff --git a/checkpoint-1000/preprocessor_config.json b/checkpoint-1000/preprocessor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..98a41e66e16f3225a15cb2b51c963fe2213aa273 --- /dev/null +++ b/checkpoint-1000/preprocessor_config.json @@ -0,0 +1,19 @@ +{ + "do_normalize": false, + "feature_extractor_type": "SpeechT5FeatureExtractor", + "feature_size": 1, + "fmax": 7600, + "fmin": 80, + "frame_signal_scale": 1.0, + "hop_length": 16, + "mel_floor": 1e-10, + "num_mel_bins": 80, + "padding_side": "right", + "padding_value": 0.0, + "processor_class": "SpeechT5Processor", + "reduction_factor": 2, + "return_attention_mask": true, + "sampling_rate": 16000, + "win_function": "hann_window", + "win_length": 64 +} diff --git a/checkpoint-1000/rng_state.pth b/checkpoint-1000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..637854d725562baa5365c45dbb6e3e5ac76a576a --- /dev/null +++ b/checkpoint-1000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f27257904c7decb41a03da01a49d9f6fdf1f1b8f5e5d56fe64ef4572336d6eb +size 14645 diff --git a/checkpoint-1000/scheduler.pt b/checkpoint-1000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..9744b455ac183d662ae7cb381d958022106980d8 --- /dev/null +++ b/checkpoint-1000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5186565a906d7db433e54fbfdb3d62aa206e2cb82464d6a3316608741a692047 +size 1465 diff --git a/checkpoint-1000/special_tokens_map.json b/checkpoint-1000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..4ee24ec69861cfc94abbe2c8c934aa0744aa623c --- /dev/null +++ b/checkpoint-1000/special_tokens_map.json @@ -0,0 +1,13 @@ +{ + "bos_token": "", + "eos_token": "", + "mask_token": { + "content": "", + "lstrip": true, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": "" +} diff --git a/checkpoint-1000/spm_char.model b/checkpoint-1000/spm_char.model new file mode 100644 index 0000000000000000000000000000000000000000..8fb73691942626fa75df80b61aab0e9b9340d8e2 --- /dev/null +++ b/checkpoint-1000/spm_char.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7fcc48f3e225f627b1641db410ceb0c8649bd2b0c982e150b03f8be3728ab560 +size 238473 diff --git a/checkpoint-1000/tokenizer_config.json b/checkpoint-1000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e88d44ba3be31ac8f53461ae7c1b02b4c5c830ab --- /dev/null +++ b/checkpoint-1000/tokenizer_config.json @@ -0,0 +1,64 @@ +{ + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "79": { + "content": "", + "lstrip": true, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + }, + "80": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "extra_special_tokens": {}, + "mask_token": "", + "model_max_length": 600, + "normalize": false, + "pad_token": "", + "processor_class": "SpeechT5Processor", + "sp_model_kwargs": {}, + "tokenizer_class": "SpeechT5Tokenizer", + "unk_token": "" +} diff --git a/checkpoint-1000/trainer_state.json b/checkpoint-1000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..0e6571fd31c1f1cd220e5e4d8213a56f10851444 --- /dev/null +++ b/checkpoint-1000/trainer_state.json @@ -0,0 +1,322 @@ +{ + "best_global_step": 1000, + "best_metric": 0.9205830097198486, + "best_model_checkpoint": "runs/emotts_ravdess\\checkpoint-1000", + "epoch": 24.395061728395063, + "eval_steps": 1000, + "global_step": 1000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.6172839506172839, + "grad_norm": 46.678199768066406, + "learning_rate": 4.800000000000001e-07, + "loss": 3.4472, + "step": 25 + }, + { + "epoch": 1.2222222222222223, + "grad_norm": 26.903335571289062, + "learning_rate": 9.800000000000001e-07, + "loss": 2.9051, + "step": 50 + }, + { + "epoch": 1.8395061728395061, + "grad_norm": 16.712799072265625, + "learning_rate": 1.48e-06, + "loss": 2.2302, + "step": 75 + }, + { + "epoch": 2.4444444444444446, + "grad_norm": 11.607951164245605, + "learning_rate": 1.98e-06, + "loss": 1.7683, + "step": 100 + }, + { + "epoch": 3.049382716049383, + "grad_norm": 7.216983318328857, + "learning_rate": 2.4800000000000004e-06, + "loss": 1.5434, + "step": 125 + }, + { + "epoch": 3.6666666666666665, + "grad_norm": 10.899630546569824, + "learning_rate": 2.9800000000000003e-06, + "loss": 1.4385, + "step": 150 + }, + { + "epoch": 4.271604938271605, + "grad_norm": 6.701765537261963, + "learning_rate": 3.48e-06, + "loss": 1.3262, + "step": 175 + }, + { + "epoch": 4.888888888888889, + "grad_norm": 9.419053077697754, + "learning_rate": 3.980000000000001e-06, + "loss": 1.285, + "step": 200 + }, + { + "epoch": 5.493827160493828, + "grad_norm": 5.913278579711914, + "learning_rate": 4.48e-06, + "loss": 1.2503, + "step": 225 + }, + { + "epoch": 6.098765432098766, + "grad_norm": 8.171669006347656, + "learning_rate": 4.980000000000001e-06, + "loss": 1.1868, + "step": 250 + }, + { + "epoch": 6.716049382716049, + "grad_norm": 5.54558801651001, + "learning_rate": 5.480000000000001e-06, + "loss": 1.1478, + "step": 275 + }, + { + "epoch": 7.320987654320987, + "grad_norm": 5.325434684753418, + "learning_rate": 5.98e-06, + "loss": 1.1245, + "step": 300 + }, + { + "epoch": 7.938271604938271, + "grad_norm": 5.406148433685303, + "learning_rate": 6.480000000000001e-06, + "loss": 1.1145, + "step": 325 + }, + { + "epoch": 8.54320987654321, + "grad_norm": 8.461536407470703, + "learning_rate": 6.98e-06, + "loss": 1.0641, + "step": 350 + }, + { + "epoch": 9.148148148148149, + "grad_norm": 3.8533031940460205, + "learning_rate": 7.48e-06, + "loss": 1.0573, + "step": 375 + }, + { + "epoch": 9.765432098765432, + "grad_norm": 7.569976806640625, + "learning_rate": 7.980000000000002e-06, + "loss": 1.061, + "step": 400 + }, + { + "epoch": 10.37037037037037, + "grad_norm": 10.156228065490723, + "learning_rate": 8.48e-06, + "loss": 1.0485, + "step": 425 + }, + { + "epoch": 10.987654320987655, + "grad_norm": 4.668756484985352, + "learning_rate": 8.98e-06, + "loss": 1.0216, + "step": 450 + }, + { + "epoch": 11.592592592592592, + "grad_norm": 5.087125301361084, + "learning_rate": 9.48e-06, + "loss": 1.0319, + "step": 475 + }, + { + "epoch": 12.197530864197532, + "grad_norm": 7.943349361419678, + "learning_rate": 9.980000000000001e-06, + "loss": 1.0, + "step": 500 + }, + { + "epoch": 12.814814814814815, + "grad_norm": 7.655898571014404, + "learning_rate": 9.931428571428571e-06, + "loss": 1.0052, + "step": 525 + }, + { + "epoch": 13.419753086419753, + "grad_norm": 4.458106994628906, + "learning_rate": 9.86e-06, + "loss": 1.0001, + "step": 550 + }, + { + "epoch": 14.024691358024691, + "grad_norm": 9.058222770690918, + "learning_rate": 9.78857142857143e-06, + "loss": 1.0015, + "step": 575 + }, + { + "epoch": 14.641975308641975, + "grad_norm": 4.795205593109131, + "learning_rate": 9.717142857142858e-06, + "loss": 0.9836, + "step": 600 + }, + { + "epoch": 15.246913580246913, + "grad_norm": 10.566876411437988, + "learning_rate": 9.645714285714286e-06, + "loss": 1.0019, + "step": 625 + }, + { + "epoch": 15.864197530864198, + "grad_norm": 7.610626220703125, + "learning_rate": 9.574285714285715e-06, + "loss": 0.9779, + "step": 650 + }, + { + "epoch": 16.469135802469136, + "grad_norm": 6.008159637451172, + "learning_rate": 9.502857142857144e-06, + "loss": 0.9798, + "step": 675 + }, + { + "epoch": 17.074074074074073, + "grad_norm": 6.685286521911621, + "learning_rate": 9.431428571428573e-06, + "loss": 0.9753, + "step": 700 + }, + { + "epoch": 17.691358024691358, + "grad_norm": 2.7540247440338135, + "learning_rate": 9.360000000000002e-06, + "loss": 0.967, + "step": 725 + }, + { + "epoch": 18.296296296296298, + "grad_norm": 4.825072288513184, + "learning_rate": 9.28857142857143e-06, + "loss": 0.9575, + "step": 750 + }, + { + "epoch": 18.91358024691358, + "grad_norm": 6.618119716644287, + "learning_rate": 9.217142857142858e-06, + "loss": 0.9675, + "step": 775 + }, + { + "epoch": 19.51851851851852, + "grad_norm": 5.465808391571045, + "learning_rate": 9.145714285714287e-06, + "loss": 0.9626, + "step": 800 + }, + { + "epoch": 20.123456790123456, + "grad_norm": 4.9501051902771, + "learning_rate": 9.074285714285716e-06, + "loss": 0.9638, + "step": 825 + }, + { + "epoch": 20.74074074074074, + "grad_norm": 4.926831245422363, + "learning_rate": 9.002857142857144e-06, + "loss": 0.9582, + "step": 850 + }, + { + "epoch": 21.34567901234568, + "grad_norm": 6.605464458465576, + "learning_rate": 8.931428571428573e-06, + "loss": 0.9551, + "step": 875 + }, + { + "epoch": 21.962962962962962, + "grad_norm": 5.774538040161133, + "learning_rate": 8.860000000000002e-06, + "loss": 0.9596, + "step": 900 + }, + { + "epoch": 22.567901234567902, + "grad_norm": 4.304802417755127, + "learning_rate": 8.788571428571429e-06, + "loss": 0.9489, + "step": 925 + }, + { + "epoch": 23.17283950617284, + "grad_norm": 5.171604633331299, + "learning_rate": 8.717142857142858e-06, + "loss": 0.953, + "step": 950 + }, + { + "epoch": 23.790123456790123, + "grad_norm": 7.152281761169434, + "learning_rate": 8.645714285714287e-06, + "loss": 0.9604, + "step": 975 + }, + { + "epoch": 24.395061728395063, + "grad_norm": 4.954558849334717, + "learning_rate": 8.574285714285714e-06, + "loss": 0.9489, + "step": 1000 + }, + { + "epoch": 24.395061728395063, + "eval_loss": 0.9205830097198486, + "eval_runtime": 2.2708, + "eval_samples_per_second": 63.413, + "eval_steps_per_second": 31.707, + "step": 1000 + } + ], + "logging_steps": 25, + "max_steps": 4000, + "num_input_tokens_seen": 0, + "num_train_epochs": 98, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 821472814356480.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-1000/training_args.bin b/checkpoint-1000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..cf416a3be0d19e0ce5aadbb31f093c5d913fee53 --- /dev/null +++ b/checkpoint-1000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66d367dc04409d63341644c780ebdb997e8756f9aa9f6d110afc5d9ab8de84be +size 5905 diff --git a/checkpoint-2000/added_tokens.json b/checkpoint-2000/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..cd5b477a9075c49d99de65622db37bb06a251985 --- /dev/null +++ b/checkpoint-2000/added_tokens.json @@ -0,0 +1,4 @@ +{ + "": 80, + "": 79 +} diff --git a/checkpoint-2000/config.json b/checkpoint-2000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..1197099a882c66f2bbab2210646af21b05d05ee4 --- /dev/null +++ b/checkpoint-2000/config.json @@ -0,0 +1,91 @@ +{ + "activation_dropout": 0.1, + "apply_spec_augment": true, + "architectures": [ + "SpeechT5ForTextToSpeech" + ], + "attention_dropout": 0.1, + "bos_token_id": 0, + "conv_bias": false, + "conv_dim": [ + 512, + 512, + 512, + 512, + 512, + 512, + 512 + ], + "conv_kernel": [ + 10, + 3, + 3, + 3, + 3, + 2, + 2 + ], + "conv_stride": [ + 5, + 2, + 2, + 2, + 2, + 2, + 2 + ], + "decoder_attention_heads": 12, + "decoder_ffn_dim": 3072, + "decoder_layerdrop": 0.1, + "decoder_layers": 6, + "decoder_start_token_id": 2, + "encoder_attention_heads": 12, + "encoder_ffn_dim": 3072, + "encoder_layerdrop": 0.1, + "encoder_layers": 12, + "encoder_max_relative_position": 160, + "eos_token_id": 2, + "feat_extract_activation": "gelu", + "feat_extract_norm": "group", + "feat_proj_dropout": 0.0, + "guided_attention_loss_num_heads": 2, + "guided_attention_loss_scale": 10.0, + "guided_attention_loss_sigma": 0.4, + "hidden_act": "gelu", + "hidden_dropout": 0.1, + "hidden_size": 768, + "initializer_range": 0.02, + "is_encoder_decoder": true, + "layer_norm_eps": 1e-05, + "mask_feature_length": 10, + "mask_feature_min_masks": 0, + "mask_feature_prob": 0.0, + "mask_time_length": 10, + "mask_time_min_masks": 2, + "mask_time_prob": 0.05, + "max_length": null, + "max_speech_positions": 1876, + "max_text_positions": 600, + "model_type": "speecht5", + "num_conv_pos_embedding_groups": 16, + "num_conv_pos_embeddings": 128, + "num_feat_extract_layers": 7, + "num_mel_bins": 80, + "pad_token_id": 1, + "positional_dropout": 0.1, + "reduction_factor": 2, + "scale_embedding": false, + "speaker_embedding_dim": 512, + "speech_decoder_postnet_dropout": 0.5, + "speech_decoder_postnet_kernel": 5, + "speech_decoder_postnet_layers": 5, + "speech_decoder_postnet_units": 256, + "speech_decoder_prenet_dropout": 0.5, + "speech_decoder_prenet_layers": 2, + "speech_decoder_prenet_units": 256, + "torch_dtype": "float32", + "transformers_version": "4.55.4", + "use_cache": false, + "use_guided_attention_loss": true, + "vocab_size": 81 +} diff --git a/checkpoint-2000/generation_config.json b/checkpoint-2000/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1748094b7e4b8c1fb66bf2357c44c5f625a36179 --- /dev/null +++ b/checkpoint-2000/generation_config.json @@ -0,0 +1,9 @@ +{ + "_from_model_config": true, + "bos_token_id": 0, + "decoder_start_token_id": 2, + "eos_token_id": 2, + "max_length": 1876, + "pad_token_id": 1, + "transformers_version": "4.55.4" +} diff --git a/checkpoint-2000/model.safetensors b/checkpoint-2000/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..36558fd470cb122c7d43ecb4ac16913b68df0eca --- /dev/null +++ b/checkpoint-2000/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:295a846f5d0ead4e65b737b369b8205cd013a02d08d0220b3caa7e8e4b777b77 +size 577789320 diff --git a/checkpoint-2000/optimizer.pt b/checkpoint-2000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..c0ac5d0cb882ff013f3e14e6e98b9a98efe46965 --- /dev/null +++ b/checkpoint-2000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2388ca0f503df54eb4d30573ff0fc9814dd98cc0759ae40bf1b7438f984e1ab6 +size 1155777946 diff --git a/checkpoint-2000/preprocessor_config.json b/checkpoint-2000/preprocessor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..98a41e66e16f3225a15cb2b51c963fe2213aa273 --- /dev/null +++ b/checkpoint-2000/preprocessor_config.json @@ -0,0 +1,19 @@ +{ + "do_normalize": false, + "feature_extractor_type": "SpeechT5FeatureExtractor", + "feature_size": 1, + "fmax": 7600, + "fmin": 80, + "frame_signal_scale": 1.0, + "hop_length": 16, + "mel_floor": 1e-10, + "num_mel_bins": 80, + "padding_side": "right", + "padding_value": 0.0, + "processor_class": "SpeechT5Processor", + "reduction_factor": 2, + "return_attention_mask": true, + "sampling_rate": 16000, + "win_function": "hann_window", + "win_length": 64 +} diff --git a/checkpoint-2000/rng_state.pth b/checkpoint-2000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..d1164add54cac59c217275dd520cddcd43877c7f --- /dev/null +++ b/checkpoint-2000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:396a8cc8a565882c2cc697e78085381bcb24a262358918ccaa5445eb5232e231 +size 14645 diff --git a/checkpoint-2000/scheduler.pt b/checkpoint-2000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..f3ce841469cd05c58a347033a571c80874dc9dc9 --- /dev/null +++ b/checkpoint-2000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1e92941487269a9e704ed42d0796c2eb3245e8d6d83c68a723be04187c99b397 +size 1465 diff --git a/checkpoint-2000/special_tokens_map.json b/checkpoint-2000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..4ee24ec69861cfc94abbe2c8c934aa0744aa623c --- /dev/null +++ b/checkpoint-2000/special_tokens_map.json @@ -0,0 +1,13 @@ +{ + "bos_token": "", + "eos_token": "", + "mask_token": { + "content": "", + "lstrip": true, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": "" +} diff --git a/checkpoint-2000/spm_char.model b/checkpoint-2000/spm_char.model new file mode 100644 index 0000000000000000000000000000000000000000..8fb73691942626fa75df80b61aab0e9b9340d8e2 --- /dev/null +++ b/checkpoint-2000/spm_char.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7fcc48f3e225f627b1641db410ceb0c8649bd2b0c982e150b03f8be3728ab560 +size 238473 diff --git a/checkpoint-2000/tokenizer_config.json b/checkpoint-2000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e88d44ba3be31ac8f53461ae7c1b02b4c5c830ab --- /dev/null +++ b/checkpoint-2000/tokenizer_config.json @@ -0,0 +1,64 @@ +{ + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "79": { + "content": "", + "lstrip": true, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + }, + "80": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "extra_special_tokens": {}, + "mask_token": "", + "model_max_length": 600, + "normalize": false, + "pad_token": "", + "processor_class": "SpeechT5Processor", + "sp_model_kwargs": {}, + "tokenizer_class": "SpeechT5Tokenizer", + "unk_token": "" +} diff --git a/checkpoint-2000/trainer_state.json b/checkpoint-2000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..744adfe7d511646d459a96a97c16626e0e168006 --- /dev/null +++ b/checkpoint-2000/trainer_state.json @@ -0,0 +1,610 @@ +{ + "best_global_step": 2000, + "best_metric": 0.8953001499176025, + "best_model_checkpoint": "runs/emotts_ravdess\\checkpoint-2000", + "epoch": 48.79012345679013, + "eval_steps": 1000, + "global_step": 2000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.6172839506172839, + "grad_norm": 46.678199768066406, + "learning_rate": 4.800000000000001e-07, + "loss": 3.4472, + "step": 25 + }, + { + "epoch": 1.2222222222222223, + "grad_norm": 26.903335571289062, + "learning_rate": 9.800000000000001e-07, + "loss": 2.9051, + "step": 50 + }, + { + "epoch": 1.8395061728395061, + "grad_norm": 16.712799072265625, + "learning_rate": 1.48e-06, + "loss": 2.2302, + "step": 75 + }, + { + "epoch": 2.4444444444444446, + "grad_norm": 11.607951164245605, + "learning_rate": 1.98e-06, + "loss": 1.7683, + "step": 100 + }, + { + "epoch": 3.049382716049383, + "grad_norm": 7.216983318328857, + "learning_rate": 2.4800000000000004e-06, + "loss": 1.5434, + "step": 125 + }, + { + "epoch": 3.6666666666666665, + "grad_norm": 10.899630546569824, + "learning_rate": 2.9800000000000003e-06, + "loss": 1.4385, + "step": 150 + }, + { + "epoch": 4.271604938271605, + "grad_norm": 6.701765537261963, + "learning_rate": 3.48e-06, + "loss": 1.3262, + "step": 175 + }, + { + "epoch": 4.888888888888889, + "grad_norm": 9.419053077697754, + "learning_rate": 3.980000000000001e-06, + "loss": 1.285, + "step": 200 + }, + { + "epoch": 5.493827160493828, + "grad_norm": 5.913278579711914, + "learning_rate": 4.48e-06, + "loss": 1.2503, + "step": 225 + }, + { + "epoch": 6.098765432098766, + "grad_norm": 8.171669006347656, + "learning_rate": 4.980000000000001e-06, + "loss": 1.1868, + "step": 250 + }, + { + "epoch": 6.716049382716049, + "grad_norm": 5.54558801651001, + "learning_rate": 5.480000000000001e-06, + "loss": 1.1478, + "step": 275 + }, + { + "epoch": 7.320987654320987, + "grad_norm": 5.325434684753418, + "learning_rate": 5.98e-06, + "loss": 1.1245, + "step": 300 + }, + { + "epoch": 7.938271604938271, + "grad_norm": 5.406148433685303, + "learning_rate": 6.480000000000001e-06, + "loss": 1.1145, + "step": 325 + }, + { + "epoch": 8.54320987654321, + "grad_norm": 8.461536407470703, + "learning_rate": 6.98e-06, + "loss": 1.0641, + "step": 350 + }, + { + "epoch": 9.148148148148149, + "grad_norm": 3.8533031940460205, + "learning_rate": 7.48e-06, + "loss": 1.0573, + "step": 375 + }, + { + "epoch": 9.765432098765432, + "grad_norm": 7.569976806640625, + "learning_rate": 7.980000000000002e-06, + "loss": 1.061, + "step": 400 + }, + { + "epoch": 10.37037037037037, + "grad_norm": 10.156228065490723, + "learning_rate": 8.48e-06, + "loss": 1.0485, + "step": 425 + }, + { + "epoch": 10.987654320987655, + "grad_norm": 4.668756484985352, + "learning_rate": 8.98e-06, + "loss": 1.0216, + "step": 450 + }, + { + "epoch": 11.592592592592592, + "grad_norm": 5.087125301361084, + "learning_rate": 9.48e-06, + "loss": 1.0319, + "step": 475 + }, + { + "epoch": 12.197530864197532, + "grad_norm": 7.943349361419678, + "learning_rate": 9.980000000000001e-06, + "loss": 1.0, + "step": 500 + }, + { + "epoch": 12.814814814814815, + "grad_norm": 7.655898571014404, + "learning_rate": 9.931428571428571e-06, + "loss": 1.0052, + "step": 525 + }, + { + "epoch": 13.419753086419753, + "grad_norm": 4.458106994628906, + "learning_rate": 9.86e-06, + "loss": 1.0001, + "step": 550 + }, + { + "epoch": 14.024691358024691, + "grad_norm": 9.058222770690918, + "learning_rate": 9.78857142857143e-06, + "loss": 1.0015, + "step": 575 + }, + { + "epoch": 14.641975308641975, + "grad_norm": 4.795205593109131, + "learning_rate": 9.717142857142858e-06, + "loss": 0.9836, + "step": 600 + }, + { + "epoch": 15.246913580246913, + "grad_norm": 10.566876411437988, + "learning_rate": 9.645714285714286e-06, + "loss": 1.0019, + "step": 625 + }, + { + "epoch": 15.864197530864198, + "grad_norm": 7.610626220703125, + "learning_rate": 9.574285714285715e-06, + "loss": 0.9779, + "step": 650 + }, + { + "epoch": 16.469135802469136, + "grad_norm": 6.008159637451172, + "learning_rate": 9.502857142857144e-06, + "loss": 0.9798, + "step": 675 + }, + { + "epoch": 17.074074074074073, + "grad_norm": 6.685286521911621, + "learning_rate": 9.431428571428573e-06, + "loss": 0.9753, + "step": 700 + }, + { + "epoch": 17.691358024691358, + "grad_norm": 2.7540247440338135, + "learning_rate": 9.360000000000002e-06, + "loss": 0.967, + "step": 725 + }, + { + "epoch": 18.296296296296298, + "grad_norm": 4.825072288513184, + "learning_rate": 9.28857142857143e-06, + "loss": 0.9575, + "step": 750 + }, + { + "epoch": 18.91358024691358, + "grad_norm": 6.618119716644287, + "learning_rate": 9.217142857142858e-06, + "loss": 0.9675, + "step": 775 + }, + { + "epoch": 19.51851851851852, + "grad_norm": 5.465808391571045, + "learning_rate": 9.145714285714287e-06, + "loss": 0.9626, + "step": 800 + }, + { + "epoch": 20.123456790123456, + "grad_norm": 4.9501051902771, + "learning_rate": 9.074285714285716e-06, + "loss": 0.9638, + "step": 825 + }, + { + "epoch": 20.74074074074074, + "grad_norm": 4.926831245422363, + "learning_rate": 9.002857142857144e-06, + "loss": 0.9582, + "step": 850 + }, + { + "epoch": 21.34567901234568, + "grad_norm": 6.605464458465576, + "learning_rate": 8.931428571428573e-06, + "loss": 0.9551, + "step": 875 + }, + { + "epoch": 21.962962962962962, + "grad_norm": 5.774538040161133, + "learning_rate": 8.860000000000002e-06, + "loss": 0.9596, + "step": 900 + }, + { + "epoch": 22.567901234567902, + "grad_norm": 4.304802417755127, + "learning_rate": 8.788571428571429e-06, + "loss": 0.9489, + "step": 925 + }, + { + "epoch": 23.17283950617284, + "grad_norm": 5.171604633331299, + "learning_rate": 8.717142857142858e-06, + "loss": 0.953, + "step": 950 + }, + { + "epoch": 23.790123456790123, + "grad_norm": 7.152281761169434, + "learning_rate": 8.645714285714287e-06, + "loss": 0.9604, + "step": 975 + }, + { + "epoch": 24.395061728395063, + "grad_norm": 4.954558849334717, + "learning_rate": 8.574285714285714e-06, + "loss": 0.9489, + "step": 1000 + }, + { + "epoch": 24.395061728395063, + "eval_loss": 0.9205830097198486, + "eval_runtime": 2.2708, + "eval_samples_per_second": 63.413, + "eval_steps_per_second": 31.707, + "step": 1000 + }, + { + "epoch": 25.0, + "grad_norm": 10.266937255859375, + "learning_rate": 8.502857142857143e-06, + "loss": 0.9541, + "step": 1025 + }, + { + "epoch": 25.617283950617285, + "grad_norm": 3.225881814956665, + "learning_rate": 8.431428571428572e-06, + "loss": 0.9451, + "step": 1050 + }, + { + "epoch": 26.22222222222222, + "grad_norm": 4.001440048217773, + "learning_rate": 8.36e-06, + "loss": 0.9422, + "step": 1075 + }, + { + "epoch": 26.839506172839506, + "grad_norm": 5.347984313964844, + "learning_rate": 8.288571428571429e-06, + "loss": 0.9434, + "step": 1100 + }, + { + "epoch": 27.444444444444443, + "grad_norm": 4.1566901206970215, + "learning_rate": 8.217142857142858e-06, + "loss": 0.942, + "step": 1125 + }, + { + "epoch": 28.049382716049383, + "grad_norm": 3.2101686000823975, + "learning_rate": 8.145714285714287e-06, + "loss": 0.9365, + "step": 1150 + }, + { + "epoch": 28.666666666666668, + "grad_norm": 5.183631896972656, + "learning_rate": 8.074285714285714e-06, + "loss": 0.941, + "step": 1175 + }, + { + "epoch": 29.271604938271604, + "grad_norm": 4.704529285430908, + "learning_rate": 8.002857142857143e-06, + "loss": 0.9374, + "step": 1200 + }, + { + "epoch": 29.88888888888889, + "grad_norm": 4.460058689117432, + "learning_rate": 7.931428571428572e-06, + "loss": 0.9383, + "step": 1225 + }, + { + "epoch": 30.493827160493826, + "grad_norm": 3.616530418395996, + "learning_rate": 7.860000000000001e-06, + "loss": 0.9321, + "step": 1250 + }, + { + "epoch": 31.098765432098766, + "grad_norm": 3.92207932472229, + "learning_rate": 7.788571428571428e-06, + "loss": 0.9347, + "step": 1275 + }, + { + "epoch": 31.71604938271605, + "grad_norm": 3.6962461471557617, + "learning_rate": 7.717142857142857e-06, + "loss": 0.9305, + "step": 1300 + }, + { + "epoch": 32.32098765432099, + "grad_norm": 4.276056289672852, + "learning_rate": 7.645714285714286e-06, + "loss": 0.9336, + "step": 1325 + }, + { + "epoch": 32.93827160493827, + "grad_norm": 5.176277160644531, + "learning_rate": 7.574285714285715e-06, + "loss": 0.9351, + "step": 1350 + }, + { + "epoch": 33.54320987654321, + "grad_norm": 7.2538347244262695, + "learning_rate": 7.502857142857144e-06, + "loss": 0.9241, + "step": 1375 + }, + { + "epoch": 34.148148148148145, + "grad_norm": 4.3576273918151855, + "learning_rate": 7.431428571428572e-06, + "loss": 0.9316, + "step": 1400 + }, + { + "epoch": 34.76543209876543, + "grad_norm": 9.138855934143066, + "learning_rate": 7.360000000000001e-06, + "loss": 0.9277, + "step": 1425 + }, + { + "epoch": 35.370370370370374, + "grad_norm": 4.475003719329834, + "learning_rate": 7.28857142857143e-06, + "loss": 0.9245, + "step": 1450 + }, + { + "epoch": 35.98765432098765, + "grad_norm": 7.28753137588501, + "learning_rate": 7.217142857142858e-06, + "loss": 0.9266, + "step": 1475 + }, + { + "epoch": 36.592592592592595, + "grad_norm": 5.1342949867248535, + "learning_rate": 7.145714285714286e-06, + "loss": 0.9297, + "step": 1500 + }, + { + "epoch": 37.19753086419753, + "grad_norm": 2.7765142917633057, + "learning_rate": 7.074285714285715e-06, + "loss": 0.9253, + "step": 1525 + }, + { + "epoch": 37.81481481481482, + "grad_norm": 3.8011326789855957, + "learning_rate": 7.002857142857143e-06, + "loss": 0.9203, + "step": 1550 + }, + { + "epoch": 38.41975308641975, + "grad_norm": 7.432782173156738, + "learning_rate": 6.931428571428572e-06, + "loss": 0.9196, + "step": 1575 + }, + { + "epoch": 39.02469135802469, + "grad_norm": 4.179474830627441, + "learning_rate": 6.860000000000001e-06, + "loss": 0.9188, + "step": 1600 + }, + { + "epoch": 39.641975308641975, + "grad_norm": 8.513073921203613, + "learning_rate": 6.7885714285714286e-06, + "loss": 0.9268, + "step": 1625 + }, + { + "epoch": 40.24691358024691, + "grad_norm": 3.699882984161377, + "learning_rate": 6.7171428571428576e-06, + "loss": 0.9216, + "step": 1650 + }, + { + "epoch": 40.864197530864196, + "grad_norm": 3.949507713317871, + "learning_rate": 6.645714285714287e-06, + "loss": 0.9238, + "step": 1675 + }, + { + "epoch": 41.46913580246913, + "grad_norm": 3.7951810359954834, + "learning_rate": 6.574285714285716e-06, + "loss": 0.9198, + "step": 1700 + }, + { + "epoch": 42.074074074074076, + "grad_norm": 5.373620986938477, + "learning_rate": 6.502857142857143e-06, + "loss": 0.9135, + "step": 1725 + }, + { + "epoch": 42.69135802469136, + "grad_norm": 6.875067234039307, + "learning_rate": 6.431428571428572e-06, + "loss": 0.918, + "step": 1750 + }, + { + "epoch": 43.2962962962963, + "grad_norm": 7.167726039886475, + "learning_rate": 6.360000000000001e-06, + "loss": 0.9276, + "step": 1775 + }, + { + "epoch": 43.91358024691358, + "grad_norm": 3.7067105770111084, + "learning_rate": 6.288571428571429e-06, + "loss": 0.9169, + "step": 1800 + }, + { + "epoch": 44.51851851851852, + "grad_norm": 4.474793434143066, + "learning_rate": 6.217142857142857e-06, + "loss": 0.9191, + "step": 1825 + }, + { + "epoch": 45.123456790123456, + "grad_norm": 5.386421203613281, + "learning_rate": 6.145714285714286e-06, + "loss": 0.9145, + "step": 1850 + }, + { + "epoch": 45.74074074074074, + "grad_norm": 3.068861246109009, + "learning_rate": 6.0742857142857145e-06, + "loss": 0.9095, + "step": 1875 + }, + { + "epoch": 46.34567901234568, + "grad_norm": 3.804973840713501, + "learning_rate": 6.0028571428571435e-06, + "loss": 0.912, + "step": 1900 + }, + { + "epoch": 46.96296296296296, + "grad_norm": 2.9225473403930664, + "learning_rate": 5.9314285714285725e-06, + "loss": 0.9049, + "step": 1925 + }, + { + "epoch": 47.5679012345679, + "grad_norm": 4.022708892822266, + "learning_rate": 5.86e-06, + "loss": 0.9049, + "step": 1950 + }, + { + "epoch": 48.17283950617284, + "grad_norm": 3.421691417694092, + "learning_rate": 5.788571428571429e-06, + "loss": 0.9101, + "step": 1975 + }, + { + "epoch": 48.79012345679013, + "grad_norm": 6.732350826263428, + "learning_rate": 5.717142857142858e-06, + "loss": 0.9105, + "step": 2000 + }, + { + "epoch": 48.79012345679013, + "eval_loss": 0.8953001499176025, + "eval_runtime": 2.1587, + "eval_samples_per_second": 66.707, + "eval_steps_per_second": 33.353, + "step": 2000 + } + ], + "logging_steps": 25, + "max_steps": 4000, + "num_input_tokens_seen": 0, + "num_train_epochs": 98, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1642945628712960.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-2000/training_args.bin b/checkpoint-2000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..cf416a3be0d19e0ce5aadbb31f093c5d913fee53 --- /dev/null +++ b/checkpoint-2000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66d367dc04409d63341644c780ebdb997e8756f9aa9f6d110afc5d9ab8de84be +size 5905 diff --git a/checkpoint-3000/added_tokens.json b/checkpoint-3000/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..cd5b477a9075c49d99de65622db37bb06a251985 --- /dev/null +++ b/checkpoint-3000/added_tokens.json @@ -0,0 +1,4 @@ +{ + "": 80, + "": 79 +} diff --git a/checkpoint-3000/config.json b/checkpoint-3000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..1197099a882c66f2bbab2210646af21b05d05ee4 --- /dev/null +++ b/checkpoint-3000/config.json @@ -0,0 +1,91 @@ +{ + "activation_dropout": 0.1, + "apply_spec_augment": true, + "architectures": [ + "SpeechT5ForTextToSpeech" + ], + "attention_dropout": 0.1, + "bos_token_id": 0, + "conv_bias": false, + "conv_dim": [ + 512, + 512, + 512, + 512, + 512, + 512, + 512 + ], + "conv_kernel": [ + 10, + 3, + 3, + 3, + 3, + 2, + 2 + ], + "conv_stride": [ + 5, + 2, + 2, + 2, + 2, + 2, + 2 + ], + "decoder_attention_heads": 12, + "decoder_ffn_dim": 3072, + "decoder_layerdrop": 0.1, + "decoder_layers": 6, + "decoder_start_token_id": 2, + "encoder_attention_heads": 12, + "encoder_ffn_dim": 3072, + "encoder_layerdrop": 0.1, + "encoder_layers": 12, + "encoder_max_relative_position": 160, + "eos_token_id": 2, + "feat_extract_activation": "gelu", + "feat_extract_norm": "group", + "feat_proj_dropout": 0.0, + "guided_attention_loss_num_heads": 2, + "guided_attention_loss_scale": 10.0, + "guided_attention_loss_sigma": 0.4, + "hidden_act": "gelu", + "hidden_dropout": 0.1, + "hidden_size": 768, + "initializer_range": 0.02, + "is_encoder_decoder": true, + "layer_norm_eps": 1e-05, + "mask_feature_length": 10, + "mask_feature_min_masks": 0, + "mask_feature_prob": 0.0, + "mask_time_length": 10, + "mask_time_min_masks": 2, + "mask_time_prob": 0.05, + "max_length": null, + "max_speech_positions": 1876, + "max_text_positions": 600, + "model_type": "speecht5", + "num_conv_pos_embedding_groups": 16, + "num_conv_pos_embeddings": 128, + "num_feat_extract_layers": 7, + "num_mel_bins": 80, + "pad_token_id": 1, + "positional_dropout": 0.1, + "reduction_factor": 2, + "scale_embedding": false, + "speaker_embedding_dim": 512, + "speech_decoder_postnet_dropout": 0.5, + "speech_decoder_postnet_kernel": 5, + "speech_decoder_postnet_layers": 5, + "speech_decoder_postnet_units": 256, + "speech_decoder_prenet_dropout": 0.5, + "speech_decoder_prenet_layers": 2, + "speech_decoder_prenet_units": 256, + "torch_dtype": "float32", + "transformers_version": "4.55.4", + "use_cache": false, + "use_guided_attention_loss": true, + "vocab_size": 81 +} diff --git a/checkpoint-3000/generation_config.json b/checkpoint-3000/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1748094b7e4b8c1fb66bf2357c44c5f625a36179 --- /dev/null +++ b/checkpoint-3000/generation_config.json @@ -0,0 +1,9 @@ +{ + "_from_model_config": true, + "bos_token_id": 0, + "decoder_start_token_id": 2, + "eos_token_id": 2, + "max_length": 1876, + "pad_token_id": 1, + "transformers_version": "4.55.4" +} diff --git a/checkpoint-3000/model.safetensors b/checkpoint-3000/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..46193a605c14d097bb3430d4d15e1f9cdf6f04fd --- /dev/null +++ b/checkpoint-3000/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f6eca9575648e4c7d7eb1ea916fee7b23eafefa0db8bf09a04bd46beac454f2 +size 577789320 diff --git a/checkpoint-3000/optimizer.pt b/checkpoint-3000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..8a2ccc8147a8c1625dea169568604855600b971e --- /dev/null +++ b/checkpoint-3000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f8c7d0e8b916fd9a744e0e04850570d8a6297e6bac0767ebd63b53e0cefe4057 +size 1155777946 diff --git a/checkpoint-3000/preprocessor_config.json b/checkpoint-3000/preprocessor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..98a41e66e16f3225a15cb2b51c963fe2213aa273 --- /dev/null +++ b/checkpoint-3000/preprocessor_config.json @@ -0,0 +1,19 @@ +{ + "do_normalize": false, + "feature_extractor_type": "SpeechT5FeatureExtractor", + "feature_size": 1, + "fmax": 7600, + "fmin": 80, + "frame_signal_scale": 1.0, + "hop_length": 16, + "mel_floor": 1e-10, + "num_mel_bins": 80, + "padding_side": "right", + "padding_value": 0.0, + "processor_class": "SpeechT5Processor", + "reduction_factor": 2, + "return_attention_mask": true, + "sampling_rate": 16000, + "win_function": "hann_window", + "win_length": 64 +} diff --git a/checkpoint-3000/rng_state.pth b/checkpoint-3000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..56c94236a64a8f2889669e23bcaf8a2665536af9 --- /dev/null +++ b/checkpoint-3000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4bd7be6ad18d8737c21def51bc146679a3086895043a68047db9ee35a01b64e8 +size 14645 diff --git a/checkpoint-3000/scheduler.pt b/checkpoint-3000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..b03ecef0b0c05f5110fd89496cd4723d841ada76 --- /dev/null +++ b/checkpoint-3000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:32cb3a0b1d61782860d37955716f6b5e952b190320ed6c3b93171c974f9325c9 +size 1465 diff --git a/checkpoint-3000/special_tokens_map.json b/checkpoint-3000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..4ee24ec69861cfc94abbe2c8c934aa0744aa623c --- /dev/null +++ b/checkpoint-3000/special_tokens_map.json @@ -0,0 +1,13 @@ +{ + "bos_token": "", + "eos_token": "", + "mask_token": { + "content": "", + "lstrip": true, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": "" +} diff --git a/checkpoint-3000/spm_char.model b/checkpoint-3000/spm_char.model new file mode 100644 index 0000000000000000000000000000000000000000..8fb73691942626fa75df80b61aab0e9b9340d8e2 --- /dev/null +++ b/checkpoint-3000/spm_char.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7fcc48f3e225f627b1641db410ceb0c8649bd2b0c982e150b03f8be3728ab560 +size 238473 diff --git a/checkpoint-3000/tokenizer_config.json b/checkpoint-3000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e88d44ba3be31ac8f53461ae7c1b02b4c5c830ab --- /dev/null +++ b/checkpoint-3000/tokenizer_config.json @@ -0,0 +1,64 @@ +{ + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "79": { + "content": "", + "lstrip": true, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + }, + "80": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "extra_special_tokens": {}, + "mask_token": "", + "model_max_length": 600, + "normalize": false, + "pad_token": "", + "processor_class": "SpeechT5Processor", + "sp_model_kwargs": {}, + "tokenizer_class": "SpeechT5Tokenizer", + "unk_token": "" +} diff --git a/checkpoint-3000/trainer_state.json b/checkpoint-3000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..55636baf3a1f510489b13f061959f3bbd8e03803 --- /dev/null +++ b/checkpoint-3000/trainer_state.json @@ -0,0 +1,898 @@ +{ + "best_global_step": 3000, + "best_metric": 0.8869494795799255, + "best_model_checkpoint": "runs/emotts_ravdess\\checkpoint-3000", + "epoch": 73.17283950617283, + "eval_steps": 1000, + "global_step": 3000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.6172839506172839, + "grad_norm": 46.678199768066406, + "learning_rate": 4.800000000000001e-07, + "loss": 3.4472, + "step": 25 + }, + { + "epoch": 1.2222222222222223, + "grad_norm": 26.903335571289062, + "learning_rate": 9.800000000000001e-07, + "loss": 2.9051, + "step": 50 + }, + { + "epoch": 1.8395061728395061, + "grad_norm": 16.712799072265625, + "learning_rate": 1.48e-06, + "loss": 2.2302, + "step": 75 + }, + { + "epoch": 2.4444444444444446, + "grad_norm": 11.607951164245605, + "learning_rate": 1.98e-06, + "loss": 1.7683, + "step": 100 + }, + { + "epoch": 3.049382716049383, + "grad_norm": 7.216983318328857, + "learning_rate": 2.4800000000000004e-06, + "loss": 1.5434, + "step": 125 + }, + { + "epoch": 3.6666666666666665, + "grad_norm": 10.899630546569824, + "learning_rate": 2.9800000000000003e-06, + "loss": 1.4385, + "step": 150 + }, + { + "epoch": 4.271604938271605, + "grad_norm": 6.701765537261963, + "learning_rate": 3.48e-06, + "loss": 1.3262, + "step": 175 + }, + { + "epoch": 4.888888888888889, + "grad_norm": 9.419053077697754, + "learning_rate": 3.980000000000001e-06, + "loss": 1.285, + "step": 200 + }, + { + "epoch": 5.493827160493828, + "grad_norm": 5.913278579711914, + "learning_rate": 4.48e-06, + "loss": 1.2503, + "step": 225 + }, + { + "epoch": 6.098765432098766, + "grad_norm": 8.171669006347656, + "learning_rate": 4.980000000000001e-06, + "loss": 1.1868, + "step": 250 + }, + { + "epoch": 6.716049382716049, + "grad_norm": 5.54558801651001, + "learning_rate": 5.480000000000001e-06, + "loss": 1.1478, + "step": 275 + }, + { + "epoch": 7.320987654320987, + "grad_norm": 5.325434684753418, + "learning_rate": 5.98e-06, + "loss": 1.1245, + "step": 300 + }, + { + "epoch": 7.938271604938271, + "grad_norm": 5.406148433685303, + "learning_rate": 6.480000000000001e-06, + "loss": 1.1145, + "step": 325 + }, + { + "epoch": 8.54320987654321, + "grad_norm": 8.461536407470703, + "learning_rate": 6.98e-06, + "loss": 1.0641, + "step": 350 + }, + { + "epoch": 9.148148148148149, + "grad_norm": 3.8533031940460205, + "learning_rate": 7.48e-06, + "loss": 1.0573, + "step": 375 + }, + { + "epoch": 9.765432098765432, + "grad_norm": 7.569976806640625, + "learning_rate": 7.980000000000002e-06, + "loss": 1.061, + "step": 400 + }, + { + "epoch": 10.37037037037037, + "grad_norm": 10.156228065490723, + "learning_rate": 8.48e-06, + "loss": 1.0485, + "step": 425 + }, + { + "epoch": 10.987654320987655, + "grad_norm": 4.668756484985352, + "learning_rate": 8.98e-06, + "loss": 1.0216, + "step": 450 + }, + { + "epoch": 11.592592592592592, + "grad_norm": 5.087125301361084, + "learning_rate": 9.48e-06, + "loss": 1.0319, + "step": 475 + }, + { + "epoch": 12.197530864197532, + "grad_norm": 7.943349361419678, + "learning_rate": 9.980000000000001e-06, + "loss": 1.0, + "step": 500 + }, + { + "epoch": 12.814814814814815, + "grad_norm": 7.655898571014404, + "learning_rate": 9.931428571428571e-06, + "loss": 1.0052, + "step": 525 + }, + { + "epoch": 13.419753086419753, + "grad_norm": 4.458106994628906, + "learning_rate": 9.86e-06, + "loss": 1.0001, + "step": 550 + }, + { + "epoch": 14.024691358024691, + "grad_norm": 9.058222770690918, + "learning_rate": 9.78857142857143e-06, + "loss": 1.0015, + "step": 575 + }, + { + "epoch": 14.641975308641975, + "grad_norm": 4.795205593109131, + "learning_rate": 9.717142857142858e-06, + "loss": 0.9836, + "step": 600 + }, + { + "epoch": 15.246913580246913, + "grad_norm": 10.566876411437988, + "learning_rate": 9.645714285714286e-06, + "loss": 1.0019, + "step": 625 + }, + { + "epoch": 15.864197530864198, + "grad_norm": 7.610626220703125, + "learning_rate": 9.574285714285715e-06, + "loss": 0.9779, + "step": 650 + }, + { + "epoch": 16.469135802469136, + "grad_norm": 6.008159637451172, + "learning_rate": 9.502857142857144e-06, + "loss": 0.9798, + "step": 675 + }, + { + "epoch": 17.074074074074073, + "grad_norm": 6.685286521911621, + "learning_rate": 9.431428571428573e-06, + "loss": 0.9753, + "step": 700 + }, + { + "epoch": 17.691358024691358, + "grad_norm": 2.7540247440338135, + "learning_rate": 9.360000000000002e-06, + "loss": 0.967, + "step": 725 + }, + { + "epoch": 18.296296296296298, + "grad_norm": 4.825072288513184, + "learning_rate": 9.28857142857143e-06, + "loss": 0.9575, + "step": 750 + }, + { + "epoch": 18.91358024691358, + "grad_norm": 6.618119716644287, + "learning_rate": 9.217142857142858e-06, + "loss": 0.9675, + "step": 775 + }, + { + "epoch": 19.51851851851852, + "grad_norm": 5.465808391571045, + "learning_rate": 9.145714285714287e-06, + "loss": 0.9626, + "step": 800 + }, + { + "epoch": 20.123456790123456, + "grad_norm": 4.9501051902771, + "learning_rate": 9.074285714285716e-06, + "loss": 0.9638, + "step": 825 + }, + { + "epoch": 20.74074074074074, + "grad_norm": 4.926831245422363, + "learning_rate": 9.002857142857144e-06, + "loss": 0.9582, + "step": 850 + }, + { + "epoch": 21.34567901234568, + "grad_norm": 6.605464458465576, + "learning_rate": 8.931428571428573e-06, + "loss": 0.9551, + "step": 875 + }, + { + "epoch": 21.962962962962962, + "grad_norm": 5.774538040161133, + "learning_rate": 8.860000000000002e-06, + "loss": 0.9596, + "step": 900 + }, + { + "epoch": 22.567901234567902, + "grad_norm": 4.304802417755127, + "learning_rate": 8.788571428571429e-06, + "loss": 0.9489, + "step": 925 + }, + { + "epoch": 23.17283950617284, + "grad_norm": 5.171604633331299, + "learning_rate": 8.717142857142858e-06, + "loss": 0.953, + "step": 950 + }, + { + "epoch": 23.790123456790123, + "grad_norm": 7.152281761169434, + "learning_rate": 8.645714285714287e-06, + "loss": 0.9604, + "step": 975 + }, + { + "epoch": 24.395061728395063, + "grad_norm": 4.954558849334717, + "learning_rate": 8.574285714285714e-06, + "loss": 0.9489, + "step": 1000 + }, + { + "epoch": 24.395061728395063, + "eval_loss": 0.9205830097198486, + "eval_runtime": 2.2708, + "eval_samples_per_second": 63.413, + "eval_steps_per_second": 31.707, + "step": 1000 + }, + { + "epoch": 25.0, + "grad_norm": 10.266937255859375, + "learning_rate": 8.502857142857143e-06, + "loss": 0.9541, + "step": 1025 + }, + { + "epoch": 25.617283950617285, + "grad_norm": 3.225881814956665, + "learning_rate": 8.431428571428572e-06, + "loss": 0.9451, + "step": 1050 + }, + { + "epoch": 26.22222222222222, + "grad_norm": 4.001440048217773, + "learning_rate": 8.36e-06, + "loss": 0.9422, + "step": 1075 + }, + { + "epoch": 26.839506172839506, + "grad_norm": 5.347984313964844, + "learning_rate": 8.288571428571429e-06, + "loss": 0.9434, + "step": 1100 + }, + { + "epoch": 27.444444444444443, + "grad_norm": 4.1566901206970215, + "learning_rate": 8.217142857142858e-06, + "loss": 0.942, + "step": 1125 + }, + { + "epoch": 28.049382716049383, + "grad_norm": 3.2101686000823975, + "learning_rate": 8.145714285714287e-06, + "loss": 0.9365, + "step": 1150 + }, + { + "epoch": 28.666666666666668, + "grad_norm": 5.183631896972656, + "learning_rate": 8.074285714285714e-06, + "loss": 0.941, + "step": 1175 + }, + { + "epoch": 29.271604938271604, + "grad_norm": 4.704529285430908, + "learning_rate": 8.002857142857143e-06, + "loss": 0.9374, + "step": 1200 + }, + { + "epoch": 29.88888888888889, + "grad_norm": 4.460058689117432, + "learning_rate": 7.931428571428572e-06, + "loss": 0.9383, + "step": 1225 + }, + { + "epoch": 30.493827160493826, + "grad_norm": 3.616530418395996, + "learning_rate": 7.860000000000001e-06, + "loss": 0.9321, + "step": 1250 + }, + { + "epoch": 31.098765432098766, + "grad_norm": 3.92207932472229, + "learning_rate": 7.788571428571428e-06, + "loss": 0.9347, + "step": 1275 + }, + { + "epoch": 31.71604938271605, + "grad_norm": 3.6962461471557617, + "learning_rate": 7.717142857142857e-06, + "loss": 0.9305, + "step": 1300 + }, + { + "epoch": 32.32098765432099, + "grad_norm": 4.276056289672852, + "learning_rate": 7.645714285714286e-06, + "loss": 0.9336, + "step": 1325 + }, + { + "epoch": 32.93827160493827, + "grad_norm": 5.176277160644531, + "learning_rate": 7.574285714285715e-06, + "loss": 0.9351, + "step": 1350 + }, + { + "epoch": 33.54320987654321, + "grad_norm": 7.2538347244262695, + "learning_rate": 7.502857142857144e-06, + "loss": 0.9241, + "step": 1375 + }, + { + "epoch": 34.148148148148145, + "grad_norm": 4.3576273918151855, + "learning_rate": 7.431428571428572e-06, + "loss": 0.9316, + "step": 1400 + }, + { + "epoch": 34.76543209876543, + "grad_norm": 9.138855934143066, + "learning_rate": 7.360000000000001e-06, + "loss": 0.9277, + "step": 1425 + }, + { + "epoch": 35.370370370370374, + "grad_norm": 4.475003719329834, + "learning_rate": 7.28857142857143e-06, + "loss": 0.9245, + "step": 1450 + }, + { + "epoch": 35.98765432098765, + "grad_norm": 7.28753137588501, + "learning_rate": 7.217142857142858e-06, + "loss": 0.9266, + "step": 1475 + }, + { + "epoch": 36.592592592592595, + "grad_norm": 5.1342949867248535, + "learning_rate": 7.145714285714286e-06, + "loss": 0.9297, + "step": 1500 + }, + { + "epoch": 37.19753086419753, + "grad_norm": 2.7765142917633057, + "learning_rate": 7.074285714285715e-06, + "loss": 0.9253, + "step": 1525 + }, + { + "epoch": 37.81481481481482, + "grad_norm": 3.8011326789855957, + "learning_rate": 7.002857142857143e-06, + "loss": 0.9203, + "step": 1550 + }, + { + "epoch": 38.41975308641975, + "grad_norm": 7.432782173156738, + "learning_rate": 6.931428571428572e-06, + "loss": 0.9196, + "step": 1575 + }, + { + "epoch": 39.02469135802469, + "grad_norm": 4.179474830627441, + "learning_rate": 6.860000000000001e-06, + "loss": 0.9188, + "step": 1600 + }, + { + "epoch": 39.641975308641975, + "grad_norm": 8.513073921203613, + "learning_rate": 6.7885714285714286e-06, + "loss": 0.9268, + "step": 1625 + }, + { + "epoch": 40.24691358024691, + "grad_norm": 3.699882984161377, + "learning_rate": 6.7171428571428576e-06, + "loss": 0.9216, + "step": 1650 + }, + { + "epoch": 40.864197530864196, + "grad_norm": 3.949507713317871, + "learning_rate": 6.645714285714287e-06, + "loss": 0.9238, + "step": 1675 + }, + { + "epoch": 41.46913580246913, + "grad_norm": 3.7951810359954834, + "learning_rate": 6.574285714285716e-06, + "loss": 0.9198, + "step": 1700 + }, + { + "epoch": 42.074074074074076, + "grad_norm": 5.373620986938477, + "learning_rate": 6.502857142857143e-06, + "loss": 0.9135, + "step": 1725 + }, + { + "epoch": 42.69135802469136, + "grad_norm": 6.875067234039307, + "learning_rate": 6.431428571428572e-06, + "loss": 0.918, + "step": 1750 + }, + { + "epoch": 43.2962962962963, + "grad_norm": 7.167726039886475, + "learning_rate": 6.360000000000001e-06, + "loss": 0.9276, + "step": 1775 + }, + { + "epoch": 43.91358024691358, + "grad_norm": 3.7067105770111084, + "learning_rate": 6.288571428571429e-06, + "loss": 0.9169, + "step": 1800 + }, + { + "epoch": 44.51851851851852, + "grad_norm": 4.474793434143066, + "learning_rate": 6.217142857142857e-06, + "loss": 0.9191, + "step": 1825 + }, + { + "epoch": 45.123456790123456, + "grad_norm": 5.386421203613281, + "learning_rate": 6.145714285714286e-06, + "loss": 0.9145, + "step": 1850 + }, + { + "epoch": 45.74074074074074, + "grad_norm": 3.068861246109009, + "learning_rate": 6.0742857142857145e-06, + "loss": 0.9095, + "step": 1875 + }, + { + "epoch": 46.34567901234568, + "grad_norm": 3.804973840713501, + "learning_rate": 6.0028571428571435e-06, + "loss": 0.912, + "step": 1900 + }, + { + "epoch": 46.96296296296296, + "grad_norm": 2.9225473403930664, + "learning_rate": 5.9314285714285725e-06, + "loss": 0.9049, + "step": 1925 + }, + { + "epoch": 47.5679012345679, + "grad_norm": 4.022708892822266, + "learning_rate": 5.86e-06, + "loss": 0.9049, + "step": 1950 + }, + { + "epoch": 48.17283950617284, + "grad_norm": 3.421691417694092, + "learning_rate": 5.788571428571429e-06, + "loss": 0.9101, + "step": 1975 + }, + { + "epoch": 48.79012345679013, + "grad_norm": 6.732350826263428, + "learning_rate": 5.717142857142858e-06, + "loss": 0.9105, + "step": 2000 + }, + { + "epoch": 48.79012345679013, + "eval_loss": 0.8953001499176025, + "eval_runtime": 2.1587, + "eval_samples_per_second": 66.707, + "eval_steps_per_second": 33.353, + "step": 2000 + }, + { + "epoch": 49.39506172839506, + "grad_norm": 5.506401538848877, + "learning_rate": 5.645714285714287e-06, + "loss": 0.9036, + "step": 2025 + }, + { + "epoch": 50.0, + "grad_norm": 9.19892406463623, + "learning_rate": 5.574285714285714e-06, + "loss": 0.9107, + "step": 2050 + }, + { + "epoch": 50.617283950617285, + "grad_norm": 3.324119806289673, + "learning_rate": 5.502857142857143e-06, + "loss": 0.9118, + "step": 2075 + }, + { + "epoch": 51.22222222222222, + "grad_norm": 5.142299652099609, + "learning_rate": 5.431428571428572e-06, + "loss": 0.9098, + "step": 2100 + }, + { + "epoch": 51.839506172839506, + "grad_norm": 2.8806934356689453, + "learning_rate": 5.36e-06, + "loss": 0.9013, + "step": 2125 + }, + { + "epoch": 52.44444444444444, + "grad_norm": 4.728231430053711, + "learning_rate": 5.2885714285714285e-06, + "loss": 0.9049, + "step": 2150 + }, + { + "epoch": 53.04938271604938, + "grad_norm": 4.9596991539001465, + "learning_rate": 5.2171428571428575e-06, + "loss": 0.9128, + "step": 2175 + }, + { + "epoch": 53.666666666666664, + "grad_norm": 3.160998821258545, + "learning_rate": 5.145714285714286e-06, + "loss": 0.9003, + "step": 2200 + }, + { + "epoch": 54.27160493827161, + "grad_norm": 3.833195924758911, + "learning_rate": 5.074285714285715e-06, + "loss": 0.9088, + "step": 2225 + }, + { + "epoch": 54.888888888888886, + "grad_norm": 5.242589950561523, + "learning_rate": 5.002857142857144e-06, + "loss": 0.9005, + "step": 2250 + }, + { + "epoch": 55.49382716049383, + "grad_norm": 3.781388759613037, + "learning_rate": 4.931428571428572e-06, + "loss": 0.9028, + "step": 2275 + }, + { + "epoch": 56.098765432098766, + "grad_norm": 6.0595574378967285, + "learning_rate": 4.86e-06, + "loss": 0.9124, + "step": 2300 + }, + { + "epoch": 56.71604938271605, + "grad_norm": 2.7515597343444824, + "learning_rate": 4.788571428571429e-06, + "loss": 0.9025, + "step": 2325 + }, + { + "epoch": 57.32098765432099, + "grad_norm": 6.520521640777588, + "learning_rate": 4.717142857142857e-06, + "loss": 0.9065, + "step": 2350 + }, + { + "epoch": 57.93827160493827, + "grad_norm": 3.289445638656616, + "learning_rate": 4.645714285714286e-06, + "loss": 0.9004, + "step": 2375 + }, + { + "epoch": 58.54320987654321, + "grad_norm": 3.6132805347442627, + "learning_rate": 4.574285714285714e-06, + "loss": 0.9021, + "step": 2400 + }, + { + "epoch": 59.148148148148145, + "grad_norm": 5.021145343780518, + "learning_rate": 4.5028571428571434e-06, + "loss": 0.8957, + "step": 2425 + }, + { + "epoch": 59.76543209876543, + "grad_norm": 5.366466522216797, + "learning_rate": 4.431428571428572e-06, + "loss": 0.8986, + "step": 2450 + }, + { + "epoch": 60.370370370370374, + "grad_norm": 5.833218574523926, + "learning_rate": 4.360000000000001e-06, + "loss": 0.9045, + "step": 2475 + }, + { + "epoch": 60.98765432098765, + "grad_norm": 5.301181793212891, + "learning_rate": 4.288571428571429e-06, + "loss": 0.8975, + "step": 2500 + }, + { + "epoch": 61.592592592592595, + "grad_norm": 3.989539861679077, + "learning_rate": 4.217142857142858e-06, + "loss": 0.9021, + "step": 2525 + }, + { + "epoch": 62.19753086419753, + "grad_norm": 13.111737251281738, + "learning_rate": 4.145714285714286e-06, + "loss": 0.9043, + "step": 2550 + }, + { + "epoch": 62.81481481481482, + "grad_norm": 3.4066903591156006, + "learning_rate": 4.074285714285714e-06, + "loss": 0.8929, + "step": 2575 + }, + { + "epoch": 63.41975308641975, + "grad_norm": 3.9170608520507812, + "learning_rate": 4.002857142857143e-06, + "loss": 0.8998, + "step": 2600 + }, + { + "epoch": 64.0246913580247, + "grad_norm": 3.5934042930603027, + "learning_rate": 3.931428571428571e-06, + "loss": 0.898, + "step": 2625 + }, + { + "epoch": 64.64197530864197, + "grad_norm": 3.3771822452545166, + "learning_rate": 3.86e-06, + "loss": 0.901, + "step": 2650 + }, + { + "epoch": 65.24691358024691, + "grad_norm": 3.5741279125213623, + "learning_rate": 3.7885714285714285e-06, + "loss": 0.903, + "step": 2675 + }, + { + "epoch": 65.8641975308642, + "grad_norm": 4.369333267211914, + "learning_rate": 3.7171428571428575e-06, + "loss": 0.8907, + "step": 2700 + }, + { + "epoch": 66.46913580246914, + "grad_norm": 2.9996423721313477, + "learning_rate": 3.6457142857142857e-06, + "loss": 0.9008, + "step": 2725 + }, + { + "epoch": 67.07407407407408, + "grad_norm": 5.098217487335205, + "learning_rate": 3.5742857142857147e-06, + "loss": 0.8979, + "step": 2750 + }, + { + "epoch": 67.69135802469135, + "grad_norm": 3.8548665046691895, + "learning_rate": 3.5028571428571433e-06, + "loss": 0.8906, + "step": 2775 + }, + { + "epoch": 68.29629629629629, + "grad_norm": 4.787322521209717, + "learning_rate": 3.431428571428572e-06, + "loss": 0.8949, + "step": 2800 + }, + { + "epoch": 68.91358024691358, + "grad_norm": 2.8501498699188232, + "learning_rate": 3.3600000000000004e-06, + "loss": 0.8932, + "step": 2825 + }, + { + "epoch": 69.51851851851852, + "grad_norm": 7.697382926940918, + "learning_rate": 3.2885714285714286e-06, + "loss": 0.8961, + "step": 2850 + }, + { + "epoch": 70.12345679012346, + "grad_norm": 3.5617403984069824, + "learning_rate": 3.2171428571428576e-06, + "loss": 0.8975, + "step": 2875 + }, + { + "epoch": 70.74074074074075, + "grad_norm": 4.286247253417969, + "learning_rate": 3.1457142857142858e-06, + "loss": 0.8988, + "step": 2900 + }, + { + "epoch": 71.34567901234568, + "grad_norm": 3.0174379348754883, + "learning_rate": 3.074285714285715e-06, + "loss": 0.8986, + "step": 2925 + }, + { + "epoch": 71.96296296296296, + "grad_norm": 5.708584308624268, + "learning_rate": 3.002857142857143e-06, + "loss": 0.8888, + "step": 2950 + }, + { + "epoch": 72.5679012345679, + "grad_norm": 7.933815956115723, + "learning_rate": 2.9314285714285716e-06, + "loss": 0.9, + "step": 2975 + }, + { + "epoch": 73.17283950617283, + "grad_norm": 3.4261972904205322, + "learning_rate": 2.86e-06, + "loss": 0.8951, + "step": 3000 + }, + { + "epoch": 73.17283950617283, + "eval_loss": 0.8869494795799255, + "eval_runtime": 2.1798, + "eval_samples_per_second": 66.061, + "eval_steps_per_second": 33.03, + "step": 3000 + } + ], + "logging_steps": 25, + "max_steps": 4000, + "num_input_tokens_seen": 0, + "num_train_epochs": 98, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2464002717960960.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-3000/training_args.bin b/checkpoint-3000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..cf416a3be0d19e0ce5aadbb31f093c5d913fee53 --- /dev/null +++ b/checkpoint-3000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66d367dc04409d63341644c780ebdb997e8756f9aa9f6d110afc5d9ab8de84be +size 5905 diff --git a/checkpoint-4000/added_tokens.json b/checkpoint-4000/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..cd5b477a9075c49d99de65622db37bb06a251985 --- /dev/null +++ b/checkpoint-4000/added_tokens.json @@ -0,0 +1,4 @@ +{ + "": 80, + "": 79 +} diff --git a/checkpoint-4000/config.json b/checkpoint-4000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..1197099a882c66f2bbab2210646af21b05d05ee4 --- /dev/null +++ b/checkpoint-4000/config.json @@ -0,0 +1,91 @@ +{ + "activation_dropout": 0.1, + "apply_spec_augment": true, + "architectures": [ + "SpeechT5ForTextToSpeech" + ], + "attention_dropout": 0.1, + "bos_token_id": 0, + "conv_bias": false, + "conv_dim": [ + 512, + 512, + 512, + 512, + 512, + 512, + 512 + ], + "conv_kernel": [ + 10, + 3, + 3, + 3, + 3, + 2, + 2 + ], + "conv_stride": [ + 5, + 2, + 2, + 2, + 2, + 2, + 2 + ], + "decoder_attention_heads": 12, + "decoder_ffn_dim": 3072, + "decoder_layerdrop": 0.1, + "decoder_layers": 6, + "decoder_start_token_id": 2, + "encoder_attention_heads": 12, + "encoder_ffn_dim": 3072, + "encoder_layerdrop": 0.1, + "encoder_layers": 12, + "encoder_max_relative_position": 160, + "eos_token_id": 2, + "feat_extract_activation": "gelu", + "feat_extract_norm": "group", + "feat_proj_dropout": 0.0, + "guided_attention_loss_num_heads": 2, + "guided_attention_loss_scale": 10.0, + "guided_attention_loss_sigma": 0.4, + "hidden_act": "gelu", + "hidden_dropout": 0.1, + "hidden_size": 768, + "initializer_range": 0.02, + "is_encoder_decoder": true, + "layer_norm_eps": 1e-05, + "mask_feature_length": 10, + "mask_feature_min_masks": 0, + "mask_feature_prob": 0.0, + "mask_time_length": 10, + "mask_time_min_masks": 2, + "mask_time_prob": 0.05, + "max_length": null, + "max_speech_positions": 1876, + "max_text_positions": 600, + "model_type": "speecht5", + "num_conv_pos_embedding_groups": 16, + "num_conv_pos_embeddings": 128, + "num_feat_extract_layers": 7, + "num_mel_bins": 80, + "pad_token_id": 1, + "positional_dropout": 0.1, + "reduction_factor": 2, + "scale_embedding": false, + "speaker_embedding_dim": 512, + "speech_decoder_postnet_dropout": 0.5, + "speech_decoder_postnet_kernel": 5, + "speech_decoder_postnet_layers": 5, + "speech_decoder_postnet_units": 256, + "speech_decoder_prenet_dropout": 0.5, + "speech_decoder_prenet_layers": 2, + "speech_decoder_prenet_units": 256, + "torch_dtype": "float32", + "transformers_version": "4.55.4", + "use_cache": false, + "use_guided_attention_loss": true, + "vocab_size": 81 +} diff --git a/checkpoint-4000/generation_config.json b/checkpoint-4000/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1748094b7e4b8c1fb66bf2357c44c5f625a36179 --- /dev/null +++ b/checkpoint-4000/generation_config.json @@ -0,0 +1,9 @@ +{ + "_from_model_config": true, + "bos_token_id": 0, + "decoder_start_token_id": 2, + "eos_token_id": 2, + "max_length": 1876, + "pad_token_id": 1, + "transformers_version": "4.55.4" +} diff --git a/checkpoint-4000/model.safetensors b/checkpoint-4000/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..063a7debbe0b96bed7d11b8b1e6151197d55864b --- /dev/null +++ b/checkpoint-4000/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b10dd87b217ab2fc492088d02d67c7955fbbff9f22b6fda9133dfa1744e6d9d +size 577789320 diff --git a/checkpoint-4000/optimizer.pt b/checkpoint-4000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..c03c363ec916b474511c91b3bc2c682ab09127f8 --- /dev/null +++ b/checkpoint-4000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e3c148661528c4aa2cc3b96d89de7440a524fdfc4c68416d7a8438ea0d22f51 +size 1155777946 diff --git a/checkpoint-4000/preprocessor_config.json b/checkpoint-4000/preprocessor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..98a41e66e16f3225a15cb2b51c963fe2213aa273 --- /dev/null +++ b/checkpoint-4000/preprocessor_config.json @@ -0,0 +1,19 @@ +{ + "do_normalize": false, + "feature_extractor_type": "SpeechT5FeatureExtractor", + "feature_size": 1, + "fmax": 7600, + "fmin": 80, + "frame_signal_scale": 1.0, + "hop_length": 16, + "mel_floor": 1e-10, + "num_mel_bins": 80, + "padding_side": "right", + "padding_value": 0.0, + "processor_class": "SpeechT5Processor", + "reduction_factor": 2, + "return_attention_mask": true, + "sampling_rate": 16000, + "win_function": "hann_window", + "win_length": 64 +} diff --git a/checkpoint-4000/rng_state.pth b/checkpoint-4000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..4508811a95c544c9c4f67e30fd978a5256727bac --- /dev/null +++ b/checkpoint-4000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aba3f2e2e55ab9cb538d7b0b1066ff8ea9c9ba098fb7f0715213c6343cb11c11 +size 14645 diff --git a/checkpoint-4000/scheduler.pt b/checkpoint-4000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..36df701f2ca9347d59a8d9d660998e8bb8e1c34e --- /dev/null +++ b/checkpoint-4000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:700b408dba7ef9825c572f76cd9846e502c0ecd58f44e9e252d68786437bee70 +size 1465 diff --git a/checkpoint-4000/special_tokens_map.json b/checkpoint-4000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..4ee24ec69861cfc94abbe2c8c934aa0744aa623c --- /dev/null +++ b/checkpoint-4000/special_tokens_map.json @@ -0,0 +1,13 @@ +{ + "bos_token": "", + "eos_token": "", + "mask_token": { + "content": "", + "lstrip": true, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": "" +} diff --git a/checkpoint-4000/spm_char.model b/checkpoint-4000/spm_char.model new file mode 100644 index 0000000000000000000000000000000000000000..8fb73691942626fa75df80b61aab0e9b9340d8e2 --- /dev/null +++ b/checkpoint-4000/spm_char.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7fcc48f3e225f627b1641db410ceb0c8649bd2b0c982e150b03f8be3728ab560 +size 238473 diff --git a/checkpoint-4000/tokenizer_config.json b/checkpoint-4000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e88d44ba3be31ac8f53461ae7c1b02b4c5c830ab --- /dev/null +++ b/checkpoint-4000/tokenizer_config.json @@ -0,0 +1,64 @@ +{ + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "79": { + "content": "", + "lstrip": true, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + }, + "80": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "extra_special_tokens": {}, + "mask_token": "", + "model_max_length": 600, + "normalize": false, + "pad_token": "", + "processor_class": "SpeechT5Processor", + "sp_model_kwargs": {}, + "tokenizer_class": "SpeechT5Tokenizer", + "unk_token": "" +} diff --git a/checkpoint-4000/trainer_state.json b/checkpoint-4000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..cd5bd0445ad17111b4e6de583e7136aeccb9f362 --- /dev/null +++ b/checkpoint-4000/trainer_state.json @@ -0,0 +1,1186 @@ +{ + "best_global_step": 4000, + "best_metric": 0.8817942142486572, + "best_model_checkpoint": "runs/emotts_ravdess\\checkpoint-4000", + "epoch": 97.5679012345679, + "eval_steps": 1000, + "global_step": 4000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.6172839506172839, + "grad_norm": 46.678199768066406, + "learning_rate": 4.800000000000001e-07, + "loss": 3.4472, + "step": 25 + }, + { + "epoch": 1.2222222222222223, + "grad_norm": 26.903335571289062, + "learning_rate": 9.800000000000001e-07, + "loss": 2.9051, + "step": 50 + }, + { + "epoch": 1.8395061728395061, + "grad_norm": 16.712799072265625, + "learning_rate": 1.48e-06, + "loss": 2.2302, + "step": 75 + }, + { + "epoch": 2.4444444444444446, + "grad_norm": 11.607951164245605, + "learning_rate": 1.98e-06, + "loss": 1.7683, + "step": 100 + }, + { + "epoch": 3.049382716049383, + "grad_norm": 7.216983318328857, + "learning_rate": 2.4800000000000004e-06, + "loss": 1.5434, + "step": 125 + }, + { + "epoch": 3.6666666666666665, + "grad_norm": 10.899630546569824, + "learning_rate": 2.9800000000000003e-06, + "loss": 1.4385, + "step": 150 + }, + { + "epoch": 4.271604938271605, + "grad_norm": 6.701765537261963, + "learning_rate": 3.48e-06, + "loss": 1.3262, + "step": 175 + }, + { + "epoch": 4.888888888888889, + "grad_norm": 9.419053077697754, + "learning_rate": 3.980000000000001e-06, + "loss": 1.285, + "step": 200 + }, + { + "epoch": 5.493827160493828, + "grad_norm": 5.913278579711914, + "learning_rate": 4.48e-06, + "loss": 1.2503, + "step": 225 + }, + { + "epoch": 6.098765432098766, + "grad_norm": 8.171669006347656, + "learning_rate": 4.980000000000001e-06, + "loss": 1.1868, + "step": 250 + }, + { + "epoch": 6.716049382716049, + "grad_norm": 5.54558801651001, + "learning_rate": 5.480000000000001e-06, + "loss": 1.1478, + "step": 275 + }, + { + "epoch": 7.320987654320987, + "grad_norm": 5.325434684753418, + "learning_rate": 5.98e-06, + "loss": 1.1245, + "step": 300 + }, + { + "epoch": 7.938271604938271, + "grad_norm": 5.406148433685303, + "learning_rate": 6.480000000000001e-06, + "loss": 1.1145, + "step": 325 + }, + { + "epoch": 8.54320987654321, + "grad_norm": 8.461536407470703, + "learning_rate": 6.98e-06, + "loss": 1.0641, + "step": 350 + }, + { + "epoch": 9.148148148148149, + "grad_norm": 3.8533031940460205, + "learning_rate": 7.48e-06, + "loss": 1.0573, + "step": 375 + }, + { + "epoch": 9.765432098765432, + "grad_norm": 7.569976806640625, + "learning_rate": 7.980000000000002e-06, + "loss": 1.061, + "step": 400 + }, + { + "epoch": 10.37037037037037, + "grad_norm": 10.156228065490723, + "learning_rate": 8.48e-06, + "loss": 1.0485, + "step": 425 + }, + { + "epoch": 10.987654320987655, + "grad_norm": 4.668756484985352, + "learning_rate": 8.98e-06, + "loss": 1.0216, + "step": 450 + }, + { + "epoch": 11.592592592592592, + "grad_norm": 5.087125301361084, + "learning_rate": 9.48e-06, + "loss": 1.0319, + "step": 475 + }, + { + "epoch": 12.197530864197532, + "grad_norm": 7.943349361419678, + "learning_rate": 9.980000000000001e-06, + "loss": 1.0, + "step": 500 + }, + { + "epoch": 12.814814814814815, + "grad_norm": 7.655898571014404, + "learning_rate": 9.931428571428571e-06, + "loss": 1.0052, + "step": 525 + }, + { + "epoch": 13.419753086419753, + "grad_norm": 4.458106994628906, + "learning_rate": 9.86e-06, + "loss": 1.0001, + "step": 550 + }, + { + "epoch": 14.024691358024691, + "grad_norm": 9.058222770690918, + "learning_rate": 9.78857142857143e-06, + "loss": 1.0015, + "step": 575 + }, + { + "epoch": 14.641975308641975, + "grad_norm": 4.795205593109131, + "learning_rate": 9.717142857142858e-06, + "loss": 0.9836, + "step": 600 + }, + { + "epoch": 15.246913580246913, + "grad_norm": 10.566876411437988, + "learning_rate": 9.645714285714286e-06, + "loss": 1.0019, + "step": 625 + }, + { + "epoch": 15.864197530864198, + "grad_norm": 7.610626220703125, + "learning_rate": 9.574285714285715e-06, + "loss": 0.9779, + "step": 650 + }, + { + "epoch": 16.469135802469136, + "grad_norm": 6.008159637451172, + "learning_rate": 9.502857142857144e-06, + "loss": 0.9798, + "step": 675 + }, + { + "epoch": 17.074074074074073, + "grad_norm": 6.685286521911621, + "learning_rate": 9.431428571428573e-06, + "loss": 0.9753, + "step": 700 + }, + { + "epoch": 17.691358024691358, + "grad_norm": 2.7540247440338135, + "learning_rate": 9.360000000000002e-06, + "loss": 0.967, + "step": 725 + }, + { + "epoch": 18.296296296296298, + "grad_norm": 4.825072288513184, + "learning_rate": 9.28857142857143e-06, + "loss": 0.9575, + "step": 750 + }, + { + "epoch": 18.91358024691358, + "grad_norm": 6.618119716644287, + "learning_rate": 9.217142857142858e-06, + "loss": 0.9675, + "step": 775 + }, + { + "epoch": 19.51851851851852, + "grad_norm": 5.465808391571045, + "learning_rate": 9.145714285714287e-06, + "loss": 0.9626, + "step": 800 + }, + { + "epoch": 20.123456790123456, + "grad_norm": 4.9501051902771, + "learning_rate": 9.074285714285716e-06, + "loss": 0.9638, + "step": 825 + }, + { + "epoch": 20.74074074074074, + "grad_norm": 4.926831245422363, + "learning_rate": 9.002857142857144e-06, + "loss": 0.9582, + "step": 850 + }, + { + "epoch": 21.34567901234568, + "grad_norm": 6.605464458465576, + "learning_rate": 8.931428571428573e-06, + "loss": 0.9551, + "step": 875 + }, + { + "epoch": 21.962962962962962, + "grad_norm": 5.774538040161133, + "learning_rate": 8.860000000000002e-06, + "loss": 0.9596, + "step": 900 + }, + { + "epoch": 22.567901234567902, + "grad_norm": 4.304802417755127, + "learning_rate": 8.788571428571429e-06, + "loss": 0.9489, + "step": 925 + }, + { + "epoch": 23.17283950617284, + "grad_norm": 5.171604633331299, + "learning_rate": 8.717142857142858e-06, + "loss": 0.953, + "step": 950 + }, + { + "epoch": 23.790123456790123, + "grad_norm": 7.152281761169434, + "learning_rate": 8.645714285714287e-06, + "loss": 0.9604, + "step": 975 + }, + { + "epoch": 24.395061728395063, + "grad_norm": 4.954558849334717, + "learning_rate": 8.574285714285714e-06, + "loss": 0.9489, + "step": 1000 + }, + { + "epoch": 24.395061728395063, + "eval_loss": 0.9205830097198486, + "eval_runtime": 2.2708, + "eval_samples_per_second": 63.413, + "eval_steps_per_second": 31.707, + "step": 1000 + }, + { + "epoch": 25.0, + "grad_norm": 10.266937255859375, + "learning_rate": 8.502857142857143e-06, + "loss": 0.9541, + "step": 1025 + }, + { + "epoch": 25.617283950617285, + "grad_norm": 3.225881814956665, + "learning_rate": 8.431428571428572e-06, + "loss": 0.9451, + "step": 1050 + }, + { + "epoch": 26.22222222222222, + "grad_norm": 4.001440048217773, + "learning_rate": 8.36e-06, + "loss": 0.9422, + "step": 1075 + }, + { + "epoch": 26.839506172839506, + "grad_norm": 5.347984313964844, + "learning_rate": 8.288571428571429e-06, + "loss": 0.9434, + "step": 1100 + }, + { + "epoch": 27.444444444444443, + "grad_norm": 4.1566901206970215, + "learning_rate": 8.217142857142858e-06, + "loss": 0.942, + "step": 1125 + }, + { + "epoch": 28.049382716049383, + "grad_norm": 3.2101686000823975, + "learning_rate": 8.145714285714287e-06, + "loss": 0.9365, + "step": 1150 + }, + { + "epoch": 28.666666666666668, + "grad_norm": 5.183631896972656, + "learning_rate": 8.074285714285714e-06, + "loss": 0.941, + "step": 1175 + }, + { + "epoch": 29.271604938271604, + "grad_norm": 4.704529285430908, + "learning_rate": 8.002857142857143e-06, + "loss": 0.9374, + "step": 1200 + }, + { + "epoch": 29.88888888888889, + "grad_norm": 4.460058689117432, + "learning_rate": 7.931428571428572e-06, + "loss": 0.9383, + "step": 1225 + }, + { + "epoch": 30.493827160493826, + "grad_norm": 3.616530418395996, + "learning_rate": 7.860000000000001e-06, + "loss": 0.9321, + "step": 1250 + }, + { + "epoch": 31.098765432098766, + "grad_norm": 3.92207932472229, + "learning_rate": 7.788571428571428e-06, + "loss": 0.9347, + "step": 1275 + }, + { + "epoch": 31.71604938271605, + "grad_norm": 3.6962461471557617, + "learning_rate": 7.717142857142857e-06, + "loss": 0.9305, + "step": 1300 + }, + { + "epoch": 32.32098765432099, + "grad_norm": 4.276056289672852, + "learning_rate": 7.645714285714286e-06, + "loss": 0.9336, + "step": 1325 + }, + { + "epoch": 32.93827160493827, + "grad_norm": 5.176277160644531, + "learning_rate": 7.574285714285715e-06, + "loss": 0.9351, + "step": 1350 + }, + { + "epoch": 33.54320987654321, + "grad_norm": 7.2538347244262695, + "learning_rate": 7.502857142857144e-06, + "loss": 0.9241, + "step": 1375 + }, + { + "epoch": 34.148148148148145, + "grad_norm": 4.3576273918151855, + "learning_rate": 7.431428571428572e-06, + "loss": 0.9316, + "step": 1400 + }, + { + "epoch": 34.76543209876543, + "grad_norm": 9.138855934143066, + "learning_rate": 7.360000000000001e-06, + "loss": 0.9277, + "step": 1425 + }, + { + "epoch": 35.370370370370374, + "grad_norm": 4.475003719329834, + "learning_rate": 7.28857142857143e-06, + "loss": 0.9245, + "step": 1450 + }, + { + "epoch": 35.98765432098765, + "grad_norm": 7.28753137588501, + "learning_rate": 7.217142857142858e-06, + "loss": 0.9266, + "step": 1475 + }, + { + "epoch": 36.592592592592595, + "grad_norm": 5.1342949867248535, + "learning_rate": 7.145714285714286e-06, + "loss": 0.9297, + "step": 1500 + }, + { + "epoch": 37.19753086419753, + "grad_norm": 2.7765142917633057, + "learning_rate": 7.074285714285715e-06, + "loss": 0.9253, + "step": 1525 + }, + { + "epoch": 37.81481481481482, + "grad_norm": 3.8011326789855957, + "learning_rate": 7.002857142857143e-06, + "loss": 0.9203, + "step": 1550 + }, + { + "epoch": 38.41975308641975, + "grad_norm": 7.432782173156738, + "learning_rate": 6.931428571428572e-06, + "loss": 0.9196, + "step": 1575 + }, + { + "epoch": 39.02469135802469, + "grad_norm": 4.179474830627441, + "learning_rate": 6.860000000000001e-06, + "loss": 0.9188, + "step": 1600 + }, + { + "epoch": 39.641975308641975, + "grad_norm": 8.513073921203613, + "learning_rate": 6.7885714285714286e-06, + "loss": 0.9268, + "step": 1625 + }, + { + "epoch": 40.24691358024691, + "grad_norm": 3.699882984161377, + "learning_rate": 6.7171428571428576e-06, + "loss": 0.9216, + "step": 1650 + }, + { + "epoch": 40.864197530864196, + "grad_norm": 3.949507713317871, + "learning_rate": 6.645714285714287e-06, + "loss": 0.9238, + "step": 1675 + }, + { + "epoch": 41.46913580246913, + "grad_norm": 3.7951810359954834, + "learning_rate": 6.574285714285716e-06, + "loss": 0.9198, + "step": 1700 + }, + { + "epoch": 42.074074074074076, + "grad_norm": 5.373620986938477, + "learning_rate": 6.502857142857143e-06, + "loss": 0.9135, + "step": 1725 + }, + { + "epoch": 42.69135802469136, + "grad_norm": 6.875067234039307, + "learning_rate": 6.431428571428572e-06, + "loss": 0.918, + "step": 1750 + }, + { + "epoch": 43.2962962962963, + "grad_norm": 7.167726039886475, + "learning_rate": 6.360000000000001e-06, + "loss": 0.9276, + "step": 1775 + }, + { + "epoch": 43.91358024691358, + "grad_norm": 3.7067105770111084, + "learning_rate": 6.288571428571429e-06, + "loss": 0.9169, + "step": 1800 + }, + { + "epoch": 44.51851851851852, + "grad_norm": 4.474793434143066, + "learning_rate": 6.217142857142857e-06, + "loss": 0.9191, + "step": 1825 + }, + { + "epoch": 45.123456790123456, + "grad_norm": 5.386421203613281, + "learning_rate": 6.145714285714286e-06, + "loss": 0.9145, + "step": 1850 + }, + { + "epoch": 45.74074074074074, + "grad_norm": 3.068861246109009, + "learning_rate": 6.0742857142857145e-06, + "loss": 0.9095, + "step": 1875 + }, + { + "epoch": 46.34567901234568, + "grad_norm": 3.804973840713501, + "learning_rate": 6.0028571428571435e-06, + "loss": 0.912, + "step": 1900 + }, + { + "epoch": 46.96296296296296, + "grad_norm": 2.9225473403930664, + "learning_rate": 5.9314285714285725e-06, + "loss": 0.9049, + "step": 1925 + }, + { + "epoch": 47.5679012345679, + "grad_norm": 4.022708892822266, + "learning_rate": 5.86e-06, + "loss": 0.9049, + "step": 1950 + }, + { + "epoch": 48.17283950617284, + "grad_norm": 3.421691417694092, + "learning_rate": 5.788571428571429e-06, + "loss": 0.9101, + "step": 1975 + }, + { + "epoch": 48.79012345679013, + "grad_norm": 6.732350826263428, + "learning_rate": 5.717142857142858e-06, + "loss": 0.9105, + "step": 2000 + }, + { + "epoch": 48.79012345679013, + "eval_loss": 0.8953001499176025, + "eval_runtime": 2.1587, + "eval_samples_per_second": 66.707, + "eval_steps_per_second": 33.353, + "step": 2000 + }, + { + "epoch": 49.39506172839506, + "grad_norm": 5.506401538848877, + "learning_rate": 5.645714285714287e-06, + "loss": 0.9036, + "step": 2025 + }, + { + "epoch": 50.0, + "grad_norm": 9.19892406463623, + "learning_rate": 5.574285714285714e-06, + "loss": 0.9107, + "step": 2050 + }, + { + "epoch": 50.617283950617285, + "grad_norm": 3.324119806289673, + "learning_rate": 5.502857142857143e-06, + "loss": 0.9118, + "step": 2075 + }, + { + "epoch": 51.22222222222222, + "grad_norm": 5.142299652099609, + "learning_rate": 5.431428571428572e-06, + "loss": 0.9098, + "step": 2100 + }, + { + "epoch": 51.839506172839506, + "grad_norm": 2.8806934356689453, + "learning_rate": 5.36e-06, + "loss": 0.9013, + "step": 2125 + }, + { + "epoch": 52.44444444444444, + "grad_norm": 4.728231430053711, + "learning_rate": 5.2885714285714285e-06, + "loss": 0.9049, + "step": 2150 + }, + { + "epoch": 53.04938271604938, + "grad_norm": 4.9596991539001465, + "learning_rate": 5.2171428571428575e-06, + "loss": 0.9128, + "step": 2175 + }, + { + "epoch": 53.666666666666664, + "grad_norm": 3.160998821258545, + "learning_rate": 5.145714285714286e-06, + "loss": 0.9003, + "step": 2200 + }, + { + "epoch": 54.27160493827161, + "grad_norm": 3.833195924758911, + "learning_rate": 5.074285714285715e-06, + "loss": 0.9088, + "step": 2225 + }, + { + "epoch": 54.888888888888886, + "grad_norm": 5.242589950561523, + "learning_rate": 5.002857142857144e-06, + "loss": 0.9005, + "step": 2250 + }, + { + "epoch": 55.49382716049383, + "grad_norm": 3.781388759613037, + "learning_rate": 4.931428571428572e-06, + "loss": 0.9028, + "step": 2275 + }, + { + "epoch": 56.098765432098766, + "grad_norm": 6.0595574378967285, + "learning_rate": 4.86e-06, + "loss": 0.9124, + "step": 2300 + }, + { + "epoch": 56.71604938271605, + "grad_norm": 2.7515597343444824, + "learning_rate": 4.788571428571429e-06, + "loss": 0.9025, + "step": 2325 + }, + { + "epoch": 57.32098765432099, + "grad_norm": 6.520521640777588, + "learning_rate": 4.717142857142857e-06, + "loss": 0.9065, + "step": 2350 + }, + { + "epoch": 57.93827160493827, + "grad_norm": 3.289445638656616, + "learning_rate": 4.645714285714286e-06, + "loss": 0.9004, + "step": 2375 + }, + { + "epoch": 58.54320987654321, + "grad_norm": 3.6132805347442627, + "learning_rate": 4.574285714285714e-06, + "loss": 0.9021, + "step": 2400 + }, + { + "epoch": 59.148148148148145, + "grad_norm": 5.021145343780518, + "learning_rate": 4.5028571428571434e-06, + "loss": 0.8957, + "step": 2425 + }, + { + "epoch": 59.76543209876543, + "grad_norm": 5.366466522216797, + "learning_rate": 4.431428571428572e-06, + "loss": 0.8986, + "step": 2450 + }, + { + "epoch": 60.370370370370374, + "grad_norm": 5.833218574523926, + "learning_rate": 4.360000000000001e-06, + "loss": 0.9045, + "step": 2475 + }, + { + "epoch": 60.98765432098765, + "grad_norm": 5.301181793212891, + "learning_rate": 4.288571428571429e-06, + "loss": 0.8975, + "step": 2500 + }, + { + "epoch": 61.592592592592595, + "grad_norm": 3.989539861679077, + "learning_rate": 4.217142857142858e-06, + "loss": 0.9021, + "step": 2525 + }, + { + "epoch": 62.19753086419753, + "grad_norm": 13.111737251281738, + "learning_rate": 4.145714285714286e-06, + "loss": 0.9043, + "step": 2550 + }, + { + "epoch": 62.81481481481482, + "grad_norm": 3.4066903591156006, + "learning_rate": 4.074285714285714e-06, + "loss": 0.8929, + "step": 2575 + }, + { + "epoch": 63.41975308641975, + "grad_norm": 3.9170608520507812, + "learning_rate": 4.002857142857143e-06, + "loss": 0.8998, + "step": 2600 + }, + { + "epoch": 64.0246913580247, + "grad_norm": 3.5934042930603027, + "learning_rate": 3.931428571428571e-06, + "loss": 0.898, + "step": 2625 + }, + { + "epoch": 64.64197530864197, + "grad_norm": 3.3771822452545166, + "learning_rate": 3.86e-06, + "loss": 0.901, + "step": 2650 + }, + { + "epoch": 65.24691358024691, + "grad_norm": 3.5741279125213623, + "learning_rate": 3.7885714285714285e-06, + "loss": 0.903, + "step": 2675 + }, + { + "epoch": 65.8641975308642, + "grad_norm": 4.369333267211914, + "learning_rate": 3.7171428571428575e-06, + "loss": 0.8907, + "step": 2700 + }, + { + "epoch": 66.46913580246914, + "grad_norm": 2.9996423721313477, + "learning_rate": 3.6457142857142857e-06, + "loss": 0.9008, + "step": 2725 + }, + { + "epoch": 67.07407407407408, + "grad_norm": 5.098217487335205, + "learning_rate": 3.5742857142857147e-06, + "loss": 0.8979, + "step": 2750 + }, + { + "epoch": 67.69135802469135, + "grad_norm": 3.8548665046691895, + "learning_rate": 3.5028571428571433e-06, + "loss": 0.8906, + "step": 2775 + }, + { + "epoch": 68.29629629629629, + "grad_norm": 4.787322521209717, + "learning_rate": 3.431428571428572e-06, + "loss": 0.8949, + "step": 2800 + }, + { + "epoch": 68.91358024691358, + "grad_norm": 2.8501498699188232, + "learning_rate": 3.3600000000000004e-06, + "loss": 0.8932, + "step": 2825 + }, + { + "epoch": 69.51851851851852, + "grad_norm": 7.697382926940918, + "learning_rate": 3.2885714285714286e-06, + "loss": 0.8961, + "step": 2850 + }, + { + "epoch": 70.12345679012346, + "grad_norm": 3.5617403984069824, + "learning_rate": 3.2171428571428576e-06, + "loss": 0.8975, + "step": 2875 + }, + { + "epoch": 70.74074074074075, + "grad_norm": 4.286247253417969, + "learning_rate": 3.1457142857142858e-06, + "loss": 0.8988, + "step": 2900 + }, + { + "epoch": 71.34567901234568, + "grad_norm": 3.0174379348754883, + "learning_rate": 3.074285714285715e-06, + "loss": 0.8986, + "step": 2925 + }, + { + "epoch": 71.96296296296296, + "grad_norm": 5.708584308624268, + "learning_rate": 3.002857142857143e-06, + "loss": 0.8888, + "step": 2950 + }, + { + "epoch": 72.5679012345679, + "grad_norm": 7.933815956115723, + "learning_rate": 2.9314285714285716e-06, + "loss": 0.9, + "step": 2975 + }, + { + "epoch": 73.17283950617283, + "grad_norm": 3.4261972904205322, + "learning_rate": 2.86e-06, + "loss": 0.8951, + "step": 3000 + }, + { + "epoch": 73.17283950617283, + "eval_loss": 0.8869494795799255, + "eval_runtime": 2.1798, + "eval_samples_per_second": 66.061, + "eval_steps_per_second": 33.03, + "step": 3000 + }, + { + "epoch": 73.79012345679013, + "grad_norm": 4.3120646476745605, + "learning_rate": 2.7885714285714287e-06, + "loss": 0.8897, + "step": 3025 + }, + { + "epoch": 74.39506172839506, + "grad_norm": 3.6650469303131104, + "learning_rate": 2.7171428571428577e-06, + "loss": 0.8961, + "step": 3050 + }, + { + "epoch": 75.0, + "grad_norm": 7.670346736907959, + "learning_rate": 2.645714285714286e-06, + "loss": 0.9003, + "step": 3075 + }, + { + "epoch": 75.61728395061728, + "grad_norm": 3.292160987854004, + "learning_rate": 2.574285714285715e-06, + "loss": 0.8946, + "step": 3100 + }, + { + "epoch": 76.22222222222223, + "grad_norm": 3.5280263423919678, + "learning_rate": 2.502857142857143e-06, + "loss": 0.8934, + "step": 3125 + }, + { + "epoch": 76.8395061728395, + "grad_norm": 5.340327739715576, + "learning_rate": 2.4314285714285717e-06, + "loss": 0.8984, + "step": 3150 + }, + { + "epoch": 77.44444444444444, + "grad_norm": 6.106954574584961, + "learning_rate": 2.3600000000000003e-06, + "loss": 0.8958, + "step": 3175 + }, + { + "epoch": 78.04938271604938, + "grad_norm": 3.5689122676849365, + "learning_rate": 2.288571428571429e-06, + "loss": 0.8968, + "step": 3200 + }, + { + "epoch": 78.66666666666667, + "grad_norm": 3.928802967071533, + "learning_rate": 2.2171428571428575e-06, + "loss": 0.8909, + "step": 3225 + }, + { + "epoch": 79.27160493827161, + "grad_norm": 3.5558717250823975, + "learning_rate": 2.145714285714286e-06, + "loss": 0.8881, + "step": 3250 + }, + { + "epoch": 79.88888888888889, + "grad_norm": 3.194141387939453, + "learning_rate": 2.0742857142857146e-06, + "loss": 0.8868, + "step": 3275 + }, + { + "epoch": 80.49382716049382, + "grad_norm": 5.6881232261657715, + "learning_rate": 2.0028571428571432e-06, + "loss": 0.8973, + "step": 3300 + }, + { + "epoch": 81.09876543209876, + "grad_norm": 3.105429172515869, + "learning_rate": 1.9314285714285714e-06, + "loss": 0.8935, + "step": 3325 + }, + { + "epoch": 81.71604938271605, + "grad_norm": 3.2020113468170166, + "learning_rate": 1.8600000000000002e-06, + "loss": 0.89, + "step": 3350 + }, + { + "epoch": 82.32098765432099, + "grad_norm": 3.5079753398895264, + "learning_rate": 1.7885714285714288e-06, + "loss": 0.8957, + "step": 3375 + }, + { + "epoch": 82.93827160493827, + "grad_norm": 3.3880198001861572, + "learning_rate": 1.7171428571428572e-06, + "loss": 0.8914, + "step": 3400 + }, + { + "epoch": 83.54320987654322, + "grad_norm": 6.072048664093018, + "learning_rate": 1.6457142857142857e-06, + "loss": 0.8904, + "step": 3425 + }, + { + "epoch": 84.14814814814815, + "grad_norm": 2.919877529144287, + "learning_rate": 1.5742857142857143e-06, + "loss": 0.8842, + "step": 3450 + }, + { + "epoch": 84.76543209876543, + "grad_norm": 3.742579936981201, + "learning_rate": 1.502857142857143e-06, + "loss": 0.8903, + "step": 3475 + }, + { + "epoch": 85.37037037037037, + "grad_norm": 3.9216341972351074, + "learning_rate": 1.4314285714285717e-06, + "loss": 0.8962, + "step": 3500 + }, + { + "epoch": 85.98765432098766, + "grad_norm": 3.594411849975586, + "learning_rate": 1.3600000000000001e-06, + "loss": 0.894, + "step": 3525 + }, + { + "epoch": 86.5925925925926, + "grad_norm": 3.7163913249969482, + "learning_rate": 1.2885714285714287e-06, + "loss": 0.8931, + "step": 3550 + }, + { + "epoch": 87.19753086419753, + "grad_norm": 2.8378684520721436, + "learning_rate": 1.2171428571428573e-06, + "loss": 0.8855, + "step": 3575 + }, + { + "epoch": 87.81481481481481, + "grad_norm": 3.5566790103912354, + "learning_rate": 1.1457142857142859e-06, + "loss": 0.8911, + "step": 3600 + }, + { + "epoch": 88.41975308641975, + "grad_norm": 3.187382936477661, + "learning_rate": 1.0742857142857145e-06, + "loss": 0.8979, + "step": 3625 + }, + { + "epoch": 89.0246913580247, + "grad_norm": 3.7930212020874023, + "learning_rate": 1.0028571428571428e-06, + "loss": 0.8824, + "step": 3650 + }, + { + "epoch": 89.64197530864197, + "grad_norm": 3.2194180488586426, + "learning_rate": 9.314285714285714e-07, + "loss": 0.8915, + "step": 3675 + }, + { + "epoch": 90.24691358024691, + "grad_norm": 3.050337076187134, + "learning_rate": 8.6e-07, + "loss": 0.8866, + "step": 3700 + }, + { + "epoch": 90.8641975308642, + "grad_norm": 5.006812572479248, + "learning_rate": 7.885714285714287e-07, + "loss": 0.8877, + "step": 3725 + }, + { + "epoch": 91.46913580246914, + "grad_norm": 3.206684112548828, + "learning_rate": 7.171428571428572e-07, + "loss": 0.8894, + "step": 3750 + }, + { + "epoch": 92.07407407407408, + "grad_norm": 4.174693584442139, + "learning_rate": 6.457142857142858e-07, + "loss": 0.8906, + "step": 3775 + }, + { + "epoch": 92.69135802469135, + "grad_norm": 5.580083847045898, + "learning_rate": 5.742857142857143e-07, + "loss": 0.8962, + "step": 3800 + }, + { + "epoch": 93.29629629629629, + "grad_norm": 4.221833229064941, + "learning_rate": 5.028571428571429e-07, + "loss": 0.8886, + "step": 3825 + }, + { + "epoch": 93.91358024691358, + "grad_norm": 3.687716484069824, + "learning_rate": 4.3142857142857146e-07, + "loss": 0.8911, + "step": 3850 + }, + { + "epoch": 94.51851851851852, + "grad_norm": 4.194035530090332, + "learning_rate": 3.6e-07, + "loss": 0.889, + "step": 3875 + }, + { + "epoch": 95.12345679012346, + "grad_norm": 4.321438312530518, + "learning_rate": 2.885714285714286e-07, + "loss": 0.8926, + "step": 3900 + }, + { + "epoch": 95.74074074074075, + "grad_norm": 4.36216926574707, + "learning_rate": 2.1714285714285715e-07, + "loss": 0.8923, + "step": 3925 + }, + { + "epoch": 96.34567901234568, + "grad_norm": 3.93856143951416, + "learning_rate": 1.4571428571428574e-07, + "loss": 0.8849, + "step": 3950 + }, + { + "epoch": 96.96296296296296, + "grad_norm": 2.968627691268921, + "learning_rate": 7.428571428571429e-08, + "loss": 0.8904, + "step": 3975 + }, + { + "epoch": 97.5679012345679, + "grad_norm": 16.590002059936523, + "learning_rate": 2.8571428571428576e-09, + "loss": 0.8853, + "step": 4000 + }, + { + "epoch": 97.5679012345679, + "eval_loss": 0.8817942142486572, + "eval_runtime": 2.1317, + "eval_samples_per_second": 67.551, + "eval_steps_per_second": 33.775, + "step": 4000 + } + ], + "logging_steps": 25, + "max_steps": 4000, + "num_input_tokens_seen": 0, + "num_train_epochs": 98, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 3285475532317440.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-4000/training_args.bin b/checkpoint-4000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..cf416a3be0d19e0ce5aadbb31f093c5d913fee53 --- /dev/null +++ b/checkpoint-4000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66d367dc04409d63341644c780ebdb997e8756f9aa9f6d110afc5d9ab8de84be +size 5905 diff --git a/config.json b/config.json new file mode 100644 index 0000000000000000000000000000000000000000..1197099a882c66f2bbab2210646af21b05d05ee4 --- /dev/null +++ b/config.json @@ -0,0 +1,91 @@ +{ + "activation_dropout": 0.1, + "apply_spec_augment": true, + "architectures": [ + "SpeechT5ForTextToSpeech" + ], + "attention_dropout": 0.1, + "bos_token_id": 0, + "conv_bias": false, + "conv_dim": [ + 512, + 512, + 512, + 512, + 512, + 512, + 512 + ], + "conv_kernel": [ + 10, + 3, + 3, + 3, + 3, + 2, + 2 + ], + "conv_stride": [ + 5, + 2, + 2, + 2, + 2, + 2, + 2 + ], + "decoder_attention_heads": 12, + "decoder_ffn_dim": 3072, + "decoder_layerdrop": 0.1, + "decoder_layers": 6, + "decoder_start_token_id": 2, + "encoder_attention_heads": 12, + "encoder_ffn_dim": 3072, + "encoder_layerdrop": 0.1, + "encoder_layers": 12, + "encoder_max_relative_position": 160, + "eos_token_id": 2, + "feat_extract_activation": "gelu", + "feat_extract_norm": "group", + "feat_proj_dropout": 0.0, + "guided_attention_loss_num_heads": 2, + "guided_attention_loss_scale": 10.0, + "guided_attention_loss_sigma": 0.4, + "hidden_act": "gelu", + "hidden_dropout": 0.1, + "hidden_size": 768, + "initializer_range": 0.02, + "is_encoder_decoder": true, + "layer_norm_eps": 1e-05, + "mask_feature_length": 10, + "mask_feature_min_masks": 0, + "mask_feature_prob": 0.0, + "mask_time_length": 10, + "mask_time_min_masks": 2, + "mask_time_prob": 0.05, + "max_length": null, + "max_speech_positions": 1876, + "max_text_positions": 600, + "model_type": "speecht5", + "num_conv_pos_embedding_groups": 16, + "num_conv_pos_embeddings": 128, + "num_feat_extract_layers": 7, + "num_mel_bins": 80, + "pad_token_id": 1, + "positional_dropout": 0.1, + "reduction_factor": 2, + "scale_embedding": false, + "speaker_embedding_dim": 512, + "speech_decoder_postnet_dropout": 0.5, + "speech_decoder_postnet_kernel": 5, + "speech_decoder_postnet_layers": 5, + "speech_decoder_postnet_units": 256, + "speech_decoder_prenet_dropout": 0.5, + "speech_decoder_prenet_layers": 2, + "speech_decoder_prenet_units": 256, + "torch_dtype": "float32", + "transformers_version": "4.55.4", + "use_cache": false, + "use_guided_attention_loss": true, + "vocab_size": 81 +} diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1748094b7e4b8c1fb66bf2357c44c5f625a36179 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,9 @@ +{ + "_from_model_config": true, + "bos_token_id": 0, + "decoder_start_token_id": 2, + "eos_token_id": 2, + "max_length": 1876, + "pad_token_id": 1, + "transformers_version": "4.55.4" +} diff --git a/logs/events.out.tfevents.1756106108.MSI.40384.0 b/logs/events.out.tfevents.1756106108.MSI.40384.0 new file mode 100644 index 0000000000000000000000000000000000000000..05eb1425642137076330ffafc058c20c6583c5e0 --- /dev/null +++ b/logs/events.out.tfevents.1756106108.MSI.40384.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:acb5c639753cb4d122ae95afcb18ae1214d10c2d954d97a6d44274330c775c93 +size 41712 diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..063a7debbe0b96bed7d11b8b1e6151197d55864b --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b10dd87b217ab2fc492088d02d67c7955fbbff9f22b6fda9133dfa1744e6d9d +size 577789320 diff --git a/preprocessor_config.json b/preprocessor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..98a41e66e16f3225a15cb2b51c963fe2213aa273 --- /dev/null +++ b/preprocessor_config.json @@ -0,0 +1,19 @@ +{ + "do_normalize": false, + "feature_extractor_type": "SpeechT5FeatureExtractor", + "feature_size": 1, + "fmax": 7600, + "fmin": 80, + "frame_signal_scale": 1.0, + "hop_length": 16, + "mel_floor": 1e-10, + "num_mel_bins": 80, + "padding_side": "right", + "padding_value": 0.0, + "processor_class": "SpeechT5Processor", + "reduction_factor": 2, + "return_attention_mask": true, + "sampling_rate": 16000, + "win_function": "hann_window", + "win_length": 64 +} diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..4ee24ec69861cfc94abbe2c8c934aa0744aa623c --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,13 @@ +{ + "bos_token": "", + "eos_token": "", + "mask_token": { + "content": "", + "lstrip": true, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": "" +} diff --git a/spkrec_cache/classifier.ckpt b/spkrec_cache/classifier.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..d6f70f222826ea5490bfad657373af3d6d2a08ba --- /dev/null +++ b/spkrec_cache/classifier.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd9e3634fe68bd0a427c95e354c0c677374f62b3f434e45b78599950d860d535 +size 5534328 diff --git a/spkrec_cache/embedding_model.ckpt b/spkrec_cache/embedding_model.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..8ba959669e5570dbbf8076bc6b8a79555a81d5c4 --- /dev/null +++ b/spkrec_cache/embedding_model.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0575cb64845e6b9a10db9bcb74d5ac32b326b8dc90352671d345e2ee3d0126a2 +size 83316686 diff --git a/spkrec_cache/hyperparams.yaml b/spkrec_cache/hyperparams.yaml new file mode 100644 index 0000000000000000000000000000000000000000..70e4cd0beb74ca08a2df9de6bd79d938670a4d15 --- /dev/null +++ b/spkrec_cache/hyperparams.yaml @@ -0,0 +1,58 @@ +# ############################################################################ +# Model: ECAPA big for Speaker verification +# ############################################################################ + +# Feature parameters +n_mels: 80 + +# Pretrain folder (HuggingFace) +pretrained_path: speechbrain/spkrec-ecapa-voxceleb + +# Output parameters +out_n_neurons: 7205 + +# Model params +compute_features: !new:speechbrain.lobes.features.Fbank + n_mels: !ref + +mean_var_norm: !new:speechbrain.processing.features.InputNormalization + norm_type: sentence + std_norm: False + +embedding_model: !new:speechbrain.lobes.models.ECAPA_TDNN.ECAPA_TDNN + input_size: !ref + channels: [1024, 1024, 1024, 1024, 3072] + kernel_sizes: [5, 3, 3, 3, 1] + dilations: [1, 2, 3, 4, 1] + attention_channels: 128 + lin_neurons: 192 + +classifier: !new:speechbrain.lobes.models.ECAPA_TDNN.Classifier + input_size: 192 + out_neurons: !ref + +mean_var_norm_emb: !new:speechbrain.processing.features.InputNormalization + norm_type: global + std_norm: False + +modules: + compute_features: !ref + mean_var_norm: !ref + embedding_model: !ref + mean_var_norm_emb: !ref + classifier: !ref + +label_encoder: !new:speechbrain.dataio.encoder.CategoricalEncoder + + +pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer + loadables: + embedding_model: !ref + mean_var_norm_emb: !ref + classifier: !ref + label_encoder: !ref + paths: + embedding_model: !ref /embedding_model.ckpt + mean_var_norm_emb: !ref /mean_var_norm_emb.ckpt + classifier: !ref /classifier.ckpt + label_encoder: !ref /label_encoder.txt diff --git a/spkrec_cache/label_encoder.ckpt b/spkrec_cache/label_encoder.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..1b4ba4327aeb75727f85395533bc448740cb1d1d --- /dev/null +++ b/spkrec_cache/label_encoder.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e13c3a167bb4112685670ee896d20e2b565af16b3a4ceeaa8689fa4d22adb8b9 +size 128619 diff --git a/spkrec_cache/mean_var_norm_emb.ckpt b/spkrec_cache/mean_var_norm_emb.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..a7fb23a2c35884d02fbf2803755228109852ba43 --- /dev/null +++ b/spkrec_cache/mean_var_norm_emb.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd70225b05b37be64fc5a95e24395d804231d43f74b2e1e5a513db7b69b34c33 +size 1921 diff --git a/spm_char.model b/spm_char.model new file mode 100644 index 0000000000000000000000000000000000000000..8fb73691942626fa75df80b61aab0e9b9340d8e2 --- /dev/null +++ b/spm_char.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7fcc48f3e225f627b1641db410ceb0c8649bd2b0c982e150b03f8be3728ab560 +size 238473 diff --git a/style_adaptor.pt b/style_adaptor.pt new file mode 100644 index 0000000000000000000000000000000000000000..3567c7bc822095537006823f57dd4d90a068077e --- /dev/null +++ b/style_adaptor.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65b08618225d68d0398c44b020f795f8ba9c763c67cde2c5834c0dcb5ff44019 +size 2118433 diff --git a/style_fusion.pt b/style_fusion.pt new file mode 100644 index 0000000000000000000000000000000000000000..543f13b773613cd52dbc387660b28fd9ad1d6906 --- /dev/null +++ b/style_fusion.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:891f2873fa8f0d20580c329a718e9b3674489cfd035d93cf9fe753de4d359b65 +size 1972815 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e88d44ba3be31ac8f53461ae7c1b02b4c5c830ab --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,64 @@ +{ + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "79": { + "content": "", + "lstrip": true, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + }, + "80": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "extra_special_tokens": {}, + "mask_token": "", + "model_max_length": 600, + "normalize": false, + "pad_token": "", + "processor_class": "SpeechT5Processor", + "sp_model_kwargs": {}, + "tokenizer_class": "SpeechT5Tokenizer", + "unk_token": "" +}