Amirhossein75 commited on
Commit
40d8691
·
1 Parent(s): 71e4fd0

Initial model upload

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .idea/.gitignore +3 -0
  2. added_tokens.json +4 -0
  3. checkpoint-1000/added_tokens.json +4 -0
  4. checkpoint-1000/config.json +91 -0
  5. checkpoint-1000/generation_config.json +9 -0
  6. checkpoint-1000/model.safetensors +3 -0
  7. checkpoint-1000/optimizer.pt +3 -0
  8. checkpoint-1000/preprocessor_config.json +19 -0
  9. checkpoint-1000/rng_state.pth +3 -0
  10. checkpoint-1000/scheduler.pt +3 -0
  11. checkpoint-1000/special_tokens_map.json +13 -0
  12. checkpoint-1000/spm_char.model +3 -0
  13. checkpoint-1000/tokenizer_config.json +64 -0
  14. checkpoint-1000/trainer_state.json +322 -0
  15. checkpoint-1000/training_args.bin +3 -0
  16. checkpoint-2000/added_tokens.json +4 -0
  17. checkpoint-2000/config.json +91 -0
  18. checkpoint-2000/generation_config.json +9 -0
  19. checkpoint-2000/model.safetensors +3 -0
  20. checkpoint-2000/optimizer.pt +3 -0
  21. checkpoint-2000/preprocessor_config.json +19 -0
  22. checkpoint-2000/rng_state.pth +3 -0
  23. checkpoint-2000/scheduler.pt +3 -0
  24. checkpoint-2000/special_tokens_map.json +13 -0
  25. checkpoint-2000/spm_char.model +3 -0
  26. checkpoint-2000/tokenizer_config.json +64 -0
  27. checkpoint-2000/trainer_state.json +610 -0
  28. checkpoint-2000/training_args.bin +3 -0
  29. checkpoint-3000/added_tokens.json +4 -0
  30. checkpoint-3000/config.json +91 -0
  31. checkpoint-3000/generation_config.json +9 -0
  32. checkpoint-3000/model.safetensors +3 -0
  33. checkpoint-3000/optimizer.pt +3 -0
  34. checkpoint-3000/preprocessor_config.json +19 -0
  35. checkpoint-3000/rng_state.pth +3 -0
  36. checkpoint-3000/scheduler.pt +3 -0
  37. checkpoint-3000/special_tokens_map.json +13 -0
  38. checkpoint-3000/spm_char.model +3 -0
  39. checkpoint-3000/tokenizer_config.json +64 -0
  40. checkpoint-3000/trainer_state.json +898 -0
  41. checkpoint-3000/training_args.bin +3 -0
  42. checkpoint-4000/added_tokens.json +4 -0
  43. checkpoint-4000/config.json +91 -0
  44. checkpoint-4000/generation_config.json +9 -0
  45. checkpoint-4000/model.safetensors +3 -0
  46. checkpoint-4000/optimizer.pt +3 -0
  47. checkpoint-4000/preprocessor_config.json +19 -0
  48. checkpoint-4000/rng_state.pth +3 -0
  49. checkpoint-4000/scheduler.pt +3 -0
  50. checkpoint-4000/special_tokens_map.json +13 -0
.idea/.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # Default ignored files
2
+ /shelf/
3
+ /workspace.xml
added_tokens.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "<ctc_blank>": 80,
3
+ "<mask>": 79
4
+ }
checkpoint-1000/added_tokens.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "<ctc_blank>": 80,
3
+ "<mask>": 79
4
+ }
checkpoint-1000/config.json ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_dropout": 0.1,
3
+ "apply_spec_augment": true,
4
+ "architectures": [
5
+ "SpeechT5ForTextToSpeech"
6
+ ],
7
+ "attention_dropout": 0.1,
8
+ "bos_token_id": 0,
9
+ "conv_bias": false,
10
+ "conv_dim": [
11
+ 512,
12
+ 512,
13
+ 512,
14
+ 512,
15
+ 512,
16
+ 512,
17
+ 512
18
+ ],
19
+ "conv_kernel": [
20
+ 10,
21
+ 3,
22
+ 3,
23
+ 3,
24
+ 3,
25
+ 2,
26
+ 2
27
+ ],
28
+ "conv_stride": [
29
+ 5,
30
+ 2,
31
+ 2,
32
+ 2,
33
+ 2,
34
+ 2,
35
+ 2
36
+ ],
37
+ "decoder_attention_heads": 12,
38
+ "decoder_ffn_dim": 3072,
39
+ "decoder_layerdrop": 0.1,
40
+ "decoder_layers": 6,
41
+ "decoder_start_token_id": 2,
42
+ "encoder_attention_heads": 12,
43
+ "encoder_ffn_dim": 3072,
44
+ "encoder_layerdrop": 0.1,
45
+ "encoder_layers": 12,
46
+ "encoder_max_relative_position": 160,
47
+ "eos_token_id": 2,
48
+ "feat_extract_activation": "gelu",
49
+ "feat_extract_norm": "group",
50
+ "feat_proj_dropout": 0.0,
51
+ "guided_attention_loss_num_heads": 2,
52
+ "guided_attention_loss_scale": 10.0,
53
+ "guided_attention_loss_sigma": 0.4,
54
+ "hidden_act": "gelu",
55
+ "hidden_dropout": 0.1,
56
+ "hidden_size": 768,
57
+ "initializer_range": 0.02,
58
+ "is_encoder_decoder": true,
59
+ "layer_norm_eps": 1e-05,
60
+ "mask_feature_length": 10,
61
+ "mask_feature_min_masks": 0,
62
+ "mask_feature_prob": 0.0,
63
+ "mask_time_length": 10,
64
+ "mask_time_min_masks": 2,
65
+ "mask_time_prob": 0.05,
66
+ "max_length": null,
67
+ "max_speech_positions": 1876,
68
+ "max_text_positions": 600,
69
+ "model_type": "speecht5",
70
+ "num_conv_pos_embedding_groups": 16,
71
+ "num_conv_pos_embeddings": 128,
72
+ "num_feat_extract_layers": 7,
73
+ "num_mel_bins": 80,
74
+ "pad_token_id": 1,
75
+ "positional_dropout": 0.1,
76
+ "reduction_factor": 2,
77
+ "scale_embedding": false,
78
+ "speaker_embedding_dim": 512,
79
+ "speech_decoder_postnet_dropout": 0.5,
80
+ "speech_decoder_postnet_kernel": 5,
81
+ "speech_decoder_postnet_layers": 5,
82
+ "speech_decoder_postnet_units": 256,
83
+ "speech_decoder_prenet_dropout": 0.5,
84
+ "speech_decoder_prenet_layers": 2,
85
+ "speech_decoder_prenet_units": 256,
86
+ "torch_dtype": "float32",
87
+ "transformers_version": "4.55.4",
88
+ "use_cache": false,
89
+ "use_guided_attention_loss": true,
90
+ "vocab_size": 81
91
+ }
checkpoint-1000/generation_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 0,
4
+ "decoder_start_token_id": 2,
5
+ "eos_token_id": 2,
6
+ "max_length": 1876,
7
+ "pad_token_id": 1,
8
+ "transformers_version": "4.55.4"
9
+ }
checkpoint-1000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eb5d6fe49ff85411787439f9ad2e6bfa7affebb9cb657848d6ca12433db4e10a
3
+ size 577789320
checkpoint-1000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a801f2d4ec47bf11dfadfa6c068daebd7c9d851603bd0a0eef429e5a22f6bb2e
3
+ size 1155777946
checkpoint-1000/preprocessor_config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": false,
3
+ "feature_extractor_type": "SpeechT5FeatureExtractor",
4
+ "feature_size": 1,
5
+ "fmax": 7600,
6
+ "fmin": 80,
7
+ "frame_signal_scale": 1.0,
8
+ "hop_length": 16,
9
+ "mel_floor": 1e-10,
10
+ "num_mel_bins": 80,
11
+ "padding_side": "right",
12
+ "padding_value": 0.0,
13
+ "processor_class": "SpeechT5Processor",
14
+ "reduction_factor": 2,
15
+ "return_attention_mask": true,
16
+ "sampling_rate": 16000,
17
+ "win_function": "hann_window",
18
+ "win_length": 64
19
+ }
checkpoint-1000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4f27257904c7decb41a03da01a49d9f6fdf1f1b8f5e5d56fe64ef4572336d6eb
3
+ size 14645
checkpoint-1000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5186565a906d7db433e54fbfdb3d62aa206e2cb82464d6a3316608741a692047
3
+ size 1465
checkpoint-1000/special_tokens_map.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "eos_token": "</s>",
4
+ "mask_token": {
5
+ "content": "<mask>",
6
+ "lstrip": true,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false
10
+ },
11
+ "pad_token": "<pad>",
12
+ "unk_token": "<unk>"
13
+ }
checkpoint-1000/spm_char.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7fcc48f3e225f627b1641db410ceb0c8649bd2b0c982e150b03f8be3728ab560
3
+ size 238473
checkpoint-1000/tokenizer_config.json ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<s>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<pad>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "</s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "79": {
36
+ "content": "<mask>",
37
+ "lstrip": true,
38
+ "normalized": true,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ },
43
+ "80": {
44
+ "content": "<ctc_blank>",
45
+ "lstrip": false,
46
+ "normalized": true,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": false
50
+ }
51
+ },
52
+ "bos_token": "<s>",
53
+ "clean_up_tokenization_spaces": false,
54
+ "eos_token": "</s>",
55
+ "extra_special_tokens": {},
56
+ "mask_token": "<mask>",
57
+ "model_max_length": 600,
58
+ "normalize": false,
59
+ "pad_token": "<pad>",
60
+ "processor_class": "SpeechT5Processor",
61
+ "sp_model_kwargs": {},
62
+ "tokenizer_class": "SpeechT5Tokenizer",
63
+ "unk_token": "<unk>"
64
+ }
checkpoint-1000/trainer_state.json ADDED
@@ -0,0 +1,322 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 1000,
3
+ "best_metric": 0.9205830097198486,
4
+ "best_model_checkpoint": "runs/emotts_ravdess\\checkpoint-1000",
5
+ "epoch": 24.395061728395063,
6
+ "eval_steps": 1000,
7
+ "global_step": 1000,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.6172839506172839,
14
+ "grad_norm": 46.678199768066406,
15
+ "learning_rate": 4.800000000000001e-07,
16
+ "loss": 3.4472,
17
+ "step": 25
18
+ },
19
+ {
20
+ "epoch": 1.2222222222222223,
21
+ "grad_norm": 26.903335571289062,
22
+ "learning_rate": 9.800000000000001e-07,
23
+ "loss": 2.9051,
24
+ "step": 50
25
+ },
26
+ {
27
+ "epoch": 1.8395061728395061,
28
+ "grad_norm": 16.712799072265625,
29
+ "learning_rate": 1.48e-06,
30
+ "loss": 2.2302,
31
+ "step": 75
32
+ },
33
+ {
34
+ "epoch": 2.4444444444444446,
35
+ "grad_norm": 11.607951164245605,
36
+ "learning_rate": 1.98e-06,
37
+ "loss": 1.7683,
38
+ "step": 100
39
+ },
40
+ {
41
+ "epoch": 3.049382716049383,
42
+ "grad_norm": 7.216983318328857,
43
+ "learning_rate": 2.4800000000000004e-06,
44
+ "loss": 1.5434,
45
+ "step": 125
46
+ },
47
+ {
48
+ "epoch": 3.6666666666666665,
49
+ "grad_norm": 10.899630546569824,
50
+ "learning_rate": 2.9800000000000003e-06,
51
+ "loss": 1.4385,
52
+ "step": 150
53
+ },
54
+ {
55
+ "epoch": 4.271604938271605,
56
+ "grad_norm": 6.701765537261963,
57
+ "learning_rate": 3.48e-06,
58
+ "loss": 1.3262,
59
+ "step": 175
60
+ },
61
+ {
62
+ "epoch": 4.888888888888889,
63
+ "grad_norm": 9.419053077697754,
64
+ "learning_rate": 3.980000000000001e-06,
65
+ "loss": 1.285,
66
+ "step": 200
67
+ },
68
+ {
69
+ "epoch": 5.493827160493828,
70
+ "grad_norm": 5.913278579711914,
71
+ "learning_rate": 4.48e-06,
72
+ "loss": 1.2503,
73
+ "step": 225
74
+ },
75
+ {
76
+ "epoch": 6.098765432098766,
77
+ "grad_norm": 8.171669006347656,
78
+ "learning_rate": 4.980000000000001e-06,
79
+ "loss": 1.1868,
80
+ "step": 250
81
+ },
82
+ {
83
+ "epoch": 6.716049382716049,
84
+ "grad_norm": 5.54558801651001,
85
+ "learning_rate": 5.480000000000001e-06,
86
+ "loss": 1.1478,
87
+ "step": 275
88
+ },
89
+ {
90
+ "epoch": 7.320987654320987,
91
+ "grad_norm": 5.325434684753418,
92
+ "learning_rate": 5.98e-06,
93
+ "loss": 1.1245,
94
+ "step": 300
95
+ },
96
+ {
97
+ "epoch": 7.938271604938271,
98
+ "grad_norm": 5.406148433685303,
99
+ "learning_rate": 6.480000000000001e-06,
100
+ "loss": 1.1145,
101
+ "step": 325
102
+ },
103
+ {
104
+ "epoch": 8.54320987654321,
105
+ "grad_norm": 8.461536407470703,
106
+ "learning_rate": 6.98e-06,
107
+ "loss": 1.0641,
108
+ "step": 350
109
+ },
110
+ {
111
+ "epoch": 9.148148148148149,
112
+ "grad_norm": 3.8533031940460205,
113
+ "learning_rate": 7.48e-06,
114
+ "loss": 1.0573,
115
+ "step": 375
116
+ },
117
+ {
118
+ "epoch": 9.765432098765432,
119
+ "grad_norm": 7.569976806640625,
120
+ "learning_rate": 7.980000000000002e-06,
121
+ "loss": 1.061,
122
+ "step": 400
123
+ },
124
+ {
125
+ "epoch": 10.37037037037037,
126
+ "grad_norm": 10.156228065490723,
127
+ "learning_rate": 8.48e-06,
128
+ "loss": 1.0485,
129
+ "step": 425
130
+ },
131
+ {
132
+ "epoch": 10.987654320987655,
133
+ "grad_norm": 4.668756484985352,
134
+ "learning_rate": 8.98e-06,
135
+ "loss": 1.0216,
136
+ "step": 450
137
+ },
138
+ {
139
+ "epoch": 11.592592592592592,
140
+ "grad_norm": 5.087125301361084,
141
+ "learning_rate": 9.48e-06,
142
+ "loss": 1.0319,
143
+ "step": 475
144
+ },
145
+ {
146
+ "epoch": 12.197530864197532,
147
+ "grad_norm": 7.943349361419678,
148
+ "learning_rate": 9.980000000000001e-06,
149
+ "loss": 1.0,
150
+ "step": 500
151
+ },
152
+ {
153
+ "epoch": 12.814814814814815,
154
+ "grad_norm": 7.655898571014404,
155
+ "learning_rate": 9.931428571428571e-06,
156
+ "loss": 1.0052,
157
+ "step": 525
158
+ },
159
+ {
160
+ "epoch": 13.419753086419753,
161
+ "grad_norm": 4.458106994628906,
162
+ "learning_rate": 9.86e-06,
163
+ "loss": 1.0001,
164
+ "step": 550
165
+ },
166
+ {
167
+ "epoch": 14.024691358024691,
168
+ "grad_norm": 9.058222770690918,
169
+ "learning_rate": 9.78857142857143e-06,
170
+ "loss": 1.0015,
171
+ "step": 575
172
+ },
173
+ {
174
+ "epoch": 14.641975308641975,
175
+ "grad_norm": 4.795205593109131,
176
+ "learning_rate": 9.717142857142858e-06,
177
+ "loss": 0.9836,
178
+ "step": 600
179
+ },
180
+ {
181
+ "epoch": 15.246913580246913,
182
+ "grad_norm": 10.566876411437988,
183
+ "learning_rate": 9.645714285714286e-06,
184
+ "loss": 1.0019,
185
+ "step": 625
186
+ },
187
+ {
188
+ "epoch": 15.864197530864198,
189
+ "grad_norm": 7.610626220703125,
190
+ "learning_rate": 9.574285714285715e-06,
191
+ "loss": 0.9779,
192
+ "step": 650
193
+ },
194
+ {
195
+ "epoch": 16.469135802469136,
196
+ "grad_norm": 6.008159637451172,
197
+ "learning_rate": 9.502857142857144e-06,
198
+ "loss": 0.9798,
199
+ "step": 675
200
+ },
201
+ {
202
+ "epoch": 17.074074074074073,
203
+ "grad_norm": 6.685286521911621,
204
+ "learning_rate": 9.431428571428573e-06,
205
+ "loss": 0.9753,
206
+ "step": 700
207
+ },
208
+ {
209
+ "epoch": 17.691358024691358,
210
+ "grad_norm": 2.7540247440338135,
211
+ "learning_rate": 9.360000000000002e-06,
212
+ "loss": 0.967,
213
+ "step": 725
214
+ },
215
+ {
216
+ "epoch": 18.296296296296298,
217
+ "grad_norm": 4.825072288513184,
218
+ "learning_rate": 9.28857142857143e-06,
219
+ "loss": 0.9575,
220
+ "step": 750
221
+ },
222
+ {
223
+ "epoch": 18.91358024691358,
224
+ "grad_norm": 6.618119716644287,
225
+ "learning_rate": 9.217142857142858e-06,
226
+ "loss": 0.9675,
227
+ "step": 775
228
+ },
229
+ {
230
+ "epoch": 19.51851851851852,
231
+ "grad_norm": 5.465808391571045,
232
+ "learning_rate": 9.145714285714287e-06,
233
+ "loss": 0.9626,
234
+ "step": 800
235
+ },
236
+ {
237
+ "epoch": 20.123456790123456,
238
+ "grad_norm": 4.9501051902771,
239
+ "learning_rate": 9.074285714285716e-06,
240
+ "loss": 0.9638,
241
+ "step": 825
242
+ },
243
+ {
244
+ "epoch": 20.74074074074074,
245
+ "grad_norm": 4.926831245422363,
246
+ "learning_rate": 9.002857142857144e-06,
247
+ "loss": 0.9582,
248
+ "step": 850
249
+ },
250
+ {
251
+ "epoch": 21.34567901234568,
252
+ "grad_norm": 6.605464458465576,
253
+ "learning_rate": 8.931428571428573e-06,
254
+ "loss": 0.9551,
255
+ "step": 875
256
+ },
257
+ {
258
+ "epoch": 21.962962962962962,
259
+ "grad_norm": 5.774538040161133,
260
+ "learning_rate": 8.860000000000002e-06,
261
+ "loss": 0.9596,
262
+ "step": 900
263
+ },
264
+ {
265
+ "epoch": 22.567901234567902,
266
+ "grad_norm": 4.304802417755127,
267
+ "learning_rate": 8.788571428571429e-06,
268
+ "loss": 0.9489,
269
+ "step": 925
270
+ },
271
+ {
272
+ "epoch": 23.17283950617284,
273
+ "grad_norm": 5.171604633331299,
274
+ "learning_rate": 8.717142857142858e-06,
275
+ "loss": 0.953,
276
+ "step": 950
277
+ },
278
+ {
279
+ "epoch": 23.790123456790123,
280
+ "grad_norm": 7.152281761169434,
281
+ "learning_rate": 8.645714285714287e-06,
282
+ "loss": 0.9604,
283
+ "step": 975
284
+ },
285
+ {
286
+ "epoch": 24.395061728395063,
287
+ "grad_norm": 4.954558849334717,
288
+ "learning_rate": 8.574285714285714e-06,
289
+ "loss": 0.9489,
290
+ "step": 1000
291
+ },
292
+ {
293
+ "epoch": 24.395061728395063,
294
+ "eval_loss": 0.9205830097198486,
295
+ "eval_runtime": 2.2708,
296
+ "eval_samples_per_second": 63.413,
297
+ "eval_steps_per_second": 31.707,
298
+ "step": 1000
299
+ }
300
+ ],
301
+ "logging_steps": 25,
302
+ "max_steps": 4000,
303
+ "num_input_tokens_seen": 0,
304
+ "num_train_epochs": 98,
305
+ "save_steps": 1000,
306
+ "stateful_callbacks": {
307
+ "TrainerControl": {
308
+ "args": {
309
+ "should_epoch_stop": false,
310
+ "should_evaluate": false,
311
+ "should_log": false,
312
+ "should_save": true,
313
+ "should_training_stop": false
314
+ },
315
+ "attributes": {}
316
+ }
317
+ },
318
+ "total_flos": 821472814356480.0,
319
+ "train_batch_size": 4,
320
+ "trial_name": null,
321
+ "trial_params": null
322
+ }
checkpoint-1000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:66d367dc04409d63341644c780ebdb997e8756f9aa9f6d110afc5d9ab8de84be
3
+ size 5905
checkpoint-2000/added_tokens.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "<ctc_blank>": 80,
3
+ "<mask>": 79
4
+ }
checkpoint-2000/config.json ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_dropout": 0.1,
3
+ "apply_spec_augment": true,
4
+ "architectures": [
5
+ "SpeechT5ForTextToSpeech"
6
+ ],
7
+ "attention_dropout": 0.1,
8
+ "bos_token_id": 0,
9
+ "conv_bias": false,
10
+ "conv_dim": [
11
+ 512,
12
+ 512,
13
+ 512,
14
+ 512,
15
+ 512,
16
+ 512,
17
+ 512
18
+ ],
19
+ "conv_kernel": [
20
+ 10,
21
+ 3,
22
+ 3,
23
+ 3,
24
+ 3,
25
+ 2,
26
+ 2
27
+ ],
28
+ "conv_stride": [
29
+ 5,
30
+ 2,
31
+ 2,
32
+ 2,
33
+ 2,
34
+ 2,
35
+ 2
36
+ ],
37
+ "decoder_attention_heads": 12,
38
+ "decoder_ffn_dim": 3072,
39
+ "decoder_layerdrop": 0.1,
40
+ "decoder_layers": 6,
41
+ "decoder_start_token_id": 2,
42
+ "encoder_attention_heads": 12,
43
+ "encoder_ffn_dim": 3072,
44
+ "encoder_layerdrop": 0.1,
45
+ "encoder_layers": 12,
46
+ "encoder_max_relative_position": 160,
47
+ "eos_token_id": 2,
48
+ "feat_extract_activation": "gelu",
49
+ "feat_extract_norm": "group",
50
+ "feat_proj_dropout": 0.0,
51
+ "guided_attention_loss_num_heads": 2,
52
+ "guided_attention_loss_scale": 10.0,
53
+ "guided_attention_loss_sigma": 0.4,
54
+ "hidden_act": "gelu",
55
+ "hidden_dropout": 0.1,
56
+ "hidden_size": 768,
57
+ "initializer_range": 0.02,
58
+ "is_encoder_decoder": true,
59
+ "layer_norm_eps": 1e-05,
60
+ "mask_feature_length": 10,
61
+ "mask_feature_min_masks": 0,
62
+ "mask_feature_prob": 0.0,
63
+ "mask_time_length": 10,
64
+ "mask_time_min_masks": 2,
65
+ "mask_time_prob": 0.05,
66
+ "max_length": null,
67
+ "max_speech_positions": 1876,
68
+ "max_text_positions": 600,
69
+ "model_type": "speecht5",
70
+ "num_conv_pos_embedding_groups": 16,
71
+ "num_conv_pos_embeddings": 128,
72
+ "num_feat_extract_layers": 7,
73
+ "num_mel_bins": 80,
74
+ "pad_token_id": 1,
75
+ "positional_dropout": 0.1,
76
+ "reduction_factor": 2,
77
+ "scale_embedding": false,
78
+ "speaker_embedding_dim": 512,
79
+ "speech_decoder_postnet_dropout": 0.5,
80
+ "speech_decoder_postnet_kernel": 5,
81
+ "speech_decoder_postnet_layers": 5,
82
+ "speech_decoder_postnet_units": 256,
83
+ "speech_decoder_prenet_dropout": 0.5,
84
+ "speech_decoder_prenet_layers": 2,
85
+ "speech_decoder_prenet_units": 256,
86
+ "torch_dtype": "float32",
87
+ "transformers_version": "4.55.4",
88
+ "use_cache": false,
89
+ "use_guided_attention_loss": true,
90
+ "vocab_size": 81
91
+ }
checkpoint-2000/generation_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 0,
4
+ "decoder_start_token_id": 2,
5
+ "eos_token_id": 2,
6
+ "max_length": 1876,
7
+ "pad_token_id": 1,
8
+ "transformers_version": "4.55.4"
9
+ }
checkpoint-2000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:295a846f5d0ead4e65b737b369b8205cd013a02d08d0220b3caa7e8e4b777b77
3
+ size 577789320
checkpoint-2000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2388ca0f503df54eb4d30573ff0fc9814dd98cc0759ae40bf1b7438f984e1ab6
3
+ size 1155777946
checkpoint-2000/preprocessor_config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": false,
3
+ "feature_extractor_type": "SpeechT5FeatureExtractor",
4
+ "feature_size": 1,
5
+ "fmax": 7600,
6
+ "fmin": 80,
7
+ "frame_signal_scale": 1.0,
8
+ "hop_length": 16,
9
+ "mel_floor": 1e-10,
10
+ "num_mel_bins": 80,
11
+ "padding_side": "right",
12
+ "padding_value": 0.0,
13
+ "processor_class": "SpeechT5Processor",
14
+ "reduction_factor": 2,
15
+ "return_attention_mask": true,
16
+ "sampling_rate": 16000,
17
+ "win_function": "hann_window",
18
+ "win_length": 64
19
+ }
checkpoint-2000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:396a8cc8a565882c2cc697e78085381bcb24a262358918ccaa5445eb5232e231
3
+ size 14645
checkpoint-2000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1e92941487269a9e704ed42d0796c2eb3245e8d6d83c68a723be04187c99b397
3
+ size 1465
checkpoint-2000/special_tokens_map.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "eos_token": "</s>",
4
+ "mask_token": {
5
+ "content": "<mask>",
6
+ "lstrip": true,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false
10
+ },
11
+ "pad_token": "<pad>",
12
+ "unk_token": "<unk>"
13
+ }
checkpoint-2000/spm_char.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7fcc48f3e225f627b1641db410ceb0c8649bd2b0c982e150b03f8be3728ab560
3
+ size 238473
checkpoint-2000/tokenizer_config.json ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<s>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<pad>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "</s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "79": {
36
+ "content": "<mask>",
37
+ "lstrip": true,
38
+ "normalized": true,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ },
43
+ "80": {
44
+ "content": "<ctc_blank>",
45
+ "lstrip": false,
46
+ "normalized": true,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": false
50
+ }
51
+ },
52
+ "bos_token": "<s>",
53
+ "clean_up_tokenization_spaces": false,
54
+ "eos_token": "</s>",
55
+ "extra_special_tokens": {},
56
+ "mask_token": "<mask>",
57
+ "model_max_length": 600,
58
+ "normalize": false,
59
+ "pad_token": "<pad>",
60
+ "processor_class": "SpeechT5Processor",
61
+ "sp_model_kwargs": {},
62
+ "tokenizer_class": "SpeechT5Tokenizer",
63
+ "unk_token": "<unk>"
64
+ }
checkpoint-2000/trainer_state.json ADDED
@@ -0,0 +1,610 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 2000,
3
+ "best_metric": 0.8953001499176025,
4
+ "best_model_checkpoint": "runs/emotts_ravdess\\checkpoint-2000",
5
+ "epoch": 48.79012345679013,
6
+ "eval_steps": 1000,
7
+ "global_step": 2000,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.6172839506172839,
14
+ "grad_norm": 46.678199768066406,
15
+ "learning_rate": 4.800000000000001e-07,
16
+ "loss": 3.4472,
17
+ "step": 25
18
+ },
19
+ {
20
+ "epoch": 1.2222222222222223,
21
+ "grad_norm": 26.903335571289062,
22
+ "learning_rate": 9.800000000000001e-07,
23
+ "loss": 2.9051,
24
+ "step": 50
25
+ },
26
+ {
27
+ "epoch": 1.8395061728395061,
28
+ "grad_norm": 16.712799072265625,
29
+ "learning_rate": 1.48e-06,
30
+ "loss": 2.2302,
31
+ "step": 75
32
+ },
33
+ {
34
+ "epoch": 2.4444444444444446,
35
+ "grad_norm": 11.607951164245605,
36
+ "learning_rate": 1.98e-06,
37
+ "loss": 1.7683,
38
+ "step": 100
39
+ },
40
+ {
41
+ "epoch": 3.049382716049383,
42
+ "grad_norm": 7.216983318328857,
43
+ "learning_rate": 2.4800000000000004e-06,
44
+ "loss": 1.5434,
45
+ "step": 125
46
+ },
47
+ {
48
+ "epoch": 3.6666666666666665,
49
+ "grad_norm": 10.899630546569824,
50
+ "learning_rate": 2.9800000000000003e-06,
51
+ "loss": 1.4385,
52
+ "step": 150
53
+ },
54
+ {
55
+ "epoch": 4.271604938271605,
56
+ "grad_norm": 6.701765537261963,
57
+ "learning_rate": 3.48e-06,
58
+ "loss": 1.3262,
59
+ "step": 175
60
+ },
61
+ {
62
+ "epoch": 4.888888888888889,
63
+ "grad_norm": 9.419053077697754,
64
+ "learning_rate": 3.980000000000001e-06,
65
+ "loss": 1.285,
66
+ "step": 200
67
+ },
68
+ {
69
+ "epoch": 5.493827160493828,
70
+ "grad_norm": 5.913278579711914,
71
+ "learning_rate": 4.48e-06,
72
+ "loss": 1.2503,
73
+ "step": 225
74
+ },
75
+ {
76
+ "epoch": 6.098765432098766,
77
+ "grad_norm": 8.171669006347656,
78
+ "learning_rate": 4.980000000000001e-06,
79
+ "loss": 1.1868,
80
+ "step": 250
81
+ },
82
+ {
83
+ "epoch": 6.716049382716049,
84
+ "grad_norm": 5.54558801651001,
85
+ "learning_rate": 5.480000000000001e-06,
86
+ "loss": 1.1478,
87
+ "step": 275
88
+ },
89
+ {
90
+ "epoch": 7.320987654320987,
91
+ "grad_norm": 5.325434684753418,
92
+ "learning_rate": 5.98e-06,
93
+ "loss": 1.1245,
94
+ "step": 300
95
+ },
96
+ {
97
+ "epoch": 7.938271604938271,
98
+ "grad_norm": 5.406148433685303,
99
+ "learning_rate": 6.480000000000001e-06,
100
+ "loss": 1.1145,
101
+ "step": 325
102
+ },
103
+ {
104
+ "epoch": 8.54320987654321,
105
+ "grad_norm": 8.461536407470703,
106
+ "learning_rate": 6.98e-06,
107
+ "loss": 1.0641,
108
+ "step": 350
109
+ },
110
+ {
111
+ "epoch": 9.148148148148149,
112
+ "grad_norm": 3.8533031940460205,
113
+ "learning_rate": 7.48e-06,
114
+ "loss": 1.0573,
115
+ "step": 375
116
+ },
117
+ {
118
+ "epoch": 9.765432098765432,
119
+ "grad_norm": 7.569976806640625,
120
+ "learning_rate": 7.980000000000002e-06,
121
+ "loss": 1.061,
122
+ "step": 400
123
+ },
124
+ {
125
+ "epoch": 10.37037037037037,
126
+ "grad_norm": 10.156228065490723,
127
+ "learning_rate": 8.48e-06,
128
+ "loss": 1.0485,
129
+ "step": 425
130
+ },
131
+ {
132
+ "epoch": 10.987654320987655,
133
+ "grad_norm": 4.668756484985352,
134
+ "learning_rate": 8.98e-06,
135
+ "loss": 1.0216,
136
+ "step": 450
137
+ },
138
+ {
139
+ "epoch": 11.592592592592592,
140
+ "grad_norm": 5.087125301361084,
141
+ "learning_rate": 9.48e-06,
142
+ "loss": 1.0319,
143
+ "step": 475
144
+ },
145
+ {
146
+ "epoch": 12.197530864197532,
147
+ "grad_norm": 7.943349361419678,
148
+ "learning_rate": 9.980000000000001e-06,
149
+ "loss": 1.0,
150
+ "step": 500
151
+ },
152
+ {
153
+ "epoch": 12.814814814814815,
154
+ "grad_norm": 7.655898571014404,
155
+ "learning_rate": 9.931428571428571e-06,
156
+ "loss": 1.0052,
157
+ "step": 525
158
+ },
159
+ {
160
+ "epoch": 13.419753086419753,
161
+ "grad_norm": 4.458106994628906,
162
+ "learning_rate": 9.86e-06,
163
+ "loss": 1.0001,
164
+ "step": 550
165
+ },
166
+ {
167
+ "epoch": 14.024691358024691,
168
+ "grad_norm": 9.058222770690918,
169
+ "learning_rate": 9.78857142857143e-06,
170
+ "loss": 1.0015,
171
+ "step": 575
172
+ },
173
+ {
174
+ "epoch": 14.641975308641975,
175
+ "grad_norm": 4.795205593109131,
176
+ "learning_rate": 9.717142857142858e-06,
177
+ "loss": 0.9836,
178
+ "step": 600
179
+ },
180
+ {
181
+ "epoch": 15.246913580246913,
182
+ "grad_norm": 10.566876411437988,
183
+ "learning_rate": 9.645714285714286e-06,
184
+ "loss": 1.0019,
185
+ "step": 625
186
+ },
187
+ {
188
+ "epoch": 15.864197530864198,
189
+ "grad_norm": 7.610626220703125,
190
+ "learning_rate": 9.574285714285715e-06,
191
+ "loss": 0.9779,
192
+ "step": 650
193
+ },
194
+ {
195
+ "epoch": 16.469135802469136,
196
+ "grad_norm": 6.008159637451172,
197
+ "learning_rate": 9.502857142857144e-06,
198
+ "loss": 0.9798,
199
+ "step": 675
200
+ },
201
+ {
202
+ "epoch": 17.074074074074073,
203
+ "grad_norm": 6.685286521911621,
204
+ "learning_rate": 9.431428571428573e-06,
205
+ "loss": 0.9753,
206
+ "step": 700
207
+ },
208
+ {
209
+ "epoch": 17.691358024691358,
210
+ "grad_norm": 2.7540247440338135,
211
+ "learning_rate": 9.360000000000002e-06,
212
+ "loss": 0.967,
213
+ "step": 725
214
+ },
215
+ {
216
+ "epoch": 18.296296296296298,
217
+ "grad_norm": 4.825072288513184,
218
+ "learning_rate": 9.28857142857143e-06,
219
+ "loss": 0.9575,
220
+ "step": 750
221
+ },
222
+ {
223
+ "epoch": 18.91358024691358,
224
+ "grad_norm": 6.618119716644287,
225
+ "learning_rate": 9.217142857142858e-06,
226
+ "loss": 0.9675,
227
+ "step": 775
228
+ },
229
+ {
230
+ "epoch": 19.51851851851852,
231
+ "grad_norm": 5.465808391571045,
232
+ "learning_rate": 9.145714285714287e-06,
233
+ "loss": 0.9626,
234
+ "step": 800
235
+ },
236
+ {
237
+ "epoch": 20.123456790123456,
238
+ "grad_norm": 4.9501051902771,
239
+ "learning_rate": 9.074285714285716e-06,
240
+ "loss": 0.9638,
241
+ "step": 825
242
+ },
243
+ {
244
+ "epoch": 20.74074074074074,
245
+ "grad_norm": 4.926831245422363,
246
+ "learning_rate": 9.002857142857144e-06,
247
+ "loss": 0.9582,
248
+ "step": 850
249
+ },
250
+ {
251
+ "epoch": 21.34567901234568,
252
+ "grad_norm": 6.605464458465576,
253
+ "learning_rate": 8.931428571428573e-06,
254
+ "loss": 0.9551,
255
+ "step": 875
256
+ },
257
+ {
258
+ "epoch": 21.962962962962962,
259
+ "grad_norm": 5.774538040161133,
260
+ "learning_rate": 8.860000000000002e-06,
261
+ "loss": 0.9596,
262
+ "step": 900
263
+ },
264
+ {
265
+ "epoch": 22.567901234567902,
266
+ "grad_norm": 4.304802417755127,
267
+ "learning_rate": 8.788571428571429e-06,
268
+ "loss": 0.9489,
269
+ "step": 925
270
+ },
271
+ {
272
+ "epoch": 23.17283950617284,
273
+ "grad_norm": 5.171604633331299,
274
+ "learning_rate": 8.717142857142858e-06,
275
+ "loss": 0.953,
276
+ "step": 950
277
+ },
278
+ {
279
+ "epoch": 23.790123456790123,
280
+ "grad_norm": 7.152281761169434,
281
+ "learning_rate": 8.645714285714287e-06,
282
+ "loss": 0.9604,
283
+ "step": 975
284
+ },
285
+ {
286
+ "epoch": 24.395061728395063,
287
+ "grad_norm": 4.954558849334717,
288
+ "learning_rate": 8.574285714285714e-06,
289
+ "loss": 0.9489,
290
+ "step": 1000
291
+ },
292
+ {
293
+ "epoch": 24.395061728395063,
294
+ "eval_loss": 0.9205830097198486,
295
+ "eval_runtime": 2.2708,
296
+ "eval_samples_per_second": 63.413,
297
+ "eval_steps_per_second": 31.707,
298
+ "step": 1000
299
+ },
300
+ {
301
+ "epoch": 25.0,
302
+ "grad_norm": 10.266937255859375,
303
+ "learning_rate": 8.502857142857143e-06,
304
+ "loss": 0.9541,
305
+ "step": 1025
306
+ },
307
+ {
308
+ "epoch": 25.617283950617285,
309
+ "grad_norm": 3.225881814956665,
310
+ "learning_rate": 8.431428571428572e-06,
311
+ "loss": 0.9451,
312
+ "step": 1050
313
+ },
314
+ {
315
+ "epoch": 26.22222222222222,
316
+ "grad_norm": 4.001440048217773,
317
+ "learning_rate": 8.36e-06,
318
+ "loss": 0.9422,
319
+ "step": 1075
320
+ },
321
+ {
322
+ "epoch": 26.839506172839506,
323
+ "grad_norm": 5.347984313964844,
324
+ "learning_rate": 8.288571428571429e-06,
325
+ "loss": 0.9434,
326
+ "step": 1100
327
+ },
328
+ {
329
+ "epoch": 27.444444444444443,
330
+ "grad_norm": 4.1566901206970215,
331
+ "learning_rate": 8.217142857142858e-06,
332
+ "loss": 0.942,
333
+ "step": 1125
334
+ },
335
+ {
336
+ "epoch": 28.049382716049383,
337
+ "grad_norm": 3.2101686000823975,
338
+ "learning_rate": 8.145714285714287e-06,
339
+ "loss": 0.9365,
340
+ "step": 1150
341
+ },
342
+ {
343
+ "epoch": 28.666666666666668,
344
+ "grad_norm": 5.183631896972656,
345
+ "learning_rate": 8.074285714285714e-06,
346
+ "loss": 0.941,
347
+ "step": 1175
348
+ },
349
+ {
350
+ "epoch": 29.271604938271604,
351
+ "grad_norm": 4.704529285430908,
352
+ "learning_rate": 8.002857142857143e-06,
353
+ "loss": 0.9374,
354
+ "step": 1200
355
+ },
356
+ {
357
+ "epoch": 29.88888888888889,
358
+ "grad_norm": 4.460058689117432,
359
+ "learning_rate": 7.931428571428572e-06,
360
+ "loss": 0.9383,
361
+ "step": 1225
362
+ },
363
+ {
364
+ "epoch": 30.493827160493826,
365
+ "grad_norm": 3.616530418395996,
366
+ "learning_rate": 7.860000000000001e-06,
367
+ "loss": 0.9321,
368
+ "step": 1250
369
+ },
370
+ {
371
+ "epoch": 31.098765432098766,
372
+ "grad_norm": 3.92207932472229,
373
+ "learning_rate": 7.788571428571428e-06,
374
+ "loss": 0.9347,
375
+ "step": 1275
376
+ },
377
+ {
378
+ "epoch": 31.71604938271605,
379
+ "grad_norm": 3.6962461471557617,
380
+ "learning_rate": 7.717142857142857e-06,
381
+ "loss": 0.9305,
382
+ "step": 1300
383
+ },
384
+ {
385
+ "epoch": 32.32098765432099,
386
+ "grad_norm": 4.276056289672852,
387
+ "learning_rate": 7.645714285714286e-06,
388
+ "loss": 0.9336,
389
+ "step": 1325
390
+ },
391
+ {
392
+ "epoch": 32.93827160493827,
393
+ "grad_norm": 5.176277160644531,
394
+ "learning_rate": 7.574285714285715e-06,
395
+ "loss": 0.9351,
396
+ "step": 1350
397
+ },
398
+ {
399
+ "epoch": 33.54320987654321,
400
+ "grad_norm": 7.2538347244262695,
401
+ "learning_rate": 7.502857142857144e-06,
402
+ "loss": 0.9241,
403
+ "step": 1375
404
+ },
405
+ {
406
+ "epoch": 34.148148148148145,
407
+ "grad_norm": 4.3576273918151855,
408
+ "learning_rate": 7.431428571428572e-06,
409
+ "loss": 0.9316,
410
+ "step": 1400
411
+ },
412
+ {
413
+ "epoch": 34.76543209876543,
414
+ "grad_norm": 9.138855934143066,
415
+ "learning_rate": 7.360000000000001e-06,
416
+ "loss": 0.9277,
417
+ "step": 1425
418
+ },
419
+ {
420
+ "epoch": 35.370370370370374,
421
+ "grad_norm": 4.475003719329834,
422
+ "learning_rate": 7.28857142857143e-06,
423
+ "loss": 0.9245,
424
+ "step": 1450
425
+ },
426
+ {
427
+ "epoch": 35.98765432098765,
428
+ "grad_norm": 7.28753137588501,
429
+ "learning_rate": 7.217142857142858e-06,
430
+ "loss": 0.9266,
431
+ "step": 1475
432
+ },
433
+ {
434
+ "epoch": 36.592592592592595,
435
+ "grad_norm": 5.1342949867248535,
436
+ "learning_rate": 7.145714285714286e-06,
437
+ "loss": 0.9297,
438
+ "step": 1500
439
+ },
440
+ {
441
+ "epoch": 37.19753086419753,
442
+ "grad_norm": 2.7765142917633057,
443
+ "learning_rate": 7.074285714285715e-06,
444
+ "loss": 0.9253,
445
+ "step": 1525
446
+ },
447
+ {
448
+ "epoch": 37.81481481481482,
449
+ "grad_norm": 3.8011326789855957,
450
+ "learning_rate": 7.002857142857143e-06,
451
+ "loss": 0.9203,
452
+ "step": 1550
453
+ },
454
+ {
455
+ "epoch": 38.41975308641975,
456
+ "grad_norm": 7.432782173156738,
457
+ "learning_rate": 6.931428571428572e-06,
458
+ "loss": 0.9196,
459
+ "step": 1575
460
+ },
461
+ {
462
+ "epoch": 39.02469135802469,
463
+ "grad_norm": 4.179474830627441,
464
+ "learning_rate": 6.860000000000001e-06,
465
+ "loss": 0.9188,
466
+ "step": 1600
467
+ },
468
+ {
469
+ "epoch": 39.641975308641975,
470
+ "grad_norm": 8.513073921203613,
471
+ "learning_rate": 6.7885714285714286e-06,
472
+ "loss": 0.9268,
473
+ "step": 1625
474
+ },
475
+ {
476
+ "epoch": 40.24691358024691,
477
+ "grad_norm": 3.699882984161377,
478
+ "learning_rate": 6.7171428571428576e-06,
479
+ "loss": 0.9216,
480
+ "step": 1650
481
+ },
482
+ {
483
+ "epoch": 40.864197530864196,
484
+ "grad_norm": 3.949507713317871,
485
+ "learning_rate": 6.645714285714287e-06,
486
+ "loss": 0.9238,
487
+ "step": 1675
488
+ },
489
+ {
490
+ "epoch": 41.46913580246913,
491
+ "grad_norm": 3.7951810359954834,
492
+ "learning_rate": 6.574285714285716e-06,
493
+ "loss": 0.9198,
494
+ "step": 1700
495
+ },
496
+ {
497
+ "epoch": 42.074074074074076,
498
+ "grad_norm": 5.373620986938477,
499
+ "learning_rate": 6.502857142857143e-06,
500
+ "loss": 0.9135,
501
+ "step": 1725
502
+ },
503
+ {
504
+ "epoch": 42.69135802469136,
505
+ "grad_norm": 6.875067234039307,
506
+ "learning_rate": 6.431428571428572e-06,
507
+ "loss": 0.918,
508
+ "step": 1750
509
+ },
510
+ {
511
+ "epoch": 43.2962962962963,
512
+ "grad_norm": 7.167726039886475,
513
+ "learning_rate": 6.360000000000001e-06,
514
+ "loss": 0.9276,
515
+ "step": 1775
516
+ },
517
+ {
518
+ "epoch": 43.91358024691358,
519
+ "grad_norm": 3.7067105770111084,
520
+ "learning_rate": 6.288571428571429e-06,
521
+ "loss": 0.9169,
522
+ "step": 1800
523
+ },
524
+ {
525
+ "epoch": 44.51851851851852,
526
+ "grad_norm": 4.474793434143066,
527
+ "learning_rate": 6.217142857142857e-06,
528
+ "loss": 0.9191,
529
+ "step": 1825
530
+ },
531
+ {
532
+ "epoch": 45.123456790123456,
533
+ "grad_norm": 5.386421203613281,
534
+ "learning_rate": 6.145714285714286e-06,
535
+ "loss": 0.9145,
536
+ "step": 1850
537
+ },
538
+ {
539
+ "epoch": 45.74074074074074,
540
+ "grad_norm": 3.068861246109009,
541
+ "learning_rate": 6.0742857142857145e-06,
542
+ "loss": 0.9095,
543
+ "step": 1875
544
+ },
545
+ {
546
+ "epoch": 46.34567901234568,
547
+ "grad_norm": 3.804973840713501,
548
+ "learning_rate": 6.0028571428571435e-06,
549
+ "loss": 0.912,
550
+ "step": 1900
551
+ },
552
+ {
553
+ "epoch": 46.96296296296296,
554
+ "grad_norm": 2.9225473403930664,
555
+ "learning_rate": 5.9314285714285725e-06,
556
+ "loss": 0.9049,
557
+ "step": 1925
558
+ },
559
+ {
560
+ "epoch": 47.5679012345679,
561
+ "grad_norm": 4.022708892822266,
562
+ "learning_rate": 5.86e-06,
563
+ "loss": 0.9049,
564
+ "step": 1950
565
+ },
566
+ {
567
+ "epoch": 48.17283950617284,
568
+ "grad_norm": 3.421691417694092,
569
+ "learning_rate": 5.788571428571429e-06,
570
+ "loss": 0.9101,
571
+ "step": 1975
572
+ },
573
+ {
574
+ "epoch": 48.79012345679013,
575
+ "grad_norm": 6.732350826263428,
576
+ "learning_rate": 5.717142857142858e-06,
577
+ "loss": 0.9105,
578
+ "step": 2000
579
+ },
580
+ {
581
+ "epoch": 48.79012345679013,
582
+ "eval_loss": 0.8953001499176025,
583
+ "eval_runtime": 2.1587,
584
+ "eval_samples_per_second": 66.707,
585
+ "eval_steps_per_second": 33.353,
586
+ "step": 2000
587
+ }
588
+ ],
589
+ "logging_steps": 25,
590
+ "max_steps": 4000,
591
+ "num_input_tokens_seen": 0,
592
+ "num_train_epochs": 98,
593
+ "save_steps": 1000,
594
+ "stateful_callbacks": {
595
+ "TrainerControl": {
596
+ "args": {
597
+ "should_epoch_stop": false,
598
+ "should_evaluate": false,
599
+ "should_log": false,
600
+ "should_save": true,
601
+ "should_training_stop": false
602
+ },
603
+ "attributes": {}
604
+ }
605
+ },
606
+ "total_flos": 1642945628712960.0,
607
+ "train_batch_size": 4,
608
+ "trial_name": null,
609
+ "trial_params": null
610
+ }
checkpoint-2000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:66d367dc04409d63341644c780ebdb997e8756f9aa9f6d110afc5d9ab8de84be
3
+ size 5905
checkpoint-3000/added_tokens.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "<ctc_blank>": 80,
3
+ "<mask>": 79
4
+ }
checkpoint-3000/config.json ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_dropout": 0.1,
3
+ "apply_spec_augment": true,
4
+ "architectures": [
5
+ "SpeechT5ForTextToSpeech"
6
+ ],
7
+ "attention_dropout": 0.1,
8
+ "bos_token_id": 0,
9
+ "conv_bias": false,
10
+ "conv_dim": [
11
+ 512,
12
+ 512,
13
+ 512,
14
+ 512,
15
+ 512,
16
+ 512,
17
+ 512
18
+ ],
19
+ "conv_kernel": [
20
+ 10,
21
+ 3,
22
+ 3,
23
+ 3,
24
+ 3,
25
+ 2,
26
+ 2
27
+ ],
28
+ "conv_stride": [
29
+ 5,
30
+ 2,
31
+ 2,
32
+ 2,
33
+ 2,
34
+ 2,
35
+ 2
36
+ ],
37
+ "decoder_attention_heads": 12,
38
+ "decoder_ffn_dim": 3072,
39
+ "decoder_layerdrop": 0.1,
40
+ "decoder_layers": 6,
41
+ "decoder_start_token_id": 2,
42
+ "encoder_attention_heads": 12,
43
+ "encoder_ffn_dim": 3072,
44
+ "encoder_layerdrop": 0.1,
45
+ "encoder_layers": 12,
46
+ "encoder_max_relative_position": 160,
47
+ "eos_token_id": 2,
48
+ "feat_extract_activation": "gelu",
49
+ "feat_extract_norm": "group",
50
+ "feat_proj_dropout": 0.0,
51
+ "guided_attention_loss_num_heads": 2,
52
+ "guided_attention_loss_scale": 10.0,
53
+ "guided_attention_loss_sigma": 0.4,
54
+ "hidden_act": "gelu",
55
+ "hidden_dropout": 0.1,
56
+ "hidden_size": 768,
57
+ "initializer_range": 0.02,
58
+ "is_encoder_decoder": true,
59
+ "layer_norm_eps": 1e-05,
60
+ "mask_feature_length": 10,
61
+ "mask_feature_min_masks": 0,
62
+ "mask_feature_prob": 0.0,
63
+ "mask_time_length": 10,
64
+ "mask_time_min_masks": 2,
65
+ "mask_time_prob": 0.05,
66
+ "max_length": null,
67
+ "max_speech_positions": 1876,
68
+ "max_text_positions": 600,
69
+ "model_type": "speecht5",
70
+ "num_conv_pos_embedding_groups": 16,
71
+ "num_conv_pos_embeddings": 128,
72
+ "num_feat_extract_layers": 7,
73
+ "num_mel_bins": 80,
74
+ "pad_token_id": 1,
75
+ "positional_dropout": 0.1,
76
+ "reduction_factor": 2,
77
+ "scale_embedding": false,
78
+ "speaker_embedding_dim": 512,
79
+ "speech_decoder_postnet_dropout": 0.5,
80
+ "speech_decoder_postnet_kernel": 5,
81
+ "speech_decoder_postnet_layers": 5,
82
+ "speech_decoder_postnet_units": 256,
83
+ "speech_decoder_prenet_dropout": 0.5,
84
+ "speech_decoder_prenet_layers": 2,
85
+ "speech_decoder_prenet_units": 256,
86
+ "torch_dtype": "float32",
87
+ "transformers_version": "4.55.4",
88
+ "use_cache": false,
89
+ "use_guided_attention_loss": true,
90
+ "vocab_size": 81
91
+ }
checkpoint-3000/generation_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 0,
4
+ "decoder_start_token_id": 2,
5
+ "eos_token_id": 2,
6
+ "max_length": 1876,
7
+ "pad_token_id": 1,
8
+ "transformers_version": "4.55.4"
9
+ }
checkpoint-3000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2f6eca9575648e4c7d7eb1ea916fee7b23eafefa0db8bf09a04bd46beac454f2
3
+ size 577789320
checkpoint-3000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f8c7d0e8b916fd9a744e0e04850570d8a6297e6bac0767ebd63b53e0cefe4057
3
+ size 1155777946
checkpoint-3000/preprocessor_config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": false,
3
+ "feature_extractor_type": "SpeechT5FeatureExtractor",
4
+ "feature_size": 1,
5
+ "fmax": 7600,
6
+ "fmin": 80,
7
+ "frame_signal_scale": 1.0,
8
+ "hop_length": 16,
9
+ "mel_floor": 1e-10,
10
+ "num_mel_bins": 80,
11
+ "padding_side": "right",
12
+ "padding_value": 0.0,
13
+ "processor_class": "SpeechT5Processor",
14
+ "reduction_factor": 2,
15
+ "return_attention_mask": true,
16
+ "sampling_rate": 16000,
17
+ "win_function": "hann_window",
18
+ "win_length": 64
19
+ }
checkpoint-3000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4bd7be6ad18d8737c21def51bc146679a3086895043a68047db9ee35a01b64e8
3
+ size 14645
checkpoint-3000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:32cb3a0b1d61782860d37955716f6b5e952b190320ed6c3b93171c974f9325c9
3
+ size 1465
checkpoint-3000/special_tokens_map.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "eos_token": "</s>",
4
+ "mask_token": {
5
+ "content": "<mask>",
6
+ "lstrip": true,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false
10
+ },
11
+ "pad_token": "<pad>",
12
+ "unk_token": "<unk>"
13
+ }
checkpoint-3000/spm_char.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7fcc48f3e225f627b1641db410ceb0c8649bd2b0c982e150b03f8be3728ab560
3
+ size 238473
checkpoint-3000/tokenizer_config.json ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<s>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<pad>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "</s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "79": {
36
+ "content": "<mask>",
37
+ "lstrip": true,
38
+ "normalized": true,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ },
43
+ "80": {
44
+ "content": "<ctc_blank>",
45
+ "lstrip": false,
46
+ "normalized": true,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": false
50
+ }
51
+ },
52
+ "bos_token": "<s>",
53
+ "clean_up_tokenization_spaces": false,
54
+ "eos_token": "</s>",
55
+ "extra_special_tokens": {},
56
+ "mask_token": "<mask>",
57
+ "model_max_length": 600,
58
+ "normalize": false,
59
+ "pad_token": "<pad>",
60
+ "processor_class": "SpeechT5Processor",
61
+ "sp_model_kwargs": {},
62
+ "tokenizer_class": "SpeechT5Tokenizer",
63
+ "unk_token": "<unk>"
64
+ }
checkpoint-3000/trainer_state.json ADDED
@@ -0,0 +1,898 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 3000,
3
+ "best_metric": 0.8869494795799255,
4
+ "best_model_checkpoint": "runs/emotts_ravdess\\checkpoint-3000",
5
+ "epoch": 73.17283950617283,
6
+ "eval_steps": 1000,
7
+ "global_step": 3000,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.6172839506172839,
14
+ "grad_norm": 46.678199768066406,
15
+ "learning_rate": 4.800000000000001e-07,
16
+ "loss": 3.4472,
17
+ "step": 25
18
+ },
19
+ {
20
+ "epoch": 1.2222222222222223,
21
+ "grad_norm": 26.903335571289062,
22
+ "learning_rate": 9.800000000000001e-07,
23
+ "loss": 2.9051,
24
+ "step": 50
25
+ },
26
+ {
27
+ "epoch": 1.8395061728395061,
28
+ "grad_norm": 16.712799072265625,
29
+ "learning_rate": 1.48e-06,
30
+ "loss": 2.2302,
31
+ "step": 75
32
+ },
33
+ {
34
+ "epoch": 2.4444444444444446,
35
+ "grad_norm": 11.607951164245605,
36
+ "learning_rate": 1.98e-06,
37
+ "loss": 1.7683,
38
+ "step": 100
39
+ },
40
+ {
41
+ "epoch": 3.049382716049383,
42
+ "grad_norm": 7.216983318328857,
43
+ "learning_rate": 2.4800000000000004e-06,
44
+ "loss": 1.5434,
45
+ "step": 125
46
+ },
47
+ {
48
+ "epoch": 3.6666666666666665,
49
+ "grad_norm": 10.899630546569824,
50
+ "learning_rate": 2.9800000000000003e-06,
51
+ "loss": 1.4385,
52
+ "step": 150
53
+ },
54
+ {
55
+ "epoch": 4.271604938271605,
56
+ "grad_norm": 6.701765537261963,
57
+ "learning_rate": 3.48e-06,
58
+ "loss": 1.3262,
59
+ "step": 175
60
+ },
61
+ {
62
+ "epoch": 4.888888888888889,
63
+ "grad_norm": 9.419053077697754,
64
+ "learning_rate": 3.980000000000001e-06,
65
+ "loss": 1.285,
66
+ "step": 200
67
+ },
68
+ {
69
+ "epoch": 5.493827160493828,
70
+ "grad_norm": 5.913278579711914,
71
+ "learning_rate": 4.48e-06,
72
+ "loss": 1.2503,
73
+ "step": 225
74
+ },
75
+ {
76
+ "epoch": 6.098765432098766,
77
+ "grad_norm": 8.171669006347656,
78
+ "learning_rate": 4.980000000000001e-06,
79
+ "loss": 1.1868,
80
+ "step": 250
81
+ },
82
+ {
83
+ "epoch": 6.716049382716049,
84
+ "grad_norm": 5.54558801651001,
85
+ "learning_rate": 5.480000000000001e-06,
86
+ "loss": 1.1478,
87
+ "step": 275
88
+ },
89
+ {
90
+ "epoch": 7.320987654320987,
91
+ "grad_norm": 5.325434684753418,
92
+ "learning_rate": 5.98e-06,
93
+ "loss": 1.1245,
94
+ "step": 300
95
+ },
96
+ {
97
+ "epoch": 7.938271604938271,
98
+ "grad_norm": 5.406148433685303,
99
+ "learning_rate": 6.480000000000001e-06,
100
+ "loss": 1.1145,
101
+ "step": 325
102
+ },
103
+ {
104
+ "epoch": 8.54320987654321,
105
+ "grad_norm": 8.461536407470703,
106
+ "learning_rate": 6.98e-06,
107
+ "loss": 1.0641,
108
+ "step": 350
109
+ },
110
+ {
111
+ "epoch": 9.148148148148149,
112
+ "grad_norm": 3.8533031940460205,
113
+ "learning_rate": 7.48e-06,
114
+ "loss": 1.0573,
115
+ "step": 375
116
+ },
117
+ {
118
+ "epoch": 9.765432098765432,
119
+ "grad_norm": 7.569976806640625,
120
+ "learning_rate": 7.980000000000002e-06,
121
+ "loss": 1.061,
122
+ "step": 400
123
+ },
124
+ {
125
+ "epoch": 10.37037037037037,
126
+ "grad_norm": 10.156228065490723,
127
+ "learning_rate": 8.48e-06,
128
+ "loss": 1.0485,
129
+ "step": 425
130
+ },
131
+ {
132
+ "epoch": 10.987654320987655,
133
+ "grad_norm": 4.668756484985352,
134
+ "learning_rate": 8.98e-06,
135
+ "loss": 1.0216,
136
+ "step": 450
137
+ },
138
+ {
139
+ "epoch": 11.592592592592592,
140
+ "grad_norm": 5.087125301361084,
141
+ "learning_rate": 9.48e-06,
142
+ "loss": 1.0319,
143
+ "step": 475
144
+ },
145
+ {
146
+ "epoch": 12.197530864197532,
147
+ "grad_norm": 7.943349361419678,
148
+ "learning_rate": 9.980000000000001e-06,
149
+ "loss": 1.0,
150
+ "step": 500
151
+ },
152
+ {
153
+ "epoch": 12.814814814814815,
154
+ "grad_norm": 7.655898571014404,
155
+ "learning_rate": 9.931428571428571e-06,
156
+ "loss": 1.0052,
157
+ "step": 525
158
+ },
159
+ {
160
+ "epoch": 13.419753086419753,
161
+ "grad_norm": 4.458106994628906,
162
+ "learning_rate": 9.86e-06,
163
+ "loss": 1.0001,
164
+ "step": 550
165
+ },
166
+ {
167
+ "epoch": 14.024691358024691,
168
+ "grad_norm": 9.058222770690918,
169
+ "learning_rate": 9.78857142857143e-06,
170
+ "loss": 1.0015,
171
+ "step": 575
172
+ },
173
+ {
174
+ "epoch": 14.641975308641975,
175
+ "grad_norm": 4.795205593109131,
176
+ "learning_rate": 9.717142857142858e-06,
177
+ "loss": 0.9836,
178
+ "step": 600
179
+ },
180
+ {
181
+ "epoch": 15.246913580246913,
182
+ "grad_norm": 10.566876411437988,
183
+ "learning_rate": 9.645714285714286e-06,
184
+ "loss": 1.0019,
185
+ "step": 625
186
+ },
187
+ {
188
+ "epoch": 15.864197530864198,
189
+ "grad_norm": 7.610626220703125,
190
+ "learning_rate": 9.574285714285715e-06,
191
+ "loss": 0.9779,
192
+ "step": 650
193
+ },
194
+ {
195
+ "epoch": 16.469135802469136,
196
+ "grad_norm": 6.008159637451172,
197
+ "learning_rate": 9.502857142857144e-06,
198
+ "loss": 0.9798,
199
+ "step": 675
200
+ },
201
+ {
202
+ "epoch": 17.074074074074073,
203
+ "grad_norm": 6.685286521911621,
204
+ "learning_rate": 9.431428571428573e-06,
205
+ "loss": 0.9753,
206
+ "step": 700
207
+ },
208
+ {
209
+ "epoch": 17.691358024691358,
210
+ "grad_norm": 2.7540247440338135,
211
+ "learning_rate": 9.360000000000002e-06,
212
+ "loss": 0.967,
213
+ "step": 725
214
+ },
215
+ {
216
+ "epoch": 18.296296296296298,
217
+ "grad_norm": 4.825072288513184,
218
+ "learning_rate": 9.28857142857143e-06,
219
+ "loss": 0.9575,
220
+ "step": 750
221
+ },
222
+ {
223
+ "epoch": 18.91358024691358,
224
+ "grad_norm": 6.618119716644287,
225
+ "learning_rate": 9.217142857142858e-06,
226
+ "loss": 0.9675,
227
+ "step": 775
228
+ },
229
+ {
230
+ "epoch": 19.51851851851852,
231
+ "grad_norm": 5.465808391571045,
232
+ "learning_rate": 9.145714285714287e-06,
233
+ "loss": 0.9626,
234
+ "step": 800
235
+ },
236
+ {
237
+ "epoch": 20.123456790123456,
238
+ "grad_norm": 4.9501051902771,
239
+ "learning_rate": 9.074285714285716e-06,
240
+ "loss": 0.9638,
241
+ "step": 825
242
+ },
243
+ {
244
+ "epoch": 20.74074074074074,
245
+ "grad_norm": 4.926831245422363,
246
+ "learning_rate": 9.002857142857144e-06,
247
+ "loss": 0.9582,
248
+ "step": 850
249
+ },
250
+ {
251
+ "epoch": 21.34567901234568,
252
+ "grad_norm": 6.605464458465576,
253
+ "learning_rate": 8.931428571428573e-06,
254
+ "loss": 0.9551,
255
+ "step": 875
256
+ },
257
+ {
258
+ "epoch": 21.962962962962962,
259
+ "grad_norm": 5.774538040161133,
260
+ "learning_rate": 8.860000000000002e-06,
261
+ "loss": 0.9596,
262
+ "step": 900
263
+ },
264
+ {
265
+ "epoch": 22.567901234567902,
266
+ "grad_norm": 4.304802417755127,
267
+ "learning_rate": 8.788571428571429e-06,
268
+ "loss": 0.9489,
269
+ "step": 925
270
+ },
271
+ {
272
+ "epoch": 23.17283950617284,
273
+ "grad_norm": 5.171604633331299,
274
+ "learning_rate": 8.717142857142858e-06,
275
+ "loss": 0.953,
276
+ "step": 950
277
+ },
278
+ {
279
+ "epoch": 23.790123456790123,
280
+ "grad_norm": 7.152281761169434,
281
+ "learning_rate": 8.645714285714287e-06,
282
+ "loss": 0.9604,
283
+ "step": 975
284
+ },
285
+ {
286
+ "epoch": 24.395061728395063,
287
+ "grad_norm": 4.954558849334717,
288
+ "learning_rate": 8.574285714285714e-06,
289
+ "loss": 0.9489,
290
+ "step": 1000
291
+ },
292
+ {
293
+ "epoch": 24.395061728395063,
294
+ "eval_loss": 0.9205830097198486,
295
+ "eval_runtime": 2.2708,
296
+ "eval_samples_per_second": 63.413,
297
+ "eval_steps_per_second": 31.707,
298
+ "step": 1000
299
+ },
300
+ {
301
+ "epoch": 25.0,
302
+ "grad_norm": 10.266937255859375,
303
+ "learning_rate": 8.502857142857143e-06,
304
+ "loss": 0.9541,
305
+ "step": 1025
306
+ },
307
+ {
308
+ "epoch": 25.617283950617285,
309
+ "grad_norm": 3.225881814956665,
310
+ "learning_rate": 8.431428571428572e-06,
311
+ "loss": 0.9451,
312
+ "step": 1050
313
+ },
314
+ {
315
+ "epoch": 26.22222222222222,
316
+ "grad_norm": 4.001440048217773,
317
+ "learning_rate": 8.36e-06,
318
+ "loss": 0.9422,
319
+ "step": 1075
320
+ },
321
+ {
322
+ "epoch": 26.839506172839506,
323
+ "grad_norm": 5.347984313964844,
324
+ "learning_rate": 8.288571428571429e-06,
325
+ "loss": 0.9434,
326
+ "step": 1100
327
+ },
328
+ {
329
+ "epoch": 27.444444444444443,
330
+ "grad_norm": 4.1566901206970215,
331
+ "learning_rate": 8.217142857142858e-06,
332
+ "loss": 0.942,
333
+ "step": 1125
334
+ },
335
+ {
336
+ "epoch": 28.049382716049383,
337
+ "grad_norm": 3.2101686000823975,
338
+ "learning_rate": 8.145714285714287e-06,
339
+ "loss": 0.9365,
340
+ "step": 1150
341
+ },
342
+ {
343
+ "epoch": 28.666666666666668,
344
+ "grad_norm": 5.183631896972656,
345
+ "learning_rate": 8.074285714285714e-06,
346
+ "loss": 0.941,
347
+ "step": 1175
348
+ },
349
+ {
350
+ "epoch": 29.271604938271604,
351
+ "grad_norm": 4.704529285430908,
352
+ "learning_rate": 8.002857142857143e-06,
353
+ "loss": 0.9374,
354
+ "step": 1200
355
+ },
356
+ {
357
+ "epoch": 29.88888888888889,
358
+ "grad_norm": 4.460058689117432,
359
+ "learning_rate": 7.931428571428572e-06,
360
+ "loss": 0.9383,
361
+ "step": 1225
362
+ },
363
+ {
364
+ "epoch": 30.493827160493826,
365
+ "grad_norm": 3.616530418395996,
366
+ "learning_rate": 7.860000000000001e-06,
367
+ "loss": 0.9321,
368
+ "step": 1250
369
+ },
370
+ {
371
+ "epoch": 31.098765432098766,
372
+ "grad_norm": 3.92207932472229,
373
+ "learning_rate": 7.788571428571428e-06,
374
+ "loss": 0.9347,
375
+ "step": 1275
376
+ },
377
+ {
378
+ "epoch": 31.71604938271605,
379
+ "grad_norm": 3.6962461471557617,
380
+ "learning_rate": 7.717142857142857e-06,
381
+ "loss": 0.9305,
382
+ "step": 1300
383
+ },
384
+ {
385
+ "epoch": 32.32098765432099,
386
+ "grad_norm": 4.276056289672852,
387
+ "learning_rate": 7.645714285714286e-06,
388
+ "loss": 0.9336,
389
+ "step": 1325
390
+ },
391
+ {
392
+ "epoch": 32.93827160493827,
393
+ "grad_norm": 5.176277160644531,
394
+ "learning_rate": 7.574285714285715e-06,
395
+ "loss": 0.9351,
396
+ "step": 1350
397
+ },
398
+ {
399
+ "epoch": 33.54320987654321,
400
+ "grad_norm": 7.2538347244262695,
401
+ "learning_rate": 7.502857142857144e-06,
402
+ "loss": 0.9241,
403
+ "step": 1375
404
+ },
405
+ {
406
+ "epoch": 34.148148148148145,
407
+ "grad_norm": 4.3576273918151855,
408
+ "learning_rate": 7.431428571428572e-06,
409
+ "loss": 0.9316,
410
+ "step": 1400
411
+ },
412
+ {
413
+ "epoch": 34.76543209876543,
414
+ "grad_norm": 9.138855934143066,
415
+ "learning_rate": 7.360000000000001e-06,
416
+ "loss": 0.9277,
417
+ "step": 1425
418
+ },
419
+ {
420
+ "epoch": 35.370370370370374,
421
+ "grad_norm": 4.475003719329834,
422
+ "learning_rate": 7.28857142857143e-06,
423
+ "loss": 0.9245,
424
+ "step": 1450
425
+ },
426
+ {
427
+ "epoch": 35.98765432098765,
428
+ "grad_norm": 7.28753137588501,
429
+ "learning_rate": 7.217142857142858e-06,
430
+ "loss": 0.9266,
431
+ "step": 1475
432
+ },
433
+ {
434
+ "epoch": 36.592592592592595,
435
+ "grad_norm": 5.1342949867248535,
436
+ "learning_rate": 7.145714285714286e-06,
437
+ "loss": 0.9297,
438
+ "step": 1500
439
+ },
440
+ {
441
+ "epoch": 37.19753086419753,
442
+ "grad_norm": 2.7765142917633057,
443
+ "learning_rate": 7.074285714285715e-06,
444
+ "loss": 0.9253,
445
+ "step": 1525
446
+ },
447
+ {
448
+ "epoch": 37.81481481481482,
449
+ "grad_norm": 3.8011326789855957,
450
+ "learning_rate": 7.002857142857143e-06,
451
+ "loss": 0.9203,
452
+ "step": 1550
453
+ },
454
+ {
455
+ "epoch": 38.41975308641975,
456
+ "grad_norm": 7.432782173156738,
457
+ "learning_rate": 6.931428571428572e-06,
458
+ "loss": 0.9196,
459
+ "step": 1575
460
+ },
461
+ {
462
+ "epoch": 39.02469135802469,
463
+ "grad_norm": 4.179474830627441,
464
+ "learning_rate": 6.860000000000001e-06,
465
+ "loss": 0.9188,
466
+ "step": 1600
467
+ },
468
+ {
469
+ "epoch": 39.641975308641975,
470
+ "grad_norm": 8.513073921203613,
471
+ "learning_rate": 6.7885714285714286e-06,
472
+ "loss": 0.9268,
473
+ "step": 1625
474
+ },
475
+ {
476
+ "epoch": 40.24691358024691,
477
+ "grad_norm": 3.699882984161377,
478
+ "learning_rate": 6.7171428571428576e-06,
479
+ "loss": 0.9216,
480
+ "step": 1650
481
+ },
482
+ {
483
+ "epoch": 40.864197530864196,
484
+ "grad_norm": 3.949507713317871,
485
+ "learning_rate": 6.645714285714287e-06,
486
+ "loss": 0.9238,
487
+ "step": 1675
488
+ },
489
+ {
490
+ "epoch": 41.46913580246913,
491
+ "grad_norm": 3.7951810359954834,
492
+ "learning_rate": 6.574285714285716e-06,
493
+ "loss": 0.9198,
494
+ "step": 1700
495
+ },
496
+ {
497
+ "epoch": 42.074074074074076,
498
+ "grad_norm": 5.373620986938477,
499
+ "learning_rate": 6.502857142857143e-06,
500
+ "loss": 0.9135,
501
+ "step": 1725
502
+ },
503
+ {
504
+ "epoch": 42.69135802469136,
505
+ "grad_norm": 6.875067234039307,
506
+ "learning_rate": 6.431428571428572e-06,
507
+ "loss": 0.918,
508
+ "step": 1750
509
+ },
510
+ {
511
+ "epoch": 43.2962962962963,
512
+ "grad_norm": 7.167726039886475,
513
+ "learning_rate": 6.360000000000001e-06,
514
+ "loss": 0.9276,
515
+ "step": 1775
516
+ },
517
+ {
518
+ "epoch": 43.91358024691358,
519
+ "grad_norm": 3.7067105770111084,
520
+ "learning_rate": 6.288571428571429e-06,
521
+ "loss": 0.9169,
522
+ "step": 1800
523
+ },
524
+ {
525
+ "epoch": 44.51851851851852,
526
+ "grad_norm": 4.474793434143066,
527
+ "learning_rate": 6.217142857142857e-06,
528
+ "loss": 0.9191,
529
+ "step": 1825
530
+ },
531
+ {
532
+ "epoch": 45.123456790123456,
533
+ "grad_norm": 5.386421203613281,
534
+ "learning_rate": 6.145714285714286e-06,
535
+ "loss": 0.9145,
536
+ "step": 1850
537
+ },
538
+ {
539
+ "epoch": 45.74074074074074,
540
+ "grad_norm": 3.068861246109009,
541
+ "learning_rate": 6.0742857142857145e-06,
542
+ "loss": 0.9095,
543
+ "step": 1875
544
+ },
545
+ {
546
+ "epoch": 46.34567901234568,
547
+ "grad_norm": 3.804973840713501,
548
+ "learning_rate": 6.0028571428571435e-06,
549
+ "loss": 0.912,
550
+ "step": 1900
551
+ },
552
+ {
553
+ "epoch": 46.96296296296296,
554
+ "grad_norm": 2.9225473403930664,
555
+ "learning_rate": 5.9314285714285725e-06,
556
+ "loss": 0.9049,
557
+ "step": 1925
558
+ },
559
+ {
560
+ "epoch": 47.5679012345679,
561
+ "grad_norm": 4.022708892822266,
562
+ "learning_rate": 5.86e-06,
563
+ "loss": 0.9049,
564
+ "step": 1950
565
+ },
566
+ {
567
+ "epoch": 48.17283950617284,
568
+ "grad_norm": 3.421691417694092,
569
+ "learning_rate": 5.788571428571429e-06,
570
+ "loss": 0.9101,
571
+ "step": 1975
572
+ },
573
+ {
574
+ "epoch": 48.79012345679013,
575
+ "grad_norm": 6.732350826263428,
576
+ "learning_rate": 5.717142857142858e-06,
577
+ "loss": 0.9105,
578
+ "step": 2000
579
+ },
580
+ {
581
+ "epoch": 48.79012345679013,
582
+ "eval_loss": 0.8953001499176025,
583
+ "eval_runtime": 2.1587,
584
+ "eval_samples_per_second": 66.707,
585
+ "eval_steps_per_second": 33.353,
586
+ "step": 2000
587
+ },
588
+ {
589
+ "epoch": 49.39506172839506,
590
+ "grad_norm": 5.506401538848877,
591
+ "learning_rate": 5.645714285714287e-06,
592
+ "loss": 0.9036,
593
+ "step": 2025
594
+ },
595
+ {
596
+ "epoch": 50.0,
597
+ "grad_norm": 9.19892406463623,
598
+ "learning_rate": 5.574285714285714e-06,
599
+ "loss": 0.9107,
600
+ "step": 2050
601
+ },
602
+ {
603
+ "epoch": 50.617283950617285,
604
+ "grad_norm": 3.324119806289673,
605
+ "learning_rate": 5.502857142857143e-06,
606
+ "loss": 0.9118,
607
+ "step": 2075
608
+ },
609
+ {
610
+ "epoch": 51.22222222222222,
611
+ "grad_norm": 5.142299652099609,
612
+ "learning_rate": 5.431428571428572e-06,
613
+ "loss": 0.9098,
614
+ "step": 2100
615
+ },
616
+ {
617
+ "epoch": 51.839506172839506,
618
+ "grad_norm": 2.8806934356689453,
619
+ "learning_rate": 5.36e-06,
620
+ "loss": 0.9013,
621
+ "step": 2125
622
+ },
623
+ {
624
+ "epoch": 52.44444444444444,
625
+ "grad_norm": 4.728231430053711,
626
+ "learning_rate": 5.2885714285714285e-06,
627
+ "loss": 0.9049,
628
+ "step": 2150
629
+ },
630
+ {
631
+ "epoch": 53.04938271604938,
632
+ "grad_norm": 4.9596991539001465,
633
+ "learning_rate": 5.2171428571428575e-06,
634
+ "loss": 0.9128,
635
+ "step": 2175
636
+ },
637
+ {
638
+ "epoch": 53.666666666666664,
639
+ "grad_norm": 3.160998821258545,
640
+ "learning_rate": 5.145714285714286e-06,
641
+ "loss": 0.9003,
642
+ "step": 2200
643
+ },
644
+ {
645
+ "epoch": 54.27160493827161,
646
+ "grad_norm": 3.833195924758911,
647
+ "learning_rate": 5.074285714285715e-06,
648
+ "loss": 0.9088,
649
+ "step": 2225
650
+ },
651
+ {
652
+ "epoch": 54.888888888888886,
653
+ "grad_norm": 5.242589950561523,
654
+ "learning_rate": 5.002857142857144e-06,
655
+ "loss": 0.9005,
656
+ "step": 2250
657
+ },
658
+ {
659
+ "epoch": 55.49382716049383,
660
+ "grad_norm": 3.781388759613037,
661
+ "learning_rate": 4.931428571428572e-06,
662
+ "loss": 0.9028,
663
+ "step": 2275
664
+ },
665
+ {
666
+ "epoch": 56.098765432098766,
667
+ "grad_norm": 6.0595574378967285,
668
+ "learning_rate": 4.86e-06,
669
+ "loss": 0.9124,
670
+ "step": 2300
671
+ },
672
+ {
673
+ "epoch": 56.71604938271605,
674
+ "grad_norm": 2.7515597343444824,
675
+ "learning_rate": 4.788571428571429e-06,
676
+ "loss": 0.9025,
677
+ "step": 2325
678
+ },
679
+ {
680
+ "epoch": 57.32098765432099,
681
+ "grad_norm": 6.520521640777588,
682
+ "learning_rate": 4.717142857142857e-06,
683
+ "loss": 0.9065,
684
+ "step": 2350
685
+ },
686
+ {
687
+ "epoch": 57.93827160493827,
688
+ "grad_norm": 3.289445638656616,
689
+ "learning_rate": 4.645714285714286e-06,
690
+ "loss": 0.9004,
691
+ "step": 2375
692
+ },
693
+ {
694
+ "epoch": 58.54320987654321,
695
+ "grad_norm": 3.6132805347442627,
696
+ "learning_rate": 4.574285714285714e-06,
697
+ "loss": 0.9021,
698
+ "step": 2400
699
+ },
700
+ {
701
+ "epoch": 59.148148148148145,
702
+ "grad_norm": 5.021145343780518,
703
+ "learning_rate": 4.5028571428571434e-06,
704
+ "loss": 0.8957,
705
+ "step": 2425
706
+ },
707
+ {
708
+ "epoch": 59.76543209876543,
709
+ "grad_norm": 5.366466522216797,
710
+ "learning_rate": 4.431428571428572e-06,
711
+ "loss": 0.8986,
712
+ "step": 2450
713
+ },
714
+ {
715
+ "epoch": 60.370370370370374,
716
+ "grad_norm": 5.833218574523926,
717
+ "learning_rate": 4.360000000000001e-06,
718
+ "loss": 0.9045,
719
+ "step": 2475
720
+ },
721
+ {
722
+ "epoch": 60.98765432098765,
723
+ "grad_norm": 5.301181793212891,
724
+ "learning_rate": 4.288571428571429e-06,
725
+ "loss": 0.8975,
726
+ "step": 2500
727
+ },
728
+ {
729
+ "epoch": 61.592592592592595,
730
+ "grad_norm": 3.989539861679077,
731
+ "learning_rate": 4.217142857142858e-06,
732
+ "loss": 0.9021,
733
+ "step": 2525
734
+ },
735
+ {
736
+ "epoch": 62.19753086419753,
737
+ "grad_norm": 13.111737251281738,
738
+ "learning_rate": 4.145714285714286e-06,
739
+ "loss": 0.9043,
740
+ "step": 2550
741
+ },
742
+ {
743
+ "epoch": 62.81481481481482,
744
+ "grad_norm": 3.4066903591156006,
745
+ "learning_rate": 4.074285714285714e-06,
746
+ "loss": 0.8929,
747
+ "step": 2575
748
+ },
749
+ {
750
+ "epoch": 63.41975308641975,
751
+ "grad_norm": 3.9170608520507812,
752
+ "learning_rate": 4.002857142857143e-06,
753
+ "loss": 0.8998,
754
+ "step": 2600
755
+ },
756
+ {
757
+ "epoch": 64.0246913580247,
758
+ "grad_norm": 3.5934042930603027,
759
+ "learning_rate": 3.931428571428571e-06,
760
+ "loss": 0.898,
761
+ "step": 2625
762
+ },
763
+ {
764
+ "epoch": 64.64197530864197,
765
+ "grad_norm": 3.3771822452545166,
766
+ "learning_rate": 3.86e-06,
767
+ "loss": 0.901,
768
+ "step": 2650
769
+ },
770
+ {
771
+ "epoch": 65.24691358024691,
772
+ "grad_norm": 3.5741279125213623,
773
+ "learning_rate": 3.7885714285714285e-06,
774
+ "loss": 0.903,
775
+ "step": 2675
776
+ },
777
+ {
778
+ "epoch": 65.8641975308642,
779
+ "grad_norm": 4.369333267211914,
780
+ "learning_rate": 3.7171428571428575e-06,
781
+ "loss": 0.8907,
782
+ "step": 2700
783
+ },
784
+ {
785
+ "epoch": 66.46913580246914,
786
+ "grad_norm": 2.9996423721313477,
787
+ "learning_rate": 3.6457142857142857e-06,
788
+ "loss": 0.9008,
789
+ "step": 2725
790
+ },
791
+ {
792
+ "epoch": 67.07407407407408,
793
+ "grad_norm": 5.098217487335205,
794
+ "learning_rate": 3.5742857142857147e-06,
795
+ "loss": 0.8979,
796
+ "step": 2750
797
+ },
798
+ {
799
+ "epoch": 67.69135802469135,
800
+ "grad_norm": 3.8548665046691895,
801
+ "learning_rate": 3.5028571428571433e-06,
802
+ "loss": 0.8906,
803
+ "step": 2775
804
+ },
805
+ {
806
+ "epoch": 68.29629629629629,
807
+ "grad_norm": 4.787322521209717,
808
+ "learning_rate": 3.431428571428572e-06,
809
+ "loss": 0.8949,
810
+ "step": 2800
811
+ },
812
+ {
813
+ "epoch": 68.91358024691358,
814
+ "grad_norm": 2.8501498699188232,
815
+ "learning_rate": 3.3600000000000004e-06,
816
+ "loss": 0.8932,
817
+ "step": 2825
818
+ },
819
+ {
820
+ "epoch": 69.51851851851852,
821
+ "grad_norm": 7.697382926940918,
822
+ "learning_rate": 3.2885714285714286e-06,
823
+ "loss": 0.8961,
824
+ "step": 2850
825
+ },
826
+ {
827
+ "epoch": 70.12345679012346,
828
+ "grad_norm": 3.5617403984069824,
829
+ "learning_rate": 3.2171428571428576e-06,
830
+ "loss": 0.8975,
831
+ "step": 2875
832
+ },
833
+ {
834
+ "epoch": 70.74074074074075,
835
+ "grad_norm": 4.286247253417969,
836
+ "learning_rate": 3.1457142857142858e-06,
837
+ "loss": 0.8988,
838
+ "step": 2900
839
+ },
840
+ {
841
+ "epoch": 71.34567901234568,
842
+ "grad_norm": 3.0174379348754883,
843
+ "learning_rate": 3.074285714285715e-06,
844
+ "loss": 0.8986,
845
+ "step": 2925
846
+ },
847
+ {
848
+ "epoch": 71.96296296296296,
849
+ "grad_norm": 5.708584308624268,
850
+ "learning_rate": 3.002857142857143e-06,
851
+ "loss": 0.8888,
852
+ "step": 2950
853
+ },
854
+ {
855
+ "epoch": 72.5679012345679,
856
+ "grad_norm": 7.933815956115723,
857
+ "learning_rate": 2.9314285714285716e-06,
858
+ "loss": 0.9,
859
+ "step": 2975
860
+ },
861
+ {
862
+ "epoch": 73.17283950617283,
863
+ "grad_norm": 3.4261972904205322,
864
+ "learning_rate": 2.86e-06,
865
+ "loss": 0.8951,
866
+ "step": 3000
867
+ },
868
+ {
869
+ "epoch": 73.17283950617283,
870
+ "eval_loss": 0.8869494795799255,
871
+ "eval_runtime": 2.1798,
872
+ "eval_samples_per_second": 66.061,
873
+ "eval_steps_per_second": 33.03,
874
+ "step": 3000
875
+ }
876
+ ],
877
+ "logging_steps": 25,
878
+ "max_steps": 4000,
879
+ "num_input_tokens_seen": 0,
880
+ "num_train_epochs": 98,
881
+ "save_steps": 1000,
882
+ "stateful_callbacks": {
883
+ "TrainerControl": {
884
+ "args": {
885
+ "should_epoch_stop": false,
886
+ "should_evaluate": false,
887
+ "should_log": false,
888
+ "should_save": true,
889
+ "should_training_stop": false
890
+ },
891
+ "attributes": {}
892
+ }
893
+ },
894
+ "total_flos": 2464002717960960.0,
895
+ "train_batch_size": 4,
896
+ "trial_name": null,
897
+ "trial_params": null
898
+ }
checkpoint-3000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:66d367dc04409d63341644c780ebdb997e8756f9aa9f6d110afc5d9ab8de84be
3
+ size 5905
checkpoint-4000/added_tokens.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "<ctc_blank>": 80,
3
+ "<mask>": 79
4
+ }
checkpoint-4000/config.json ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_dropout": 0.1,
3
+ "apply_spec_augment": true,
4
+ "architectures": [
5
+ "SpeechT5ForTextToSpeech"
6
+ ],
7
+ "attention_dropout": 0.1,
8
+ "bos_token_id": 0,
9
+ "conv_bias": false,
10
+ "conv_dim": [
11
+ 512,
12
+ 512,
13
+ 512,
14
+ 512,
15
+ 512,
16
+ 512,
17
+ 512
18
+ ],
19
+ "conv_kernel": [
20
+ 10,
21
+ 3,
22
+ 3,
23
+ 3,
24
+ 3,
25
+ 2,
26
+ 2
27
+ ],
28
+ "conv_stride": [
29
+ 5,
30
+ 2,
31
+ 2,
32
+ 2,
33
+ 2,
34
+ 2,
35
+ 2
36
+ ],
37
+ "decoder_attention_heads": 12,
38
+ "decoder_ffn_dim": 3072,
39
+ "decoder_layerdrop": 0.1,
40
+ "decoder_layers": 6,
41
+ "decoder_start_token_id": 2,
42
+ "encoder_attention_heads": 12,
43
+ "encoder_ffn_dim": 3072,
44
+ "encoder_layerdrop": 0.1,
45
+ "encoder_layers": 12,
46
+ "encoder_max_relative_position": 160,
47
+ "eos_token_id": 2,
48
+ "feat_extract_activation": "gelu",
49
+ "feat_extract_norm": "group",
50
+ "feat_proj_dropout": 0.0,
51
+ "guided_attention_loss_num_heads": 2,
52
+ "guided_attention_loss_scale": 10.0,
53
+ "guided_attention_loss_sigma": 0.4,
54
+ "hidden_act": "gelu",
55
+ "hidden_dropout": 0.1,
56
+ "hidden_size": 768,
57
+ "initializer_range": 0.02,
58
+ "is_encoder_decoder": true,
59
+ "layer_norm_eps": 1e-05,
60
+ "mask_feature_length": 10,
61
+ "mask_feature_min_masks": 0,
62
+ "mask_feature_prob": 0.0,
63
+ "mask_time_length": 10,
64
+ "mask_time_min_masks": 2,
65
+ "mask_time_prob": 0.05,
66
+ "max_length": null,
67
+ "max_speech_positions": 1876,
68
+ "max_text_positions": 600,
69
+ "model_type": "speecht5",
70
+ "num_conv_pos_embedding_groups": 16,
71
+ "num_conv_pos_embeddings": 128,
72
+ "num_feat_extract_layers": 7,
73
+ "num_mel_bins": 80,
74
+ "pad_token_id": 1,
75
+ "positional_dropout": 0.1,
76
+ "reduction_factor": 2,
77
+ "scale_embedding": false,
78
+ "speaker_embedding_dim": 512,
79
+ "speech_decoder_postnet_dropout": 0.5,
80
+ "speech_decoder_postnet_kernel": 5,
81
+ "speech_decoder_postnet_layers": 5,
82
+ "speech_decoder_postnet_units": 256,
83
+ "speech_decoder_prenet_dropout": 0.5,
84
+ "speech_decoder_prenet_layers": 2,
85
+ "speech_decoder_prenet_units": 256,
86
+ "torch_dtype": "float32",
87
+ "transformers_version": "4.55.4",
88
+ "use_cache": false,
89
+ "use_guided_attention_loss": true,
90
+ "vocab_size": 81
91
+ }
checkpoint-4000/generation_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 0,
4
+ "decoder_start_token_id": 2,
5
+ "eos_token_id": 2,
6
+ "max_length": 1876,
7
+ "pad_token_id": 1,
8
+ "transformers_version": "4.55.4"
9
+ }
checkpoint-4000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4b10dd87b217ab2fc492088d02d67c7955fbbff9f22b6fda9133dfa1744e6d9d
3
+ size 577789320
checkpoint-4000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4e3c148661528c4aa2cc3b96d89de7440a524fdfc4c68416d7a8438ea0d22f51
3
+ size 1155777946
checkpoint-4000/preprocessor_config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": false,
3
+ "feature_extractor_type": "SpeechT5FeatureExtractor",
4
+ "feature_size": 1,
5
+ "fmax": 7600,
6
+ "fmin": 80,
7
+ "frame_signal_scale": 1.0,
8
+ "hop_length": 16,
9
+ "mel_floor": 1e-10,
10
+ "num_mel_bins": 80,
11
+ "padding_side": "right",
12
+ "padding_value": 0.0,
13
+ "processor_class": "SpeechT5Processor",
14
+ "reduction_factor": 2,
15
+ "return_attention_mask": true,
16
+ "sampling_rate": 16000,
17
+ "win_function": "hann_window",
18
+ "win_length": 64
19
+ }
checkpoint-4000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aba3f2e2e55ab9cb538d7b0b1066ff8ea9c9ba098fb7f0715213c6343cb11c11
3
+ size 14645
checkpoint-4000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:700b408dba7ef9825c572f76cd9846e502c0ecd58f44e9e252d68786437bee70
3
+ size 1465
checkpoint-4000/special_tokens_map.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "eos_token": "</s>",
4
+ "mask_token": {
5
+ "content": "<mask>",
6
+ "lstrip": true,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false
10
+ },
11
+ "pad_token": "<pad>",
12
+ "unk_token": "<unk>"
13
+ }