ESPnet
multilingual
audio
universa
ftshijt commited on
Commit
85a3918
·
1 Parent(s): bfe0cb2

Update model

Browse files
README.md ADDED
@@ -0,0 +1,292 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - espnet
4
+ - audio
5
+ - universa
6
+ language: multilingual
7
+ datasets:
8
+ - universa_unite
9
+ license: cc-by-4.0
10
+ ---
11
+
12
+ ## ESPnet2 universa model
13
+
14
+ ### `espnet/arecho_base_v0.1-large-decoder`
15
+
16
+ This model was trained by ftshijt using universa_unite recipe in [espnet](https://github.com/espnet/espnet/).
17
+
18
+ ### Demo: How to use in ESPnet2
19
+
20
+ Follow the [ESPnet installation instructions](https://espnet.github.io/espnet/installation.html)
21
+ if you haven't done that already.
22
+
23
+ ```bash
24
+ cd espnet
25
+ git checkout 69996dc206e556ec48db77b6cc385ff1d32895b3
26
+ pip install -e .
27
+ cd egs2/universa_unite/uni_versa1
28
+ ./run.sh --skip_data_prep false --skip_train true --download_model espnet/arecho_base_v0.1-large-decoder
29
+ ```
30
+
31
+
32
+
33
+ ## universa config
34
+
35
+ <details><summary>expand</summary>
36
+
37
+ ```
38
+ config: conf/train_aruniversa_wavlm_large.yaml
39
+ print_config: false
40
+ log_level: INFO
41
+ drop_last_iter: false
42
+ dry_run: false
43
+ iterator_type: sequence
44
+ valid_iterator_type: null
45
+ output_dir: exp/universa_universa_ar_overall_base_token_wavlm_large
46
+ ngpu: 1
47
+ seed: 777
48
+ num_workers: 1
49
+ num_att_plot: 0
50
+ dist_backend: nccl
51
+ dist_init_method: env://
52
+ dist_world_size: null
53
+ dist_rank: null
54
+ local_rank: 0
55
+ dist_master_addr: null
56
+ dist_master_port: null
57
+ dist_launcher: null
58
+ multiprocessing_distributed: false
59
+ unused_parameters: false
60
+ sharded_ddp: false
61
+ use_deepspeed: false
62
+ deepspeed_config: null
63
+ gradient_as_bucket_view: true
64
+ ddp_comm_hook: null
65
+ cudnn_enabled: true
66
+ cudnn_benchmark: false
67
+ cudnn_deterministic: false
68
+ use_tf32: false
69
+ collect_stats: false
70
+ write_collected_feats: false
71
+ max_epoch: 100
72
+ patience: null
73
+ val_scheduler_criterion:
74
+ - valid
75
+ - loss
76
+ early_stopping_criterion:
77
+ - valid
78
+ - loss
79
+ - min
80
+ best_model_criterion:
81
+ - - train
82
+ - loss
83
+ - min
84
+ - - valid
85
+ - loss
86
+ - min
87
+ - - train
88
+ - acc
89
+ - max
90
+ - - valid
91
+ - acc
92
+ - max
93
+ keep_nbest_models: 1
94
+ nbest_averaging_interval: 0
95
+ grad_clip: -1
96
+ grad_clip_type: 2.0
97
+ grad_noise: false
98
+ accum_grad: 2
99
+ no_forward_run: false
100
+ resume: true
101
+ train_dtype: float32
102
+ use_amp: false
103
+ log_interval: 50
104
+ use_matplotlib: true
105
+ use_tensorboard: true
106
+ create_graph_in_tensorboard: false
107
+ use_wandb: false
108
+ wandb_project: null
109
+ wandb_id: null
110
+ wandb_entity: null
111
+ wandb_name: null
112
+ wandb_model_log_interval: -1
113
+ detect_anomaly: false
114
+ use_adapter: false
115
+ adapter: lora
116
+ save_strategy: all
117
+ adapter_conf: {}
118
+ pretrain_path: null
119
+ init_param: []
120
+ ignore_init_mismatch: false
121
+ freeze_param:
122
+ - frontend.upstream
123
+ num_iters_per_epoch: null
124
+ batch_size: 16
125
+ valid_batch_size: null
126
+ batch_bins: 1000000
127
+ valid_batch_bins: null
128
+ category_sample_size: 10
129
+ train_shape_file:
130
+ - exp/universa_stats_overall_base/train/audio_shape
131
+ - exp/universa_stats_overall_base/train/ref_audio_shape
132
+ valid_shape_file:
133
+ - exp/universa_stats_overall_base/valid/audio_shape
134
+ - exp/universa_stats_overall_base/valid/ref_audio_shape
135
+ batch_type: sorted
136
+ valid_batch_type: null
137
+ fold_length:
138
+ - 256000
139
+ sort_in_batch: descending
140
+ shuffle_within_batch: false
141
+ sort_batch: descending
142
+ multiple_iterator: false
143
+ chunk_length: 500
144
+ chunk_shift_ratio: 0.5
145
+ num_cache_chunks: 1024
146
+ chunk_excluded_key_prefixes: []
147
+ chunk_default_fs: null
148
+ chunk_max_abs_length: null
149
+ chunk_discard_short_samples: true
150
+ train_data_path_and_name_and_type:
151
+ - - dump/raw/overall_base/wav.scp
152
+ - audio
153
+ - kaldi_ark
154
+ - - dump/raw/overall_base/metric.scp
155
+ - metrics
156
+ - metric
157
+ - - dump/raw/overall_base/ref_wav.scp
158
+ - ref_audio
159
+ - kaldi_ark
160
+ valid_data_path_and_name_and_type:
161
+ - - dump/raw/overall_dev/wav.scp
162
+ - audio
163
+ - kaldi_ark
164
+ - - dump/raw/overall_dev/metric.scp
165
+ - metrics
166
+ - metric
167
+ - - dump/raw/overall_dev/ref_wav.scp
168
+ - ref_audio
169
+ - kaldi_ark
170
+ multi_task_dataset: false
171
+ allow_variable_data_keys: false
172
+ max_cache_size: 0.0
173
+ max_cache_fd: 32
174
+ allow_multi_rates: false
175
+ valid_max_cache_size: null
176
+ exclude_weight_decay: false
177
+ exclude_weight_decay_conf: {}
178
+ optim: adamw
179
+ optim_conf:
180
+ lr: 0.001
181
+ scheduler: warmuplr
182
+ scheduler_conf:
183
+ warmup_steps: 25000
184
+ metric2id: dump/raw/overall_base/metric2id
185
+ metric2type: dump/raw/overall_base/metric2type
186
+ metric_pad_value: -100
187
+ token_list: null
188
+ metric_token_info: data/token_list/metric_500_percentile_overall_base_w-numerical/tokens.json
189
+ metric_token_pad_value: 0
190
+ tokenize_numerical_metric: true
191
+ init: null
192
+ model_conf: {}
193
+ use_ref_audio: true
194
+ use_ref_text: false
195
+ use_preprocessor: true
196
+ token_type: bpe
197
+ bpemodel: null
198
+ non_linguistic_symbols: null
199
+ cleaner: null
200
+ g2p: null
201
+ sequential_metric: true
202
+ randomize_sequential_metric: true
203
+ frontend: s3prl
204
+ frontend_conf:
205
+ frontend_conf:
206
+ upstream: wavlm_large
207
+ download_dir: ./hub
208
+ multilayer_feature: true
209
+ universa: ar_universa
210
+ universa_conf:
211
+ embedding_dim: 512
212
+ audio_encoder_type: transformer
213
+ audio_encoder_params:
214
+ num_blocks: 4
215
+ attention_heads: 4
216
+ linear_units: 1024
217
+ dropout_rate: 0.1
218
+ positional_dropout_rate: 0.1
219
+ attention_dropout_rate: 0.1
220
+ input_layer: conv2d
221
+ normalize_before: true
222
+ concat_after: false
223
+ positionwise_layer_type: linear
224
+ positionwise_conv_kernel_size: 1
225
+ layer_drop_rate: 0.1
226
+ qk_norm: false
227
+ use_flash_attn: false
228
+ cross_attention_type: multihead
229
+ cross_attention_params:
230
+ n_head: 2
231
+ dropout_rate: 0.1
232
+ metric_decoder_params:
233
+ num_blocks: 12
234
+ attention_heads: 8
235
+ linear_units: 2048
236
+ dropout_rate: 0.1
237
+ positional_dropout_rate: 0.1
238
+ src_attention_dropout_rate: 0.1
239
+ self_attention_dropout_rate: 0.1
240
+ input_layer: embed
241
+ normalize_before: true
242
+ concat_after: false
243
+ layer_drop_rate: 0.1
244
+ qk_norm: false
245
+ use_flash_attn: false
246
+ use_rope: true
247
+ lsm_weight: 0.1
248
+ sym_sos: <sos>
249
+ sym_eos: <eos>
250
+ required:
251
+ - output_dir
252
+ - metric2id
253
+ version: '202503'
254
+ distributed: false
255
+ ```
256
+
257
+ </details>
258
+
259
+
260
+
261
+ ### Citing ESPnet
262
+
263
+ ```BibTex
264
+ @inproceedings{watanabe2018espnet,
265
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
266
+ title={{ESPnet}: End-to-End Speech Processing Toolkit},
267
+ year={2018},
268
+ booktitle={Proceedings of Interspeech},
269
+ pages={2207--2211},
270
+ doi={10.21437/Interspeech.2018-1456},
271
+ url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
272
+ }
273
+
274
+
275
+
276
+
277
+
278
+
279
+ ```
280
+
281
+ or arXiv:
282
+
283
+ ```bibtex
284
+ @misc{watanabe2018espnet,
285
+ title={ESPnet: End-to-End Speech Processing Toolkit},
286
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
287
+ year={2018},
288
+ eprint={1804.00015},
289
+ archivePrefix={arXiv},
290
+ primaryClass={cs.CL}
291
+ }
292
+ ```
dump/raw/overall_base/metric2id ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ srmr
2
+ language
3
+ nisqa_mos_pred
4
+ nisqa_noi_pred
5
+ nisqa_dis_pred
6
+ nisqa_col_pred
7
+ nisqa_loud_pred
8
+ sheet_ssqa
9
+ utmos
10
+ utmosv2
11
+ dns_overall
12
+ dns_p808
13
+ plcmos
14
+ singmos
15
+ scoreq_nr
16
+ se_sdr
17
+ se_sar
18
+ se_si_snr
19
+ se_ci_sdr
20
+ pam_score
21
+ speaking_rate
22
+ audiobox_aesthetics_CE
23
+ audiobox_aesthetics_CU
24
+ audiobox_aesthetics_PC
25
+ audiobox_aesthetics_PQ
26
+ asvspoof_score
27
+ real_language
28
+ qwen_speaker_count
29
+ qwen_speaker_gender
30
+ qwen_speaker_age
31
+ qwen_speech_impairment
32
+ qwen_voice_pitch
33
+ qwen_pitch_range
34
+ qwen_voice_type
35
+ qwen_speech_volume_level
36
+ qwen_language
37
+ qwen_speech_register
38
+ qwen_vocabulary_complexity
39
+ qwen_speech_purpose
40
+ qwen_speech_emotion
41
+ qwen_speech_clarity
42
+ qwen_speech_rate
43
+ qwen_speaking_style
44
+ qwen_laughter_crying
45
+ qwen_speech_background_environment
46
+ qwen_recording_quality
47
+ qwen_channel_type
48
+ snr_simulation
49
+ rir_room_size
50
+ nomad
51
+ emotion_similarity
52
+ noresqa_score
53
+ speech_bert
54
+ speech_bleu
55
+ speech_token_distance
56
+ scoreq_ref
57
+ asr_match_error_rate
58
+ ref_text_length
59
+ pred_text_length
60
+ spk_similarity
61
+ rt60
62
+ visqol
63
+ pysepm_fwsegsnr
64
+ pysepm_llr
65
+ pysepm_wss
66
+ pysepm_cd
67
+ pysepm_c_sig
68
+ pysepm_c_bak
69
+ pysepm_c_ovl
70
+ pysepm_csii_high
71
+ pysepm_csii_mid
72
+ pysepm_csii_low
73
+ pysepm_ncm
74
+ mcd
75
+ f0rmse
76
+ f0corr
77
+ pesq
78
+ stoi
79
+ sdr
80
+ sar
81
+ si_snr
82
+ ci_sdr
83
+ nisqa_real_mos
84
+ wer
85
+ cer
86
+ urgent_mos
87
+ voicemos_real_mos
dump/raw/overall_base/metric2type ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ srmr numerical
2
+ language categorical
3
+ nisqa_mos_pred numerical
4
+ nisqa_noi_pred numerical
5
+ nisqa_dis_pred numerical
6
+ nisqa_col_pred numerical
7
+ nisqa_loud_pred numerical
8
+ sheet_ssqa numerical
9
+ utmos numerical
10
+ utmosv2 numerical
11
+ dns_overall numerical
12
+ dns_p808 numerical
13
+ plcmos numerical
14
+ singmos numerical
15
+ scoreq_nr numerical
16
+ se_sdr numerical
17
+ se_sar numerical
18
+ se_si_snr numerical
19
+ se_ci_sdr numerical
20
+ pam_score numerical
21
+ speaking_rate numerical
22
+ audiobox_aesthetics_CE numerical
23
+ audiobox_aesthetics_CU numerical
24
+ audiobox_aesthetics_PC numerical
25
+ audiobox_aesthetics_PQ numerical
26
+ asvspoof_score numerical
27
+ real_language categorical
28
+ qwen_speaker_count numerical
29
+ qwen_speaker_gender categorical
30
+ qwen_speaker_age categorical
31
+ qwen_speech_impairment categorical
32
+ qwen_voice_pitch categorical
33
+ qwen_pitch_range categorical
34
+ qwen_voice_type categorical
35
+ qwen_speech_volume_level categorical
36
+ qwen_language categorical
37
+ qwen_speech_register categorical
38
+ qwen_vocabulary_complexity categorical
39
+ qwen_speech_purpose categorical
40
+ qwen_speech_emotion categorical
41
+ qwen_speech_clarity categorical
42
+ qwen_speech_rate categorical
43
+ qwen_speaking_style categorical
44
+ qwen_laughter_crying categorical
45
+ qwen_speech_background_environment categorical
46
+ qwen_recording_quality categorical
47
+ qwen_channel_type categorical
48
+ snr_simulation numerical
49
+ rir_room_size categorical
50
+ nomad numerical
51
+ emotion_similarity numerical
52
+ noresqa_score numerical
53
+ speech_bert numerical
54
+ speech_bleu numerical
55
+ speech_token_distance numerical
56
+ scoreq_ref numerical
57
+ asr_match_error_rate numerical
58
+ ref_text_length numerical
59
+ pred_text_length numerical
60
+ spk_similarity numerical
61
+ rt60 numerical
62
+ visqol numerical
63
+ pysepm_fwsegsnr numerical
64
+ pysepm_llr numerical
65
+ pysepm_wss numerical
66
+ pysepm_cd numerical
67
+ pysepm_c_sig numerical
68
+ pysepm_c_bak numerical
69
+ pysepm_c_ovl numerical
70
+ pysepm_csii_high numerical
71
+ pysepm_csii_mid numerical
72
+ pysepm_csii_low numerical
73
+ pysepm_ncm numerical
74
+ mcd numerical
75
+ f0rmse numerical
76
+ f0corr numerical
77
+ pesq numerical
78
+ stoi numerical
79
+ sdr numerical
80
+ sar numerical
81
+ si_snr numerical
82
+ ci_sdr numerical
83
+ nisqa_real_mos numerical
84
+ wer numerical
85
+ cer numerical
86
+ urgent_mos numerical
87
+ voicemos_real_mos numerical
exp/universa_universa_ar_overall_base_token_wavlm_large/24epoch.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:011b048aa2986a73de04ab5859d71a3ff822e73bbb7a357f082a9872367b8bf6
3
+ size 2762035766
exp/universa_universa_ar_overall_base_token_wavlm_large/config.yaml ADDED
@@ -0,0 +1,217 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/train_aruniversa_wavlm_large.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ drop_last_iter: false
5
+ dry_run: false
6
+ iterator_type: sequence
7
+ valid_iterator_type: null
8
+ output_dir: exp/universa_universa_ar_overall_base_token_wavlm_large
9
+ ngpu: 1
10
+ seed: 777
11
+ num_workers: 1
12
+ num_att_plot: 0
13
+ dist_backend: nccl
14
+ dist_init_method: env://
15
+ dist_world_size: null
16
+ dist_rank: null
17
+ local_rank: 0
18
+ dist_master_addr: null
19
+ dist_master_port: null
20
+ dist_launcher: null
21
+ multiprocessing_distributed: false
22
+ unused_parameters: false
23
+ sharded_ddp: false
24
+ use_deepspeed: false
25
+ deepspeed_config: null
26
+ gradient_as_bucket_view: true
27
+ ddp_comm_hook: null
28
+ cudnn_enabled: true
29
+ cudnn_benchmark: false
30
+ cudnn_deterministic: false
31
+ use_tf32: false
32
+ collect_stats: false
33
+ write_collected_feats: false
34
+ max_epoch: 100
35
+ patience: null
36
+ val_scheduler_criterion:
37
+ - valid
38
+ - loss
39
+ early_stopping_criterion:
40
+ - valid
41
+ - loss
42
+ - min
43
+ best_model_criterion:
44
+ - - train
45
+ - loss
46
+ - min
47
+ - - valid
48
+ - loss
49
+ - min
50
+ - - train
51
+ - acc
52
+ - max
53
+ - - valid
54
+ - acc
55
+ - max
56
+ keep_nbest_models: 1
57
+ nbest_averaging_interval: 0
58
+ grad_clip: -1
59
+ grad_clip_type: 2.0
60
+ grad_noise: false
61
+ accum_grad: 2
62
+ no_forward_run: false
63
+ resume: true
64
+ train_dtype: float32
65
+ use_amp: false
66
+ log_interval: 50
67
+ use_matplotlib: true
68
+ use_tensorboard: true
69
+ create_graph_in_tensorboard: false
70
+ use_wandb: false
71
+ wandb_project: null
72
+ wandb_id: null
73
+ wandb_entity: null
74
+ wandb_name: null
75
+ wandb_model_log_interval: -1
76
+ detect_anomaly: false
77
+ use_adapter: false
78
+ adapter: lora
79
+ save_strategy: all
80
+ adapter_conf: {}
81
+ pretrain_path: null
82
+ init_param: []
83
+ ignore_init_mismatch: false
84
+ freeze_param:
85
+ - frontend.upstream
86
+ num_iters_per_epoch: null
87
+ batch_size: 16
88
+ valid_batch_size: null
89
+ batch_bins: 1000000
90
+ valid_batch_bins: null
91
+ category_sample_size: 10
92
+ train_shape_file:
93
+ - exp/universa_stats_overall_base/train/audio_shape
94
+ - exp/universa_stats_overall_base/train/ref_audio_shape
95
+ valid_shape_file:
96
+ - exp/universa_stats_overall_base/valid/audio_shape
97
+ - exp/universa_stats_overall_base/valid/ref_audio_shape
98
+ batch_type: sorted
99
+ valid_batch_type: null
100
+ fold_length:
101
+ - 256000
102
+ sort_in_batch: descending
103
+ shuffle_within_batch: false
104
+ sort_batch: descending
105
+ multiple_iterator: false
106
+ chunk_length: 500
107
+ chunk_shift_ratio: 0.5
108
+ num_cache_chunks: 1024
109
+ chunk_excluded_key_prefixes: []
110
+ chunk_default_fs: null
111
+ chunk_max_abs_length: null
112
+ chunk_discard_short_samples: true
113
+ train_data_path_and_name_and_type:
114
+ - - dump/raw/overall_base/wav.scp
115
+ - audio
116
+ - kaldi_ark
117
+ - - dump/raw/overall_base/metric.scp
118
+ - metrics
119
+ - metric
120
+ - - dump/raw/overall_base/ref_wav.scp
121
+ - ref_audio
122
+ - kaldi_ark
123
+ valid_data_path_and_name_and_type:
124
+ - - dump/raw/overall_dev/wav.scp
125
+ - audio
126
+ - kaldi_ark
127
+ - - dump/raw/overall_dev/metric.scp
128
+ - metrics
129
+ - metric
130
+ - - dump/raw/overall_dev/ref_wav.scp
131
+ - ref_audio
132
+ - kaldi_ark
133
+ multi_task_dataset: false
134
+ allow_variable_data_keys: false
135
+ max_cache_size: 0.0
136
+ max_cache_fd: 32
137
+ allow_multi_rates: false
138
+ valid_max_cache_size: null
139
+ exclude_weight_decay: false
140
+ exclude_weight_decay_conf: {}
141
+ optim: adamw
142
+ optim_conf:
143
+ lr: 0.001
144
+ scheduler: warmuplr
145
+ scheduler_conf:
146
+ warmup_steps: 25000
147
+ metric2id: dump/raw/overall_base/metric2id
148
+ metric2type: dump/raw/overall_base/metric2type
149
+ metric_pad_value: -100
150
+ token_list: null
151
+ metric_token_info: data/token_list/metric_500_percentile_overall_base_w-numerical/tokens.json
152
+ metric_token_pad_value: 0
153
+ tokenize_numerical_metric: true
154
+ init: null
155
+ model_conf: {}
156
+ use_ref_audio: true
157
+ use_ref_text: false
158
+ use_preprocessor: true
159
+ token_type: bpe
160
+ bpemodel: null
161
+ non_linguistic_symbols: null
162
+ cleaner: null
163
+ g2p: null
164
+ sequential_metric: true
165
+ randomize_sequential_metric: true
166
+ frontend: s3prl
167
+ frontend_conf:
168
+ frontend_conf:
169
+ upstream: wavlm_large
170
+ download_dir: ./hub
171
+ multilayer_feature: true
172
+ universa: ar_universa
173
+ universa_conf:
174
+ embedding_dim: 512
175
+ audio_encoder_type: transformer
176
+ audio_encoder_params:
177
+ num_blocks: 4
178
+ attention_heads: 4
179
+ linear_units: 1024
180
+ dropout_rate: 0.1
181
+ positional_dropout_rate: 0.1
182
+ attention_dropout_rate: 0.1
183
+ input_layer: conv2d
184
+ normalize_before: true
185
+ concat_after: false
186
+ positionwise_layer_type: linear
187
+ positionwise_conv_kernel_size: 1
188
+ layer_drop_rate: 0.1
189
+ qk_norm: false
190
+ use_flash_attn: false
191
+ cross_attention_type: multihead
192
+ cross_attention_params:
193
+ n_head: 2
194
+ dropout_rate: 0.1
195
+ metric_decoder_params:
196
+ num_blocks: 12
197
+ attention_heads: 8
198
+ linear_units: 2048
199
+ dropout_rate: 0.1
200
+ positional_dropout_rate: 0.1
201
+ src_attention_dropout_rate: 0.1
202
+ self_attention_dropout_rate: 0.1
203
+ input_layer: embed
204
+ normalize_before: true
205
+ concat_after: false
206
+ layer_drop_rate: 0.1
207
+ qk_norm: false
208
+ use_flash_attn: false
209
+ use_rope: true
210
+ lsm_weight: 0.1
211
+ sym_sos: <sos>
212
+ sym_eos: <eos>
213
+ required:
214
+ - output_dir
215
+ - metric2id
216
+ version: '202503'
217
+ distributed: false
exp/universa_universa_ar_overall_base_token_wavlm_large/images/acc_ar_decoder.png ADDED
exp/universa_universa_ar_overall_base_token_wavlm_large/images/backward_time.png ADDED
exp/universa_universa_ar_overall_base_token_wavlm_large/images/clip.png ADDED
exp/universa_universa_ar_overall_base_token_wavlm_large/images/forward_time.png ADDED
exp/universa_universa_ar_overall_base_token_wavlm_large/images/gpu_max_cached_mem_GB.png ADDED
exp/universa_universa_ar_overall_base_token_wavlm_large/images/grad_norm.png ADDED
exp/universa_universa_ar_overall_base_token_wavlm_large/images/iter_time.png ADDED
exp/universa_universa_ar_overall_base_token_wavlm_large/images/loss.png ADDED
exp/universa_universa_ar_overall_base_token_wavlm_large/images/loss_ar_decoder.png ADDED
exp/universa_universa_ar_overall_base_token_wavlm_large/images/loss_scale.png ADDED
exp/universa_universa_ar_overall_base_token_wavlm_large/images/optim0_lr0.png ADDED
exp/universa_universa_ar_overall_base_token_wavlm_large/images/optim_step_time.png ADDED
exp/universa_universa_ar_overall_base_token_wavlm_large/images/train_time.png ADDED
exp/universa_universa_ar_overall_base_token_wavlm_large/images/value_ar_decoder.png ADDED
meta.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ espnet: '202503'
2
+ files:
3
+ model_file: exp/universa_universa_ar_overall_base_token_wavlm_large/24epoch.pth
4
+ python: "3.9.18 | packaged by conda-forge | (main, Dec 23 2023, 17:20:25) \n[GCC 12.3.0]"
5
+ timestamp: 1749800668.161494
6
+ torch: 2.6.0.dev20241210+cu124
7
+ yaml_files:
8
+ train_config: exp/universa_universa_ar_overall_base_token_wavlm_large/config.yaml