Update model
Browse files- README.md +292 -0
- dump/raw/overall_base/metric2id +87 -0
- dump/raw/overall_base/metric2type +87 -0
- exp/universa_universa_ar_overall_base_token_wavlm_large/24epoch.pth +3 -0
- exp/universa_universa_ar_overall_base_token_wavlm_large/config.yaml +217 -0
- exp/universa_universa_ar_overall_base_token_wavlm_large/images/acc_ar_decoder.png +0 -0
- exp/universa_universa_ar_overall_base_token_wavlm_large/images/backward_time.png +0 -0
- exp/universa_universa_ar_overall_base_token_wavlm_large/images/clip.png +0 -0
- exp/universa_universa_ar_overall_base_token_wavlm_large/images/forward_time.png +0 -0
- exp/universa_universa_ar_overall_base_token_wavlm_large/images/gpu_max_cached_mem_GB.png +0 -0
- exp/universa_universa_ar_overall_base_token_wavlm_large/images/grad_norm.png +0 -0
- exp/universa_universa_ar_overall_base_token_wavlm_large/images/iter_time.png +0 -0
- exp/universa_universa_ar_overall_base_token_wavlm_large/images/loss.png +0 -0
- exp/universa_universa_ar_overall_base_token_wavlm_large/images/loss_ar_decoder.png +0 -0
- exp/universa_universa_ar_overall_base_token_wavlm_large/images/loss_scale.png +0 -0
- exp/universa_universa_ar_overall_base_token_wavlm_large/images/optim0_lr0.png +0 -0
- exp/universa_universa_ar_overall_base_token_wavlm_large/images/optim_step_time.png +0 -0
- exp/universa_universa_ar_overall_base_token_wavlm_large/images/train_time.png +0 -0
- exp/universa_universa_ar_overall_base_token_wavlm_large/images/value_ar_decoder.png +0 -0
- meta.yaml +8 -0
README.md
ADDED
@@ -0,0 +1,292 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
tags:
|
3 |
+
- espnet
|
4 |
+
- audio
|
5 |
+
- universa
|
6 |
+
language: multilingual
|
7 |
+
datasets:
|
8 |
+
- universa_unite
|
9 |
+
license: cc-by-4.0
|
10 |
+
---
|
11 |
+
|
12 |
+
## ESPnet2 universa model
|
13 |
+
|
14 |
+
### `espnet/arecho_base_v0.1-large-decoder`
|
15 |
+
|
16 |
+
This model was trained by ftshijt using universa_unite recipe in [espnet](https://github.com/espnet/espnet/).
|
17 |
+
|
18 |
+
### Demo: How to use in ESPnet2
|
19 |
+
|
20 |
+
Follow the [ESPnet installation instructions](https://espnet.github.io/espnet/installation.html)
|
21 |
+
if you haven't done that already.
|
22 |
+
|
23 |
+
```bash
|
24 |
+
cd espnet
|
25 |
+
git checkout 69996dc206e556ec48db77b6cc385ff1d32895b3
|
26 |
+
pip install -e .
|
27 |
+
cd egs2/universa_unite/uni_versa1
|
28 |
+
./run.sh --skip_data_prep false --skip_train true --download_model espnet/arecho_base_v0.1-large-decoder
|
29 |
+
```
|
30 |
+
|
31 |
+
|
32 |
+
|
33 |
+
## universa config
|
34 |
+
|
35 |
+
<details><summary>expand</summary>
|
36 |
+
|
37 |
+
```
|
38 |
+
config: conf/train_aruniversa_wavlm_large.yaml
|
39 |
+
print_config: false
|
40 |
+
log_level: INFO
|
41 |
+
drop_last_iter: false
|
42 |
+
dry_run: false
|
43 |
+
iterator_type: sequence
|
44 |
+
valid_iterator_type: null
|
45 |
+
output_dir: exp/universa_universa_ar_overall_base_token_wavlm_large
|
46 |
+
ngpu: 1
|
47 |
+
seed: 777
|
48 |
+
num_workers: 1
|
49 |
+
num_att_plot: 0
|
50 |
+
dist_backend: nccl
|
51 |
+
dist_init_method: env://
|
52 |
+
dist_world_size: null
|
53 |
+
dist_rank: null
|
54 |
+
local_rank: 0
|
55 |
+
dist_master_addr: null
|
56 |
+
dist_master_port: null
|
57 |
+
dist_launcher: null
|
58 |
+
multiprocessing_distributed: false
|
59 |
+
unused_parameters: false
|
60 |
+
sharded_ddp: false
|
61 |
+
use_deepspeed: false
|
62 |
+
deepspeed_config: null
|
63 |
+
gradient_as_bucket_view: true
|
64 |
+
ddp_comm_hook: null
|
65 |
+
cudnn_enabled: true
|
66 |
+
cudnn_benchmark: false
|
67 |
+
cudnn_deterministic: false
|
68 |
+
use_tf32: false
|
69 |
+
collect_stats: false
|
70 |
+
write_collected_feats: false
|
71 |
+
max_epoch: 100
|
72 |
+
patience: null
|
73 |
+
val_scheduler_criterion:
|
74 |
+
- valid
|
75 |
+
- loss
|
76 |
+
early_stopping_criterion:
|
77 |
+
- valid
|
78 |
+
- loss
|
79 |
+
- min
|
80 |
+
best_model_criterion:
|
81 |
+
- - train
|
82 |
+
- loss
|
83 |
+
- min
|
84 |
+
- - valid
|
85 |
+
- loss
|
86 |
+
- min
|
87 |
+
- - train
|
88 |
+
- acc
|
89 |
+
- max
|
90 |
+
- - valid
|
91 |
+
- acc
|
92 |
+
- max
|
93 |
+
keep_nbest_models: 1
|
94 |
+
nbest_averaging_interval: 0
|
95 |
+
grad_clip: -1
|
96 |
+
grad_clip_type: 2.0
|
97 |
+
grad_noise: false
|
98 |
+
accum_grad: 2
|
99 |
+
no_forward_run: false
|
100 |
+
resume: true
|
101 |
+
train_dtype: float32
|
102 |
+
use_amp: false
|
103 |
+
log_interval: 50
|
104 |
+
use_matplotlib: true
|
105 |
+
use_tensorboard: true
|
106 |
+
create_graph_in_tensorboard: false
|
107 |
+
use_wandb: false
|
108 |
+
wandb_project: null
|
109 |
+
wandb_id: null
|
110 |
+
wandb_entity: null
|
111 |
+
wandb_name: null
|
112 |
+
wandb_model_log_interval: -1
|
113 |
+
detect_anomaly: false
|
114 |
+
use_adapter: false
|
115 |
+
adapter: lora
|
116 |
+
save_strategy: all
|
117 |
+
adapter_conf: {}
|
118 |
+
pretrain_path: null
|
119 |
+
init_param: []
|
120 |
+
ignore_init_mismatch: false
|
121 |
+
freeze_param:
|
122 |
+
- frontend.upstream
|
123 |
+
num_iters_per_epoch: null
|
124 |
+
batch_size: 16
|
125 |
+
valid_batch_size: null
|
126 |
+
batch_bins: 1000000
|
127 |
+
valid_batch_bins: null
|
128 |
+
category_sample_size: 10
|
129 |
+
train_shape_file:
|
130 |
+
- exp/universa_stats_overall_base/train/audio_shape
|
131 |
+
- exp/universa_stats_overall_base/train/ref_audio_shape
|
132 |
+
valid_shape_file:
|
133 |
+
- exp/universa_stats_overall_base/valid/audio_shape
|
134 |
+
- exp/universa_stats_overall_base/valid/ref_audio_shape
|
135 |
+
batch_type: sorted
|
136 |
+
valid_batch_type: null
|
137 |
+
fold_length:
|
138 |
+
- 256000
|
139 |
+
sort_in_batch: descending
|
140 |
+
shuffle_within_batch: false
|
141 |
+
sort_batch: descending
|
142 |
+
multiple_iterator: false
|
143 |
+
chunk_length: 500
|
144 |
+
chunk_shift_ratio: 0.5
|
145 |
+
num_cache_chunks: 1024
|
146 |
+
chunk_excluded_key_prefixes: []
|
147 |
+
chunk_default_fs: null
|
148 |
+
chunk_max_abs_length: null
|
149 |
+
chunk_discard_short_samples: true
|
150 |
+
train_data_path_and_name_and_type:
|
151 |
+
- - dump/raw/overall_base/wav.scp
|
152 |
+
- audio
|
153 |
+
- kaldi_ark
|
154 |
+
- - dump/raw/overall_base/metric.scp
|
155 |
+
- metrics
|
156 |
+
- metric
|
157 |
+
- - dump/raw/overall_base/ref_wav.scp
|
158 |
+
- ref_audio
|
159 |
+
- kaldi_ark
|
160 |
+
valid_data_path_and_name_and_type:
|
161 |
+
- - dump/raw/overall_dev/wav.scp
|
162 |
+
- audio
|
163 |
+
- kaldi_ark
|
164 |
+
- - dump/raw/overall_dev/metric.scp
|
165 |
+
- metrics
|
166 |
+
- metric
|
167 |
+
- - dump/raw/overall_dev/ref_wav.scp
|
168 |
+
- ref_audio
|
169 |
+
- kaldi_ark
|
170 |
+
multi_task_dataset: false
|
171 |
+
allow_variable_data_keys: false
|
172 |
+
max_cache_size: 0.0
|
173 |
+
max_cache_fd: 32
|
174 |
+
allow_multi_rates: false
|
175 |
+
valid_max_cache_size: null
|
176 |
+
exclude_weight_decay: false
|
177 |
+
exclude_weight_decay_conf: {}
|
178 |
+
optim: adamw
|
179 |
+
optim_conf:
|
180 |
+
lr: 0.001
|
181 |
+
scheduler: warmuplr
|
182 |
+
scheduler_conf:
|
183 |
+
warmup_steps: 25000
|
184 |
+
metric2id: dump/raw/overall_base/metric2id
|
185 |
+
metric2type: dump/raw/overall_base/metric2type
|
186 |
+
metric_pad_value: -100
|
187 |
+
token_list: null
|
188 |
+
metric_token_info: data/token_list/metric_500_percentile_overall_base_w-numerical/tokens.json
|
189 |
+
metric_token_pad_value: 0
|
190 |
+
tokenize_numerical_metric: true
|
191 |
+
init: null
|
192 |
+
model_conf: {}
|
193 |
+
use_ref_audio: true
|
194 |
+
use_ref_text: false
|
195 |
+
use_preprocessor: true
|
196 |
+
token_type: bpe
|
197 |
+
bpemodel: null
|
198 |
+
non_linguistic_symbols: null
|
199 |
+
cleaner: null
|
200 |
+
g2p: null
|
201 |
+
sequential_metric: true
|
202 |
+
randomize_sequential_metric: true
|
203 |
+
frontend: s3prl
|
204 |
+
frontend_conf:
|
205 |
+
frontend_conf:
|
206 |
+
upstream: wavlm_large
|
207 |
+
download_dir: ./hub
|
208 |
+
multilayer_feature: true
|
209 |
+
universa: ar_universa
|
210 |
+
universa_conf:
|
211 |
+
embedding_dim: 512
|
212 |
+
audio_encoder_type: transformer
|
213 |
+
audio_encoder_params:
|
214 |
+
num_blocks: 4
|
215 |
+
attention_heads: 4
|
216 |
+
linear_units: 1024
|
217 |
+
dropout_rate: 0.1
|
218 |
+
positional_dropout_rate: 0.1
|
219 |
+
attention_dropout_rate: 0.1
|
220 |
+
input_layer: conv2d
|
221 |
+
normalize_before: true
|
222 |
+
concat_after: false
|
223 |
+
positionwise_layer_type: linear
|
224 |
+
positionwise_conv_kernel_size: 1
|
225 |
+
layer_drop_rate: 0.1
|
226 |
+
qk_norm: false
|
227 |
+
use_flash_attn: false
|
228 |
+
cross_attention_type: multihead
|
229 |
+
cross_attention_params:
|
230 |
+
n_head: 2
|
231 |
+
dropout_rate: 0.1
|
232 |
+
metric_decoder_params:
|
233 |
+
num_blocks: 12
|
234 |
+
attention_heads: 8
|
235 |
+
linear_units: 2048
|
236 |
+
dropout_rate: 0.1
|
237 |
+
positional_dropout_rate: 0.1
|
238 |
+
src_attention_dropout_rate: 0.1
|
239 |
+
self_attention_dropout_rate: 0.1
|
240 |
+
input_layer: embed
|
241 |
+
normalize_before: true
|
242 |
+
concat_after: false
|
243 |
+
layer_drop_rate: 0.1
|
244 |
+
qk_norm: false
|
245 |
+
use_flash_attn: false
|
246 |
+
use_rope: true
|
247 |
+
lsm_weight: 0.1
|
248 |
+
sym_sos: <sos>
|
249 |
+
sym_eos: <eos>
|
250 |
+
required:
|
251 |
+
- output_dir
|
252 |
+
- metric2id
|
253 |
+
version: '202503'
|
254 |
+
distributed: false
|
255 |
+
```
|
256 |
+
|
257 |
+
</details>
|
258 |
+
|
259 |
+
|
260 |
+
|
261 |
+
### Citing ESPnet
|
262 |
+
|
263 |
+
```BibTex
|
264 |
+
@inproceedings{watanabe2018espnet,
|
265 |
+
author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
|
266 |
+
title={{ESPnet}: End-to-End Speech Processing Toolkit},
|
267 |
+
year={2018},
|
268 |
+
booktitle={Proceedings of Interspeech},
|
269 |
+
pages={2207--2211},
|
270 |
+
doi={10.21437/Interspeech.2018-1456},
|
271 |
+
url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
|
272 |
+
}
|
273 |
+
|
274 |
+
|
275 |
+
|
276 |
+
|
277 |
+
|
278 |
+
|
279 |
+
```
|
280 |
+
|
281 |
+
or arXiv:
|
282 |
+
|
283 |
+
```bibtex
|
284 |
+
@misc{watanabe2018espnet,
|
285 |
+
title={ESPnet: End-to-End Speech Processing Toolkit},
|
286 |
+
author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
|
287 |
+
year={2018},
|
288 |
+
eprint={1804.00015},
|
289 |
+
archivePrefix={arXiv},
|
290 |
+
primaryClass={cs.CL}
|
291 |
+
}
|
292 |
+
```
|
dump/raw/overall_base/metric2id
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
srmr
|
2 |
+
language
|
3 |
+
nisqa_mos_pred
|
4 |
+
nisqa_noi_pred
|
5 |
+
nisqa_dis_pred
|
6 |
+
nisqa_col_pred
|
7 |
+
nisqa_loud_pred
|
8 |
+
sheet_ssqa
|
9 |
+
utmos
|
10 |
+
utmosv2
|
11 |
+
dns_overall
|
12 |
+
dns_p808
|
13 |
+
plcmos
|
14 |
+
singmos
|
15 |
+
scoreq_nr
|
16 |
+
se_sdr
|
17 |
+
se_sar
|
18 |
+
se_si_snr
|
19 |
+
se_ci_sdr
|
20 |
+
pam_score
|
21 |
+
speaking_rate
|
22 |
+
audiobox_aesthetics_CE
|
23 |
+
audiobox_aesthetics_CU
|
24 |
+
audiobox_aesthetics_PC
|
25 |
+
audiobox_aesthetics_PQ
|
26 |
+
asvspoof_score
|
27 |
+
real_language
|
28 |
+
qwen_speaker_count
|
29 |
+
qwen_speaker_gender
|
30 |
+
qwen_speaker_age
|
31 |
+
qwen_speech_impairment
|
32 |
+
qwen_voice_pitch
|
33 |
+
qwen_pitch_range
|
34 |
+
qwen_voice_type
|
35 |
+
qwen_speech_volume_level
|
36 |
+
qwen_language
|
37 |
+
qwen_speech_register
|
38 |
+
qwen_vocabulary_complexity
|
39 |
+
qwen_speech_purpose
|
40 |
+
qwen_speech_emotion
|
41 |
+
qwen_speech_clarity
|
42 |
+
qwen_speech_rate
|
43 |
+
qwen_speaking_style
|
44 |
+
qwen_laughter_crying
|
45 |
+
qwen_speech_background_environment
|
46 |
+
qwen_recording_quality
|
47 |
+
qwen_channel_type
|
48 |
+
snr_simulation
|
49 |
+
rir_room_size
|
50 |
+
nomad
|
51 |
+
emotion_similarity
|
52 |
+
noresqa_score
|
53 |
+
speech_bert
|
54 |
+
speech_bleu
|
55 |
+
speech_token_distance
|
56 |
+
scoreq_ref
|
57 |
+
asr_match_error_rate
|
58 |
+
ref_text_length
|
59 |
+
pred_text_length
|
60 |
+
spk_similarity
|
61 |
+
rt60
|
62 |
+
visqol
|
63 |
+
pysepm_fwsegsnr
|
64 |
+
pysepm_llr
|
65 |
+
pysepm_wss
|
66 |
+
pysepm_cd
|
67 |
+
pysepm_c_sig
|
68 |
+
pysepm_c_bak
|
69 |
+
pysepm_c_ovl
|
70 |
+
pysepm_csii_high
|
71 |
+
pysepm_csii_mid
|
72 |
+
pysepm_csii_low
|
73 |
+
pysepm_ncm
|
74 |
+
mcd
|
75 |
+
f0rmse
|
76 |
+
f0corr
|
77 |
+
pesq
|
78 |
+
stoi
|
79 |
+
sdr
|
80 |
+
sar
|
81 |
+
si_snr
|
82 |
+
ci_sdr
|
83 |
+
nisqa_real_mos
|
84 |
+
wer
|
85 |
+
cer
|
86 |
+
urgent_mos
|
87 |
+
voicemos_real_mos
|
dump/raw/overall_base/metric2type
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
srmr numerical
|
2 |
+
language categorical
|
3 |
+
nisqa_mos_pred numerical
|
4 |
+
nisqa_noi_pred numerical
|
5 |
+
nisqa_dis_pred numerical
|
6 |
+
nisqa_col_pred numerical
|
7 |
+
nisqa_loud_pred numerical
|
8 |
+
sheet_ssqa numerical
|
9 |
+
utmos numerical
|
10 |
+
utmosv2 numerical
|
11 |
+
dns_overall numerical
|
12 |
+
dns_p808 numerical
|
13 |
+
plcmos numerical
|
14 |
+
singmos numerical
|
15 |
+
scoreq_nr numerical
|
16 |
+
se_sdr numerical
|
17 |
+
se_sar numerical
|
18 |
+
se_si_snr numerical
|
19 |
+
se_ci_sdr numerical
|
20 |
+
pam_score numerical
|
21 |
+
speaking_rate numerical
|
22 |
+
audiobox_aesthetics_CE numerical
|
23 |
+
audiobox_aesthetics_CU numerical
|
24 |
+
audiobox_aesthetics_PC numerical
|
25 |
+
audiobox_aesthetics_PQ numerical
|
26 |
+
asvspoof_score numerical
|
27 |
+
real_language categorical
|
28 |
+
qwen_speaker_count numerical
|
29 |
+
qwen_speaker_gender categorical
|
30 |
+
qwen_speaker_age categorical
|
31 |
+
qwen_speech_impairment categorical
|
32 |
+
qwen_voice_pitch categorical
|
33 |
+
qwen_pitch_range categorical
|
34 |
+
qwen_voice_type categorical
|
35 |
+
qwen_speech_volume_level categorical
|
36 |
+
qwen_language categorical
|
37 |
+
qwen_speech_register categorical
|
38 |
+
qwen_vocabulary_complexity categorical
|
39 |
+
qwen_speech_purpose categorical
|
40 |
+
qwen_speech_emotion categorical
|
41 |
+
qwen_speech_clarity categorical
|
42 |
+
qwen_speech_rate categorical
|
43 |
+
qwen_speaking_style categorical
|
44 |
+
qwen_laughter_crying categorical
|
45 |
+
qwen_speech_background_environment categorical
|
46 |
+
qwen_recording_quality categorical
|
47 |
+
qwen_channel_type categorical
|
48 |
+
snr_simulation numerical
|
49 |
+
rir_room_size categorical
|
50 |
+
nomad numerical
|
51 |
+
emotion_similarity numerical
|
52 |
+
noresqa_score numerical
|
53 |
+
speech_bert numerical
|
54 |
+
speech_bleu numerical
|
55 |
+
speech_token_distance numerical
|
56 |
+
scoreq_ref numerical
|
57 |
+
asr_match_error_rate numerical
|
58 |
+
ref_text_length numerical
|
59 |
+
pred_text_length numerical
|
60 |
+
spk_similarity numerical
|
61 |
+
rt60 numerical
|
62 |
+
visqol numerical
|
63 |
+
pysepm_fwsegsnr numerical
|
64 |
+
pysepm_llr numerical
|
65 |
+
pysepm_wss numerical
|
66 |
+
pysepm_cd numerical
|
67 |
+
pysepm_c_sig numerical
|
68 |
+
pysepm_c_bak numerical
|
69 |
+
pysepm_c_ovl numerical
|
70 |
+
pysepm_csii_high numerical
|
71 |
+
pysepm_csii_mid numerical
|
72 |
+
pysepm_csii_low numerical
|
73 |
+
pysepm_ncm numerical
|
74 |
+
mcd numerical
|
75 |
+
f0rmse numerical
|
76 |
+
f0corr numerical
|
77 |
+
pesq numerical
|
78 |
+
stoi numerical
|
79 |
+
sdr numerical
|
80 |
+
sar numerical
|
81 |
+
si_snr numerical
|
82 |
+
ci_sdr numerical
|
83 |
+
nisqa_real_mos numerical
|
84 |
+
wer numerical
|
85 |
+
cer numerical
|
86 |
+
urgent_mos numerical
|
87 |
+
voicemos_real_mos numerical
|
exp/universa_universa_ar_overall_base_token_wavlm_large/24epoch.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:011b048aa2986a73de04ab5859d71a3ff822e73bbb7a357f082a9872367b8bf6
|
3 |
+
size 2762035766
|
exp/universa_universa_ar_overall_base_token_wavlm_large/config.yaml
ADDED
@@ -0,0 +1,217 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
config: conf/train_aruniversa_wavlm_large.yaml
|
2 |
+
print_config: false
|
3 |
+
log_level: INFO
|
4 |
+
drop_last_iter: false
|
5 |
+
dry_run: false
|
6 |
+
iterator_type: sequence
|
7 |
+
valid_iterator_type: null
|
8 |
+
output_dir: exp/universa_universa_ar_overall_base_token_wavlm_large
|
9 |
+
ngpu: 1
|
10 |
+
seed: 777
|
11 |
+
num_workers: 1
|
12 |
+
num_att_plot: 0
|
13 |
+
dist_backend: nccl
|
14 |
+
dist_init_method: env://
|
15 |
+
dist_world_size: null
|
16 |
+
dist_rank: null
|
17 |
+
local_rank: 0
|
18 |
+
dist_master_addr: null
|
19 |
+
dist_master_port: null
|
20 |
+
dist_launcher: null
|
21 |
+
multiprocessing_distributed: false
|
22 |
+
unused_parameters: false
|
23 |
+
sharded_ddp: false
|
24 |
+
use_deepspeed: false
|
25 |
+
deepspeed_config: null
|
26 |
+
gradient_as_bucket_view: true
|
27 |
+
ddp_comm_hook: null
|
28 |
+
cudnn_enabled: true
|
29 |
+
cudnn_benchmark: false
|
30 |
+
cudnn_deterministic: false
|
31 |
+
use_tf32: false
|
32 |
+
collect_stats: false
|
33 |
+
write_collected_feats: false
|
34 |
+
max_epoch: 100
|
35 |
+
patience: null
|
36 |
+
val_scheduler_criterion:
|
37 |
+
- valid
|
38 |
+
- loss
|
39 |
+
early_stopping_criterion:
|
40 |
+
- valid
|
41 |
+
- loss
|
42 |
+
- min
|
43 |
+
best_model_criterion:
|
44 |
+
- - train
|
45 |
+
- loss
|
46 |
+
- min
|
47 |
+
- - valid
|
48 |
+
- loss
|
49 |
+
- min
|
50 |
+
- - train
|
51 |
+
- acc
|
52 |
+
- max
|
53 |
+
- - valid
|
54 |
+
- acc
|
55 |
+
- max
|
56 |
+
keep_nbest_models: 1
|
57 |
+
nbest_averaging_interval: 0
|
58 |
+
grad_clip: -1
|
59 |
+
grad_clip_type: 2.0
|
60 |
+
grad_noise: false
|
61 |
+
accum_grad: 2
|
62 |
+
no_forward_run: false
|
63 |
+
resume: true
|
64 |
+
train_dtype: float32
|
65 |
+
use_amp: false
|
66 |
+
log_interval: 50
|
67 |
+
use_matplotlib: true
|
68 |
+
use_tensorboard: true
|
69 |
+
create_graph_in_tensorboard: false
|
70 |
+
use_wandb: false
|
71 |
+
wandb_project: null
|
72 |
+
wandb_id: null
|
73 |
+
wandb_entity: null
|
74 |
+
wandb_name: null
|
75 |
+
wandb_model_log_interval: -1
|
76 |
+
detect_anomaly: false
|
77 |
+
use_adapter: false
|
78 |
+
adapter: lora
|
79 |
+
save_strategy: all
|
80 |
+
adapter_conf: {}
|
81 |
+
pretrain_path: null
|
82 |
+
init_param: []
|
83 |
+
ignore_init_mismatch: false
|
84 |
+
freeze_param:
|
85 |
+
- frontend.upstream
|
86 |
+
num_iters_per_epoch: null
|
87 |
+
batch_size: 16
|
88 |
+
valid_batch_size: null
|
89 |
+
batch_bins: 1000000
|
90 |
+
valid_batch_bins: null
|
91 |
+
category_sample_size: 10
|
92 |
+
train_shape_file:
|
93 |
+
- exp/universa_stats_overall_base/train/audio_shape
|
94 |
+
- exp/universa_stats_overall_base/train/ref_audio_shape
|
95 |
+
valid_shape_file:
|
96 |
+
- exp/universa_stats_overall_base/valid/audio_shape
|
97 |
+
- exp/universa_stats_overall_base/valid/ref_audio_shape
|
98 |
+
batch_type: sorted
|
99 |
+
valid_batch_type: null
|
100 |
+
fold_length:
|
101 |
+
- 256000
|
102 |
+
sort_in_batch: descending
|
103 |
+
shuffle_within_batch: false
|
104 |
+
sort_batch: descending
|
105 |
+
multiple_iterator: false
|
106 |
+
chunk_length: 500
|
107 |
+
chunk_shift_ratio: 0.5
|
108 |
+
num_cache_chunks: 1024
|
109 |
+
chunk_excluded_key_prefixes: []
|
110 |
+
chunk_default_fs: null
|
111 |
+
chunk_max_abs_length: null
|
112 |
+
chunk_discard_short_samples: true
|
113 |
+
train_data_path_and_name_and_type:
|
114 |
+
- - dump/raw/overall_base/wav.scp
|
115 |
+
- audio
|
116 |
+
- kaldi_ark
|
117 |
+
- - dump/raw/overall_base/metric.scp
|
118 |
+
- metrics
|
119 |
+
- metric
|
120 |
+
- - dump/raw/overall_base/ref_wav.scp
|
121 |
+
- ref_audio
|
122 |
+
- kaldi_ark
|
123 |
+
valid_data_path_and_name_and_type:
|
124 |
+
- - dump/raw/overall_dev/wav.scp
|
125 |
+
- audio
|
126 |
+
- kaldi_ark
|
127 |
+
- - dump/raw/overall_dev/metric.scp
|
128 |
+
- metrics
|
129 |
+
- metric
|
130 |
+
- - dump/raw/overall_dev/ref_wav.scp
|
131 |
+
- ref_audio
|
132 |
+
- kaldi_ark
|
133 |
+
multi_task_dataset: false
|
134 |
+
allow_variable_data_keys: false
|
135 |
+
max_cache_size: 0.0
|
136 |
+
max_cache_fd: 32
|
137 |
+
allow_multi_rates: false
|
138 |
+
valid_max_cache_size: null
|
139 |
+
exclude_weight_decay: false
|
140 |
+
exclude_weight_decay_conf: {}
|
141 |
+
optim: adamw
|
142 |
+
optim_conf:
|
143 |
+
lr: 0.001
|
144 |
+
scheduler: warmuplr
|
145 |
+
scheduler_conf:
|
146 |
+
warmup_steps: 25000
|
147 |
+
metric2id: dump/raw/overall_base/metric2id
|
148 |
+
metric2type: dump/raw/overall_base/metric2type
|
149 |
+
metric_pad_value: -100
|
150 |
+
token_list: null
|
151 |
+
metric_token_info: data/token_list/metric_500_percentile_overall_base_w-numerical/tokens.json
|
152 |
+
metric_token_pad_value: 0
|
153 |
+
tokenize_numerical_metric: true
|
154 |
+
init: null
|
155 |
+
model_conf: {}
|
156 |
+
use_ref_audio: true
|
157 |
+
use_ref_text: false
|
158 |
+
use_preprocessor: true
|
159 |
+
token_type: bpe
|
160 |
+
bpemodel: null
|
161 |
+
non_linguistic_symbols: null
|
162 |
+
cleaner: null
|
163 |
+
g2p: null
|
164 |
+
sequential_metric: true
|
165 |
+
randomize_sequential_metric: true
|
166 |
+
frontend: s3prl
|
167 |
+
frontend_conf:
|
168 |
+
frontend_conf:
|
169 |
+
upstream: wavlm_large
|
170 |
+
download_dir: ./hub
|
171 |
+
multilayer_feature: true
|
172 |
+
universa: ar_universa
|
173 |
+
universa_conf:
|
174 |
+
embedding_dim: 512
|
175 |
+
audio_encoder_type: transformer
|
176 |
+
audio_encoder_params:
|
177 |
+
num_blocks: 4
|
178 |
+
attention_heads: 4
|
179 |
+
linear_units: 1024
|
180 |
+
dropout_rate: 0.1
|
181 |
+
positional_dropout_rate: 0.1
|
182 |
+
attention_dropout_rate: 0.1
|
183 |
+
input_layer: conv2d
|
184 |
+
normalize_before: true
|
185 |
+
concat_after: false
|
186 |
+
positionwise_layer_type: linear
|
187 |
+
positionwise_conv_kernel_size: 1
|
188 |
+
layer_drop_rate: 0.1
|
189 |
+
qk_norm: false
|
190 |
+
use_flash_attn: false
|
191 |
+
cross_attention_type: multihead
|
192 |
+
cross_attention_params:
|
193 |
+
n_head: 2
|
194 |
+
dropout_rate: 0.1
|
195 |
+
metric_decoder_params:
|
196 |
+
num_blocks: 12
|
197 |
+
attention_heads: 8
|
198 |
+
linear_units: 2048
|
199 |
+
dropout_rate: 0.1
|
200 |
+
positional_dropout_rate: 0.1
|
201 |
+
src_attention_dropout_rate: 0.1
|
202 |
+
self_attention_dropout_rate: 0.1
|
203 |
+
input_layer: embed
|
204 |
+
normalize_before: true
|
205 |
+
concat_after: false
|
206 |
+
layer_drop_rate: 0.1
|
207 |
+
qk_norm: false
|
208 |
+
use_flash_attn: false
|
209 |
+
use_rope: true
|
210 |
+
lsm_weight: 0.1
|
211 |
+
sym_sos: <sos>
|
212 |
+
sym_eos: <eos>
|
213 |
+
required:
|
214 |
+
- output_dir
|
215 |
+
- metric2id
|
216 |
+
version: '202503'
|
217 |
+
distributed: false
|
exp/universa_universa_ar_overall_base_token_wavlm_large/images/acc_ar_decoder.png
ADDED
![]() |
exp/universa_universa_ar_overall_base_token_wavlm_large/images/backward_time.png
ADDED
![]() |
exp/universa_universa_ar_overall_base_token_wavlm_large/images/clip.png
ADDED
![]() |
exp/universa_universa_ar_overall_base_token_wavlm_large/images/forward_time.png
ADDED
![]() |
exp/universa_universa_ar_overall_base_token_wavlm_large/images/gpu_max_cached_mem_GB.png
ADDED
![]() |
exp/universa_universa_ar_overall_base_token_wavlm_large/images/grad_norm.png
ADDED
![]() |
exp/universa_universa_ar_overall_base_token_wavlm_large/images/iter_time.png
ADDED
![]() |
exp/universa_universa_ar_overall_base_token_wavlm_large/images/loss.png
ADDED
![]() |
exp/universa_universa_ar_overall_base_token_wavlm_large/images/loss_ar_decoder.png
ADDED
![]() |
exp/universa_universa_ar_overall_base_token_wavlm_large/images/loss_scale.png
ADDED
![]() |
exp/universa_universa_ar_overall_base_token_wavlm_large/images/optim0_lr0.png
ADDED
![]() |
exp/universa_universa_ar_overall_base_token_wavlm_large/images/optim_step_time.png
ADDED
![]() |
exp/universa_universa_ar_overall_base_token_wavlm_large/images/train_time.png
ADDED
![]() |
exp/universa_universa_ar_overall_base_token_wavlm_large/images/value_ar_decoder.png
ADDED
![]() |
meta.yaml
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
espnet: '202503'
|
2 |
+
files:
|
3 |
+
model_file: exp/universa_universa_ar_overall_base_token_wavlm_large/24epoch.pth
|
4 |
+
python: "3.9.18 | packaged by conda-forge | (main, Dec 23 2023, 17:20:25) \n[GCC 12.3.0]"
|
5 |
+
timestamp: 1749800668.161494
|
6 |
+
torch: 2.6.0.dev20241210+cu124
|
7 |
+
yaml_files:
|
8 |
+
train_config: exp/universa_universa_ar_overall_base_token_wavlm_large/config.yaml
|