| # network architecture | |
| # encoder related | |
| encoder: conformer | |
| encoder_conf: | |
| output_size: 512 # dimension of attention | |
| attention_heads: 8 | |
| linear_units: 2048 # the number of units of position-wise feed forward | |
| num_blocks: 12 # the number of encoder blocks | |
| dropout_rate: 0.1 | |
| positional_dropout_rate: 0.0 | |
| attention_dropout_rate: 0.0 | |
| input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 | |
| normalize_before: true | |
| cnn_module_kernel: 31 | |
| use_cnn_module: True | |
| activation_type: 'swish' | |
| pos_enc_layer_type: 'rel_pos' | |
| selfattention_layer_type: 'rel_selfattn' | |
| # decoder related | |
| decoder: transformer | |
| decoder_conf: | |
| attention_heads: 8 | |
| linear_units: 2048 | |
| num_blocks: 6 | |
| dropout_rate: 0.1 | |
| positional_dropout_rate: 0.0 | |
| self_attention_dropout_rate: 0.0 | |
| src_attention_dropout_rate: 0.0 | |
| # hybrid CTC/attention | |
| model_conf: | |
| ctc_weight: 1.0 | |
| lsm_weight: 0.1 # label smoothing option | |
| length_normalized_loss: false | |
| # use raw_wav or kaldi feature | |
| raw_wav: true | |
| # dataset related | |
| dataset_conf: | |
| filter_conf: | |
| max_length: 2000 | |
| min_length: 50 | |
| token_max_length: 400 | |
| token_min_length: 1 | |
| resample_conf: | |
| resample_rate: 16000 | |
| speed_perturb: false | |
| fbank_conf: | |
| num_mel_bins: 80 | |
| frame_shift: 10 | |
| frame_length: 25 | |
| dither: 1.0 | |
| spec_aug: false | |
| spec_aug_conf: | |
| num_t_mask: 3 | |
| num_f_mask: 2 | |
| max_t: 50 | |
| max_f: 10 | |
| shuffle: true | |
| shuffle_conf: | |
| shuffle_size: 1500 | |
| sort: true | |
| sort_conf: | |
| sort_size: 500 # sort_size should be less than shuffle_size | |
| batch_conf: | |
| batch_type: 'dynamic' # static or dynamic | |
| max_frames_in_batch: 20000 | |
| batch_size: 3 | |
| pretrain: True | |
| wav2vec_conf: | |
| pretrain: True | |
| quantize_targets: True | |
| project_targets: True | |
| latent_vars: 320 | |
| latent_dim: 512 | |
| latent_groups: 2 | |
| w2v_ext_loss: True | |
| w2v_loss_weights: [1.5,0] | |
| mask: True | |
| mask_prob: 0.65 | |
| grad_clip: 5 | |
| accum_grad: 4 | |
| max_epoch: 280 | |
| log_interval: 100 | |
| optim: adam | |
| optim_conf: | |
| lr: 0.002 | |
| scheduler: warmuplr # pytorch v1.1.0+ required | |
| scheduler_conf: | |
| warmup_steps: 25000 | |