|
{ |
|
"model" : { |
|
"fm_decoder_downsampling_factor" : [1,2,4,2,1], |
|
"fm_decoder_num_layers" : [2,2,4,4,4], |
|
"fm_decoder_cnn_module_kernel" : [31,15,7,15,31], |
|
"fm_decoder_feedforward_dim" : 1536, |
|
"fm_decoder_num_heads" : 4, |
|
"fm_decoder_dim" : 512, |
|
"text_encoder_num_layers" : 4, |
|
"text_encoder_feedforward_dim" : 512, |
|
"text_encoder_cnn_module_kernel" : 9, |
|
"text_encoder_num_heads" : 4, |
|
"text_encoder_dim" : 192, |
|
"query_head_dim" : 32, |
|
"value_head_dim" : 12, |
|
"pos_head_dim" : 4, |
|
"pos_dim" : 48, |
|
"time_embed_dim" : 192, |
|
"text_embed_dim" : 192, |
|
"feat_dim": 100 |
|
}, |
|
"feature" : { |
|
"sampling_rate": 24000, |
|
"type": "vocos" |
|
} |
|
} |