|
{ |
|
"architectures": [ |
|
"AudioVAE" |
|
], |
|
"dec_kwargs": { |
|
"backbone": { |
|
"_attn_implementation": "flash_attention_2", |
|
"attention_dropout": 0.0, |
|
"attn_implementation": null, |
|
"bos_token_id": 151643, |
|
"eos_token_id": 151645, |
|
"hidden_act": "silu", |
|
"hidden_size": 896, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 4864, |
|
"is_causal": true, |
|
"max_position_embeddings": 32768, |
|
"max_window_layers": 0, |
|
"model_type": "qwen2", |
|
"num_attention_heads": 14, |
|
"num_hidden_layers": 24, |
|
"num_key_value_heads": 2, |
|
"rms_norm_eps": 1e-06, |
|
"rope_theta": 1000000.0, |
|
"sliding_window": 32, |
|
"tie_word_embeddings": true, |
|
"torch_dtype": "bfloat16", |
|
"transformers_version": "4.43.1", |
|
"use_cache": false, |
|
"use_sliding_window": true, |
|
"vocab_size": 1 |
|
}, |
|
"latent_dim": 64, |
|
"output_dim": 320 |
|
}, |
|
"enc_kwargs": { |
|
"backbone": { |
|
"_attn_implementation": "flash_attention_2", |
|
"attention_dropout": 0.0, |
|
"attn_implementation": null, |
|
"bos_token_id": 151643, |
|
"eos_token_id": 151645, |
|
"hidden_act": "silu", |
|
"hidden_size": 896, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 4864, |
|
"is_causal": true, |
|
"max_position_embeddings": 32768, |
|
"max_window_layers": 0, |
|
"model_type": "qwen2", |
|
"num_attention_heads": 14, |
|
"num_hidden_layers": 24, |
|
"num_key_value_heads": 2, |
|
"rms_norm_eps": 1e-06, |
|
"rope_theta": 1000000.0, |
|
"sliding_window": 32, |
|
"tie_word_embeddings": true, |
|
"torch_dtype": "bfloat16", |
|
"transformers_version": "4.43.1", |
|
"use_cache": false, |
|
"use_sliding_window": true, |
|
"vocab_size": 1 |
|
}, |
|
"hop_size": 320, |
|
"input_dim": 320, |
|
"latent_dim": 64 |
|
}, |
|
"hifi_gan_disc_kwargs": { |
|
"channel_increasing_factor": 4, |
|
"channels": 16, |
|
"max_downsample_channels": 512, |
|
"periods": [ |
|
2, |
|
3, |
|
5, |
|
7, |
|
11 |
|
] |
|
}, |
|
"init_method": "kaiming", |
|
"lambda_adv": 1.0, |
|
"lambda_disc": 1.0, |
|
"lambda_feat_match_loss": 1.0, |
|
"lambda_mel_loss": 1.0, |
|
"lambda_semantic": 2.0, |
|
"patch_size": -1, |
|
"semantic_module_kwargs": { |
|
"causal": true, |
|
"whisper_encoder": { |
|
"n_ctx": 1500, |
|
"n_head": 20, |
|
"n_layer": 32, |
|
"n_mels": 128, |
|
"n_state": 1280 |
|
} |
|
}, |
|
"spec_disc_kwargs": { |
|
"channels": 32, |
|
"downsample_scales": [ |
|
2, |
|
2, |
|
2 |
|
], |
|
"in_channels": 1, |
|
"kernel_sizes": [ |
|
5, |
|
3 |
|
], |
|
"max_downsample_channels": 512, |
|
"out_channels": 1, |
|
"stft_params": { |
|
"fft_sizes": [ |
|
78, |
|
126, |
|
206, |
|
334, |
|
542, |
|
876, |
|
1418, |
|
2296 |
|
], |
|
"hop_sizes": [ |
|
39, |
|
63, |
|
103, |
|
167, |
|
271, |
|
438, |
|
709, |
|
1148 |
|
], |
|
"win_lengths": [ |
|
78, |
|
126, |
|
206, |
|
334, |
|
542, |
|
876, |
|
1418, |
|
2296 |
|
], |
|
"window": "hann_window" |
|
}, |
|
"use_weight_norm": true |
|
}, |
|
"torch_dtype": "bfloat16", |
|
"transformers_version": "4.52.4" |
|
} |
|
|