Add model configuration
Browse files- config.json +83 -83
config.json
CHANGED
@@ -1,111 +1,99 @@
|
|
1 |
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
"architectures": [
|
3 |
"Dolphin"
|
4 |
],
|
5 |
"auto_map": {
|
6 |
"AutoModel": "dolphin.Dolphin"
|
7 |
},
|
8 |
-
"
|
9 |
-
"
|
10 |
-
"
|
11 |
-
"
|
12 |
-
|
13 |
-
|
14 |
-
"kernel_size": 16,
|
15 |
-
"out_channels": 1,
|
16 |
-
"stride": 4
|
17 |
-
},
|
18 |
"module_audio_enc": {
|
19 |
-
"bias": false,
|
20 |
-
"groups": 1,
|
21 |
"in_channels": 1,
|
22 |
-
"kernel_size": 16,
|
23 |
"out_channels": 256,
|
24 |
-
"
|
|
|
|
|
|
|
25 |
},
|
26 |
"module_feature_projector": {
|
27 |
-
"bias": false,
|
28 |
-
"in_channels": 256,
|
29 |
-
"kernel_size": 1,
|
30 |
"num_channels": 256,
|
31 |
-
"out_channels": 128
|
32 |
-
},
|
33 |
-
"module_output_layer": {
|
34 |
"in_channels": 256,
|
35 |
-
"out_channels": 128
|
|
|
|
|
36 |
},
|
37 |
"module_separator": {
|
38 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
"global_blocks": {
|
40 |
-
"dropout_rate": 0.05,
|
41 |
"in_channels": 128,
|
42 |
-
"num_mha_heads": 8
|
|
|
43 |
},
|
44 |
"local_blocks": {
|
45 |
-
"dropout_rate": 0.05,
|
46 |
"in_channels": 128,
|
47 |
-
"kernel_size": 65
|
|
|
48 |
},
|
49 |
-
"spk_attention": {
|
50 |
-
"dropout_rate": 0.05,
|
51 |
-
"in_channels": 128,
|
52 |
-
"num_mha_heads": 8
|
53 |
-
}
|
54 |
-
},
|
55 |
-
"enc_stage": {
|
56 |
"down_conv_layer": {
|
57 |
"in_channels": 128,
|
58 |
"samp_kernel_size": 5
|
59 |
-
}
|
|
|
|
|
|
|
|
|
|
|
60 |
"global_blocks": {
|
61 |
-
"dropout_rate": 0.05,
|
62 |
"in_channels": 128,
|
63 |
-
"num_mha_heads": 8
|
|
|
64 |
},
|
65 |
"local_blocks": {
|
66 |
-
"dropout_rate": 0.05,
|
67 |
"in_channels": 128,
|
68 |
-
"kernel_size": 65
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
}
|
70 |
-
},
|
71 |
-
"num_stages": 4,
|
72 |
-
"relative_positional_encoding": {
|
73 |
-
"embed_v": false,
|
74 |
-
"in_channels": 128,
|
75 |
-
"maxlen": 2000,
|
76 |
-
"num_heads": 8
|
77 |
-
},
|
78 |
-
"simple_fusion": {
|
79 |
-
"out_channels": 128
|
80 |
}
|
81 |
},
|
82 |
-
"
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
"
|
88 |
-
"
|
89 |
-
"
|
90 |
-
|
91 |
-
|
|
|
92 |
"video_encoder_params": {
|
93 |
-
"attn_dim_head": 32,
|
94 |
-
"attn_dropout": 0.0,
|
95 |
-
"attn_heads": 8,
|
96 |
-
"codebook_dim": 64,
|
97 |
-
"codebook_size": 256,
|
98 |
-
"commitment_cost": 1.0,
|
99 |
-
"distill_cost": 1.0,
|
100 |
-
"flash_attn": true,
|
101 |
-
"image_size": 88,
|
102 |
-
"in_channel": 1,
|
103 |
-
"init_channel": 4,
|
104 |
-
"input_conv_kernel_size": [
|
105 |
-
7,
|
106 |
-
7,
|
107 |
-
7
|
108 |
-
],
|
109 |
"layers": [
|
110 |
"residual",
|
111 |
"compress_space",
|
@@ -117,20 +105,32 @@
|
|
117 |
"consecutive_residual",
|
118 |
"attend_space"
|
119 |
],
|
120 |
-
"
|
121 |
-
"
|
|
|
122 |
"max_dim": 32,
|
123 |
-
"
|
|
|
|
|
|
|
|
|
124 |
"output_conv_kernel_size": [
|
125 |
3,
|
126 |
3,
|
127 |
3
|
128 |
],
|
|
|
129 |
"pad_mode": "constant",
|
130 |
-
"
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
136 |
}
|
|
|
1 |
{
|
2 |
+
"model_type": "dolphin",
|
3 |
+
"task": "audio_visual_speech_separation",
|
4 |
+
"framework": "pytorch",
|
5 |
+
"license": "apache-2.0",
|
6 |
+
"tags": [
|
7 |
+
"audio",
|
8 |
+
"speech-separation",
|
9 |
+
"audio-visual",
|
10 |
+
"pytorch",
|
11 |
+
"dolphin"
|
12 |
+
],
|
13 |
"architectures": [
|
14 |
"Dolphin"
|
15 |
],
|
16 |
"auto_map": {
|
17 |
"AutoModel": "dolphin.Dolphin"
|
18 |
},
|
19 |
+
"num_stages": 4,
|
20 |
+
"sample_rate": 16000,
|
21 |
+
"vpre_channels": 3872,
|
22 |
+
"vmid_channels": 512,
|
23 |
+
"vin_channels": 64,
|
24 |
+
"vout_channels": 64,
|
|
|
|
|
|
|
|
|
25 |
"module_audio_enc": {
|
|
|
|
|
26 |
"in_channels": 1,
|
|
|
27 |
"out_channels": 256,
|
28 |
+
"kernel_size": 16,
|
29 |
+
"stride": 4,
|
30 |
+
"groups": 1,
|
31 |
+
"bias": false
|
32 |
},
|
33 |
"module_feature_projector": {
|
|
|
|
|
|
|
34 |
"num_channels": 256,
|
|
|
|
|
|
|
35 |
"in_channels": 256,
|
36 |
+
"out_channels": 128,
|
37 |
+
"kernel_size": 1,
|
38 |
+
"bias": false
|
39 |
},
|
40 |
"module_separator": {
|
41 |
+
"num_stages": 4,
|
42 |
+
"relative_positional_encoding": {
|
43 |
+
"in_channels": 128,
|
44 |
+
"num_heads": 8,
|
45 |
+
"maxlen": 2000,
|
46 |
+
"embed_v": false
|
47 |
+
},
|
48 |
+
"enc_stage": {
|
49 |
"global_blocks": {
|
|
|
50 |
"in_channels": 128,
|
51 |
+
"num_mha_heads": 8,
|
52 |
+
"dropout_rate": 0.05
|
53 |
},
|
54 |
"local_blocks": {
|
|
|
55 |
"in_channels": 128,
|
56 |
+
"kernel_size": 65,
|
57 |
+
"dropout_rate": 0.05
|
58 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
"down_conv_layer": {
|
60 |
"in_channels": 128,
|
61 |
"samp_kernel_size": 5
|
62 |
+
}
|
63 |
+
},
|
64 |
+
"simple_fusion": {
|
65 |
+
"out_channels": 128
|
66 |
+
},
|
67 |
+
"dec_stage": {
|
68 |
"global_blocks": {
|
|
|
69 |
"in_channels": 128,
|
70 |
+
"num_mha_heads": 8,
|
71 |
+
"dropout_rate": 0.05
|
72 |
},
|
73 |
"local_blocks": {
|
|
|
74 |
"in_channels": 128,
|
75 |
+
"kernel_size": 65,
|
76 |
+
"dropout_rate": 0.05
|
77 |
+
},
|
78 |
+
"spk_attention": {
|
79 |
+
"in_channels": 128,
|
80 |
+
"num_mha_heads": 8,
|
81 |
+
"dropout_rate": 0.05
|
82 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
83 |
}
|
84 |
},
|
85 |
+
"module_output_layer": {
|
86 |
+
"in_channels": 256,
|
87 |
+
"out_channels": 128
|
88 |
+
},
|
89 |
+
"module_audio_dec": {
|
90 |
+
"in_channels": 256,
|
91 |
+
"out_channels": 1,
|
92 |
+
"kernel_size": 16,
|
93 |
+
"stride": 4,
|
94 |
+
"bias": false
|
95 |
+
},
|
96 |
"video_encoder_params": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
97 |
"layers": [
|
98 |
"residual",
|
99 |
"compress_space",
|
|
|
105 |
"consecutive_residual",
|
106 |
"attend_space"
|
107 |
],
|
108 |
+
"image_size": 88,
|
109 |
+
"in_channel": 1,
|
110 |
+
"init_channel": 4,
|
111 |
"max_dim": 32,
|
112 |
+
"input_conv_kernel_size": [
|
113 |
+
7,
|
114 |
+
7,
|
115 |
+
7
|
116 |
+
],
|
117 |
"output_conv_kernel_size": [
|
118 |
3,
|
119 |
3,
|
120 |
3
|
121 |
],
|
122 |
+
"residual_conv_kernel_size": 3,
|
123 |
"pad_mode": "constant",
|
124 |
+
"attn_dim_head": 32,
|
125 |
+
"attn_heads": 8,
|
126 |
+
"attn_dropout": 0.0,
|
127 |
+
"flash_attn": true,
|
128 |
+
"linear_attn_dim_head": 8,
|
129 |
+
"linear_attn_heads": 16,
|
130 |
+
"num_quantizers": 1,
|
131 |
+
"codebook_size": 256,
|
132 |
+
"codebook_dim": 64,
|
133 |
+
"commitment_cost": 1.0,
|
134 |
+
"distill_cost": 1.0
|
135 |
+
}
|
136 |
}
|