JusperLee commited on
Commit
5c69d39
·
verified ·
1 Parent(s): 44a284d

Add model configuration

Browse files
Files changed (1) hide show
  1. config.json +83 -83
config.json CHANGED
@@ -1,111 +1,99 @@
1
  {
 
 
 
 
 
 
 
 
 
 
 
2
  "architectures": [
3
  "Dolphin"
4
  ],
5
  "auto_map": {
6
  "AutoModel": "dolphin.Dolphin"
7
  },
8
- "framework": "pytorch",
9
- "license": "apache-2.0",
10
- "model_type": "dolphin",
11
- "module_audio_dec": {
12
- "bias": false,
13
- "in_channels": 256,
14
- "kernel_size": 16,
15
- "out_channels": 1,
16
- "stride": 4
17
- },
18
  "module_audio_enc": {
19
- "bias": false,
20
- "groups": 1,
21
  "in_channels": 1,
22
- "kernel_size": 16,
23
  "out_channels": 256,
24
- "stride": 4
 
 
 
25
  },
26
  "module_feature_projector": {
27
- "bias": false,
28
- "in_channels": 256,
29
- "kernel_size": 1,
30
  "num_channels": 256,
31
- "out_channels": 128
32
- },
33
- "module_output_layer": {
34
  "in_channels": 256,
35
- "out_channels": 128
 
 
36
  },
37
  "module_separator": {
38
- "dec_stage": {
 
 
 
 
 
 
 
39
  "global_blocks": {
40
- "dropout_rate": 0.05,
41
  "in_channels": 128,
42
- "num_mha_heads": 8
 
43
  },
44
  "local_blocks": {
45
- "dropout_rate": 0.05,
46
  "in_channels": 128,
47
- "kernel_size": 65
 
48
  },
49
- "spk_attention": {
50
- "dropout_rate": 0.05,
51
- "in_channels": 128,
52
- "num_mha_heads": 8
53
- }
54
- },
55
- "enc_stage": {
56
  "down_conv_layer": {
57
  "in_channels": 128,
58
  "samp_kernel_size": 5
59
- },
 
 
 
 
 
60
  "global_blocks": {
61
- "dropout_rate": 0.05,
62
  "in_channels": 128,
63
- "num_mha_heads": 8
 
64
  },
65
  "local_blocks": {
66
- "dropout_rate": 0.05,
67
  "in_channels": 128,
68
- "kernel_size": 65
 
 
 
 
 
 
69
  }
70
- },
71
- "num_stages": 4,
72
- "relative_positional_encoding": {
73
- "embed_v": false,
74
- "in_channels": 128,
75
- "maxlen": 2000,
76
- "num_heads": 8
77
- },
78
- "simple_fusion": {
79
- "out_channels": 128
80
  }
81
  },
82
- "num_stages": 4,
83
- "sample_rate": 16000,
84
- "tags": [
85
- "audio",
86
- "speech-separation",
87
- "audio-visual",
88
- "pytorch",
89
- "dolphin"
90
- ],
91
- "task": "audio_visual_speech_separation",
 
92
  "video_encoder_params": {
93
- "attn_dim_head": 32,
94
- "attn_dropout": 0.0,
95
- "attn_heads": 8,
96
- "codebook_dim": 64,
97
- "codebook_size": 256,
98
- "commitment_cost": 1.0,
99
- "distill_cost": 1.0,
100
- "flash_attn": true,
101
- "image_size": 88,
102
- "in_channel": 1,
103
- "init_channel": 4,
104
- "input_conv_kernel_size": [
105
- 7,
106
- 7,
107
- 7
108
- ],
109
  "layers": [
110
  "residual",
111
  "compress_space",
@@ -117,20 +105,32 @@
117
  "consecutive_residual",
118
  "attend_space"
119
  ],
120
- "linear_attn_dim_head": 8,
121
- "linear_attn_heads": 16,
 
122
  "max_dim": 32,
123
- "num_quantizers": 1,
 
 
 
 
124
  "output_conv_kernel_size": [
125
  3,
126
  3,
127
  3
128
  ],
 
129
  "pad_mode": "constant",
130
- "residual_conv_kernel_size": 3
131
- },
132
- "vin_channels": 64,
133
- "vmid_channels": 512,
134
- "vout_channels": 64,
135
- "vpre_channels": 3872
 
 
 
 
 
 
136
  }
 
1
  {
2
+ "model_type": "dolphin",
3
+ "task": "audio_visual_speech_separation",
4
+ "framework": "pytorch",
5
+ "license": "apache-2.0",
6
+ "tags": [
7
+ "audio",
8
+ "speech-separation",
9
+ "audio-visual",
10
+ "pytorch",
11
+ "dolphin"
12
+ ],
13
  "architectures": [
14
  "Dolphin"
15
  ],
16
  "auto_map": {
17
  "AutoModel": "dolphin.Dolphin"
18
  },
19
+ "num_stages": 4,
20
+ "sample_rate": 16000,
21
+ "vpre_channels": 3872,
22
+ "vmid_channels": 512,
23
+ "vin_channels": 64,
24
+ "vout_channels": 64,
 
 
 
 
25
  "module_audio_enc": {
 
 
26
  "in_channels": 1,
 
27
  "out_channels": 256,
28
+ "kernel_size": 16,
29
+ "stride": 4,
30
+ "groups": 1,
31
+ "bias": false
32
  },
33
  "module_feature_projector": {
 
 
 
34
  "num_channels": 256,
 
 
 
35
  "in_channels": 256,
36
+ "out_channels": 128,
37
+ "kernel_size": 1,
38
+ "bias": false
39
  },
40
  "module_separator": {
41
+ "num_stages": 4,
42
+ "relative_positional_encoding": {
43
+ "in_channels": 128,
44
+ "num_heads": 8,
45
+ "maxlen": 2000,
46
+ "embed_v": false
47
+ },
48
+ "enc_stage": {
49
  "global_blocks": {
 
50
  "in_channels": 128,
51
+ "num_mha_heads": 8,
52
+ "dropout_rate": 0.05
53
  },
54
  "local_blocks": {
 
55
  "in_channels": 128,
56
+ "kernel_size": 65,
57
+ "dropout_rate": 0.05
58
  },
 
 
 
 
 
 
 
59
  "down_conv_layer": {
60
  "in_channels": 128,
61
  "samp_kernel_size": 5
62
+ }
63
+ },
64
+ "simple_fusion": {
65
+ "out_channels": 128
66
+ },
67
+ "dec_stage": {
68
  "global_blocks": {
 
69
  "in_channels": 128,
70
+ "num_mha_heads": 8,
71
+ "dropout_rate": 0.05
72
  },
73
  "local_blocks": {
 
74
  "in_channels": 128,
75
+ "kernel_size": 65,
76
+ "dropout_rate": 0.05
77
+ },
78
+ "spk_attention": {
79
+ "in_channels": 128,
80
+ "num_mha_heads": 8,
81
+ "dropout_rate": 0.05
82
  }
 
 
 
 
 
 
 
 
 
 
83
  }
84
  },
85
+ "module_output_layer": {
86
+ "in_channels": 256,
87
+ "out_channels": 128
88
+ },
89
+ "module_audio_dec": {
90
+ "in_channels": 256,
91
+ "out_channels": 1,
92
+ "kernel_size": 16,
93
+ "stride": 4,
94
+ "bias": false
95
+ },
96
  "video_encoder_params": {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
  "layers": [
98
  "residual",
99
  "compress_space",
 
105
  "consecutive_residual",
106
  "attend_space"
107
  ],
108
+ "image_size": 88,
109
+ "in_channel": 1,
110
+ "init_channel": 4,
111
  "max_dim": 32,
112
+ "input_conv_kernel_size": [
113
+ 7,
114
+ 7,
115
+ 7
116
+ ],
117
  "output_conv_kernel_size": [
118
  3,
119
  3,
120
  3
121
  ],
122
+ "residual_conv_kernel_size": 3,
123
  "pad_mode": "constant",
124
+ "attn_dim_head": 32,
125
+ "attn_heads": 8,
126
+ "attn_dropout": 0.0,
127
+ "flash_attn": true,
128
+ "linear_attn_dim_head": 8,
129
+ "linear_attn_heads": 16,
130
+ "num_quantizers": 1,
131
+ "codebook_size": 256,
132
+ "codebook_dim": 64,
133
+ "commitment_cost": 1.0,
134
+ "distill_cost": 1.0
135
+ }
136
  }