mrfakename CypressYang commited on
Commit
a230ee6
·
verified ·
0 Parent(s):

Duplicate from CypressYang/SongBloom

Browse files

Co-authored-by: CypressYang <[email protected]>

.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ ---
autoencoder_music_dsp1920.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:10ccb6c83613781ad32e998a90597ba7eb9292911a224598da1fd53728eb4cd3
3
+ size 674920616
songbloom_full_150s.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aaeaee1dc889c8790e53064189ddcb7c66396cbfef3b15794a53e41d17d55fd2
3
+ size 7827256017
songbloom_full_150s.yaml ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ cfg_file:
2
+ precision: 'bf16-mixed' # ['16-mixed', 'bf16-mixed']
3
+ min_dur: 60
4
+ max_dur: 150
5
+ sr: 48000
6
+
7
+ pretrained_path: ${dynamic_path:???/songbloom_full_150s.pt}
8
+ continue_checkpoint:
9
+
10
+ train_dataset:
11
+ lyric_processor: phoneme
12
+ prompt_len: 10
13
+
14
+ vae:
15
+ vae_cfg: ${dynamic_path:???/stable_audio_1920_vae.json}
16
+ vae_ckpt: ${dynamic_path:???/autoencoder_music_dsp1920.ckpt}
17
+ sr: ${sr}
18
+
19
+ model:
20
+ block_size: 16
21
+ latent_dim: 64
22
+ dim: 1536
23
+ num_heads: 24
24
+ lm_layers: 36
25
+ diff_layers: 12
26
+ num_pitch: 16384
27
+ time_cond_type: prepend
28
+ timestep_features_dim: 256
29
+ diffusion_objective: rectified_flow
30
+ timestep_sampler: logit_normal
31
+ backend: llama
32
+ rotary_base_val: 20000
33
+ init_std: 0.02
34
+ h_dropout: 0.05
35
+
36
+ condition_provider_cfg:
37
+ prompt_wav:
38
+ type: audio_tokenizer_wrapper
39
+ output_dim: ${model.dim}
40
+ audio_tokenizer:
41
+ max_len: 250 # 25.0 * 10s
42
+ lyrics:
43
+ type: phoneme_tokenizer
44
+ output_dim: ${model.dim}
45
+ vocab_list: ${load_yaml:${dynamic_path:???/vocab_g2p.yaml}}
46
+ max_len: 600
47
+ max_sentence_per_structure: 50
48
+ mode: sum
49
+
50
+
51
+ cfg_dropout: 0.1
52
+ attribute_dropout:
53
+ text:
54
+ lyrics: 0.
55
+ wav:
56
+ prompt_wav: 0.1
57
+
58
+ fuser_cfg:
59
+ cross_attention_pos_emb: false
60
+ cross_attention_pos_emb_scale: 1
61
+ sum: []
62
+ prepend: [lyrics, prompt_wav]
63
+ cross: []
64
+ input_interpolate: []
65
+
66
+
67
+
68
+ inference:
69
+ cfg_coef: 1.5
70
+ temp: 0.9
71
+ diff_temp: 0.95
72
+ top_k: 100
73
+ penalty_repeat: True
74
+ penalty_window: 50
75
+ steps: 36
76
+ dit_cfg_type: h
stable_audio_1920_vae.json ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "autoencoder",
3
+ "sample_size": 403200,
4
+ "sample_rate": 48000,
5
+ "audio_channels": 2,
6
+ "model": {
7
+ "encoder": {
8
+ "type": "oobleck",
9
+ "config": {
10
+ "in_channels": 2,
11
+ "channels": 128,
12
+ "c_mults": [1, 2, 4, 8, 16],
13
+ "strides": [2, 4, 4, 6, 10],
14
+ "latent_dim": 128,
15
+ "use_snake": true
16
+ }
17
+ },
18
+ "decoder": {
19
+ "type": "oobleck",
20
+ "config": {
21
+ "out_channels": 2,
22
+ "channels": 128,
23
+ "c_mults": [1, 2, 4, 8, 16],
24
+ "strides": [2, 4, 4, 6, 10],
25
+ "latent_dim": 64,
26
+ "use_snake": true,
27
+ "final_tanh": false
28
+ }
29
+ },
30
+ "bottleneck": {
31
+ "type": "vae"
32
+ },
33
+ "latent_dim": 64,
34
+ "downsampling_ratio": 1920,
35
+ "io_channels": 2
36
+ },
37
+ "training": {
38
+ "learning_rate": 1.5e-4,
39
+ "warmup_steps": 0,
40
+ "use_ema": true,
41
+ "optimizer_configs": {
42
+ "autoencoder": {
43
+ "optimizer": {
44
+ "type": "AdamW",
45
+ "config": {
46
+ "betas": [0.8, 0.99],
47
+ "lr": 1.5e-4,
48
+ "weight_decay": 1e-3
49
+ }
50
+ },
51
+ "scheduler": {
52
+ "type": "InverseLR",
53
+ "config": {
54
+ "inv_gamma": 200000,
55
+ "power": 0.5,
56
+ "warmup": 0.999
57
+ }
58
+ }
59
+ },
60
+ "discriminator": {
61
+ "optimizer": {
62
+ "type": "AdamW",
63
+ "config": {
64
+ "betas": [0.8, 0.99],
65
+ "lr": 3e-4,
66
+ "weight_decay": 1e-3
67
+ }
68
+ },
69
+ "scheduler": {
70
+ "type": "InverseLR",
71
+ "config": {
72
+ "inv_gamma": 200000,
73
+ "power": 0.5,
74
+ "warmup": 0.999
75
+ }
76
+ }
77
+ }
78
+ },
79
+ "loss_configs": {
80
+ "discriminator": {
81
+ "type": "encodec",
82
+ "config": {
83
+ "filters": 64,
84
+ "n_ffts": [2048, 1024, 512, 256, 128],
85
+ "hop_lengths": [512, 256, 128, 64, 32],
86
+ "win_lengths": [2048, 1024, 512, 256, 128]
87
+ },
88
+ "weights": {
89
+ "adversarial": 0.1,
90
+ "feature_matching": 5.0
91
+ }
92
+ },
93
+ "spectral": {
94
+ "type": "mrstft",
95
+ "config": {
96
+ "fft_sizes": [2048, 1024, 512, 256, 128, 64, 32],
97
+ "hop_sizes": [512, 256, 128, 64, 32, 16, 8],
98
+ "win_lengths": [2048, 1024, 512, 256, 128, 64, 32],
99
+ "perceptual_weighting": true
100
+ },
101
+ "weights": {
102
+ "mrstft": 1.0
103
+ }
104
+ },
105
+ "time": {
106
+ "type": "l1",
107
+ "weights": {
108
+ "l1": 0.0
109
+ }
110
+ },
111
+ "bottleneck": {
112
+ "type": "kl",
113
+ "weights": {
114
+ "kl": 1e-4
115
+ }
116
+ }
117
+ },
118
+ "demo": {
119
+ "demo_every": 2000
120
+ }
121
+ }
122
+ }
vocab_g2p.yaml ADDED
@@ -0,0 +1,330 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ - ''
2
+ - '[verse]'
3
+ - '[chorus]'
4
+ - '[bridge]'
5
+ - '[intro]'
6
+ - '[outro]'
7
+ - '[inst]'
8
+ - '[silence]'
9
+ - '!'
10
+ - ','
11
+ - '-'
12
+ - .
13
+ - '?'
14
+ - AA
15
+ - AA0
16
+ - AA1
17
+ - AA2
18
+ - AE0
19
+ - AE1
20
+ - AE2
21
+ - AH0
22
+ - AH1
23
+ - AH2
24
+ - AO0
25
+ - AO1
26
+ - AO2
27
+ - AW0
28
+ - AW1
29
+ - AW2
30
+ - AY0
31
+ - AY1
32
+ - AY2
33
+ - B
34
+ - CH
35
+ - D
36
+ - DH
37
+ - E1
38
+ - E2
39
+ - E3
40
+ - E4
41
+ - E5
42
+ - EE
43
+ - EH0
44
+ - EH1
45
+ - EH2
46
+ - ER
47
+ - ER0
48
+ - ER1
49
+ - ER2
50
+ - EY0
51
+ - EY1
52
+ - EY2
53
+ - En1
54
+ - En2
55
+ - En3
56
+ - En4
57
+ - En5
58
+ - F
59
+ - G
60
+ - HH
61
+ - I
62
+ - IH
63
+ - IH0
64
+ - IH1
65
+ - IH2
66
+ - IY0
67
+ - IY1
68
+ - IY2
69
+ - JH
70
+ - K
71
+ - L
72
+ - M
73
+ - N
74
+ - NG
75
+ - OO
76
+ - OW0
77
+ - OW1
78
+ - OW2
79
+ - OY0
80
+ - OY1
81
+ - OY2
82
+ - P
83
+ - R
84
+ - S
85
+ - SH
86
+ - SP
87
+ - SP2
88
+ - SP3
89
+ - T
90
+ - TH
91
+ - U
92
+ - UH0
93
+ - UH1
94
+ - UH2
95
+ - UNK
96
+ - UW0
97
+ - UW1
98
+ - UW2
99
+ - V
100
+ - W
101
+ - Y
102
+ - Z
103
+ - ZH
104
+ - _
105
+ - a
106
+ - a1
107
+ - a2
108
+ - a3
109
+ - a4
110
+ - a5
111
+ - ai1
112
+ - ai2
113
+ - ai3
114
+ - ai4
115
+ - ai5
116
+ - an1
117
+ - an2
118
+ - an3
119
+ - an4
120
+ - an5
121
+ - ang1
122
+ - ang2
123
+ - ang3
124
+ - ang4
125
+ - ang5
126
+ - ao1
127
+ - ao2
128
+ - ao3
129
+ - ao4
130
+ - ao5
131
+ - b
132
+ - by
133
+ - c
134
+ - ch
135
+ - cl
136
+ - d
137
+ - dy
138
+ - e
139
+ - e1
140
+ - e2
141
+ - e3
142
+ - e4
143
+ - e5
144
+ - ei1
145
+ - ei2
146
+ - ei3
147
+ - ei4
148
+ - ei5
149
+ - en1
150
+ - en2
151
+ - en3
152
+ - en4
153
+ - en5
154
+ - eng1
155
+ - eng2
156
+ - eng3
157
+ - eng4
158
+ - eng5
159
+ - er1
160
+ - er2
161
+ - er3
162
+ - er4
163
+ - er5
164
+ - f
165
+ - g
166
+ - gy
167
+ - h
168
+ - hy
169
+ - i
170
+ - i01
171
+ - i02
172
+ - i03
173
+ - i04
174
+ - i05
175
+ - i1
176
+ - i2
177
+ - i3
178
+ - i4
179
+ - i5
180
+ - ia1
181
+ - ia2
182
+ - ia3
183
+ - ia4
184
+ - ia5
185
+ - ian1
186
+ - ian2
187
+ - ian3
188
+ - ian4
189
+ - ian5
190
+ - iang1
191
+ - iang2
192
+ - iang3
193
+ - iang4
194
+ - iang5
195
+ - iao1
196
+ - iao2
197
+ - iao3
198
+ - iao4
199
+ - iao5
200
+ - ie1
201
+ - ie2
202
+ - ie3
203
+ - ie4
204
+ - ie5
205
+ - in1
206
+ - in2
207
+ - in3
208
+ - in4
209
+ - in5
210
+ - ing1
211
+ - ing2
212
+ - ing3
213
+ - ing4
214
+ - ing5
215
+ - iong1
216
+ - iong2
217
+ - iong3
218
+ - iong4
219
+ - iong5
220
+ - ir1
221
+ - ir2
222
+ - ir3
223
+ - ir4
224
+ - ir5
225
+ - iu1
226
+ - iu2
227
+ - iu3
228
+ - iu4
229
+ - iu5
230
+ - j
231
+ - k
232
+ - ky
233
+ - l
234
+ - m
235
+ - my
236
+ - n
237
+ - ny
238
+ - o
239
+ - o1
240
+ - o2
241
+ - o3
242
+ - o4
243
+ - o5
244
+ - ong1
245
+ - ong2
246
+ - ong3
247
+ - ong4
248
+ - ong5
249
+ - ou1
250
+ - ou2
251
+ - ou3
252
+ - ou4
253
+ - ou5
254
+ - p
255
+ - py
256
+ - q
257
+ - r
258
+ - ry
259
+ - s
260
+ - sh
261
+ - t
262
+ - ts
263
+ - u
264
+ - u1
265
+ - u2
266
+ - u3
267
+ - u4
268
+ - u5
269
+ - ua1
270
+ - ua2
271
+ - ua3
272
+ - ua4
273
+ - ua5
274
+ - uai1
275
+ - uai2
276
+ - uai3
277
+ - uai4
278
+ - uai5
279
+ - uan1
280
+ - uan2
281
+ - uan3
282
+ - uan4
283
+ - uan5
284
+ - uang1
285
+ - uang2
286
+ - uang3
287
+ - uang4
288
+ - uang5
289
+ - ui1
290
+ - ui2
291
+ - ui3
292
+ - ui4
293
+ - ui5
294
+ - un1
295
+ - un2
296
+ - un3
297
+ - un4
298
+ - un5
299
+ - uo1
300
+ - uo2
301
+ - uo3
302
+ - uo4
303
+ - uo5
304
+ - v
305
+ - v1
306
+ - v2
307
+ - v3
308
+ - v4
309
+ - v5
310
+ - van1
311
+ - van2
312
+ - van3
313
+ - van4
314
+ - van5
315
+ - ve1
316
+ - ve2
317
+ - ve3
318
+ - ve4
319
+ - ve5
320
+ - vn1
321
+ - vn2
322
+ - vn3
323
+ - vn4
324
+ - vn5
325
+ - w
326
+ - x
327
+ - y
328
+ - z
329
+ - zh
330
+ - "\u2026"