choihj0706 commited on
Commit
2905751
·
1 Parent(s): f2d34c1

Add fine-tuned MusicGen model

Browse files
Files changed (1) hide show
  1. config.json +54 -335
config.json CHANGED
@@ -1,338 +1,57 @@
1
  {
2
- "transformer_lm.norm_first": {
3
- "value": true
4
- },
5
- "wandb.with_media_logging": {
6
- "value": true
7
- },
8
- "generate.lm.prompt_duration": {
9
- "value": "None"
10
- },
11
- "slurm.time": {
12
- "value": 3600
13
- },
14
- "fuser.cross": {
15
- "value": "['description']"
16
- },
17
- "fsdp.per_block": {
18
- "value": true
19
- },
20
- "fsdp.buffer_dtype": {
21
- "value": "float32"
22
- },
23
- "autocast": {
24
- "value": true
25
- },
26
- "fsdp.param_dtype": {
27
- "value": "float16"
28
- },
29
- "optim.eager_sync": {
30
- "value": true
31
- },
32
- "transformer_lm.emb_lr": {
33
- "value": "None"
34
- },
35
- "channels": {
36
- "value": 1
37
- },
38
- "optim.ema.use": {
39
- "value": true
40
- },
41
- "dataset.shuffle": {
42
- "value": false
43
- },
44
- "generate.every": {
45
- "value": 25
46
- },
47
- "codebooks_pattern.modeling": {
48
- "value": "delay"
49
- },
50
- "metrics.text_consistency.clap.model_arch": {
51
- "value": "HTSAT-base"
52
- },
53
- "generate.audio.loudness_headroom_db": {
54
- "value": 14
55
- },
56
- "fuser.sum": {
57
- "value": "[]"
58
- },
59
- "conditioners.description.t5.word_dropout": {
60
- "value": 0.3
61
- },
62
- "dora.dir": {
63
- "value": "/checkpoint/choihj/experiments/audiocraft/outputs"
64
- },
65
- "tensorboard.with_media_logging": {
66
- "value": true
67
- },
68
- "generate.audio.format": {
69
- "value": "wav"
70
- },
71
- "logging.level": {
72
- "value": "INFO"
73
- },
74
- "slurm.gpus": {
75
- "value": 4
76
- },
77
- "dataset.min_segment_ratio": {
78
- "value": 0.8
79
- },
80
- "interleave_stereo_codebooks.use": {
81
- "value": false
82
- },
83
- "codebooks_pattern.unroll.flattening": {
84
- "value": "[0, 1, 2, 3]"
85
- },
86
- "transformer_lm.two_step_cfg": {
87
- "value": false
88
- },
89
- "optim.updates_per_epoch": {
90
- "value": 100
91
- },
92
- "transformer_lm.depthwise_init": {
93
- "value": "current"
94
- },
95
- "transformer_lm.past_context": {
96
- "value": "None"
97
- },
98
- "metrics.chroma_cosine.chroma_base.sample_rate": {
99
- "value": 32000
100
- },
101
- "fuser.cross_attention_pos_emb_scale": {
102
- "value": 1
103
- },
104
- "optim.epochs": {
105
- "value": 100
106
- },
107
- "transformer_lm.bias_attn": {
108
- "value": false
109
- },
110
- "datasource.valid": {
111
- "value": "/content/drive/MyDrive/projects/carecruise_intern/audiocraft/egs/eval"
112
- },
113
- "tensorboard.sub_dir": {
114
- "value": "None"
115
- },
116
- "generate.num_workers": {
117
- "value": 5
118
- },
119
- "metrics.fad.tf.bin": {
120
- "value": "None"
121
- },
122
- "fsdp.reduce_dtype": {
123
- "value": "float32"
124
- },
125
- "dataset.train.merge_text_p": {
126
- "value": 0.25
127
- },
128
- "schedule.step.gamma": {
129
- "value": "None"
130
- },
131
- "transformer_lm.kv_repeat": {
132
- "value": 1
133
- },
134
- "wandb.group": {
135
- "value": "None"
136
- },
137
- "cache.write": {
138
- "value": false
139
- },
140
- "transformer_lm.causal": {
141
- "value": true
142
- },
143
- "generate.lm.remove_prompts": {
144
- "value": false
145
- },
146
- "metrics.fad.tf.model_path": {
147
- "value": "//reference/fad/vggish_model.ckpt"
148
- },
149
- "evaluate.metrics.base": {
150
- "value": false
151
- },
152
- "generate.num_samples": {
153
- "value": 5
154
- },
155
- "autocast_dtype": {
156
- "value": "float16"
157
- },
158
- "classifier_free_guidance.inference_coef": {
159
- "value": 3
160
- },
161
- "codebooks_pattern.delay.flatten_first": {
162
- "value": 0
163
- },
164
- "dataset.segment_duration": {
165
- "value": 30
166
- },
167
- "slurm.mem_per_gpu": {
168
- "value": 40
169
- },
170
- "datasource.train": {
171
- "value": "/content/drive/MyDrive/projects/carecruise_intern/audiocraft/egs/train"
172
- },
173
- "transformer_lm.layer_scale": {
174
- "value": "None"
175
- },
176
- "num_threads": {
177
- "value": 1
178
- },
179
- "optim.ema.device": {
180
- "value": "cuda"
181
- },
182
- "metrics.text_consistency.use_gt": {
183
- "value": false
184
- },
185
- "schedule.inverse_sqrt.warmup_init_lr": {
186
- "value": 0
187
- },
188
- "evaluate.metrics.text_consistency": {
189
- "value": false
190
- },
191
- "schedule.polynomial_decay.end_lr": {
192
- "value": 0
193
- },
194
- "transformer_lm.num_heads": {
195
- "value": 16
196
- },
197
- "metrics.chroma_cosine.chroma_base.n_chroma": {
198
- "value": 12
199
- },
200
- "dtype": {
201
- "value": "float32"
202
- },
203
- "metrics.kld.model": {
204
- "value": "passt"
205
- },
206
- "evaluate.truncate_audio": {
207
- "value": "None"
208
- },
209
- "checkpoint.save_last": {
210
- "value": true
211
- },
212
- "evaluate.metrics.kld": {
213
- "value": false
214
- },
215
- "optim.optimizer": {
216
- "value": "adamw"
217
- },
218
- "dataset.train.drop_other_p": {
219
- "value": 0.5
220
- },
221
- "transformer_lm.activation": {
222
- "value": "gelu"
223
- },
224
- "evaluate.every": {
225
- "value": 25
226
- },
227
- "fsdp.use": {
228
- "value": false
229
- },
230
- "tokens.padding_with_special_token": {
231
- "value": false
232
- },
233
- "transformer_lm.qk_layer_norm": {
234
- "value": false
235
- },
236
- "device": {
237
- "value": "cuda"
238
- },
239
- "fsdp.sharding_strategy": {
240
- "value": "shard_grad_op"
241
- },
242
- "dataset.train.shuffle": {
243
- "value": true
244
- },
245
- "optim.adam.betas": {
246
- "value": "[0.9, 0.95]"
247
- },
248
- "metrics.kld.use_gt": {
249
- "value": false
250
- },
251
- "dataset.generate.return_info": {
252
- "value": true
253
- },
254
- "dataset.batch_size": {
255
- "value": 1
256
- },
257
- "dataset.sample_on_duration": {
258
- "value": false
259
- },
260
- "schedule.inverse_sqrt.warmup": {
261
- "value": "None"
262
- },
263
- "fuser.prepend": {
264
- "value": "[]"
265
- },
266
- "efficient_attention_backend": {
267
- "value": "torch"
268
- },
269
- "codebooks_pattern.unroll.delays": {
270
- "value": "[0, 0, 0, 0]"
271
- },
272
- "schedule.cosine.warmup": {
273
- "value": 8
274
- },
275
- "schedule.lr_scheduler": {
276
- "value": "cosine"
277
- },
278
- "dataset.valid.num_samples": {
279
- "value": 1
280
- },
281
- "transformer_lm.hidden_scale": {
282
- "value": 4
283
- },
284
- "schedule.exponential.lr_decay": {
285
- "value": "None"
286
- },
287
- "show": {
288
- "value": false
289
- },
290
- "transformer_lm.card": {
291
- "value": 2048
292
- },
293
- "fuser.cross_attention_pos_emb": {
294
- "value": false
295
- },
296
- "conditioners.description.model": {
297
- "value": "t5"
298
- },
299
- "generate.path": {
300
- "value": "samples"
301
- },
302
- "codebooks_pattern.delay.delays": {
303
- "value": "[0, 1, 2, 3]"
304
- },
305
- "transformer_lm.xpos": {
306
- "value": false
307
- },
308
- "logging.log_tensorboard": {
309
- "value": true
310
- },
311
- "benchmark_no_load": {
312
- "value": false
313
- },
314
- "schedule.cosine.lr_min_ratio": {
315
- "value": 0
316
- },
317
- "transformer_lm.custom": {
318
- "value": false
319
- },
320
- "evaluate.metrics.chroma_cosine": {
321
- "value": false
322
- },
323
- "cache.write_shard": {
324
- "value": 0
325
- },
326
- "schedule.polynomial_decay.power": {
327
- "value": 1
328
- },
329
- "generate.audio.strategy": {
330
- "value": "loudness"
331
- },
332
- "transformer_lm.dim": {
333
- "value": 1024
334
- },
335
- "compression_model_checkpoint": {
336
- "value": "//pretrained/facebook/encodec_32khz"
337
  }
338
  }
 
1
  {
2
+ "model_type": "musicgen",
3
+ "text_encoder": {
4
+ "type": "T5",
5
+ "name_or_path": "t5-base",
6
+ "config": {
7
+ "vocab_size": 32128,
8
+ "d_model": 1024,
9
+ "num_layers": 12,
10
+ "num_heads": 16,
11
+ "dropout_rate": 0.1
12
+ }
13
+ },
14
+ "audio_encoder": {
15
+ "type": "AudioEncoder",
16
+ "config": {
17
+ "sample_rate": 32000,
18
+ "num_channels": 1,
19
+ "embedding_size": 512
20
+ }
21
+ },
22
+ "decoder": {
23
+ "type": "TransformerDecoder",
24
+ "config": {
25
+ "d_model": 1024,
26
+ "num_heads": 16,
27
+ "num_layers": 24,
28
+ "dropout_rate": 0.1
29
+ }
30
+ },
31
+ "training": {
32
+ "batch_size": 16,
33
+ "num_epochs": 100,
34
+ "learning_rate": 0.0001,
35
+ "weight_decay": 0.01,
36
+ "gradient_clipping": 1.0
37
+ },
38
+ "generation": {
39
+ "sample_rate": 32000,
40
+ "audio_format": "wav",
41
+ "num_samples": 5,
42
+ "max_duration": 30.0,
43
+ "temperature": 1.0,
44
+ "top_k": 250,
45
+ "top_p": 0.9
46
+ },
47
+ "logging": {
48
+ "log_tensorboard": true,
49
+ "log_wandb": true,
50
+ "wandb_project": "music_generation",
51
+ "log_updates": 10
52
+ },
53
+ "hardware": {
54
+ "device": "cuda",
55
+ "num_gpus": 4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  }
57
  }