Zeyue7 commited on
Commit
2185c05
·
1 Parent(s): a90b58b
Files changed (3) hide show
  1. README.md +1 -1
  2. config.json +150 -0
  3. model.ckpt +3 -0
README.md CHANGED
@@ -9,7 +9,7 @@ license: cc-by-nc-4.0
9
  [TL;DR]: AudioX is a unified Diffusion Transformer model for Anything-to-Audio and Music Generation, capable of generating high-quality general audio and music, offering flexible natural language control, and seamlessly processing various modalities including text, video, image, music, and audio.
10
 
11
  ### Links
12
- - **[Paper](https://arxiv.org/abs/2503.10522)**: Explore the research behind VidMuse.
13
  - **[Project](https://zeyuet.github.io/AudioX/)**: Visit the official project page for more information and updates.
14
 
15
 
 
9
  [TL;DR]: AudioX is a unified Diffusion Transformer model for Anything-to-Audio and Music Generation, capable of generating high-quality general audio and music, offering flexible natural language control, and seamlessly processing various modalities including text, video, image, music, and audio.
10
 
11
  ### Links
12
+ - **[Paper](https://arxiv.org/abs/2503.10522)**: Explore the research behind AudioX.
13
  - **[Project](https://zeyuet.github.io/AudioX/)**: Visit the official project page for more information and updates.
14
 
15
 
config.json ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "diffusion_cond",
3
+ "sample_size": 485100,
4
+ "sample_rate": 44100,
5
+ "video_fps": 5,
6
+ "audio_channels": 2,
7
+ "model": {
8
+ "pretransform": {
9
+ "type": "autoencoder",
10
+ "iterate_batch": true,
11
+ "config": {
12
+ "encoder": {
13
+ "type": "oobleck",
14
+ "requires_grad": false,
15
+ "config": {
16
+ "in_channels": 2,
17
+ "channels": 128,
18
+ "c_mults": [1, 2, 4, 8, 16],
19
+ "strides": [2, 4, 4, 8, 8],
20
+ "latent_dim": 128,
21
+ "use_snake": true
22
+ }
23
+ },
24
+ "decoder": {
25
+ "type": "oobleck",
26
+ "config": {
27
+ "out_channels": 2,
28
+ "channels": 128,
29
+ "c_mults": [1, 2, 4, 8, 16],
30
+ "strides": [2, 4, 4, 8, 8],
31
+ "latent_dim": 64,
32
+ "use_snake": true,
33
+ "final_tanh": false
34
+ }
35
+ },
36
+ "bottleneck": {
37
+ "type": "vae"
38
+ },
39
+ "latent_dim": 64,
40
+ "downsampling_ratio": 2048,
41
+ "io_channels": 2
42
+ }
43
+ },
44
+ "conditioning": {
45
+ "configs": [
46
+ {
47
+ "id": "video_prompt",
48
+ "type": "clip",
49
+ "config": {
50
+ "clip_model_name": "clip-vit-base-patch32"
51
+ }
52
+ },
53
+ {
54
+ "id": "text_prompt",
55
+ "type": "t5",
56
+ "config": {
57
+ "t5_model_name": "t5-base",
58
+ "max_length": 128
59
+ }
60
+ },
61
+ {
62
+ "id": "audio_prompt",
63
+ "type": "audio_autoencoder-videomae",
64
+ "config": {
65
+
66
+ "sample_rate": 44100,
67
+ "pretransform_config": {
68
+ "type": "autoencoder",
69
+ "iterate_batch": true,
70
+ "config": {
71
+ "encoder": {
72
+ "type": "oobleck",
73
+ "requires_grad": false,
74
+ "config": {
75
+ "in_channels": 2,
76
+ "channels": 128,
77
+ "c_mults": [1, 2, 4, 8, 16],
78
+ "strides": [2, 4, 4, 8, 8],
79
+ "latent_dim": 128,
80
+ "use_snake": true
81
+ }
82
+ },
83
+ "decoder": {
84
+ "type": "oobleck",
85
+ "config": {
86
+ "out_channels": 2,
87
+ "channels": 128,
88
+ "c_mults": [1, 2, 4, 8, 16],
89
+ "strides": [2, 4, 4, 8, 8],
90
+ "latent_dim": 64,
91
+ "use_snake": true,
92
+ "final_tanh": false
93
+ }
94
+ },
95
+ "bottleneck": {
96
+ "type": "vae"
97
+ },
98
+ "latent_dim": 64,
99
+ "downsampling_ratio": 2048,
100
+ "io_channels": 2
101
+ }
102
+ }
103
+ }
104
+ }
105
+ ],
106
+ "cond_dim": 768
107
+ },
108
+ "diffusion": {
109
+ "cross_attention_cond_ids": ["video_prompt", "text_prompt", "audio_prompt"],
110
+ "global_cond_ids": [],
111
+ "type": "dit",
112
+ "config": {
113
+ "io_channels": 64,
114
+ "embed_dim": 1536,
115
+ "depth": 24,
116
+ "num_heads": 24,
117
+ "cond_token_dim": 768,
118
+ "global_cond_dim": 1536,
119
+ "project_cond_tokens": false,
120
+ "transformer_type": "continuous_transformer",
121
+ "video_fps": 5
122
+ }
123
+ },
124
+ "io_channels": 64
125
+ },
126
+ "training": {
127
+ "use_ema": true,
128
+ "log_loss_info": false,
129
+ "optimizer_configs": {
130
+ "diffusion": {
131
+ "optimizer": {
132
+ "type": "AdamW",
133
+ "config": {
134
+ "lr": 5e-5,
135
+ "betas": [0.9, 0.999],
136
+ "weight_decay": 1e-3
137
+ }
138
+ },
139
+ "scheduler": {
140
+ "type": "InverseLR",
141
+ "config": {
142
+ "inv_gamma": 1000000,
143
+ "power": 0.5,
144
+ "warmup": 0.99
145
+ }
146
+ }
147
+ }
148
+ }
149
+ }
150
+ }
model.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a668d11406a81a372dc21f018ff4256d7fcd1a1a7c9fda30899f23a16b37f79a
3
+ size 5958909851