Zeyue7
/

AudioX

diffusion_cond

Model card Files Files and versions

xet

Community

Zeyue7 commited on Apr 1

Commit

2185c05

1 Parent(s): a90b58b

AudioX

Browse files

Files changed (3) hide show

README.md +1 -1
config.json +150 -0
model.ckpt +3 -0

README.md CHANGED Viewed

@@ -9,7 +9,7 @@ license: cc-by-nc-4.0
 [TL;DR]: AudioX is a unified Diffusion Transformer model for Anything-to-Audio and Music Generation, capable of generating high-quality general audio and music, offering flexible natural language control, and seamlessly processing various modalities including text, video, image, music, and audio.
 ### Links
-- **[Paper](https://arxiv.org/abs/2503.10522)**: Explore the research behind VidMuse.
 - **[Project](https://zeyuet.github.io/AudioX/)**: Visit the official project page for more information and updates.

 [TL;DR]: AudioX is a unified Diffusion Transformer model for Anything-to-Audio and Music Generation, capable of generating high-quality general audio and music, offering flexible natural language control, and seamlessly processing various modalities including text, video, image, music, and audio.
 ### Links
+- **[Paper](https://arxiv.org/abs/2503.10522)**: Explore the research behind AudioX.
 - **[Project](https://zeyuet.github.io/AudioX/)**: Visit the official project page for more information and updates.

config.json ADDED Viewed

	@@ -0,0 +1,150 @@

+{
+    "model_type": "diffusion_cond",
+    "sample_size": 485100,
+    "sample_rate": 44100,
+    "video_fps": 5,
+    "audio_channels": 2,
+    "model": {
+        "pretransform": {
+            "type": "autoencoder",
+            "iterate_batch": true,
+            "config": {
+                "encoder": {
+                    "type": "oobleck",
+                    "requires_grad": false,
+                    "config": {
+                        "in_channels": 2,
+                        "channels": 128,
+                        "c_mults": [1, 2, 4, 8, 16],
+                        "strides": [2, 4, 4, 8, 8],
+                        "latent_dim": 128,
+                        "use_snake": true
+                    }
+                },
+                "decoder": {
+                    "type": "oobleck",
+                    "config": {
+                        "out_channels": 2,
+                        "channels": 128,
+                        "c_mults": [1, 2, 4, 8, 16],
+                        "strides": [2, 4, 4, 8, 8],
+                        "latent_dim": 64,
+                        "use_snake": true,
+                        "final_tanh": false
+                    }
+                },
+                "bottleneck": {
+                    "type": "vae"
+                },
+                "latent_dim": 64,
+                "downsampling_ratio": 2048,
+                "io_channels": 2
+            }
+        },
+        "conditioning": {
+            "configs": [
+                {
+                    "id": "video_prompt",
+                    "type": "clip",
+                    "config": {
+                        "clip_model_name": "clip-vit-base-patch32"
+                    }
+                },
+                {
+                    "id": "text_prompt",
+                    "type": "t5",
+                    "config": {
+                        "t5_model_name": "t5-base",
+                        "max_length": 128
+                    }
+                },
+                {
+                    "id": "audio_prompt",
+                    "type": "audio_autoencoder-videomae",
+                    "config": {
+                        "sample_rate": 44100,
+                        "pretransform_config": {
+                            "type": "autoencoder",
+                            "iterate_batch": true,
+                            "config": {
+                                "encoder": {
+                                    "type": "oobleck",
+                                    "requires_grad": false,
+                                    "config": {
+                                        "in_channels": 2,
+                                        "channels": 128,
+                                        "c_mults": [1, 2, 4, 8, 16],
+                                        "strides": [2, 4, 4, 8, 8],
+                                        "latent_dim": 128,
+                                        "use_snake": true
+                                    }
+                                },
+                                "decoder": {
+                                    "type": "oobleck",
+                                    "config": {
+                                        "out_channels": 2,
+                                        "channels": 128,
+                                        "c_mults": [1, 2, 4, 8, 16],
+                                        "strides": [2, 4, 4, 8, 8],
+                                        "latent_dim": 64,
+                                        "use_snake": true,
+                                        "final_tanh": false
+                                    }
+                                },
+                                "bottleneck": {
+                                    "type": "vae"
+                                },
+                                "latent_dim": 64,
+                                "downsampling_ratio": 2048,
+                                "io_channels": 2
+                            }
+                        }
+                    }
+                }
+            ],
+            "cond_dim": 768
+        },
+        "diffusion": {
+            "cross_attention_cond_ids": ["video_prompt", "text_prompt", "audio_prompt"],
+            "global_cond_ids": [],
+            "type": "dit",
+            "config": {
+                "io_channels": 64,
+                "embed_dim": 1536,
+                "depth": 24,
+                "num_heads": 24,
+                "cond_token_dim": 768,
+                "global_cond_dim": 1536,
+                "project_cond_tokens": false,
+                "transformer_type": "continuous_transformer",
+                "video_fps": 5
+            }
+        },
+        "io_channels": 64
+    },
+    "training": {
+        "use_ema": true,
+        "log_loss_info": false,
+        "optimizer_configs": {
+            "diffusion": {
+                "optimizer": {
+                    "type": "AdamW",
+                    "config": {
+                        "lr": 5e-5,
+                        "betas": [0.9, 0.999],
+                        "weight_decay": 1e-3
+                    }
+                },
+                "scheduler": {
+                    "type": "InverseLR",
+                    "config": {
+                        "inv_gamma": 1000000,
+                        "power": 0.5,
+                        "warmup": 0.99
+                    }
+                }
+            }
+        }
+    }
+}

model.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a668d11406a81a372dc21f018ff4256d7fcd1a1a7c9fda30899f23a16b37f79a
+size 5958909851