Spaces:
				
			
			
	
			
			
		Runtime error
		
	
	
	
			
			
	
	
	
	
		
		
		Runtime error
		
	This Pull Request upgrades the space with a newer model
Browse filesThis PR uses _Stable Audio Open Zero_ instead of _AudioLDM_. This model can generate up to 47 seconds of sound.
Click on _Merge_ to add this feature.
- README.md +5 -10
- app.py +108 -276
- requirements.txt +2 -7
    	
        README.md
    CHANGED
    
    | @@ -1,5 +1,5 @@ | |
| 1 | 
             
            ---
         | 
| 2 | 
            -
            title:  | 
| 3 | 
             
            emoji: π
         | 
| 4 | 
             
            colorFrom: indigo
         | 
| 5 | 
             
            colorTo: red
         | 
| @@ -8,15 +8,10 @@ sdk_version: 4.37.2 | |
| 8 | 
             
            app_file: app.py
         | 
| 9 | 
             
            pinned: false
         | 
| 10 | 
             
            license: bigscience-openrail-m
         | 
| 11 | 
            -
             | 
|  | |
|  | |
|  | |
| 12 | 
             
            ---
         | 
| 13 |  | 
| 14 | 
             
            Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
         | 
| 15 | 
            -
             | 
| 16 | 
            -
            ## Reference
         | 
| 17 | 
            -
            Part of the code from this repo is borrowed from the following repos. We would like to thank the authors of them for their contribution. 
         | 
| 18 | 
            -
             | 
| 19 | 
            -
            > https://github.com/LAION-AI/CLAP
         | 
| 20 | 
            -
            > https://github.com/CompVis/stable-diffusion
         | 
| 21 | 
            -
            > https://github.com/v-iashin/SpecVQGAN 
         | 
| 22 | 
            -
            > https://github.com/toshas/torch-fidelity
         | 
|  | |
| 1 | 
             
            ---
         | 
| 2 | 
            +
            title: Stable Audio Open Zero
         | 
| 3 | 
             
            emoji: π
         | 
| 4 | 
             
            colorFrom: indigo
         | 
| 5 | 
             
            colorTo: red
         | 
|  | |
| 8 | 
             
            app_file: app.py
         | 
| 9 | 
             
            pinned: false
         | 
| 10 | 
             
            license: bigscience-openrail-m
         | 
| 11 | 
            +
            tags:
         | 
| 12 | 
            +
              - Text-to-Audio
         | 
| 13 | 
            +
              - LLM
         | 
| 14 | 
            +
            short_description: Text-to-Audio Generation
         | 
| 15 | 
             
            ---
         | 
| 16 |  | 
| 17 | 
             
            Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
    	
        app.py
    CHANGED
    
    | @@ -1,278 +1,110 @@ | |
| 1 | 
            -
            import  | 
| 2 | 
             
            import torch
         | 
| 3 | 
            -
             | 
| 4 | 
            -
            from  | 
| 5 | 
            -
             | 
| 6 | 
            -
             | 
| 7 | 
            -
             | 
| 8 | 
            -
             | 
| 9 | 
            -
             | 
| 10 | 
            -
             | 
| 11 | 
            -
             | 
| 12 | 
            -
             | 
| 13 | 
            -
             | 
| 14 | 
            -
             | 
| 15 | 
            -
             | 
| 16 | 
            -
             | 
| 17 | 
            -
             | 
| 18 | 
            -
             | 
| 19 | 
            -
             | 
| 20 | 
            -
             | 
| 21 | 
            -
             | 
| 22 | 
            -
            #  | 
| 23 | 
            -
             | 
| 24 | 
            -
             | 
| 25 | 
            -
             | 
| 26 | 
            -
             | 
| 27 | 
            -
             | 
| 28 | 
            -
             | 
| 29 | 
            -
             | 
| 30 | 
            -
                 | 
| 31 | 
            -
             | 
| 32 | 
            -
             | 
| 33 | 
            -
                 | 
| 34 | 
            -
             | 
| 35 | 
            -
             | 
| 36 | 
            -
             | 
| 37 | 
            -
             | 
| 38 | 
            -
             | 
| 39 | 
            -
             | 
| 40 | 
            -
             | 
| 41 | 
            -
                 | 
| 42 | 
            -
             | 
| 43 | 
            -
             | 
| 44 | 
            -
             | 
| 45 | 
            -
             | 
| 46 | 
            -
             | 
| 47 | 
            -
             | 
| 48 | 
            -
             | 
| 49 | 
            -
             | 
| 50 | 
            -
             | 
| 51 | 
            -
             | 
| 52 | 
            -
             | 
| 53 | 
            -
             | 
| 54 | 
            -
                 | 
| 55 | 
            -
             | 
| 56 | 
            -
             | 
| 57 | 
            -
             | 
| 58 | 
            -
                 | 
| 59 | 
            -
                 | 
| 60 | 
            -
             | 
| 61 | 
            -
             | 
| 62 | 
            -
             | 
| 63 | 
            -
                     | 
| 64 | 
            -
             | 
| 65 | 
            -
                     | 
| 66 | 
            -
             | 
| 67 | 
            -
                     | 
| 68 | 
            -
             | 
| 69 | 
            -
                    } input[type='range'] {
         | 
| 70 | 
            -
                        accent-color: #000000;
         | 
| 71 | 
            -
                    } .dark input[type='range'] {
         | 
| 72 | 
            -
                        accent-color: #dfdfdf;
         | 
| 73 | 
            -
                    } .container {
         | 
| 74 | 
            -
                        max-width: 730px; margin: auto; padding-top: 1.5rem;
         | 
| 75 | 
            -
                    } #gallery {
         | 
| 76 | 
            -
                        min-height: 22rem; margin-bottom: 15px; margin-left: auto; margin-right: auto; border-bottom-right-radius:
         | 
| 77 | 
            -
                        .5rem !important; border-bottom-left-radius: .5rem !important;
         | 
| 78 | 
            -
                    } #gallery>div>.h-full {
         | 
| 79 | 
            -
                        min-height: 20rem;
         | 
| 80 | 
            -
                    } .details:hover {
         | 
| 81 | 
            -
                        text-decoration: underline;
         | 
| 82 | 
            -
                    } .gr-button {
         | 
| 83 | 
            -
                        white-space: nowrap;
         | 
| 84 | 
            -
                    } .gr-button:focus {
         | 
| 85 | 
            -
                        border-color: rgb(147 197 253 / var(--tw-border-opacity)); outline: none; box-shadow:
         | 
| 86 | 
            -
                        var(--tw-ring-offset-shadow), var(--tw-ring-shadow), var(--tw-shadow, 0 0 #0000); --tw-border-opacity: 1;
         | 
| 87 | 
            -
                        --tw-ring-offset-shadow: var(--tw-ring-inset) 0 0 0 var(--tw-ring-offset-width)
         | 
| 88 | 
            -
                        var(--tw-ring-offset-color); --tw-ring-shadow: var(--tw-ring-inset) 0 0 0 calc(3px
         | 
| 89 | 
            -
                        var(--tw-ring-offset-width)) var(--tw-ring-color); --tw-ring-color: rgb(191 219 254 /
         | 
| 90 | 
            -
                        var(--tw-ring-opacity)); --tw-ring-opacity: .5;
         | 
| 91 | 
            -
                    } #advanced-btn {
         | 
| 92 | 
            -
                        font-size: .7rem !important; line-height: 19px; margin-top: 12px; margin-bottom: 12px; padding: 2px 8px;
         | 
| 93 | 
            -
                        border-radius: 14px !important;
         | 
| 94 | 
            -
                    } #advanced-options {
         | 
| 95 | 
            -
                        margin-bottom: 20px;
         | 
| 96 | 
            -
                    } .footer {
         | 
| 97 | 
            -
                        margin-bottom: 45px; margin-top: 35px; text-align: center; border-bottom: 1px solid #e5e5e5;
         | 
| 98 | 
            -
                    } .footer>p {
         | 
| 99 | 
            -
                        font-size: .8rem; display: inline-block; padding: 0 10px; transform: translateY(10px); background: white;
         | 
| 100 | 
            -
                    } .dark .footer {
         | 
| 101 | 
            -
                        border-color: #303030;
         | 
| 102 | 
            -
                    } .dark .footer>p {
         | 
| 103 | 
            -
                        background: #0b0f19;
         | 
| 104 | 
            -
                    } .acknowledgments h4{
         | 
| 105 | 
            -
                        margin: 1.25em 0 .25em 0; font-weight: bold; font-size: 115%;
         | 
| 106 | 
            -
                    } #container-advanced-btns{
         | 
| 107 | 
            -
                        display: flex; flex-wrap: wrap; justify-content: space-between; align-items: center;
         | 
| 108 | 
            -
                    } .animate-spin {
         | 
| 109 | 
            -
                        animation: spin 1s linear infinite;
         | 
| 110 | 
            -
                    } @keyframes spin {
         | 
| 111 | 
            -
                        from {
         | 
| 112 | 
            -
                            transform: rotate(0deg);
         | 
| 113 | 
            -
                        } to {
         | 
| 114 | 
            -
                            transform: rotate(360deg);
         | 
| 115 | 
            -
                        }
         | 
| 116 | 
            -
                    } #share-btn-container {
         | 
| 117 | 
            -
                        display: flex; padding-left: 0.5rem !important; padding-right: 0.5rem !important; background-color:
         | 
| 118 | 
            -
                        #000000; justify-content: center; align-items: center; border-radius: 9999px !important; width: 13rem;
         | 
| 119 | 
            -
                        margin-top: 10px; margin-left: auto;
         | 
| 120 | 
            -
                    } #share-btn {
         | 
| 121 | 
            -
                        all: initial; color: #ffffff;font-weight: 600; cursor:pointer; font-family: 'IBM Plex Sans', sans-serif;
         | 
| 122 | 
            -
                        margin-left: 0.5rem !important; padding-top: 0.25rem !important; padding-bottom: 0.25rem
         | 
| 123 | 
            -
                        !important;right:0;
         | 
| 124 | 
            -
                    } #share-btn * {
         | 
| 125 | 
            -
                        all: unset;
         | 
| 126 | 
            -
                    } #share-btn-container div:nth-child(-n+2){
         | 
| 127 | 
            -
                        width: auto !important; min-height: 0px !important;
         | 
| 128 | 
            -
                    } #share-btn-container .wrap {
         | 
| 129 | 
            -
                        display: none !important;
         | 
| 130 | 
            -
                    } .gr-form{
         | 
| 131 | 
            -
                        flex: 1 1 50%; border-top-right-radius: 0; border-bottom-right-radius: 0;
         | 
| 132 | 
            -
                    } #prompt-container{
         | 
| 133 | 
            -
                        gap: 0;
         | 
| 134 | 
            -
                    } #generated_id{
         | 
| 135 | 
            -
                        min-height: 700px
         | 
| 136 | 
            -
                    } #setting_id{
         | 
| 137 | 
            -
                      margin-bottom: 12px; text-align: center; font-weight: 900;
         | 
| 138 | 
            -
                    }
         | 
| 139 | 
            -
            """
         | 
| 140 | 
            -
            iface = gr.Blocks(css=css)
         | 
| 141 | 
            -
             | 
| 142 | 
            -
            with iface:
         | 
| 143 | 
            -
                gr.HTML(
         | 
| 144 | 
            -
                    """
         | 
| 145 | 
            -
                        <div style="text-align: center; max-width: 700px; margin: 0 auto;">
         | 
| 146 | 
            -
                          <div
         | 
| 147 | 
            -
                            style="
         | 
| 148 | 
            -
                              display: inline-flex; align-items: center; gap: 0.8rem; font-size: 1.75rem;
         | 
| 149 | 
            -
                            "
         | 
| 150 | 
            -
                          >
         | 
| 151 | 
            -
                            <h1 style="font-weight: 900; margin-bottom: 7px; line-height: normal;">
         | 
| 152 | 
            -
                              AudioLDM: Text-to-Audio Generation with Latent Diffusion Models
         | 
| 153 | 
            -
                            </h1>
         | 
| 154 | 
            -
                          </div> <p style="margin-bottom: 10px; font-size: 94%">
         | 
| 155 | 
            -
                            <a href="https://arxiv.org/abs/2301.12503">[Paper]</a> <a href="https://audioldm.github.io/">[Project
         | 
| 156 | 
            -
                            page]</a> <a href="https://huggingface.co/docs/diffusers/main/en/api/pipelines/audioldm">[π§¨
         | 
| 157 | 
            -
                            Diffusers]</a>
         | 
| 158 | 
            -
                          </p>
         | 
| 159 | 
            -
                        </div>
         | 
| 160 | 
            -
                    """
         | 
| 161 | 
            -
                )
         | 
| 162 | 
            -
                gr.HTML(
         | 
| 163 | 
            -
                    """
         | 
| 164 | 
            -
                    <p>This is the demo for AudioLDM, powered by 𧨠Diffusers. Demo uses the checkpoint <a
         | 
| 165 | 
            -
                    href="https://huggingface.co/cvssp/audioldm-m-full"> audioldm-m-full </a>. For faster inference without waiting in
         | 
| 166 | 
            -
                    queue, you may duplicate the space and upgrade to a GPU in the settings. <br/> <a
         | 
| 167 | 
            -
                    href="https://huggingface.co/spaces/haoheliu/audioldm-text-to-audio-generation?duplicate=true"> <img
         | 
| 168 | 
            -
                    style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a> <p/>
         | 
| 169 | 
            -
                """
         | 
| 170 | 
             
                )
         | 
| 171 | 
            -
             | 
| 172 | 
            -
             | 
| 173 | 
            -
             | 
| 174 | 
            -
             | 
| 175 | 
            -
             | 
| 176 | 
            -
             | 
| 177 | 
            -
             | 
| 178 | 
            -
             | 
| 179 | 
            -
             | 
| 180 | 
            -
             | 
| 181 | 
            -
             | 
| 182 | 
            -
             | 
| 183 | 
            -
             | 
| 184 | 
            -
             | 
| 185 | 
            -
             | 
| 186 | 
            -
             | 
| 187 | 
            -
             | 
| 188 | 
            -
             | 
| 189 | 
            -
             | 
| 190 | 
            -
             | 
| 191 | 
            -
             | 
| 192 | 
            -
             | 
| 193 | 
            -
             | 
| 194 | 
            -
             | 
| 195 | 
            -
             | 
| 196 | 
            -
             | 
| 197 | 
            -
             | 
| 198 | 
            -
             | 
| 199 | 
            -
             | 
| 200 | 
            -
             | 
| 201 | 
            -
             | 
| 202 | 
            -
             | 
| 203 | 
            -
             | 
| 204 | 
            -
             | 
| 205 | 
            -
             | 
| 206 | 
            -
             | 
| 207 | 
            -
             | 
| 208 | 
            -
             | 
| 209 | 
            -
             | 
| 210 | 
            -
             | 
| 211 | 
            -
             | 
| 212 | 
            -
                
         | 
| 213 | 
            -
                        outputs = gr.Video(label="Output", elem_id="output-video")
         | 
| 214 | 
            -
                        btn = gr.Button("Submit", elem_id=".gr-Button") # .style(full_width=True)
         | 
| 215 | 
            -
             | 
| 216 | 
            -
                    with gr.Group(elem_id="share-btn-container", visible=False):
         | 
| 217 | 
            -
                        community_icon = gr.HTML(community_icon_html)
         | 
| 218 | 
            -
                        loading_icon = gr.HTML(loading_icon_html)
         | 
| 219 | 
            -
                        share_button = gr.Button("Share to community", elem_id="share-btn")
         | 
| 220 | 
            -
             | 
| 221 | 
            -
                    btn.click(
         | 
| 222 | 
            -
                        text2audio,
         | 
| 223 | 
            -
                        inputs=[textbox, negative_textbox, duration, guidance_scale, seed, n_candidates],
         | 
| 224 | 
            -
                        outputs=[outputs],
         | 
| 225 | 
            -
                    )
         | 
| 226 | 
            -
             | 
| 227 | 
            -
                    share_button.click(None, [], [], js=share_js) 
         | 
| 228 | 
            -
                    gr.HTML(
         | 
| 229 | 
            -
                        """
         | 
| 230 | 
            -
                    <div class="footer" style="text-align: center; max-width: 700px; margin: 0 auto;">
         | 
| 231 | 
            -
                                <p>Follow the latest update of AudioLDM on our<a href="https://github.com/haoheliu/AudioLDM"
         | 
| 232 | 
            -
                                style="text-decoration: underline;" target="_blank"> Github repo</a> </p> <br> <p>Model by <a
         | 
| 233 | 
            -
                                href="https://twitter.com/LiuHaohe" style="text-decoration: underline;" target="_blank">Haohe
         | 
| 234 | 
            -
                                Liu</a>. Code and demo by π€ Hugging Face.</p> <br>
         | 
| 235 | 
            -
                    </div>
         | 
| 236 | 
            -
                    """
         | 
| 237 | 
            -
                    )
         | 
| 238 | 
            -
                    gr.Examples(
         | 
| 239 | 
            -
                        [
         | 
| 240 | 
            -
                            ["A hammer is hitting a wooden surface", "low quality, average quality", 5, 2.5, 45, 3],
         | 
| 241 | 
            -
                            ["Peaceful and calming ambient music with singing bowl and other instruments.", "low quality, average quality", 5, 2.5, 45, 3],
         | 
| 242 | 
            -
                            ["A man is speaking in a small room.", "low quality, average quality", 5, 2.5, 45, 3],
         | 
| 243 | 
            -
                            ["A female is speaking followed by footstep sound", "low quality, average quality", 5, 2.5, 45, 3],
         | 
| 244 | 
            -
                            ["Wooden table tapping sound followed by water pouring sound.", "low quality, average quality", 5, 2.5, 45, 3],
         | 
| 245 | 
            -
                        ],
         | 
| 246 | 
            -
                        fn=text2audio,
         | 
| 247 | 
            -
                        inputs=[textbox, negative_textbox, duration, guidance_scale, seed, n_candidates],
         | 
| 248 | 
            -
                        outputs=[outputs],
         | 
| 249 | 
            -
                        cache_examples=True,
         | 
| 250 | 
            -
                    )
         | 
| 251 | 
            -
                    gr.HTML(
         | 
| 252 | 
            -
                        """
         | 
| 253 | 
            -
                            <div class="acknowledgements"> <p>Essential Tricks for Enhancing the Quality of Your Generated
         | 
| 254 | 
            -
                            Audio</p> <p>1. Try to use more adjectives to describe your sound. For example: "A man is speaking
         | 
| 255 | 
            -
                            clearly and slowly in a large room" is better than "A man is speaking". This can make sure AudioLDM
         | 
| 256 | 
            -
                            understands what you want.</p> <p>2. Try to use different random seeds, which can affect the generation
         | 
| 257 | 
            -
                            quality significantly sometimes.</p> <p>3. It's better to use general terms like 'man' or 'woman'
         | 
| 258 | 
            -
                            instead of specific names for individuals or abstract objects that humans may not be familiar with,
         | 
| 259 | 
            -
                            such as 'mummy'.</p> <p>4. Using a negative prompt to not guide the diffusion process can improve the
         | 
| 260 | 
            -
                            audio quality significantly. Try using negative prompts like 'low quality'.</p> </div>
         | 
| 261 | 
            -
                            """
         | 
| 262 | 
            -
                    )
         | 
| 263 | 
            -
                    with gr.Accordion("Additional information", open=False):
         | 
| 264 | 
            -
                        gr.HTML(
         | 
| 265 | 
            -
                            """
         | 
| 266 | 
            -
                            <div class="acknowledgments">
         | 
| 267 | 
            -
                                <p> We build the model with data from <a href="http://research.google.com/audioset/">AudioSet</a>,
         | 
| 268 | 
            -
                                <a href="https://freesound.org/">Freesound</a> and <a
         | 
| 269 | 
            -
                                href="https://sound-effects.bbcrewind.co.uk/">BBC Sound Effect library</a>. We share this demo
         | 
| 270 | 
            -
                                based on the <a
         | 
| 271 | 
            -
                                href="https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/375954/Research.pdf">UK
         | 
| 272 | 
            -
                                copyright exception</a> of data for academic research. </p>
         | 
| 273 | 
            -
                                        </div>
         | 
| 274 | 
            -
                                    """
         | 
| 275 | 
            -
                        )
         | 
| 276 | 
            -
            # <p>This demo is strictly for research demo purpose only. For commercial use please <a href="[email protected]">contact us</a>.</p>
         | 
| 277 | 
            -
             | 
| 278 | 
            -
            iface.queue(max_size=10).launch(debug=True)
         | 
|  | |
| 1 | 
            +
            import random
         | 
| 2 | 
             
            import torch
         | 
| 3 | 
            +
            import torchaudio
         | 
| 4 | 
            +
            from einops import rearrange
         | 
| 5 | 
            +
            import gradio as gr
         | 
| 6 | 
            +
            import spaces
         | 
| 7 | 
            +
            import os
         | 
| 8 | 
            +
            import uuid
         | 
| 9 | 
            +
             | 
| 10 | 
            +
            # Importing the model-related functions
         | 
| 11 | 
            +
            from stable_audio_tools import get_pretrained_model
         | 
| 12 | 
            +
            from stable_audio_tools.inference.generation import generate_diffusion_cond
         | 
| 13 | 
            +
             | 
| 14 | 
            +
            # Load the model outside of the GPU-decorated function
         | 
| 15 | 
            +
            def load_model():
         | 
| 16 | 
            +
                print("Loading model...")
         | 
| 17 | 
            +
                model, model_config = get_pretrained_model("chaowenguo/stable-audio-open-1.0")
         | 
| 18 | 
            +
                print("Model loaded successfully.")
         | 
| 19 | 
            +
                return model, model_config
         | 
| 20 | 
            +
             | 
| 21 | 
            +
            # Function to set up, generate, and process the audio
         | 
| 22 | 
            +
            @spaces.GPU(duration=120)  # Allocate GPU only when this function is called
         | 
| 23 | 
            +
            def generate_audio(prompt, seconds_total=30, steps=100, cfg_scale=7):
         | 
| 24 | 
            +
                print(f"Prompt received: {prompt}")
         | 
| 25 | 
            +
                print(f"Settings: Duration={seconds_total}s, Steps={steps}, CFG Scale={cfg_scale}")
         | 
| 26 | 
            +
             | 
| 27 | 
            +
                seed = random.randint(0, 2**63 - 1)
         | 
| 28 | 
            +
                random.seed(seed)
         | 
| 29 | 
            +
                torch.manual_seed(seed)
         | 
| 30 | 
            +
                print(f"Using seed: {seed}")
         | 
| 31 | 
            +
             | 
| 32 | 
            +
                device = "cuda" if torch.cuda.is_available() else "cpu"
         | 
| 33 | 
            +
                print(f"Using device: {device}")
         | 
| 34 | 
            +
             | 
| 35 | 
            +
                # Fetch the Hugging Face token from the environment variable
         | 
| 36 | 
            +
                hf_token = os.getenv('HF_TOKEN')
         | 
| 37 | 
            +
                print(f"Hugging Face token: {hf_token}")
         | 
| 38 | 
            +
             | 
| 39 | 
            +
                # Use pre-loaded model and configuration
         | 
| 40 | 
            +
                model, model_config = load_model()
         | 
| 41 | 
            +
                sample_rate = model_config["sample_rate"]
         | 
| 42 | 
            +
                sample_size = model_config["sample_size"]
         | 
| 43 | 
            +
             | 
| 44 | 
            +
                print(f"Sample rate: {sample_rate}, Sample size: {sample_size}")
         | 
| 45 | 
            +
             | 
| 46 | 
            +
                model = model.to(device)
         | 
| 47 | 
            +
                print("Model moved to device.")
         | 
| 48 | 
            +
             | 
| 49 | 
            +
                # Set up text and timing conditioning
         | 
| 50 | 
            +
                conditioning = [{
         | 
| 51 | 
            +
                    "prompt": prompt,
         | 
| 52 | 
            +
                    "seconds_start": 0,
         | 
| 53 | 
            +
                    "seconds_total": seconds_total
         | 
| 54 | 
            +
                }]
         | 
| 55 | 
            +
                print(f"Conditioning: {conditioning}")
         | 
| 56 | 
            +
             | 
| 57 | 
            +
                # Generate stereo audio
         | 
| 58 | 
            +
                print("Generating audio...")
         | 
| 59 | 
            +
                output = generate_diffusion_cond(
         | 
| 60 | 
            +
                    model,
         | 
| 61 | 
            +
                    steps=steps,
         | 
| 62 | 
            +
                    cfg_scale=cfg_scale,
         | 
| 63 | 
            +
                    conditioning=conditioning,
         | 
| 64 | 
            +
                    sample_size=sample_size,
         | 
| 65 | 
            +
                    sigma_min=0.3,
         | 
| 66 | 
            +
                    sigma_max=500,
         | 
| 67 | 
            +
                    sampler_type="dpmpp-3m-sde",
         | 
| 68 | 
            +
                    device=device
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 69 | 
             
                )
         | 
| 70 | 
            +
                print("Audio generated.")
         | 
| 71 | 
            +
             | 
| 72 | 
            +
                # Rearrange audio batch to a single sequence
         | 
| 73 | 
            +
                output = rearrange(output, "b d n -> d (b n)")
         | 
| 74 | 
            +
                print("Audio rearranged.")
         | 
| 75 | 
            +
             | 
| 76 | 
            +
                # Peak normalize, clip, convert to int16
         | 
| 77 | 
            +
                output = output.to(torch.float32).div(torch.max(torch.abs(output))).clamp(-1, 1).mul(32767).to(torch.int16).cpu()
         | 
| 78 | 
            +
                print("Audio normalized and converted.")
         | 
| 79 | 
            +
             | 
| 80 | 
            +
                # Generate a unique filename for the output
         | 
| 81 | 
            +
                unique_filename = f"output_{uuid.uuid4().hex}.wav"
         | 
| 82 | 
            +
                print(f"Saving audio to file: {unique_filename}")
         | 
| 83 | 
            +
             | 
| 84 | 
            +
                # Save to file
         | 
| 85 | 
            +
                torchaudio.save(unique_filename, output, sample_rate)
         | 
| 86 | 
            +
                print(f"Audio saved: {unique_filename}")
         | 
| 87 | 
            +
             | 
| 88 | 
            +
                # Return the path to the generated audio file
         | 
| 89 | 
            +
                return unique_filename
         | 
| 90 | 
            +
             | 
| 91 | 
            +
            # Setting up the Gradio Interface
         | 
| 92 | 
            +
            interface = gr.Interface(
         | 
| 93 | 
            +
                fn=generate_audio,
         | 
| 94 | 
            +
                inputs=[
         | 
| 95 | 
            +
                    gr.Textbox(label="Prompt", placeholder="Enter your text prompt here"),
         | 
| 96 | 
            +
                    gr.Slider(0, 47, value=5, label="Duration in Seconds"),
         | 
| 97 | 
            +
                    gr.Slider(10, 150, value=10, step=10, label="Number of Diffusion Steps"),
         | 
| 98 | 
            +
                    gr.Slider(1, 15, value=7, step=0.1, label="CFG Scale")
         | 
| 99 | 
            +
                ],
         | 
| 100 | 
            +
                outputs=gr.Audio(type="filepath", label="Generated Audio"),
         | 
| 101 | 
            +
                title="Stable Audio Generator",
         | 
| 102 | 
            +
                description="Generate variable-length stereo audio at 44.1kHz from text prompts using Stable Audio Open 1.0."
         | 
| 103 | 
            +
            )
         | 
| 104 | 
            +
             | 
| 105 | 
            +
             | 
| 106 | 
            +
            # Pre-load the model to avoid multiprocessing issues
         | 
| 107 | 
            +
            model, model_config = load_model()
         | 
| 108 | 
            +
             | 
| 109 | 
            +
            # Launch the Interface
         | 
| 110 | 
            +
            interface.launch()
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
    	
        requirements.txt
    CHANGED
    
    | @@ -1,8 +1,3 @@ | |
| 1 | 
            -
            git+https://github.com/huggingface/diffusers.git
         | 
| 2 | 
            -
            git+https://github.com/huggingface/transformers.git
         | 
| 3 | 
            -
            --extra-index-url https://download.pytorch.org/whl/cu113
         | 
| 4 | 
             
            torch
         | 
| 5 | 
            -
             | 
| 6 | 
            -
             | 
| 7 | 
            -
            fastapi 
         | 
| 8 | 
            -
            gradio
         | 
|  | |
|  | |
|  | |
|  | |
| 1 | 
             
            torch
         | 
| 2 | 
            +
            torchaudio
         | 
| 3 | 
            +
            stable-audio-tools
         | 
|  | |
|  | 

