File size: 17,848 Bytes
52e834f
160741c
9710eda
f081ac6
160741c
9710eda
160741c
d1a417b
160741c
52e834f
 
 
160741c
 
7d372b2
160741c
abf12f8
160741c
721c606
cf62874
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
721c606
 
 
 
 
 
 
 
 
fcdd877
91ec96f
01ce88b
b641e52
52e834f
 
 
 
 
 
2f7657b
160741c
cf62874
 
 
 
 
 
 
160741c
 
 
 
f26e2b9
 
 
160741c
 
 
 
 
 
 
2f7657b
3fe4f3e
160741c
 
 
 
 
3c3e8e9
dde3732
 
3c3e8e9
1b5791c
160741c
 
 
20c6475
 
 
160741c
 
 
 
 
 
 
 
 
 
 
 
 
2f7657b
160741c
 
 
 
 
 
 
 
 
470d848
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
faf8c97
 
 
470d848
 
721c606
 
 
 
 
 
 
 
470d848
721c606
160741c
57a7a6d
0a20a75
c01e8f8
 
 
 
 
 
9710eda
0a20a75
 
 
 
 
 
 
 
 
 
7d372b2
0a20a75
 
57a7a6d
3fe4f3e
0a20a75
 
 
c01e8f8
0a20a75
 
3c3e8e9
0a20a75
 
 
 
 
 
57a7a6d
 
 
485b322
57a7a6d
0a20a75
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3c3e8e9
0a20a75
 
05091a2
0a20a75
 
68759ee
 
970b009
57a7a6d
 
 
 
ab09220
57a7a6d
ed49550
 
 
 
 
 
 
 
 
 
 
 
 
 
b92c614
 
ed49550
 
 
 
 
 
 
 
 
b92c614
ed49550
 
 
 
 
 
080877e
ed49550
 
 
 
 
 
 
 
 
 
 
485a508
 
 
 
 
 
 
ed49550
91ec96f
 
 
 
 
 
 
 
52e834f
91ec96f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
160741c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
645f8f4
2f7657b
160741c
 
721c606
20c6475
 
 
 
160741c
485a508
 
 
 
 
 
 
 
160741c
 
 
2f7657b
721c606
485a508
52e834f
0a20a75
 
645f8f4
ed49550
 
57a7a6d
 
ed49550
 
 
0a20a75
ed49550
0a20a75
 
 
 
 
 
05091a2
485a508
 
 
 
 
 
 
 
 
90c1e48
 
485a508
 
ed49550
 
 
 
 
 
b8ba919
 
 
 
 
 
ed49550
 
 
 
 
 
0a20a75
 
57a7a6d
3c3e8e9
0a20a75
 
52e834f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
import gradio as gr
import torch, os
import wave
import librosa
import numpy as np
from scipy.io.wavfile import write
from PIL import Image
import matplotlib.pyplot as plt
from huggingface_hub import snapshot_download
import soundfile as sf
from auffusion_pipeline import AuffusionPipeline

# β€”β€”

from diffusers import StableDiffusionImg2ImgPipeline, StableDiffusionInpaintPipeline
from converter import load_wav, mel_spectrogram, normalize_spectrogram, denormalize_spectrogram, Generator, get_mel_spectrogram_from_audio
from utils import pad_spec, image_add_color, torch_to_pil, normalize, denormalize, prepare_mask_and_masked_image

# β€”β€”
def resample_audio(input_audio, original_sr, target_sr=16000):
    """
    Resample the audio to the target sample rate (16000 Hz by default).
    
    Args:
    - input_audio (numpy array): The raw audio data.
    - original_sr (int): The original sample rate of the input audio.
    - target_sr (int): The target sample rate (default is 16000 Hz).
    
    Returns:
    - numpy array: The resampled audio.
    """
    if original_sr != target_sr:
        # Resample the audio using librosa
        audio_resampled = librosa.resample(input_audio, orig_sr=original_sr, target_sr=target_sr)
        return audio_resampled
    else:
        # If sample rate is already 16000, no resampling is needed
        return input_audio


def save_spectrogram_image(spectrogram, filename):
    """Save a spectrogram as an image."""
    plt.figure(figsize=(10, 4))
    plt.imshow(spectrogram.squeeze(), aspect='auto', origin='lower', cmap='magma')
    plt.axis('off')  # Hide axes for a cleaner image
    plt.savefig(filename, bbox_inches='tight', pad_inches=0)
    plt.close()


def infer(prompt, progress=gr.Progress(track_tqdm=True)):
    pipeline = AuffusionPipeline.from_pretrained("auffusion/auffusion")
    prompt = prompt
    output = pipeline(prompt=prompt)
    audio = output.audios[0]
    sf.write(f"{prompt}.wav", audio, samplerate=16000)

    return f"{prompt}.wav"

def infer_img2img(prompt, audio_path, desired_strength, progress=gr.Progress(track_tqdm=True)):

    # Load your audio file
    input_audio, original_sr = librosa.load(audio_path, sr=None)  # Load with original sampling rate
    resampled_audio = resample_audio(input_audio, original_sr, target_sr=16000)
    # Save the resampled audio to a new file
    sf.write('resampled_audio.wav', resampled_audio, 16000)
    audio_path = 'resampled_audio.wav'
    
    pretrained_model_name_or_path = "auffusion/auffusion-full-no-adapter"
    dtype = torch.float16
    device = "cuda"
    
    if not os.path.isdir(pretrained_model_name_or_path):
        pretrained_model_name_or_path = snapshot_download(pretrained_model_name_or_path)     
    
    vocoder = Generator.from_pretrained(pretrained_model_name_or_path, subfolder="vocoder")
    vocoder = vocoder.to(device=device, dtype=dtype)

    pipe = StableDiffusionImg2ImgPipeline.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype)
    pipe = pipe.to(device)

    width_start, width = 0, 160
    strength_list = [desired_strength]
    prompt = prompt
    seed = 42

    # Loading
    audio, sampling_rate = load_wav(audio_path)
    audio, spec = get_mel_spectrogram_from_audio(audio)
    
    # Normalize the spectrogram
    norm_spec = normalize_spectrogram(spec)

    # norm_spec = norm_spec[:,:, width_start:width_start+width]
    norm_spec = pad_spec(norm_spec, 1024)
    norm_spec = normalize(norm_spec) # normalize to [-1, 1], because pipeline do not normalize for torch.Tensor input
    
    # raw_image = image_add_color(torch_to_pil(norm_spec[:,:,:width]))
    raw_image = image_add_color(torch_to_pil(norm_spec))
    
    # Generation for different strength
    image_list = []
    audio_list = []
    
    generator = torch.Generator(device=device).manual_seed(seed)
    
    for strength in strength_list:
        with torch.autocast("cuda"):
            output_spec = pipe(
                prompt=prompt, image=norm_spec, num_inference_steps=100, generator=generator, output_type="pt", strength=strength, guidance_scale=7.5
            ).images[0]
    
        # add to image_list
        # output_spec = output_spec[:, :, :width]
        output_spec_image = torch_to_pil(output_spec)
        color_output_spec_image = image_add_color(output_spec_image)
        image_list.append(color_output_spec_image)
    
        # add to audio_list
        denorm_spec = denormalize_spectrogram(output_spec)
        denorm_spec_audio = vocoder.inference(denorm_spec)
        audio_list.append(denorm_spec_audio)

    # Display

    # Concat image with different strength & add interval between images with black color 
    concat_image_list = []
    for i in range(len(image_list)):
        if i == len(image_list) - 1:
            concat_image_list.append(np.array(image_list[i]))
        else:
            concat_image_list.append(np.concatenate([np.array(image_list[i]), np.ones((256, 20, 3))*0], axis=1))
    
    concat_image = np.concatenate(concat_image_list, axis=1)
    concat_image = Image.fromarray(np.uint8(concat_image))
    
    ### Concat audio
    concat_audio_list = [np.concatenate([audio, np.zeros((1, 16000))], axis=1) for audio in audio_list]
    concat_audio = np.concatenate(concat_audio_list, axis=1)
    
    print("audio_path:", audio_path)
    print("width_start:", width_start, "width:", width)
    print("text prompt:", prompt)
    print("strength_list:", strength_list)

    # Ensure correct shape
    concat_audio = concat_audio.flatten()  # Converts (1, N) β†’ (N,)

    # Normalize the audio to prevent clipping or excessive loudness
    concat_audio = concat_audio / np.max(np.abs(concat_audio))  # Scale between -1 and 1

    # Save as WAV
    sf.write("output.wav", concat_audio, 16000)
    
    # Save input spectrogram image
    input_spec_image_path = "input_spectrogram.png"
    raw_image.save(input_spec_image_path)
    
    # Save concatenated spectrogram image
    output_spec_image_path = "output_spectrogram.png"
    concat_image.save(output_spec_image_path)

    return "output.wav", input_spec_image_path, output_spec_image_path

def infer_inp(prompt, audio_path, mask_start_point, mask_end_point, progress=gr.Progress(track_tqdm=True)):

    # Load your audio file
    input_audio, original_sr = librosa.load(audio_path, sr=None)  # Load with original sampling rate
    resampled_audio = resample_audio(input_audio, original_sr, target_sr=16000)
    # Save the resampled audio to a new file
    sf.write('resampled_audio.wav', resampled_audio, 16000)
    audio_path = 'resampled_audio.wav'

    pretrained_model_name_or_path = "auffusion/auffusion-full-no-adapter"
    dtype = torch.float16
    device = "cuda"
    
    if not os.path.isdir(pretrained_model_name_or_path):
        pretrained_model_name_or_path = snapshot_download(pretrained_model_name_or_path)     
    
    vocoder = Generator.from_pretrained(pretrained_model_name_or_path, subfolder="vocoder")
    vocoder = vocoder.to(device=device, dtype=dtype)

    pipe = StableDiffusionInpaintPipeline.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype)
    pipe = pipe.to(device)

    width_start, width = mask_start_point, mask_end_point-mask_start_point
    prompt = prompt
    seed = 42

    # Loading
    audio, sampling_rate = load_wav(audio_path) 
    audio, spec = get_mel_spectrogram_from_audio(audio)
    norm_spec = normalize_spectrogram(spec)

    norm_spec = pad_spec(norm_spec, 1024)
    norm_spec = normalize(norm_spec) # normalize to [-1, 1], because pipeline do not normalize for torch.Tensor input
    
    raw_image = image_add_color(torch_to_pil(norm_spec))
    
    # Add Mask
    mask = torch.zeros_like(norm_spec)[:1,...]
    mask[:, :, width_start:width_start+width] = 1
    mask_image = torch_to_pil(mask)
    
    mask, masked_spec = prepare_mask_and_masked_image(norm_spec, mask)
    masked_spec_image = torch_to_pil(masked_spec)

    # color masked spec and paint masked area to black
    color_masked_spec_image = image_add_color(masked_spec_image)
    color_masked_spec_image = np.array(color_masked_spec_image)
    color_masked_spec_image[:, width_start:width_start+width, :] = 0
    color_masked_spec_image = Image.fromarray(color_masked_spec_image)

    # Generation
    generator = torch.Generator(device=device).manual_seed(seed)
    
    with torch.autocast("cuda"):
        output_spec = pipe(
            prompt=prompt, image=norm_spec, mask_image=mask, num_inference_steps=100, generator=generator, height=256, width=1024, output_type="pt"
        ).images[0]
    
    output_spec_image = torch_to_pil(output_spec)
    color_output_spec_image = image_add_color(output_spec_image)

    # Display audio result: raw audio, masked raw audio, generated audio
    post_norm_spec = denormalize(norm_spec).to(device, dtype)
    raw_chunk_spec = denormalize_spectrogram(post_norm_spec)
    raw_chunk_audio = vocoder.inference(raw_chunk_spec)
    
    post_masked_spec = denormalize(masked_spec).to(device, dtype)
    denorm_masked_spec = denormalize_spectrogram(post_masked_spec)
    denorm_masked_spec_audio = vocoder.inference(denorm_masked_spec)
    
    denorm_spec = denormalize_spectrogram(output_spec)
    denorm_spec_audio = vocoder.inference(denorm_spec)

    #β€”β€”β€”

    # Ensure correct shape
    denorm_spec_audio = denorm_spec_audio.flatten()  # Converts (1, N) β†’ (N,)
    denorm_spec_audio = denorm_spec_audio / np.max(np.abs(denorm_spec_audio))  # Scale between -1 and 1

    # Save as WAV
    sf.write("generated_output.wav", denorm_spec_audio, 16000)
    
    # Save input spectrogram image
    input_spec_image_path = "input_spectrogram.png"
    raw_image.save(input_spec_image_path)
    
    # Save output spectrogram image
    output_spec_image_path = "output_spectrogram.png"
    color_output_spec_image.save(output_spec_image_path)

    return "generated_output.wav", input_spec_image_path, color_output_spec_image

def load_input_spectrogram(audio_path):
    # Loading
    audio, sampling_rate = load_wav(audio_path)
    audio, spec = get_mel_spectrogram_from_audio(audio)
    norm_spec = normalize_spectrogram(spec)
    norm_spec = pad_spec(norm_spec, 1024)
    norm_spec = normalize(norm_spec) # normalize to [-1, 1], because pipeline do not normalize for torch.Tensor input
    
    raw_image = image_add_color(torch_to_pil(norm_spec))
    
    # Save input spectrogram image
    input_spec_image_path = "input_spectrogram.png"
    raw_image.save(input_spec_image_path)

    return input_spec_image_path

def preview_masked_area(audio_path, mask_start_point, mask_end_point):
    # Loading
    audio, sampling_rate = load_wav(audio_path)
    audio, spec = get_mel_spectrogram_from_audio(audio)
    norm_spec = normalize_spectrogram(spec)
    norm_spec = pad_spec(norm_spec, 1024)
    norm_spec = normalize(norm_spec) # normalize to [-1, 1], because pipeline do not normalize for torch.Tensor input

    # Add Mask
    width_start, width = mask_start_point, mask_end_point-mask_start_point
    mask = torch.zeros_like(norm_spec)[:1,...]
    mask[:, :, width_start:width_start+width] = 1
    mask_image = torch_to_pil(mask)
    
    mask, masked_spec = prepare_mask_and_masked_image(norm_spec, mask)
    masked_spec_image = torch_to_pil(masked_spec)

    # color masked spec and paint masked area to black
    color_masked_spec_image = image_add_color(masked_spec_image)
    color_masked_spec_image = np.array(color_masked_spec_image)
    color_masked_spec_image[:, width_start:width_start+width, :] = 0
    color_masked_spec_image = Image.fromarray(color_masked_spec_image)

    # Save the masked spectrogram image
    masked_spec_image_path = "masked_spectrogram.png"
    color_masked_spec_image.save(masked_spec_image_path)

    return masked_spec_image_path

def load_inpaint_example(prompt_inp, audio_path):

    in_spec_path = load_input_spectrogram(audio_path)
    masked_spec_path = preview_masked_area(audio_path, 256, 768)
    
    return in_spec_path, masked_spec_path
    
css="""
div#col-container{
    margin: 0 auto;
    max-width: 640px;
}
"""
with gr.Blocks(css=css) as demo:
    with gr.Column(elem_id="col-container"):
        gr.Markdown("# Auffusion")
        gr.Markdown("Auffusion can generate realistic audios including human sounds, animal sounds, natural and artificial sounds and sound effects from textual prompts. ")
        gr.HTML("""
        <div style="display:flex;column-gap:4px;">
            <a href="https://auffusion.github.io/">
                <img src='https://img.shields.io/badge/Project-Page-green'>
            </a>
            <a href="https://github.com/happylittlecat2333/Auffusion">
                <img src='https://img.shields.io/badge/GitHub-Repo-blue'>
            </a> 
            <a href="https://arxiv.org/pdf/2401.01044">
                <img src='https://img.shields.io/badge/ArXiv-Paper-red'>
            </a>
            <a href="https://huggingface.co/spaces/fffiloni/auffusion?duplicate=true">
                <img src="https://huggingface.co/datasets/huggingface/badges/resolve/main/duplicate-this-space-sm.svg" alt="Duplicate this Space">
            </a>
        </div>
        """)
        with gr.Tab("Text-to-Audio"):
            prompt = gr.Textbox(label="Prompt")
            submit_btn = gr.Button("Submit")
            audio_out = gr.Audio(label="Audio Ressult")
    
            gr.Examples(
                examples = [
                    "Rolling thunder with lightning strikes",
                    "Two gunshots followed by birds chirping",
                    "A train whistle blowing in the distance"
                ],
                inputs = [prompt]
            )

            submit_btn.click(
                fn = infer,
                inputs = [prompt],
                outputs = [audio_out]
            )

        with gr.Tab("Audio-to-Audio"):
            prompt_img2img = gr.Textbox(label="Prompt")
            audio_in_img2img = gr.Audio(label="Audio Reference", type="filepath", format="wav")
            prompt_strength = gr.Slider(label="Prompt Strength", minimum=0.0, maximum=1.0, step=0.1, value=0.7)
            submit_btn_img2img = gr.Button("Submit")
            audio_out_img2img = gr.Audio(label="Audio Ressult")

            with gr.Accordion("Compare Spectrograms", open=False):
                with gr.Column():
                    input_spectrogram = gr.Image(label="Input Spectrogram")
                    output_spectrogram = gr.Image(label="Output Spectrogram")
    
            gr.Examples(
                examples = [
                    ["Ambulance siren", "./notebooks/examples/img2img/GIOApFAWDOc_160.wav"],
                    ["A cat is moewing", "./notebooks/examples/img2img/YniwgMbB6tpQ_01.wav"],
                    ["A car racing", "./notebooks/examples/img2img/_GI7meqlYZk_30.wav"]
                ],
                inputs = [prompt_img2img, audio_in_img2img]
            )

            submit_btn_img2img.click(
                fn = infer_img2img,
                inputs = [prompt_img2img, audio_in_img2img, prompt_strength],
                outputs = [audio_out_img2img, input_spectrogram, output_spectrogram]
            )     

        with gr.Tab("Audio InPainting"):
            prompt_inp = gr.Textbox(label="Prompt")
            audio_in_inp = gr.Audio(label="Audio Reference", type="filepath", format="wav")
            
            audio_in_spec = gr.Image(label="Audio IN spectrogram")
            mask_start_point = gr.Slider(label="Mask Start point", minimum=0, maximum=1024, step=1, value=256)
            mask_end_point = gr.Slider(label="Mask End point", minimum=0, maximum=1024, step=1, value=768)
            preview_mask_btn = gr.Button("Preview Mask")
            
            masked_spec_preview = gr.Image(label="Spectrogram Mask Preview")
            submit_btn_inp = gr.Button("Submit")
            
            audio_out_inp = gr.Audio(label="Audio Ressult")

            with gr.Accordion("Compare Spectrograms", open=False):
                with gr.Column():
                    input_spectrogram_inp = gr.Image(label="Input Spectrogram")
                    output_spectrogram_inp = gr.Image(label="Output Spectrogram")
            
            gr.Examples(
                examples = [
                    ["A siren ringing with a vehicle speeding closer", "./notebooks/examples/inpainting/IvfaKPDWC00_160.wav"],
                    ["A woman speaking", "./notebooks/examples/inpainting/9z8XIRyUq9Q_30.wav"],
                    ["An infant crying", "./notebooks/examples/inpainting/14ekd4nkpwc_28.wav"],
                    ["A dog barking and growling", "./notebooks/examples/inpainting/3ek-xLwr05Q_30.wav"]
                ],
                fn = load_inpaint_example,
                inputs = [prompt_inp, audio_in_inp],
                outputs = [audio_in_spec, masked_spec_preview],
                cache_examples = True
            )

            audio_in_inp.upload(
                fn = load_input_spectrogram,
                inputs = [audio_in_inp],
                outputs = [audio_in_spec]
            )

            audio_in_inp.stop_recording(
                fn = load_input_spectrogram,
                inputs = [audio_in_inp],
                outputs = [audio_in_spec]
            )

            preview_mask_btn.click(
                fn = preview_masked_area,
                inputs = [audio_in_inp, mask_start_point, mask_end_point],
                outputs = [masked_spec_preview]
            )
            
            submit_btn_inp.click(
                fn = infer_inp,
                inputs = [prompt_inp, audio_in_inp, mask_start_point, mask_end_point],
                outputs = [audio_out_inp, input_spectrogram_inp, output_spectrogram_inp]
            )

demo.queue().launch(show_api=False, show_error=True)