File size: 17,848 Bytes
52e834f 160741c 9710eda f081ac6 160741c 9710eda 160741c d1a417b 160741c 52e834f 160741c 7d372b2 160741c abf12f8 160741c 721c606 cf62874 721c606 fcdd877 91ec96f 01ce88b b641e52 52e834f 2f7657b 160741c cf62874 160741c f26e2b9 160741c 2f7657b 3fe4f3e 160741c 3c3e8e9 dde3732 3c3e8e9 1b5791c 160741c 20c6475 160741c 2f7657b 160741c 470d848 faf8c97 470d848 721c606 470d848 721c606 160741c 57a7a6d 0a20a75 c01e8f8 9710eda 0a20a75 7d372b2 0a20a75 57a7a6d 3fe4f3e 0a20a75 c01e8f8 0a20a75 3c3e8e9 0a20a75 57a7a6d 485b322 57a7a6d 0a20a75 3c3e8e9 0a20a75 05091a2 0a20a75 68759ee 970b009 57a7a6d ab09220 57a7a6d ed49550 b92c614 ed49550 b92c614 ed49550 080877e ed49550 485a508 ed49550 91ec96f 52e834f 91ec96f 160741c 645f8f4 2f7657b 160741c 721c606 20c6475 160741c 485a508 160741c 2f7657b 721c606 485a508 52e834f 0a20a75 645f8f4 ed49550 57a7a6d ed49550 0a20a75 ed49550 0a20a75 05091a2 485a508 90c1e48 485a508 ed49550 b8ba919 ed49550 0a20a75 57a7a6d 3c3e8e9 0a20a75 52e834f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 |
import gradio as gr
import torch, os
import wave
import librosa
import numpy as np
from scipy.io.wavfile import write
from PIL import Image
import matplotlib.pyplot as plt
from huggingface_hub import snapshot_download
import soundfile as sf
from auffusion_pipeline import AuffusionPipeline
# ββ
from diffusers import StableDiffusionImg2ImgPipeline, StableDiffusionInpaintPipeline
from converter import load_wav, mel_spectrogram, normalize_spectrogram, denormalize_spectrogram, Generator, get_mel_spectrogram_from_audio
from utils import pad_spec, image_add_color, torch_to_pil, normalize, denormalize, prepare_mask_and_masked_image
# ββ
def resample_audio(input_audio, original_sr, target_sr=16000):
"""
Resample the audio to the target sample rate (16000 Hz by default).
Args:
- input_audio (numpy array): The raw audio data.
- original_sr (int): The original sample rate of the input audio.
- target_sr (int): The target sample rate (default is 16000 Hz).
Returns:
- numpy array: The resampled audio.
"""
if original_sr != target_sr:
# Resample the audio using librosa
audio_resampled = librosa.resample(input_audio, orig_sr=original_sr, target_sr=target_sr)
return audio_resampled
else:
# If sample rate is already 16000, no resampling is needed
return input_audio
def save_spectrogram_image(spectrogram, filename):
"""Save a spectrogram as an image."""
plt.figure(figsize=(10, 4))
plt.imshow(spectrogram.squeeze(), aspect='auto', origin='lower', cmap='magma')
plt.axis('off') # Hide axes for a cleaner image
plt.savefig(filename, bbox_inches='tight', pad_inches=0)
plt.close()
def infer(prompt, progress=gr.Progress(track_tqdm=True)):
pipeline = AuffusionPipeline.from_pretrained("auffusion/auffusion")
prompt = prompt
output = pipeline(prompt=prompt)
audio = output.audios[0]
sf.write(f"{prompt}.wav", audio, samplerate=16000)
return f"{prompt}.wav"
def infer_img2img(prompt, audio_path, desired_strength, progress=gr.Progress(track_tqdm=True)):
# Load your audio file
input_audio, original_sr = librosa.load(audio_path, sr=None) # Load with original sampling rate
resampled_audio = resample_audio(input_audio, original_sr, target_sr=16000)
# Save the resampled audio to a new file
sf.write('resampled_audio.wav', resampled_audio, 16000)
audio_path = 'resampled_audio.wav'
pretrained_model_name_or_path = "auffusion/auffusion-full-no-adapter"
dtype = torch.float16
device = "cuda"
if not os.path.isdir(pretrained_model_name_or_path):
pretrained_model_name_or_path = snapshot_download(pretrained_model_name_or_path)
vocoder = Generator.from_pretrained(pretrained_model_name_or_path, subfolder="vocoder")
vocoder = vocoder.to(device=device, dtype=dtype)
pipe = StableDiffusionImg2ImgPipeline.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype)
pipe = pipe.to(device)
width_start, width = 0, 160
strength_list = [desired_strength]
prompt = prompt
seed = 42
# Loading
audio, sampling_rate = load_wav(audio_path)
audio, spec = get_mel_spectrogram_from_audio(audio)
# Normalize the spectrogram
norm_spec = normalize_spectrogram(spec)
# norm_spec = norm_spec[:,:, width_start:width_start+width]
norm_spec = pad_spec(norm_spec, 1024)
norm_spec = normalize(norm_spec) # normalize to [-1, 1], because pipeline do not normalize for torch.Tensor input
# raw_image = image_add_color(torch_to_pil(norm_spec[:,:,:width]))
raw_image = image_add_color(torch_to_pil(norm_spec))
# Generation for different strength
image_list = []
audio_list = []
generator = torch.Generator(device=device).manual_seed(seed)
for strength in strength_list:
with torch.autocast("cuda"):
output_spec = pipe(
prompt=prompt, image=norm_spec, num_inference_steps=100, generator=generator, output_type="pt", strength=strength, guidance_scale=7.5
).images[0]
# add to image_list
# output_spec = output_spec[:, :, :width]
output_spec_image = torch_to_pil(output_spec)
color_output_spec_image = image_add_color(output_spec_image)
image_list.append(color_output_spec_image)
# add to audio_list
denorm_spec = denormalize_spectrogram(output_spec)
denorm_spec_audio = vocoder.inference(denorm_spec)
audio_list.append(denorm_spec_audio)
# Display
# Concat image with different strength & add interval between images with black color
concat_image_list = []
for i in range(len(image_list)):
if i == len(image_list) - 1:
concat_image_list.append(np.array(image_list[i]))
else:
concat_image_list.append(np.concatenate([np.array(image_list[i]), np.ones((256, 20, 3))*0], axis=1))
concat_image = np.concatenate(concat_image_list, axis=1)
concat_image = Image.fromarray(np.uint8(concat_image))
### Concat audio
concat_audio_list = [np.concatenate([audio, np.zeros((1, 16000))], axis=1) for audio in audio_list]
concat_audio = np.concatenate(concat_audio_list, axis=1)
print("audio_path:", audio_path)
print("width_start:", width_start, "width:", width)
print("text prompt:", prompt)
print("strength_list:", strength_list)
# Ensure correct shape
concat_audio = concat_audio.flatten() # Converts (1, N) β (N,)
# Normalize the audio to prevent clipping or excessive loudness
concat_audio = concat_audio / np.max(np.abs(concat_audio)) # Scale between -1 and 1
# Save as WAV
sf.write("output.wav", concat_audio, 16000)
# Save input spectrogram image
input_spec_image_path = "input_spectrogram.png"
raw_image.save(input_spec_image_path)
# Save concatenated spectrogram image
output_spec_image_path = "output_spectrogram.png"
concat_image.save(output_spec_image_path)
return "output.wav", input_spec_image_path, output_spec_image_path
def infer_inp(prompt, audio_path, mask_start_point, mask_end_point, progress=gr.Progress(track_tqdm=True)):
# Load your audio file
input_audio, original_sr = librosa.load(audio_path, sr=None) # Load with original sampling rate
resampled_audio = resample_audio(input_audio, original_sr, target_sr=16000)
# Save the resampled audio to a new file
sf.write('resampled_audio.wav', resampled_audio, 16000)
audio_path = 'resampled_audio.wav'
pretrained_model_name_or_path = "auffusion/auffusion-full-no-adapter"
dtype = torch.float16
device = "cuda"
if not os.path.isdir(pretrained_model_name_or_path):
pretrained_model_name_or_path = snapshot_download(pretrained_model_name_or_path)
vocoder = Generator.from_pretrained(pretrained_model_name_or_path, subfolder="vocoder")
vocoder = vocoder.to(device=device, dtype=dtype)
pipe = StableDiffusionInpaintPipeline.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype)
pipe = pipe.to(device)
width_start, width = mask_start_point, mask_end_point-mask_start_point
prompt = prompt
seed = 42
# Loading
audio, sampling_rate = load_wav(audio_path)
audio, spec = get_mel_spectrogram_from_audio(audio)
norm_spec = normalize_spectrogram(spec)
norm_spec = pad_spec(norm_spec, 1024)
norm_spec = normalize(norm_spec) # normalize to [-1, 1], because pipeline do not normalize for torch.Tensor input
raw_image = image_add_color(torch_to_pil(norm_spec))
# Add Mask
mask = torch.zeros_like(norm_spec)[:1,...]
mask[:, :, width_start:width_start+width] = 1
mask_image = torch_to_pil(mask)
mask, masked_spec = prepare_mask_and_masked_image(norm_spec, mask)
masked_spec_image = torch_to_pil(masked_spec)
# color masked spec and paint masked area to black
color_masked_spec_image = image_add_color(masked_spec_image)
color_masked_spec_image = np.array(color_masked_spec_image)
color_masked_spec_image[:, width_start:width_start+width, :] = 0
color_masked_spec_image = Image.fromarray(color_masked_spec_image)
# Generation
generator = torch.Generator(device=device).manual_seed(seed)
with torch.autocast("cuda"):
output_spec = pipe(
prompt=prompt, image=norm_spec, mask_image=mask, num_inference_steps=100, generator=generator, height=256, width=1024, output_type="pt"
).images[0]
output_spec_image = torch_to_pil(output_spec)
color_output_spec_image = image_add_color(output_spec_image)
# Display audio result: raw audio, masked raw audio, generated audio
post_norm_spec = denormalize(norm_spec).to(device, dtype)
raw_chunk_spec = denormalize_spectrogram(post_norm_spec)
raw_chunk_audio = vocoder.inference(raw_chunk_spec)
post_masked_spec = denormalize(masked_spec).to(device, dtype)
denorm_masked_spec = denormalize_spectrogram(post_masked_spec)
denorm_masked_spec_audio = vocoder.inference(denorm_masked_spec)
denorm_spec = denormalize_spectrogram(output_spec)
denorm_spec_audio = vocoder.inference(denorm_spec)
#βββ
# Ensure correct shape
denorm_spec_audio = denorm_spec_audio.flatten() # Converts (1, N) β (N,)
denorm_spec_audio = denorm_spec_audio / np.max(np.abs(denorm_spec_audio)) # Scale between -1 and 1
# Save as WAV
sf.write("generated_output.wav", denorm_spec_audio, 16000)
# Save input spectrogram image
input_spec_image_path = "input_spectrogram.png"
raw_image.save(input_spec_image_path)
# Save output spectrogram image
output_spec_image_path = "output_spectrogram.png"
color_output_spec_image.save(output_spec_image_path)
return "generated_output.wav", input_spec_image_path, color_output_spec_image
def load_input_spectrogram(audio_path):
# Loading
audio, sampling_rate = load_wav(audio_path)
audio, spec = get_mel_spectrogram_from_audio(audio)
norm_spec = normalize_spectrogram(spec)
norm_spec = pad_spec(norm_spec, 1024)
norm_spec = normalize(norm_spec) # normalize to [-1, 1], because pipeline do not normalize for torch.Tensor input
raw_image = image_add_color(torch_to_pil(norm_spec))
# Save input spectrogram image
input_spec_image_path = "input_spectrogram.png"
raw_image.save(input_spec_image_path)
return input_spec_image_path
def preview_masked_area(audio_path, mask_start_point, mask_end_point):
# Loading
audio, sampling_rate = load_wav(audio_path)
audio, spec = get_mel_spectrogram_from_audio(audio)
norm_spec = normalize_spectrogram(spec)
norm_spec = pad_spec(norm_spec, 1024)
norm_spec = normalize(norm_spec) # normalize to [-1, 1], because pipeline do not normalize for torch.Tensor input
# Add Mask
width_start, width = mask_start_point, mask_end_point-mask_start_point
mask = torch.zeros_like(norm_spec)[:1,...]
mask[:, :, width_start:width_start+width] = 1
mask_image = torch_to_pil(mask)
mask, masked_spec = prepare_mask_and_masked_image(norm_spec, mask)
masked_spec_image = torch_to_pil(masked_spec)
# color masked spec and paint masked area to black
color_masked_spec_image = image_add_color(masked_spec_image)
color_masked_spec_image = np.array(color_masked_spec_image)
color_masked_spec_image[:, width_start:width_start+width, :] = 0
color_masked_spec_image = Image.fromarray(color_masked_spec_image)
# Save the masked spectrogram image
masked_spec_image_path = "masked_spectrogram.png"
color_masked_spec_image.save(masked_spec_image_path)
return masked_spec_image_path
def load_inpaint_example(prompt_inp, audio_path):
in_spec_path = load_input_spectrogram(audio_path)
masked_spec_path = preview_masked_area(audio_path, 256, 768)
return in_spec_path, masked_spec_path
css="""
div#col-container{
margin: 0 auto;
max-width: 640px;
}
"""
with gr.Blocks(css=css) as demo:
with gr.Column(elem_id="col-container"):
gr.Markdown("# Auffusion")
gr.Markdown("Auffusion can generate realistic audios including human sounds, animal sounds, natural and artificial sounds and sound effects from textual prompts. ")
gr.HTML("""
<div style="display:flex;column-gap:4px;">
<a href="https://auffusion.github.io/">
<img src='https://img.shields.io/badge/Project-Page-green'>
</a>
<a href="https://github.com/happylittlecat2333/Auffusion">
<img src='https://img.shields.io/badge/GitHub-Repo-blue'>
</a>
<a href="https://arxiv.org/pdf/2401.01044">
<img src='https://img.shields.io/badge/ArXiv-Paper-red'>
</a>
<a href="https://huggingface.co/spaces/fffiloni/auffusion?duplicate=true">
<img src="https://huggingface.co/datasets/huggingface/badges/resolve/main/duplicate-this-space-sm.svg" alt="Duplicate this Space">
</a>
</div>
""")
with gr.Tab("Text-to-Audio"):
prompt = gr.Textbox(label="Prompt")
submit_btn = gr.Button("Submit")
audio_out = gr.Audio(label="Audio Ressult")
gr.Examples(
examples = [
"Rolling thunder with lightning strikes",
"Two gunshots followed by birds chirping",
"A train whistle blowing in the distance"
],
inputs = [prompt]
)
submit_btn.click(
fn = infer,
inputs = [prompt],
outputs = [audio_out]
)
with gr.Tab("Audio-to-Audio"):
prompt_img2img = gr.Textbox(label="Prompt")
audio_in_img2img = gr.Audio(label="Audio Reference", type="filepath", format="wav")
prompt_strength = gr.Slider(label="Prompt Strength", minimum=0.0, maximum=1.0, step=0.1, value=0.7)
submit_btn_img2img = gr.Button("Submit")
audio_out_img2img = gr.Audio(label="Audio Ressult")
with gr.Accordion("Compare Spectrograms", open=False):
with gr.Column():
input_spectrogram = gr.Image(label="Input Spectrogram")
output_spectrogram = gr.Image(label="Output Spectrogram")
gr.Examples(
examples = [
["Ambulance siren", "./notebooks/examples/img2img/GIOApFAWDOc_160.wav"],
["A cat is moewing", "./notebooks/examples/img2img/YniwgMbB6tpQ_01.wav"],
["A car racing", "./notebooks/examples/img2img/_GI7meqlYZk_30.wav"]
],
inputs = [prompt_img2img, audio_in_img2img]
)
submit_btn_img2img.click(
fn = infer_img2img,
inputs = [prompt_img2img, audio_in_img2img, prompt_strength],
outputs = [audio_out_img2img, input_spectrogram, output_spectrogram]
)
with gr.Tab("Audio InPainting"):
prompt_inp = gr.Textbox(label="Prompt")
audio_in_inp = gr.Audio(label="Audio Reference", type="filepath", format="wav")
audio_in_spec = gr.Image(label="Audio IN spectrogram")
mask_start_point = gr.Slider(label="Mask Start point", minimum=0, maximum=1024, step=1, value=256)
mask_end_point = gr.Slider(label="Mask End point", minimum=0, maximum=1024, step=1, value=768)
preview_mask_btn = gr.Button("Preview Mask")
masked_spec_preview = gr.Image(label="Spectrogram Mask Preview")
submit_btn_inp = gr.Button("Submit")
audio_out_inp = gr.Audio(label="Audio Ressult")
with gr.Accordion("Compare Spectrograms", open=False):
with gr.Column():
input_spectrogram_inp = gr.Image(label="Input Spectrogram")
output_spectrogram_inp = gr.Image(label="Output Spectrogram")
gr.Examples(
examples = [
["A siren ringing with a vehicle speeding closer", "./notebooks/examples/inpainting/IvfaKPDWC00_160.wav"],
["A woman speaking", "./notebooks/examples/inpainting/9z8XIRyUq9Q_30.wav"],
["An infant crying", "./notebooks/examples/inpainting/14ekd4nkpwc_28.wav"],
["A dog barking and growling", "./notebooks/examples/inpainting/3ek-xLwr05Q_30.wav"]
],
fn = load_inpaint_example,
inputs = [prompt_inp, audio_in_inp],
outputs = [audio_in_spec, masked_spec_preview],
cache_examples = True
)
audio_in_inp.upload(
fn = load_input_spectrogram,
inputs = [audio_in_inp],
outputs = [audio_in_spec]
)
audio_in_inp.stop_recording(
fn = load_input_spectrogram,
inputs = [audio_in_inp],
outputs = [audio_in_spec]
)
preview_mask_btn.click(
fn = preview_masked_area,
inputs = [audio_in_inp, mask_start_point, mask_end_point],
outputs = [masked_spec_preview]
)
submit_btn_inp.click(
fn = infer_inp,
inputs = [prompt_inp, audio_in_inp, mask_start_point, mask_end_point],
outputs = [audio_out_inp, input_spectrogram_inp, output_spectrogram_inp]
)
demo.queue().launch(show_api=False, show_error=True) |