import gradio as gr import torch, os import wave import numpy as np from scipy.io.wavfile import write from PIL import Image import matplotlib.pyplot as plt from huggingface_hub import snapshot_download import soundfile as sf from auffusion_pipeline import AuffusionPipeline # —— from diffusers import StableDiffusionImg2ImgPipeline, StableDiffusionInpaintPipeline from converter import load_wav, mel_spectrogram, normalize_spectrogram, denormalize_spectrogram, Generator, get_mel_spectrogram_from_audio from utils import pad_spec, image_add_color, torch_to_pil, normalize, denormalize, prepare_mask_and_masked_image # —— def convert_wav_to_16khz(input_path, output_path): with wave.open(input_path, "rb") as wav_in: params = wav_in.getparams() channels, sampwidth, framerate, nframes = params[:4] # Read and convert audio data audio_data = np.frombuffer(wav_in.readframes(nframes), dtype=np.int16) new_framerate = 16000 # Save as a new WAV file write(output_path, new_framerate, audio_data) return output_path def save_spectrogram_image(spectrogram, filename): """Save a spectrogram as an image.""" plt.figure(figsize=(10, 4)) plt.imshow(spectrogram.squeeze(), aspect='auto', origin='lower', cmap='magma') plt.axis('off') # Hide axes for a cleaner image plt.savefig(filename, bbox_inches='tight', pad_inches=0) plt.close() def debug_spectrogram(audio, spec, label="Current File"): print(f"==== [{label}] ====") print(f"🔹 Raw Audio Min/Max: {audio.min()}, {audio.max()}") print(f"🔹 Spectrogram Min/Max Before Normalization: {spec.min()}, {spec.max()}") print(f"🔹 Spectrogram Mean Before Normalization: {spec.mean()}") normalized_spec = normalize_spectrogram(spec) print(f"🔹 Spectrogram Min/Max After Normalization: {normalized_spec.min()}, {normalized_spec.max()}") print(f"🔹 Spectrogram Mean After Normalization: {normalized_spec.mean()}") return normalized_spec def infer(prompt, progress=gr.Progress(track_tqdm=True)): pipeline = AuffusionPipeline.from_pretrained("auffusion/auffusion") prompt = prompt output = pipeline(prompt=prompt) audio = output.audios[0] sf.write(f"{prompt}.wav", audio, samplerate=16000) return f"{prompt}.wav" def infer_img2img(prompt, audio_path, desired_strength, progress=gr.Progress(track_tqdm=True)): audio_path = convert_wav_to_16khz(audio_path, "output_16khz.wav") pretrained_model_name_or_path = "auffusion/auffusion-full-no-adapter" dtype = torch.float16 device = "cuda" if not os.path.isdir(pretrained_model_name_or_path): pretrained_model_name_or_path = snapshot_download(pretrained_model_name_or_path) vocoder = Generator.from_pretrained(pretrained_model_name_or_path, subfolder="vocoder") vocoder = vocoder.to(device=device, dtype=dtype) pipe = StableDiffusionImg2ImgPipeline.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype) pipe = pipe.to(device) width_start, width = 0, 160 strength_list = [desired_strength] prompt = prompt seed = 42 # Loading audio, sampling_rate = load_wav(audio_path) audio, spec = get_mel_spectrogram_from_audio(audio) # Check if the spectrogram mean before normalization is too low spec_mean_before = spec.mean().item() # Apply fix only if the spectrogram mean is too low if spec_mean_before < -5.0: print(f"⚠️ Spectrogram too low (Mean: {spec_mean_before}).") else: print(f"✅ Spectrogram looks normal (Mean: {spec_mean_before}). No boost needed.") # Normalize the spectrogram norm_spec = normalize_spectrogram(spec) # norm_spec = norm_spec[:,:, width_start:width_start+width] norm_spec = pad_spec(norm_spec, 1024) norm_spec = normalize(norm_spec) # normalize to [-1, 1], because pipeline do not normalize for torch.Tensor input # raw_image = image_add_color(torch_to_pil(norm_spec[:,:,:width])) raw_image = image_add_color(torch_to_pil(norm_spec)) # Generation for different strength image_list = [] audio_list = [] generator = torch.Generator(device=device).manual_seed(seed) for strength in strength_list: with torch.autocast("cuda"): output_spec = pipe( prompt=prompt, image=norm_spec, num_inference_steps=100, generator=generator, output_type="pt", strength=strength, guidance_scale=7.5 ).images[0] # add to image_list # output_spec = output_spec[:, :, :width] output_spec_image = torch_to_pil(output_spec) color_output_spec_image = image_add_color(output_spec_image) image_list.append(color_output_spec_image) # add to audio_list denorm_spec = denormalize_spectrogram(output_spec) denorm_spec_audio = vocoder.inference(denorm_spec) audio_list.append(denorm_spec_audio) # Display # Concat image with different strength & add interval between images with black color concat_image_list = [] for i in range(len(image_list)): if i == len(image_list) - 1: concat_image_list.append(np.array(image_list[i])) else: concat_image_list.append(np.concatenate([np.array(image_list[i]), np.ones((256, 20, 3))*0], axis=1)) concat_image = np.concatenate(concat_image_list, axis=1) concat_image = Image.fromarray(np.uint8(concat_image)) ### Concat audio concat_audio_list = [np.concatenate([audio, np.zeros((1, 16000))], axis=1) for audio in audio_list] concat_audio = np.concatenate(concat_audio_list, axis=1) print("audio_path:", audio_path) print("width_start:", width_start, "width:", width) print("text prompt:", prompt) print("strength_list:", strength_list) # Ensure correct shape concat_audio = concat_audio.flatten() # Converts (1, N) → (N,) # Normalize the audio to prevent clipping or excessive loudness concat_audio = concat_audio / np.max(np.abs(concat_audio)) # Scale between -1 and 1 # Save as WAV sf.write("output.wav", concat_audio, 16000) # Save input spectrogram image input_spec_image_path = "input_spectrogram.png" raw_image.save(input_spec_image_path) # Save concatenated spectrogram image output_spec_image_path = "output_spectrogram.png" concat_image.save(output_spec_image_path) return "output.wav", input_spec_image_path, output_spec_image_path def infer_inp(prompt, audio_path, mask_start_point, mask_end_point, progress=gr.Progress(track_tqdm=True)): audio_path = convert_wav_to_16khz(audio_path, "output_16khz.wav") pretrained_model_name_or_path = "auffusion/auffusion-full-no-adapter" dtype = torch.float16 device = "cuda" if not os.path.isdir(pretrained_model_name_or_path): pretrained_model_name_or_path = snapshot_download(pretrained_model_name_or_path) vocoder = Generator.from_pretrained(pretrained_model_name_or_path, subfolder="vocoder") vocoder = vocoder.to(device=device, dtype=dtype) pipe = StableDiffusionInpaintPipeline.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype) pipe = pipe.to(device) width_start, width = mask_start_point, mask_end_point-mask_start_point prompt = prompt seed = 42 # Loading audio, sampling_rate = load_wav(audio_path) print(f"Raw audio min/max: {audio.min()}, {audio.max()}") audio, spec = get_mel_spectrogram_from_audio(audio) print(f"Spectrogram min/max before normalization: {spec.min()}, {spec.max()}") norm_spec = normalize_spectrogram(spec) print(f"Spectrogram min/max after normalization: {norm_spec.min()}, {norm_spec.max()}") norm_spec = pad_spec(norm_spec, 1024) norm_spec = normalize(norm_spec) # normalize to [-1, 1], because pipeline do not normalize for torch.Tensor input raw_image = image_add_color(torch_to_pil(norm_spec)) # Add Mask mask = torch.zeros_like(norm_spec)[:1,...] mask[:, :, width_start:width_start+width] = 1 mask_image = torch_to_pil(mask) mask, masked_spec = prepare_mask_and_masked_image(norm_spec, mask) masked_spec_image = torch_to_pil(masked_spec) # color masked spec and paint masked area to black color_masked_spec_image = image_add_color(masked_spec_image) color_masked_spec_image = np.array(color_masked_spec_image) color_masked_spec_image[:, width_start:width_start+width, :] = 0 color_masked_spec_image = Image.fromarray(color_masked_spec_image) # Generation generator = torch.Generator(device=device).manual_seed(seed) with torch.autocast("cuda"): output_spec = pipe( prompt=prompt, image=norm_spec, mask_image=mask, num_inference_steps=100, generator=generator, height=256, width=1024, output_type="pt" ).images[0] output_spec_image = torch_to_pil(output_spec) color_output_spec_image = image_add_color(output_spec_image) # Display audio result: raw audio, masked raw audio, generated audio post_norm_spec = denormalize(norm_spec).to(device, dtype) raw_chunk_spec = denormalize_spectrogram(post_norm_spec) raw_chunk_audio = vocoder.inference(raw_chunk_spec) post_masked_spec = denormalize(masked_spec).to(device, dtype) denorm_masked_spec = denormalize_spectrogram(post_masked_spec) denorm_masked_spec_audio = vocoder.inference(denorm_masked_spec) denorm_spec = denormalize_spectrogram(output_spec) denorm_spec_audio = vocoder.inference(denorm_spec) #——— # Ensure correct shape denorm_spec_audio = denorm_spec_audio.flatten() # Converts (1, N) → (N,) denorm_spec_audio = denorm_spec_audio / np.max(np.abs(denorm_spec_audio)) # Scale between -1 and 1 # Save as WAV sf.write("generated_output.wav", denorm_spec_audio, 16000) # Save input spectrogram image input_spec_image_path = "input_spectrogram.png" raw_image.save(input_spec_image_path) # Save output spectrogram image output_spec_image_path = "output_spectrogram.png" color_output_spec_image.save(output_spec_image_path) return "generated_output.wav", input_spec_image_path, color_output_spec_image def load_input_spectrogram(audio_path): # Loading audio, sampling_rate = load_wav(audio_path) audio, spec = get_mel_spectrogram_from_audio(audio) norm_spec = normalize_spectrogram(spec) norm_spec = pad_spec(norm_spec, 1024) norm_spec = normalize(norm_spec) # normalize to [-1, 1], because pipeline do not normalize for torch.Tensor input raw_image = image_add_color(torch_to_pil(norm_spec)) # Save input spectrogram image input_spec_image_path = "input_spectrogram.png" raw_image.save(input_spec_image_path) return input_spec_image_path def preview_masked_area(audio_path, mask_start_point, mask_end_point): # Loading audio, sampling_rate = load_wav(audio_path) audio, spec = get_mel_spectrogram_from_audio(audio) norm_spec = normalize_spectrogram(spec) norm_spec = pad_spec(norm_spec, 1024) norm_spec = normalize(norm_spec) # normalize to [-1, 1], because pipeline do not normalize for torch.Tensor input # Add Mask width_start, width = mask_start_point, mask_end_point-mask_start_point mask = torch.zeros_like(norm_spec)[:1,...] mask[:, :, width_start:width_start+width] = 1 mask_image = torch_to_pil(mask) mask, masked_spec = prepare_mask_and_masked_image(norm_spec, mask) masked_spec_image = torch_to_pil(masked_spec) # color masked spec and paint masked area to black color_masked_spec_image = image_add_color(masked_spec_image) color_masked_spec_image = np.array(color_masked_spec_image) color_masked_spec_image[:, width_start:width_start+width, :] = 0 color_masked_spec_image = Image.fromarray(color_masked_spec_image) # Save the masked spectrogram image masked_spec_image_path = "masked_spectrogram.png" color_masked_spec_image.save(masked_spec_image_path) return masked_spec_image_path def load_inpaint_example(prompt_inp, audio_path): in_spec_path = load_input_spectrogram(audio_path) masked_spec_path = preview_masked_area(audio_path, 256, 768) return in_spec_path, masked_spec_path css=""" div#col-container{ margin: 0 auto; max-width: 640px; } """ with gr.Blocks(css=css) as demo: with gr.Column(elem_id="col-container"): gr.Markdown("# Auffusion") gr.Markdown("Auffusion can generate realistic audios including human sounds, animal sounds, natural and artificial sounds and sound effects from textual prompts. ") gr.HTML("""
""") with gr.Tab("Text-to-Audio"): prompt = gr.Textbox(label="Prompt") submit_btn = gr.Button("Submit") audio_out = gr.Audio(label="Audio Ressult") gr.Examples( examples = [ "Rolling thunder with lightning strikes", "Two gunshots followed by birds chirping", "A train whistle blowing in the distance" ], inputs = [prompt] ) submit_btn.click( fn = infer, inputs = [prompt], outputs = [audio_out] ) with gr.Tab("Audio-to-Audio"): prompt_img2img = gr.Textbox(label="Prompt") audio_in_img2img = gr.Audio(label="Audio Reference", type="filepath", format="wav") prompt_strength = gr.Slider(label="Prompt Strength", minimum=0.0, maximum=1.0, step=0.1, value=0.7) submit_btn_img2img = gr.Button("Submit") audio_out_img2img = gr.Audio(label="Audio Ressult") with gr.Accordion("Compare Spectrograms", open=False): with gr.Column(): input_spectrogram = gr.Image(label="Input Spectrogram") output_spectrogram = gr.Image(label="Output Spectrogram") gr.Examples( examples = [ ["Ambulance siren", "./notebooks/examples/img2img/GIOApFAWDOc_160.wav"], ["A cat is moewing", "./notebooks/examples/img2img/YniwgMbB6tpQ_01.wav"], ["A car racing", "./notebooks/examples/img2img/_GI7meqlYZk_30.wav"] ], inputs = [prompt_img2img, audio_in_img2img] ) submit_btn_img2img.click( fn = infer_img2img, inputs = [prompt_img2img, audio_in_img2img, prompt_strength], outputs = [audio_out_img2img, input_spectrogram, output_spectrogram] ) with gr.Tab("Audio InPainting"): prompt_inp = gr.Textbox(label="Prompt") audio_in_inp = gr.Audio(label="Audio Reference", type="filepath", format="wav") audio_in_spec = gr.Image(label="Audio IN spectrogram") mask_start_point = gr.Slider(label="Mask Start point", minimum=0, maximum=1024, step=1, value=256) mask_end_point = gr.Slider(label="Mask End point", minimum=0, maximum=1024, step=1, value=768) preview_mask_btn = gr.Button("Preview Mask") masked_spec_preview = gr.Image(label="Spectrogram Mask Preview") submit_btn_inp = gr.Button("Submit") audio_out_inp = gr.Audio(label="Audio Ressult") with gr.Accordion("Compare Spectrograms", open=False): with gr.Column(): input_spectrogram_inp = gr.Image(label="Input Spectrogram") output_spectrogram_inp = gr.Image(label="Output Spectrogram") gr.Examples( examples = [ ["A siren ringing with a vehicle speeding closer", "./notebooks/examples/inpainting/IvfaKPDWC00_160.wav"], ["A woman speaking", "./notebooks/examples/inpainting/9z8XIRyUq9Q_30.wav"], ["An infant crying", "./notebooks/examples/inpainting/14ekd4nkpwc_28.wav"], ["A dog barking and growling", "./notebooks/examples/inpainting/3ek-xLwr05Q_30.wav"] ], fn = load_inpaint_example, inputs = [prompt_inp, audio_in_inp], outputs = [audio_in_spec, masked_spec_preview], cache_examples = True ) audio_in_inp.upload( fn = load_input_spectrogram, inputs = [audio_in_inp], outputs = [audio_in_spec] ) audio_in_inp.stop_recording( fn = load_input_spectrogram, inputs = [audio_in_inp], outputs = [audio_in_spec] ) preview_mask_btn.click( fn = preview_masked_area, inputs = [audio_in_inp, mask_start_point, mask_end_point], outputs = [masked_spec_preview] ) submit_btn_inp.click( fn = infer_inp, inputs = [prompt_inp, audio_in_inp, mask_start_point, mask_end_point], outputs = [audio_out_inp, input_spectrogram_inp, output_spectrogram_inp] ) demo.queue().launch(show_api=False, show_error=True)