Spaces:

fffiloni
/

auffusion

Running on A10G

App Files Files Community

fffiloni commited on 13 days ago

Commit

cf62874

verified ·

1 Parent(s): 9b94d3b

Update app.py

Browse files

Files changed (1) hide show

app.py +27 -2

app.py CHANGED Viewed

@@ -17,6 +17,26 @@ from converter import load_wav, mel_spectrogram, normalize_spectrogram, denormal
 from utils import pad_spec, image_add_color, torch_to_pil, normalize, denormalize, prepare_mask_and_masked_image
 # ——
 def convert_wav_to_16khz(input_path, output_path):
     with wave.open(input_path, "rb") as wav_in:
         params = wav_in.getparams()
@@ -102,8 +122,13 @@ def infer(prompt, progress=gr.Progress(track_tqdm=True)):
 def infer_img2img(prompt, audio_path, desired_strength, progress=gr.Progress(track_tqdm=True)):
-    audio_path = convert_wav_to_16khz(audio_path, "output_16khz.wav")
     pretrained_model_name_or_path = "auffusion/auffusion-full-no-adapter"
     dtype = torch.float16
     device = "cuda"

 from utils import pad_spec, image_add_color, torch_to_pil, normalize, denormalize, prepare_mask_and_masked_image
 # ——
+def resample_audio(input_audio, original_sr, target_sr=16000):
+    """
+    Resample the audio to the target sample rate (16000 Hz by default).
+    Args:
+    - input_audio (numpy array): The raw audio data.
+    - original_sr (int): The original sample rate of the input audio.
+    - target_sr (int): The target sample rate (default is 16000 Hz).
+    Returns:
+    - numpy array: The resampled audio.
+    """
+    if original_sr != target_sr:
+        # Resample the audio using librosa
+        audio_resampled = librosa.resample(input_audio, orig_sr=original_sr, target_sr=target_sr)
+        return audio_resampled
+    else:
+        # If sample rate is already 16000, no resampling is needed
+        return input_audio
 def convert_wav_to_16khz(input_path, output_path):
     with wave.open(input_path, "rb") as wav_in:
         params = wav_in.getparams()
 def infer_img2img(prompt, audio_path, desired_strength, progress=gr.Progress(track_tqdm=True)):
+    # Load your audio file
+    input_audio, original_sr = librosa.load(audio_path, sr=None)  # Load with original sampling rate
+    resampled_audio = resample_audio(input_audio, original_sr, target_sr=16000)
+    # Save the resampled audio to a new file
+    sf.write('resampled_audio.wav', resampled_audio, 16000)
+    audio_path = 'resampled_audio.wav'
     pretrained_model_name_or_path = "auffusion/auffusion-full-no-adapter"
     dtype = torch.float16
     device = "cuda"