Update main.py
Browse files
main.py
CHANGED
@@ -8,7 +8,6 @@ import random
|
|
8 |
import numpy as np
|
9 |
from scipy.signal.windows import hann
|
10 |
import soundfile as sf
|
11 |
-
import torch
|
12 |
import librosa
|
13 |
from audiosr import build_model, super_resolution
|
14 |
from scipy import signal
|
@@ -16,25 +15,27 @@ import pyloudnorm as pyln
|
|
16 |
import tempfile
|
17 |
import spaces
|
18 |
|
|
|
19 |
class AudioUpscaler:
|
20 |
"""
|
21 |
Upscales audio using the AudioSR model.
|
22 |
"""
|
23 |
|
24 |
-
def __init__(self, model_name="basic", device="
|
25 |
"""
|
26 |
Initializes the AudioUpscaler.
|
27 |
|
28 |
Args:
|
29 |
model_name (str, optional): Name of the AudioSR model to use. Defaults to "basic".
|
30 |
-
device (str, optional): Device to use for inference. Defaults to "
|
31 |
"""
|
32 |
|
33 |
self.model_name = model_name
|
34 |
self.device = device
|
35 |
-
self.sr =
|
36 |
self.audiosr = None # Model will be loaded in setup()
|
37 |
-
|
|
|
38 |
def setup(self):
|
39 |
"""
|
40 |
Loads the AudioSR model.
|
@@ -107,12 +108,12 @@ class AudioUpscaler:
|
|
107 |
self,
|
108 |
input_file,
|
109 |
chunk_size=5.12,
|
110 |
-
overlap=0.
|
111 |
seed=None,
|
112 |
guidance_scale=3.5,
|
113 |
ddim_steps=50,
|
114 |
multiband_ensemble=True,
|
115 |
-
input_cutoff=
|
116 |
):
|
117 |
"""
|
118 |
Processes the audio in chunks and performs upsampling.
|
@@ -130,7 +131,7 @@ class AudioUpscaler:
|
|
130 |
Returns:
|
131 |
np.ndarray: Upsampled audio data.
|
132 |
"""
|
133 |
-
|
134 |
audio, sr = librosa.load(input_file, sr=input_cutoff * 2, mono=False)
|
135 |
audio = audio.T
|
136 |
sr = input_cutoff * 2
|
@@ -141,12 +142,13 @@ class AudioUpscaler:
|
|
141 |
else:
|
142 |
audio_ch1 = audio
|
143 |
|
144 |
-
chunk_samples
|
145 |
overlap_samples = int(overlap * chunk_samples)
|
146 |
|
147 |
-
|
148 |
-
|
149 |
-
|
|
|
150 |
|
151 |
def process_chunks(audio):
|
152 |
chunks = []
|
@@ -320,7 +322,7 @@ class AudioUpscaler:
|
|
320 |
chunk_size=10.24,
|
321 |
seed=None,
|
322 |
multiband_ensemble=True,
|
323 |
-
input_cutoff=
|
324 |
):
|
325 |
"""
|
326 |
Upscales the audio and saves the result.
|
@@ -338,6 +340,7 @@ class AudioUpscaler:
|
|
338 |
"""
|
339 |
if seed == 0:
|
340 |
seed = random.randint(0, 2**32 - 1)
|
|
|
341 |
|
342 |
os.makedirs(output_folder, exist_ok=True)
|
343 |
waveform = self._process_audio(
|
@@ -385,7 +388,6 @@ def inference(audio_file, model_name, guidance_scale, ddim_steps, seed):
|
|
385 |
|
386 |
return (48000, waveform)
|
387 |
|
388 |
-
@spaces.GPU(duration=300)
|
389 |
def upscale_audio(
|
390 |
input_file,
|
391 |
output_folder,
|
@@ -415,6 +417,7 @@ def upscale_audio(
|
|
415 |
tuple: Upscaled audio data and sample rate.
|
416 |
"""
|
417 |
torch.cuda.empty_cache()
|
|
|
418 |
|
419 |
gc.collect()
|
420 |
upscaler = AudioUpscaler()
|
|
|
8 |
import numpy as np
|
9 |
from scipy.signal.windows import hann
|
10 |
import soundfile as sf
|
|
|
11 |
import librosa
|
12 |
from audiosr import build_model, super_resolution
|
13 |
from scipy import signal
|
|
|
15 |
import tempfile
|
16 |
import spaces
|
17 |
|
18 |
+
|
19 |
class AudioUpscaler:
|
20 |
"""
|
21 |
Upscales audio using the AudioSR model.
|
22 |
"""
|
23 |
|
24 |
+
def __init__(self, model_name="basic", device="cuda"):
|
25 |
"""
|
26 |
Initializes the AudioUpscaler.
|
27 |
|
28 |
Args:
|
29 |
model_name (str, optional): Name of the AudioSR model to use. Defaults to "basic".
|
30 |
+
device (str, optional): Device to use for inference. Defaults to "cuda".
|
31 |
"""
|
32 |
|
33 |
self.model_name = model_name
|
34 |
self.device = device
|
35 |
+
self.sr = 44100
|
36 |
self.audiosr = None # Model will be loaded in setup()
|
37 |
+
|
38 |
+
@spaces.GPU(duration=120)
|
39 |
def setup(self):
|
40 |
"""
|
41 |
Loads the AudioSR model.
|
|
|
108 |
self,
|
109 |
input_file,
|
110 |
chunk_size=5.12,
|
111 |
+
overlap=0.16,
|
112 |
seed=None,
|
113 |
guidance_scale=3.5,
|
114 |
ddim_steps=50,
|
115 |
multiband_ensemble=True,
|
116 |
+
input_cutoff=8000,
|
117 |
):
|
118 |
"""
|
119 |
Processes the audio in chunks and performs upsampling.
|
|
|
131 |
Returns:
|
132 |
np.ndarray: Upsampled audio data.
|
133 |
"""
|
134 |
+
chunk_size = random.randint(a=0, b=10)*0.08
|
135 |
audio, sr = librosa.load(input_file, sr=input_cutoff * 2, mono=False)
|
136 |
audio = audio.T
|
137 |
sr = input_cutoff * 2
|
|
|
142 |
else:
|
143 |
audio_ch1 = audio
|
144 |
|
145 |
+
chunk_samples = int(chunk_size * sr)
|
146 |
overlap_samples = int(overlap * chunk_samples)
|
147 |
|
148 |
+
|
149 |
+
output_chunk_samples = int(chunk_size * self.sr)
|
150 |
+
output_overlap_samples = int(overlap * output_chunk_samples)
|
151 |
+
enable_overlap = True if overlap > 0 else False
|
152 |
|
153 |
def process_chunks(audio):
|
154 |
chunks = []
|
|
|
322 |
chunk_size=10.24,
|
323 |
seed=None,
|
324 |
multiband_ensemble=True,
|
325 |
+
input_cutoff=8000,
|
326 |
):
|
327 |
"""
|
328 |
Upscales the audio and saves the result.
|
|
|
340 |
"""
|
341 |
if seed == 0:
|
342 |
seed = random.randint(0, 2**32 - 1)
|
343 |
+
chunk_size = random.randint(0, 10) * 0.08
|
344 |
|
345 |
os.makedirs(output_folder, exist_ok=True)
|
346 |
waveform = self._process_audio(
|
|
|
388 |
|
389 |
return (48000, waveform)
|
390 |
|
|
|
391 |
def upscale_audio(
|
392 |
input_file,
|
393 |
output_folder,
|
|
|
417 |
tuple: Upscaled audio data and sample rate.
|
418 |
"""
|
419 |
torch.cuda.empty_cache()
|
420 |
+
chunk_size = random.randint(a=0, b=10)*0.08
|
421 |
|
422 |
gc.collect()
|
423 |
upscaler = AudioUpscaler()
|