Update app.py
Browse files
app.py
CHANGED
@@ -37,18 +37,6 @@ def resample_audio(input_audio, original_sr, target_sr=16000):
|
|
37 |
# If sample rate is already 16000, no resampling is needed
|
38 |
return input_audio
|
39 |
|
40 |
-
def convert_wav_to_16khz(input_path, output_path):
|
41 |
-
with wave.open(input_path, "rb") as wav_in:
|
42 |
-
params = wav_in.getparams()
|
43 |
-
channels, sampwidth, framerate, nframes = params[:4]
|
44 |
-
|
45 |
-
# Read and convert audio data
|
46 |
-
audio_data = np.frombuffer(wav_in.readframes(nframes), dtype=np.int16)
|
47 |
-
new_framerate = 16000
|
48 |
-
|
49 |
-
# Save as a new WAV file
|
50 |
-
write(output_path, new_framerate, audio_data)
|
51 |
-
return output_path
|
52 |
|
53 |
def save_spectrogram_image(spectrogram, filename):
|
54 |
"""Save a spectrogram as an image."""
|
@@ -58,58 +46,6 @@ def save_spectrogram_image(spectrogram, filename):
|
|
58 |
plt.savefig(filename, bbox_inches='tight', pad_inches=0)
|
59 |
plt.close()
|
60 |
|
61 |
-
def debug_spectrogram(audio, spec, label="Current File"):
|
62 |
-
print(f"==== [{label}] ====")
|
63 |
-
print(f"🔹 Raw Audio Min/Max: {audio.min()}, {audio.max()}")
|
64 |
-
print(f"🔹 Spectrogram Min/Max Before Normalization: {spec.min()}, {spec.max()}")
|
65 |
-
print(f"🔹 Spectrogram Mean Before Normalization: {spec.mean()}")
|
66 |
-
|
67 |
-
normalized_spec = normalize_spectrogram(spec)
|
68 |
-
|
69 |
-
print(f"🔹 Spectrogram Min/Max After Normalization: {normalized_spec.min()}, {normalized_spec.max()}")
|
70 |
-
print(f"🔹 Spectrogram Mean After Normalization: {normalized_spec.mean()}")
|
71 |
-
|
72 |
-
return normalized_spec
|
73 |
-
|
74 |
-
def extract_pitch(y, sr, hop_length=512):
|
75 |
-
# Use librosa's yin method to estimate the pitch (fundamental frequency)
|
76 |
-
f0, voiced_flag, voiced_probs = librosa.pyin(y, fmin=librosa.note_to_hz('C1'), fmax=librosa.note_to_hz('C8'))
|
77 |
-
|
78 |
-
# Calculate the mean pitch (only for voiced segments)
|
79 |
-
f0_mean = np.mean(f0[voiced_flag]) if len(f0[voiced_flag]) > 0 else None
|
80 |
-
return f0_mean
|
81 |
-
|
82 |
-
def compare_pitch(original_audio, processed_audio, sr=16000):
|
83 |
-
# Extract pitch from the original and processed audio
|
84 |
-
pitch_original = extract_pitch(original_audio, sr)
|
85 |
-
pitch_processed = extract_pitch(processed_audio, sr)
|
86 |
-
|
87 |
-
if pitch_original is not None and pitch_processed is not None:
|
88 |
-
pitch_diff = pitch_original - pitch_processed
|
89 |
-
print(f"Original Pitch: {pitch_original} Hz")
|
90 |
-
print(f"Processed Pitch: {pitch_processed} Hz")
|
91 |
-
print(f"Pitch Difference: {pitch_diff} Hz")
|
92 |
-
else:
|
93 |
-
print("Could not extract pitch from one of the signals.")
|
94 |
-
|
95 |
-
def adjust_spectrogram_mean(spec, target_mean=-5.0):
|
96 |
-
# Calculate the current mean of the spectrogram
|
97 |
-
current_mean = spec.mean().item()
|
98 |
-
|
99 |
-
# If the current mean is below the target mean, shift the values up
|
100 |
-
if current_mean < target_mean:
|
101 |
-
shift_value = target_mean - current_mean
|
102 |
-
print(f"Current mean: {current_mean}. Shifting by: {shift_value}")
|
103 |
-
|
104 |
-
# Shift the entire spectrogram by the calculated shift value
|
105 |
-
adjusted_spec = spec + shift_value
|
106 |
-
|
107 |
-
# Ensure that the adjusted values are still valid (in the expected range)
|
108 |
-
adjusted_spec = torch.clamp(adjusted_spec, min=0.0) # Optional: prevent negative values if needed
|
109 |
-
return adjusted_spec
|
110 |
-
else:
|
111 |
-
print(f"Current mean: {current_mean}. No adjustment needed.")
|
112 |
-
return spec
|
113 |
|
114 |
def infer(prompt, progress=gr.Progress(track_tqdm=True)):
|
115 |
pipeline = AuffusionPipeline.from_pretrained("auffusion/auffusion")
|
@@ -151,16 +87,6 @@ def infer_img2img(prompt, audio_path, desired_strength, progress=gr.Progress(tra
|
|
151 |
audio, sampling_rate = load_wav(audio_path)
|
152 |
audio, spec = get_mel_spectrogram_from_audio(audio)
|
153 |
|
154 |
-
# Check if the spectrogram mean before normalization is too low
|
155 |
-
spec_mean_before = spec.mean().item()
|
156 |
-
|
157 |
-
# Apply fix only if the spectrogram mean is too low
|
158 |
-
if spec_mean_before < -5.0:
|
159 |
-
print(f"⚠️ Spectrogram too low (Mean: {spec_mean_before}).")
|
160 |
-
else:
|
161 |
-
print(f"✅ Spectrogram looks normal (Mean: {spec_mean_before}). No boost needed.")
|
162 |
-
|
163 |
-
|
164 |
# Normalize the spectrogram
|
165 |
norm_spec = normalize_spectrogram(spec)
|
166 |
|
@@ -233,17 +159,16 @@ def infer_img2img(prompt, audio_path, desired_strength, progress=gr.Progress(tra
|
|
233 |
output_spec_image_path = "output_spectrogram.png"
|
234 |
concat_image.save(output_spec_image_path)
|
235 |
|
236 |
-
# ——
|
237 |
-
original_audio, sr = librosa.load(audio_path, sr=None)
|
238 |
-
processed_audio, sr = librosa.load("output.wav", sr=None)
|
239 |
-
|
240 |
-
compare_pitch(original_audio, processed_audio)
|
241 |
-
|
242 |
return "output.wav", input_spec_image_path, output_spec_image_path
|
243 |
|
244 |
def infer_inp(prompt, audio_path, mask_start_point, mask_end_point, progress=gr.Progress(track_tqdm=True)):
|
245 |
|
246 |
-
|
|
|
|
|
|
|
|
|
|
|
247 |
|
248 |
pretrained_model_name_or_path = "auffusion/auffusion-full-no-adapter"
|
249 |
dtype = torch.float16
|
@@ -263,14 +188,9 @@ def infer_inp(prompt, audio_path, mask_start_point, mask_end_point, progress=gr.
|
|
263 |
seed = 42
|
264 |
|
265 |
# Loading
|
266 |
-
audio, sampling_rate = load_wav(audio_path)
|
267 |
-
print(f"Raw audio min/max: {audio.min()}, {audio.max()}")
|
268 |
-
|
269 |
audio, spec = get_mel_spectrogram_from_audio(audio)
|
270 |
-
print(f"Spectrogram min/max before normalization: {spec.min()}, {spec.max()}")
|
271 |
-
|
272 |
norm_spec = normalize_spectrogram(spec)
|
273 |
-
print(f"Spectrogram min/max after normalization: {norm_spec.min()}, {norm_spec.max()}")
|
274 |
|
275 |
norm_spec = pad_spec(norm_spec, 1024)
|
276 |
norm_spec = normalize(norm_spec) # normalize to [-1, 1], because pipeline do not normalize for torch.Tensor input
|
|
|
37 |
# If sample rate is already 16000, no resampling is needed
|
38 |
return input_audio
|
39 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
|
41 |
def save_spectrogram_image(spectrogram, filename):
|
42 |
"""Save a spectrogram as an image."""
|
|
|
46 |
plt.savefig(filename, bbox_inches='tight', pad_inches=0)
|
47 |
plt.close()
|
48 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
|
50 |
def infer(prompt, progress=gr.Progress(track_tqdm=True)):
|
51 |
pipeline = AuffusionPipeline.from_pretrained("auffusion/auffusion")
|
|
|
87 |
audio, sampling_rate = load_wav(audio_path)
|
88 |
audio, spec = get_mel_spectrogram_from_audio(audio)
|
89 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
90 |
# Normalize the spectrogram
|
91 |
norm_spec = normalize_spectrogram(spec)
|
92 |
|
|
|
159 |
output_spec_image_path = "output_spectrogram.png"
|
160 |
concat_image.save(output_spec_image_path)
|
161 |
|
|
|
|
|
|
|
|
|
|
|
|
|
162 |
return "output.wav", input_spec_image_path, output_spec_image_path
|
163 |
|
164 |
def infer_inp(prompt, audio_path, mask_start_point, mask_end_point, progress=gr.Progress(track_tqdm=True)):
|
165 |
|
166 |
+
# Load your audio file
|
167 |
+
input_audio, original_sr = librosa.load(audio_path, sr=None) # Load with original sampling rate
|
168 |
+
resampled_audio = resample_audio(input_audio, original_sr, target_sr=16000)
|
169 |
+
# Save the resampled audio to a new file
|
170 |
+
sf.write('resampled_audio.wav', resampled_audio, 16000)
|
171 |
+
audio_path = 'resampled_audio.wav'
|
172 |
|
173 |
pretrained_model_name_or_path = "auffusion/auffusion-full-no-adapter"
|
174 |
dtype = torch.float16
|
|
|
188 |
seed = 42
|
189 |
|
190 |
# Loading
|
191 |
+
audio, sampling_rate = load_wav(audio_path)
|
|
|
|
|
192 |
audio, spec = get_mel_spectrogram_from_audio(audio)
|
|
|
|
|
193 |
norm_spec = normalize_spectrogram(spec)
|
|
|
194 |
|
195 |
norm_spec = pad_spec(norm_spec, 1024)
|
196 |
norm_spec = normalize(norm_spec) # normalize to [-1, 1], because pipeline do not normalize for torch.Tensor input
|