fffiloni commited on
Commit
c01e8f8
·
verified ·
1 Parent(s): cf62874

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -87
app.py CHANGED
@@ -37,18 +37,6 @@ def resample_audio(input_audio, original_sr, target_sr=16000):
37
  # If sample rate is already 16000, no resampling is needed
38
  return input_audio
39
 
40
- def convert_wav_to_16khz(input_path, output_path):
41
- with wave.open(input_path, "rb") as wav_in:
42
- params = wav_in.getparams()
43
- channels, sampwidth, framerate, nframes = params[:4]
44
-
45
- # Read and convert audio data
46
- audio_data = np.frombuffer(wav_in.readframes(nframes), dtype=np.int16)
47
- new_framerate = 16000
48
-
49
- # Save as a new WAV file
50
- write(output_path, new_framerate, audio_data)
51
- return output_path
52
 
53
  def save_spectrogram_image(spectrogram, filename):
54
  """Save a spectrogram as an image."""
@@ -58,58 +46,6 @@ def save_spectrogram_image(spectrogram, filename):
58
  plt.savefig(filename, bbox_inches='tight', pad_inches=0)
59
  plt.close()
60
 
61
- def debug_spectrogram(audio, spec, label="Current File"):
62
- print(f"==== [{label}] ====")
63
- print(f"🔹 Raw Audio Min/Max: {audio.min()}, {audio.max()}")
64
- print(f"🔹 Spectrogram Min/Max Before Normalization: {spec.min()}, {spec.max()}")
65
- print(f"🔹 Spectrogram Mean Before Normalization: {spec.mean()}")
66
-
67
- normalized_spec = normalize_spectrogram(spec)
68
-
69
- print(f"🔹 Spectrogram Min/Max After Normalization: {normalized_spec.min()}, {normalized_spec.max()}")
70
- print(f"🔹 Spectrogram Mean After Normalization: {normalized_spec.mean()}")
71
-
72
- return normalized_spec
73
-
74
- def extract_pitch(y, sr, hop_length=512):
75
- # Use librosa's yin method to estimate the pitch (fundamental frequency)
76
- f0, voiced_flag, voiced_probs = librosa.pyin(y, fmin=librosa.note_to_hz('C1'), fmax=librosa.note_to_hz('C8'))
77
-
78
- # Calculate the mean pitch (only for voiced segments)
79
- f0_mean = np.mean(f0[voiced_flag]) if len(f0[voiced_flag]) > 0 else None
80
- return f0_mean
81
-
82
- def compare_pitch(original_audio, processed_audio, sr=16000):
83
- # Extract pitch from the original and processed audio
84
- pitch_original = extract_pitch(original_audio, sr)
85
- pitch_processed = extract_pitch(processed_audio, sr)
86
-
87
- if pitch_original is not None and pitch_processed is not None:
88
- pitch_diff = pitch_original - pitch_processed
89
- print(f"Original Pitch: {pitch_original} Hz")
90
- print(f"Processed Pitch: {pitch_processed} Hz")
91
- print(f"Pitch Difference: {pitch_diff} Hz")
92
- else:
93
- print("Could not extract pitch from one of the signals.")
94
-
95
- def adjust_spectrogram_mean(spec, target_mean=-5.0):
96
- # Calculate the current mean of the spectrogram
97
- current_mean = spec.mean().item()
98
-
99
- # If the current mean is below the target mean, shift the values up
100
- if current_mean < target_mean:
101
- shift_value = target_mean - current_mean
102
- print(f"Current mean: {current_mean}. Shifting by: {shift_value}")
103
-
104
- # Shift the entire spectrogram by the calculated shift value
105
- adjusted_spec = spec + shift_value
106
-
107
- # Ensure that the adjusted values are still valid (in the expected range)
108
- adjusted_spec = torch.clamp(adjusted_spec, min=0.0) # Optional: prevent negative values if needed
109
- return adjusted_spec
110
- else:
111
- print(f"Current mean: {current_mean}. No adjustment needed.")
112
- return spec
113
 
114
  def infer(prompt, progress=gr.Progress(track_tqdm=True)):
115
  pipeline = AuffusionPipeline.from_pretrained("auffusion/auffusion")
@@ -151,16 +87,6 @@ def infer_img2img(prompt, audio_path, desired_strength, progress=gr.Progress(tra
151
  audio, sampling_rate = load_wav(audio_path)
152
  audio, spec = get_mel_spectrogram_from_audio(audio)
153
 
154
- # Check if the spectrogram mean before normalization is too low
155
- spec_mean_before = spec.mean().item()
156
-
157
- # Apply fix only if the spectrogram mean is too low
158
- if spec_mean_before < -5.0:
159
- print(f"⚠️ Spectrogram too low (Mean: {spec_mean_before}).")
160
- else:
161
- print(f"✅ Spectrogram looks normal (Mean: {spec_mean_before}). No boost needed.")
162
-
163
-
164
  # Normalize the spectrogram
165
  norm_spec = normalize_spectrogram(spec)
166
 
@@ -233,17 +159,16 @@ def infer_img2img(prompt, audio_path, desired_strength, progress=gr.Progress(tra
233
  output_spec_image_path = "output_spectrogram.png"
234
  concat_image.save(output_spec_image_path)
235
 
236
- # ——
237
- original_audio, sr = librosa.load(audio_path, sr=None)
238
- processed_audio, sr = librosa.load("output.wav", sr=None)
239
-
240
- compare_pitch(original_audio, processed_audio)
241
-
242
  return "output.wav", input_spec_image_path, output_spec_image_path
243
 
244
  def infer_inp(prompt, audio_path, mask_start_point, mask_end_point, progress=gr.Progress(track_tqdm=True)):
245
 
246
- audio_path = convert_wav_to_16khz(audio_path, "output_16khz.wav")
 
 
 
 
 
247
 
248
  pretrained_model_name_or_path = "auffusion/auffusion-full-no-adapter"
249
  dtype = torch.float16
@@ -263,14 +188,9 @@ def infer_inp(prompt, audio_path, mask_start_point, mask_end_point, progress=gr.
263
  seed = 42
264
 
265
  # Loading
266
- audio, sampling_rate = load_wav(audio_path)
267
- print(f"Raw audio min/max: {audio.min()}, {audio.max()}")
268
-
269
  audio, spec = get_mel_spectrogram_from_audio(audio)
270
- print(f"Spectrogram min/max before normalization: {spec.min()}, {spec.max()}")
271
-
272
  norm_spec = normalize_spectrogram(spec)
273
- print(f"Spectrogram min/max after normalization: {norm_spec.min()}, {norm_spec.max()}")
274
 
275
  norm_spec = pad_spec(norm_spec, 1024)
276
  norm_spec = normalize(norm_spec) # normalize to [-1, 1], because pipeline do not normalize for torch.Tensor input
 
37
  # If sample rate is already 16000, no resampling is needed
38
  return input_audio
39
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
  def save_spectrogram_image(spectrogram, filename):
42
  """Save a spectrogram as an image."""
 
46
  plt.savefig(filename, bbox_inches='tight', pad_inches=0)
47
  plt.close()
48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
  def infer(prompt, progress=gr.Progress(track_tqdm=True)):
51
  pipeline = AuffusionPipeline.from_pretrained("auffusion/auffusion")
 
87
  audio, sampling_rate = load_wav(audio_path)
88
  audio, spec = get_mel_spectrogram_from_audio(audio)
89
 
 
 
 
 
 
 
 
 
 
 
90
  # Normalize the spectrogram
91
  norm_spec = normalize_spectrogram(spec)
92
 
 
159
  output_spec_image_path = "output_spectrogram.png"
160
  concat_image.save(output_spec_image_path)
161
 
 
 
 
 
 
 
162
  return "output.wav", input_spec_image_path, output_spec_image_path
163
 
164
  def infer_inp(prompt, audio_path, mask_start_point, mask_end_point, progress=gr.Progress(track_tqdm=True)):
165
 
166
+ # Load your audio file
167
+ input_audio, original_sr = librosa.load(audio_path, sr=None) # Load with original sampling rate
168
+ resampled_audio = resample_audio(input_audio, original_sr, target_sr=16000)
169
+ # Save the resampled audio to a new file
170
+ sf.write('resampled_audio.wav', resampled_audio, 16000)
171
+ audio_path = 'resampled_audio.wav'
172
 
173
  pretrained_model_name_or_path = "auffusion/auffusion-full-no-adapter"
174
  dtype = torch.float16
 
188
  seed = 42
189
 
190
  # Loading
191
+ audio, sampling_rate = load_wav(audio_path)
 
 
192
  audio, spec = get_mel_spectrogram_from_audio(audio)
 
 
193
  norm_spec = normalize_spectrogram(spec)
 
194
 
195
  norm_spec = pad_spec(norm_spec, 1024)
196
  norm_spec = normalize(norm_spec) # normalize to [-1, 1], because pipeline do not normalize for torch.Tensor input