fffiloni commited on
Commit
f081ac6
·
verified ·
1 Parent(s): f814738

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +32 -4
app.py CHANGED
@@ -1,6 +1,7 @@
1
  import gradio as gr
2
  import torch, os
3
  import wave
 
4
  import numpy as np
5
  from scipy.io.wavfile import write
6
  from PIL import Image
@@ -50,6 +51,27 @@ def debug_spectrogram(audio, spec, label="Current File"):
50
 
51
  return normalized_spec
52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  def infer(prompt, progress=gr.Progress(track_tqdm=True)):
54
  pipeline = AuffusionPipeline.from_pretrained("auffusion/auffusion")
55
  prompt = prompt
@@ -91,10 +113,10 @@ def infer_img2img(prompt, audio_path, desired_strength, progress=gr.Progress(tra
91
  # Apply fix only if the spectrogram mean is too low
92
  if spec_mean_before < -5.0:
93
  print(f"⚠️ Spectrogram too low (Mean: {spec_mean_before}).")
94
- def pitch_shift_spectrogram(spec, sr=16000, n_steps=4):
95
- spec = librosa.effects.pitch_shift(spec, sr, n_steps=n_steps)
96
- return spec
97
- spec = pitch_shift_spectrogram(spec, sr=16000, n_steps=4)
98
  else:
99
  print(f"✅ Spectrogram looks normal (Mean: {spec_mean_before}). No boost needed.")
100
 
@@ -170,6 +192,12 @@ def infer_img2img(prompt, audio_path, desired_strength, progress=gr.Progress(tra
170
  output_spec_image_path = "output_spectrogram.png"
171
  concat_image.save(output_spec_image_path)
172
 
 
 
 
 
 
 
173
  return "output.wav", input_spec_image_path, output_spec_image_path
174
 
175
  def infer_inp(prompt, audio_path, mask_start_point, mask_end_point, progress=gr.Progress(track_tqdm=True)):
 
1
  import gradio as gr
2
  import torch, os
3
  import wave
4
+ import librosa
5
  import numpy as np
6
  from scipy.io.wavfile import write
7
  from PIL import Image
 
51
 
52
  return normalized_spec
53
 
54
+ def extract_pitch(y, sr, hop_length=512):
55
+ # Use librosa's yin method to estimate the pitch (fundamental frequency)
56
+ f0, voiced_flag, voiced_probs = librosa.pyin(y, fmin=librosa.note_to_hz('C1'), fmax=librosa.note_to_hz('C8'))
57
+
58
+ # Calculate the mean pitch (only for voiced segments)
59
+ f0_mean = np.mean(f0[voiced_flag]) if len(f0[voiced_flag]) > 0 else None
60
+ return f0_mean
61
+
62
+ def compare_pitch(original_audio, processed_audio, sr=16000):
63
+ # Extract pitch from the original and processed audio
64
+ pitch_original = extract_pitch(original_audio, sr)
65
+ pitch_processed = extract_pitch(processed_audio, sr)
66
+
67
+ if pitch_original is not None and pitch_processed is not None:
68
+ pitch_diff = pitch_original - pitch_processed
69
+ print(f"Original Pitch: {pitch_original} Hz")
70
+ print(f"Processed Pitch: {pitch_processed} Hz")
71
+ print(f"Pitch Difference: {pitch_diff} Hz")
72
+ else:
73
+ print("Could not extract pitch from one of the signals.")
74
+
75
  def infer(prompt, progress=gr.Progress(track_tqdm=True)):
76
  pipeline = AuffusionPipeline.from_pretrained("auffusion/auffusion")
77
  prompt = prompt
 
113
  # Apply fix only if the spectrogram mean is too low
114
  if spec_mean_before < -5.0:
115
  print(f"⚠️ Spectrogram too low (Mean: {spec_mean_before}).")
116
+ #def pitch_shift_spectrogram(spec, sr=16000, n_steps=4):
117
+ # spec = librosa.effects.pitch_shift(spec, sr, n_steps=n_steps)
118
+ # return spec
119
+ #spec = pitch_shift_spectrogram(spec, sr=16000, n_steps=4)
120
  else:
121
  print(f"✅ Spectrogram looks normal (Mean: {spec_mean_before}). No boost needed.")
122
 
 
192
  output_spec_image_path = "output_spectrogram.png"
193
  concat_image.save(output_spec_image_path)
194
 
195
+ # ——
196
+ original_audio, sr = librosa.load(audio_path, sr=None)
197
+ processed_audio, sr = librosa.load("output.wav", sr=None)
198
+
199
+ compare_pitch(original_audio, processed_audio)
200
+
201
  return "output.wav", input_spec_image_path, output_spec_image_path
202
 
203
  def infer_inp(prompt, audio_path, mask_start_point, mask_end_point, progress=gr.Progress(track_tqdm=True)):