Update app.py
Browse files
app.py
CHANGED
@@ -127,6 +127,92 @@ def infer_img2img(prompt, audio_path, desired_strength, progress=gr.Progress(tra
|
|
127 |
|
128 |
return "output.wav", input_spec_image_path, output_spec_image_path
|
129 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
130 |
css="""
|
131 |
div#col-container{
|
132 |
margin: 0 auto;
|
@@ -185,14 +271,7 @@ with gr.Blocks(css=css) as demo:
|
|
185 |
input_spectrogram = gr.Image(label="Input Spectrogram")
|
186 |
output_spectrogram = gr.Image(label="Output Spectrogram")
|
187 |
|
188 |
-
|
189 |
-
examples = [
|
190 |
-
"Rolling thunder with lightning strikes",
|
191 |
-
"Two gunshots followed by birds chirping",
|
192 |
-
"A train whistle blowing in the distance"
|
193 |
-
],
|
194 |
-
inputs = [prompt_img2img]
|
195 |
-
)
|
196 |
|
197 |
submit_btn_img2img.click(
|
198 |
fn = infer_img2img,
|
@@ -200,4 +279,23 @@ with gr.Blocks(css=css) as demo:
|
|
200 |
outputs = [audio_out_img2img, input_spectrogram, output_spectrogram]
|
201 |
)
|
202 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
203 |
demo.queue().launch(show_api=False, show_error=True)
|
|
|
127 |
|
128 |
return "output.wav", input_spec_image_path, output_spec_image_path
|
129 |
|
130 |
+
def infer_inp(prompt, audio_path, progress=gr.Progress(track_tqdm=True)):
|
131 |
+
|
132 |
+
pretrained_model_name_or_path = "auffusion/auffusion-full-no-adapter"
|
133 |
+
dtype = torch.float16
|
134 |
+
device = "cuda"
|
135 |
+
|
136 |
+
if not os.path.isdir(pretrained_model_name_or_path):
|
137 |
+
pretrained_model_name_or_path = snapshot_download(pretrained_model_name_or_path)
|
138 |
+
|
139 |
+
vocoder = Generator.from_pretrained(pretrained_model_name_or_path, subfolder="vocoder")
|
140 |
+
vocoder = vocoder.to(device=device, dtype=dtype)
|
141 |
+
|
142 |
+
pipe = StableDiffusionImg2ImgPipeline.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype)
|
143 |
+
pipe = pipe.to(device)
|
144 |
+
|
145 |
+
width_start, width = 256, 512
|
146 |
+
prompt = "A siren ringing with a vehicle speeding closer"
|
147 |
+
seed = 42
|
148 |
+
|
149 |
+
# Loading
|
150 |
+
audio, sampling_rate = load_wav(audio_path)
|
151 |
+
audio, spec = get_mel_spectrogram_from_audio(audio)
|
152 |
+
norm_spec = normalize_spectrogram(spec)
|
153 |
+
norm_spec = pad_spec(norm_spec, 1024)
|
154 |
+
norm_spec = normalize(norm_spec) # normalize to [-1, 1], because pipeline do not normalize for torch.Tensor input
|
155 |
+
|
156 |
+
raw_image = image_add_color(torch_to_pil(norm_spec))
|
157 |
+
|
158 |
+
# Add Mask
|
159 |
+
mask = torch.zeros_like(norm_spec)[:1,...]
|
160 |
+
mask[:, :, width_start:width_start+width] = 1
|
161 |
+
mask_image = torch_to_pil(mask)
|
162 |
+
|
163 |
+
mask, masked_spec = prepare_mask_and_masked_image(norm_spec, mask)
|
164 |
+
masked_spec_image = torch_to_pil(masked_spec)
|
165 |
+
|
166 |
+
# color masked spec and paint masked area to black
|
167 |
+
color_masked_spec_image = image_add_color(masked_spec_image)
|
168 |
+
color_masked_spec_image = np.array(color_masked_spec_image)
|
169 |
+
color_masked_spec_image[:, width_start:width_start+width, :] = 0
|
170 |
+
color_masked_spec_image = Image.fromarray(color_masked_spec_image)
|
171 |
+
|
172 |
+
# Generation
|
173 |
+
generator = torch.Generator(device=device).manual_seed(seed)
|
174 |
+
|
175 |
+
with torch.autocast("cuda"):
|
176 |
+
output_spec = pipe(
|
177 |
+
prompt=prompt, image=norm_spec, mask_image=mask, num_inference_steps=100, generator=generator, height=256, width=1024, output_type="pt"
|
178 |
+
).images[0]
|
179 |
+
|
180 |
+
output_spec_image = torch_to_pil(output_spec)
|
181 |
+
color_output_spec_image = image_add_color(output_spec_image)
|
182 |
+
|
183 |
+
# Display audio result: raw audio, masked raw audio, generated audio
|
184 |
+
post_norm_spec = denormalize(norm_spec).to(device, dtype)
|
185 |
+
raw_chunk_spec = denormalize_spectrogram(post_norm_spec)
|
186 |
+
raw_chunk_audio = vocoder.inference(raw_chunk_spec)
|
187 |
+
|
188 |
+
post_masked_spec = denormalize(masked_spec).to(device, dtype)
|
189 |
+
denorm_masked_spec = denormalize_spectrogram(post_masked_spec)
|
190 |
+
denorm_masked_spec_audio = vocoder.inference(denorm_masked_spec)
|
191 |
+
|
192 |
+
denorm_spec = denormalize_spectrogram(output_spec)
|
193 |
+
denorm_spec_audio = vocoder.inference(denorm_spec)
|
194 |
+
|
195 |
+
#———
|
196 |
+
|
197 |
+
# Ensure correct shape
|
198 |
+
denorm_spec_audio = denorm_spec_audio.flatten() # Converts (1, N) → (N,)
|
199 |
+
|
200 |
+
# Normalize the audio to prevent clipping or excessive loudness
|
201 |
+
denorm_spec_audio = denorm_spec_audio / np.max(np.abs(denorm_spec_audio)) # Scale between -1 and 1
|
202 |
+
|
203 |
+
# Save as WAV
|
204 |
+
sf.write("output.wav", denorm_spec_audio, 16000)
|
205 |
+
|
206 |
+
# Save input spectrogram image
|
207 |
+
#input_spec_image_path = "input_spectrogram.png"
|
208 |
+
#raw_image.save(input_spec_image_path)
|
209 |
+
|
210 |
+
# Save concatenated spectrogram image
|
211 |
+
#output_spec_image_path = "output_spectrogram.png"
|
212 |
+
concat_image.save(output_spec_image_path)
|
213 |
+
|
214 |
+
#return "output.wav"
|
215 |
+
|
216 |
css="""
|
217 |
div#col-container{
|
218 |
margin: 0 auto;
|
|
|
271 |
input_spectrogram = gr.Image(label="Input Spectrogram")
|
272 |
output_spectrogram = gr.Image(label="Output Spectrogram")
|
273 |
|
274 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
275 |
|
276 |
submit_btn_img2img.click(
|
277 |
fn = infer_img2img,
|
|
|
279 |
outputs = [audio_out_img2img, input_spectrogram, output_spectrogram]
|
280 |
)
|
281 |
|
282 |
+
with gr.Tab("Audio InPainting"):
|
283 |
+
prompt_inp = gr.Textbox(label="Prompt")
|
284 |
+
audio_in_inp = gr.Audio(label="Audio Reference", type="filepath")
|
285 |
+
submit_btn_inp = gr.Button("Submit")
|
286 |
+
audio_out_inp = gr.Audio(label="Audio Ressult")
|
287 |
+
|
288 |
+
with gr.Accordion("Compare Spectrograms", open=False):
|
289 |
+
with gr.Column():
|
290 |
+
input_spectrogram_inp = gr.Image(label="Input Spectrogram")
|
291 |
+
output_spectrogram_inp = gr.Image(label="Output Spectrogram")
|
292 |
+
|
293 |
+
|
294 |
+
|
295 |
+
submit_btn_inp.click(
|
296 |
+
fn = infer_inp,
|
297 |
+
inputs = [prompt_inp, audio_in_inp],
|
298 |
+
outputs = [audio_out_inp]
|
299 |
+
)
|
300 |
+
|
301 |
demo.queue().launch(show_api=False, show_error=True)
|