fffiloni commited on
Commit
57a7a6d
·
verified ·
1 Parent(s): 5aa74d7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -77
app.py CHANGED
@@ -1,6 +1,5 @@
1
  import gradio as gr
2
  import torch, os
3
- from torchvision import transforms
4
  import numpy as np
5
  from PIL import Image
6
  import matplotlib.pyplot as plt
@@ -128,10 +127,7 @@ def infer_img2img(prompt, audio_path, desired_strength, progress=gr.Progress(tra
128
 
129
  return "output.wav", input_spec_image_path, output_spec_image_path
130
 
131
- def infer_inp(prompt, audio_path, spec_with_mask, progress=gr.Progress(track_tqdm=True)):
132
-
133
- if spec_with_mask:
134
- print(spec_with_mask)
135
 
136
  pretrained_model_name_or_path = "auffusion/auffusion-full-no-adapter"
137
  dtype = torch.float16
@@ -146,7 +142,7 @@ def infer_inp(prompt, audio_path, spec_with_mask, progress=gr.Progress(track_tqd
146
  pipe = StableDiffusionInpaintPipeline.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype)
147
  pipe = pipe.to(device)
148
 
149
- width_start, width = 256, 512
150
  prompt = "A siren ringing with a vehicle speeding closer"
151
  seed = 42
152
 
@@ -160,24 +156,11 @@ def infer_inp(prompt, audio_path, spec_with_mask, progress=gr.Progress(track_tqd
160
  raw_image = image_add_color(torch_to_pil(norm_spec))
161
 
162
  # Add Mask
163
- #mask = torch.zeros_like(norm_spec)[:1,...]
164
- #mask[:, :, width_start:width_start+width] = 1
165
- #mask_image = torch_to_pil(mask)
166
-
167
- # Load the mask image (input from user)
168
- mask_pil = spec_with_mask['layers'][0]
169
-
170
- # Convert to tensor and normalize
171
- mask_tensor = transforms.ToTensor()(mask_pil) # Shape: (1, H, W), values in [0, 1]
172
-
173
- # Ensure the shape matches expected input (add batch dimension if needed)
174
- mask_tensor = mask_tensor[:1, :, :] # Keep only one channel (grayscale)
175
- mask_tensor = mask_tensor.to(device, dtype) # Send to correct device and dtype
176
 
177
- # Convert to PIL image if needed for visualization
178
- mask_image = torch_to_pil(mask_tensor)
179
-
180
- mask, masked_spec = prepare_mask_and_masked_image(norm_spec, mask_tensor)
181
  masked_spec_image = torch_to_pil(masked_spec)
182
 
183
  # color masked spec and paint masked area to black
@@ -221,47 +204,15 @@ def infer_inp(prompt, audio_path, spec_with_mask, progress=gr.Progress(track_tqd
221
  sf.write("output.wav", denorm_spec_audio, 16000)
222
 
223
  # Save input spectrogram image
224
- #input_spec_image_path = "input_spectrogram.png"
225
- #raw_image.save(input_spec_image_path)
226
-
227
- # Save concatenated spectrogram image
228
- #output_spec_image_path = "output_spectrogram.png"
229
- #denorm_spec_audio.save(output_spec_image_path)
230
-
231
- return "output.wav"
232
-
233
- def create_transparent_layer(image_path):
234
- """Creates a transparent PNG with the same size as the background image."""
235
- background = Image.open(image_path)
236
- transparent_layer = Image.new("RGBA", background.size, (0, 0, 0, 0))
237
-
238
- layer_path = "layer_one.png"
239
- transparent_layer.save(layer_path)
240
- return layer_path
241
-
242
- def load_spec_for_manual_masking(audio_path):
243
- # Loading
244
- audio, sampling_rate = load_wav(audio_path)
245
- audio, spec = get_mel_spectrogram_from_audio(audio)
246
- norm_spec = normalize_spectrogram(spec)
247
- norm_spec = pad_spec(norm_spec, 1024)
248
- norm_spec = normalize(norm_spec) # normalize to [-1, 1], because pipeline do not normalize for torch.Tensor input
249
-
250
- raw_image = image_add_color(torch_to_pil(norm_spec))
251
-
252
  input_spec_image_path = "input_spectrogram.png"
253
  raw_image.save(input_spec_image_path)
254
 
255
- # Create transparent layer
256
- layer_one_path = create_transparent_layer(input_spec_image_path)
257
-
258
- # Return as EditorValue
259
- return {
260
- "background": input_spec_image_path,
261
- "layers": [layer_one_path],
262
- "composite": None
263
- }
264
-
265
 
266
  css="""
267
  div#col-container{
@@ -332,8 +283,8 @@ with gr.Blocks(css=css) as demo:
332
  with gr.Tab("Audio InPainting"):
333
  prompt_inp = gr.Textbox(label="Prompt")
334
  audio_in_inp = gr.Audio(label="Audio Reference", type="filepath")
335
- brush = gr.Brush(colors=["#000"], color_mode="fixed")
336
- spec_for_mask = gr.ImageEditor(label="Draw Mask", type="pil", brush=brush, interactive=True, layers=False)
337
  submit_btn_inp = gr.Button("Submit")
338
  audio_out_inp = gr.Audio(label="Audio Ressult")
339
 
@@ -341,23 +292,11 @@ with gr.Blocks(css=css) as demo:
341
  with gr.Column():
342
  input_spectrogram_inp = gr.Image(label="Input Spectrogram")
343
  output_spectrogram_inp = gr.Image(label="Output Spectrogram")
344
-
345
- audio_in_inp.upload(
346
- fn = load_spec_for_manual_masking,
347
- inputs = [audio_in_inp],
348
- outputs = [spec_for_mask]
349
- )
350
-
351
- spec_for_mask.clear(
352
- fn = load_spec_for_manual_masking,
353
- inputs = [audio_in_inp],
354
- outputs = [spec_for_mask]
355
- )
356
 
357
  submit_btn_inp.click(
358
  fn = infer_inp,
359
- inputs = [prompt_inp, audio_in_inp, spec_for_mask],
360
- outputs = [audio_out_inp]
361
  )
362
 
363
  demo.queue().launch(show_api=False, show_error=True)
 
1
  import gradio as gr
2
  import torch, os
 
3
  import numpy as np
4
  from PIL import Image
5
  import matplotlib.pyplot as plt
 
127
 
128
  return "output.wav", input_spec_image_path, output_spec_image_path
129
 
130
+ def infer_inp(prompt, audio_path, mask_start_point, mask_end_point, progress=gr.Progress(track_tqdm=True)):
 
 
 
131
 
132
  pretrained_model_name_or_path = "auffusion/auffusion-full-no-adapter"
133
  dtype = torch.float16
 
142
  pipe = StableDiffusionInpaintPipeline.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype)
143
  pipe = pipe.to(device)
144
 
145
+ width_start, width = mask_start_point, mask_end_point-mask_start_point
146
  prompt = "A siren ringing with a vehicle speeding closer"
147
  seed = 42
148
 
 
156
  raw_image = image_add_color(torch_to_pil(norm_spec))
157
 
158
  # Add Mask
159
+ mask = torch.zeros_like(norm_spec)[:1,...]
160
+ mask[:, :, width_start:width_start+width] = 1
161
+ mask_image = torch_to_pil(mask)
 
 
 
 
 
 
 
 
 
 
162
 
163
+ mask, masked_spec = prepare_mask_and_masked_image(norm_spec, mask)
 
 
 
164
  masked_spec_image = torch_to_pil(masked_spec)
165
 
166
  # color masked spec and paint masked area to black
 
204
  sf.write("output.wav", denorm_spec_audio, 16000)
205
 
206
  # Save input spectrogram image
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
207
  input_spec_image_path = "input_spectrogram.png"
208
  raw_image.save(input_spec_image_path)
209
 
210
+ # Save output spectrogram image
211
+ output_spec_image_path = "output_spectrogram.png"
212
+ color_output_spec_image.save(output_spec_image_path)
213
+
214
+ return "output.wav", input_spec_image_path, color_output_spec_image
215
+
 
 
 
 
216
 
217
  css="""
218
  div#col-container{
 
283
  with gr.Tab("Audio InPainting"):
284
  prompt_inp = gr.Textbox(label="Prompt")
285
  audio_in_inp = gr.Audio(label="Audio Reference", type="filepath")
286
+ mask_start_point = gr.Slider(label="Mask Start point", minimum=0, maximum=1024, step=1, value=256)
287
+ mask_end_point = gr.Slider(label="Mask End point", minimum=0, maximum=1024, step=1, value=768)
288
  submit_btn_inp = gr.Button("Submit")
289
  audio_out_inp = gr.Audio(label="Audio Ressult")
290
 
 
292
  with gr.Column():
293
  input_spectrogram_inp = gr.Image(label="Input Spectrogram")
294
  output_spectrogram_inp = gr.Image(label="Output Spectrogram")
 
 
 
 
 
 
 
 
 
 
 
 
295
 
296
  submit_btn_inp.click(
297
  fn = infer_inp,
298
+ inputs = [prompt_inp, audio_in_inp, mask_start_point, mask_end_point],
299
+ outputs = [audio_out_inp, input_spectrogram, output_spectrogram]
300
  )
301
 
302
  demo.queue().launch(show_api=False, show_error=True)