Update app.py
Browse files
app.py
CHANGED
@@ -1,6 +1,5 @@
|
|
1 |
import gradio as gr
|
2 |
import torch, os
|
3 |
-
from torchvision import transforms
|
4 |
import numpy as np
|
5 |
from PIL import Image
|
6 |
import matplotlib.pyplot as plt
|
@@ -128,10 +127,7 @@ def infer_img2img(prompt, audio_path, desired_strength, progress=gr.Progress(tra
|
|
128 |
|
129 |
return "output.wav", input_spec_image_path, output_spec_image_path
|
130 |
|
131 |
-
def infer_inp(prompt, audio_path,
|
132 |
-
|
133 |
-
if spec_with_mask:
|
134 |
-
print(spec_with_mask)
|
135 |
|
136 |
pretrained_model_name_or_path = "auffusion/auffusion-full-no-adapter"
|
137 |
dtype = torch.float16
|
@@ -146,7 +142,7 @@ def infer_inp(prompt, audio_path, spec_with_mask, progress=gr.Progress(track_tqd
|
|
146 |
pipe = StableDiffusionInpaintPipeline.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype)
|
147 |
pipe = pipe.to(device)
|
148 |
|
149 |
-
width_start, width =
|
150 |
prompt = "A siren ringing with a vehicle speeding closer"
|
151 |
seed = 42
|
152 |
|
@@ -160,24 +156,11 @@ def infer_inp(prompt, audio_path, spec_with_mask, progress=gr.Progress(track_tqd
|
|
160 |
raw_image = image_add_color(torch_to_pil(norm_spec))
|
161 |
|
162 |
# Add Mask
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
# Load the mask image (input from user)
|
168 |
-
mask_pil = spec_with_mask['layers'][0]
|
169 |
-
|
170 |
-
# Convert to tensor and normalize
|
171 |
-
mask_tensor = transforms.ToTensor()(mask_pil) # Shape: (1, H, W), values in [0, 1]
|
172 |
-
|
173 |
-
# Ensure the shape matches expected input (add batch dimension if needed)
|
174 |
-
mask_tensor = mask_tensor[:1, :, :] # Keep only one channel (grayscale)
|
175 |
-
mask_tensor = mask_tensor.to(device, dtype) # Send to correct device and dtype
|
176 |
|
177 |
-
|
178 |
-
mask_image = torch_to_pil(mask_tensor)
|
179 |
-
|
180 |
-
mask, masked_spec = prepare_mask_and_masked_image(norm_spec, mask_tensor)
|
181 |
masked_spec_image = torch_to_pil(masked_spec)
|
182 |
|
183 |
# color masked spec and paint masked area to black
|
@@ -221,47 +204,15 @@ def infer_inp(prompt, audio_path, spec_with_mask, progress=gr.Progress(track_tqd
|
|
221 |
sf.write("output.wav", denorm_spec_audio, 16000)
|
222 |
|
223 |
# Save input spectrogram image
|
224 |
-
#input_spec_image_path = "input_spectrogram.png"
|
225 |
-
#raw_image.save(input_spec_image_path)
|
226 |
-
|
227 |
-
# Save concatenated spectrogram image
|
228 |
-
#output_spec_image_path = "output_spectrogram.png"
|
229 |
-
#denorm_spec_audio.save(output_spec_image_path)
|
230 |
-
|
231 |
-
return "output.wav"
|
232 |
-
|
233 |
-
def create_transparent_layer(image_path):
|
234 |
-
"""Creates a transparent PNG with the same size as the background image."""
|
235 |
-
background = Image.open(image_path)
|
236 |
-
transparent_layer = Image.new("RGBA", background.size, (0, 0, 0, 0))
|
237 |
-
|
238 |
-
layer_path = "layer_one.png"
|
239 |
-
transparent_layer.save(layer_path)
|
240 |
-
return layer_path
|
241 |
-
|
242 |
-
def load_spec_for_manual_masking(audio_path):
|
243 |
-
# Loading
|
244 |
-
audio, sampling_rate = load_wav(audio_path)
|
245 |
-
audio, spec = get_mel_spectrogram_from_audio(audio)
|
246 |
-
norm_spec = normalize_spectrogram(spec)
|
247 |
-
norm_spec = pad_spec(norm_spec, 1024)
|
248 |
-
norm_spec = normalize(norm_spec) # normalize to [-1, 1], because pipeline do not normalize for torch.Tensor input
|
249 |
-
|
250 |
-
raw_image = image_add_color(torch_to_pil(norm_spec))
|
251 |
-
|
252 |
input_spec_image_path = "input_spectrogram.png"
|
253 |
raw_image.save(input_spec_image_path)
|
254 |
|
255 |
-
#
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
return
|
260 |
-
|
261 |
-
"layers": [layer_one_path],
|
262 |
-
"composite": None
|
263 |
-
}
|
264 |
-
|
265 |
|
266 |
css="""
|
267 |
div#col-container{
|
@@ -332,8 +283,8 @@ with gr.Blocks(css=css) as demo:
|
|
332 |
with gr.Tab("Audio InPainting"):
|
333 |
prompt_inp = gr.Textbox(label="Prompt")
|
334 |
audio_in_inp = gr.Audio(label="Audio Reference", type="filepath")
|
335 |
-
|
336 |
-
|
337 |
submit_btn_inp = gr.Button("Submit")
|
338 |
audio_out_inp = gr.Audio(label="Audio Ressult")
|
339 |
|
@@ -341,23 +292,11 @@ with gr.Blocks(css=css) as demo:
|
|
341 |
with gr.Column():
|
342 |
input_spectrogram_inp = gr.Image(label="Input Spectrogram")
|
343 |
output_spectrogram_inp = gr.Image(label="Output Spectrogram")
|
344 |
-
|
345 |
-
audio_in_inp.upload(
|
346 |
-
fn = load_spec_for_manual_masking,
|
347 |
-
inputs = [audio_in_inp],
|
348 |
-
outputs = [spec_for_mask]
|
349 |
-
)
|
350 |
-
|
351 |
-
spec_for_mask.clear(
|
352 |
-
fn = load_spec_for_manual_masking,
|
353 |
-
inputs = [audio_in_inp],
|
354 |
-
outputs = [spec_for_mask]
|
355 |
-
)
|
356 |
|
357 |
submit_btn_inp.click(
|
358 |
fn = infer_inp,
|
359 |
-
inputs = [prompt_inp, audio_in_inp,
|
360 |
-
outputs = [audio_out_inp]
|
361 |
)
|
362 |
|
363 |
demo.queue().launch(show_api=False, show_error=True)
|
|
|
1 |
import gradio as gr
|
2 |
import torch, os
|
|
|
3 |
import numpy as np
|
4 |
from PIL import Image
|
5 |
import matplotlib.pyplot as plt
|
|
|
127 |
|
128 |
return "output.wav", input_spec_image_path, output_spec_image_path
|
129 |
|
130 |
+
def infer_inp(prompt, audio_path, mask_start_point, mask_end_point, progress=gr.Progress(track_tqdm=True)):
|
|
|
|
|
|
|
131 |
|
132 |
pretrained_model_name_or_path = "auffusion/auffusion-full-no-adapter"
|
133 |
dtype = torch.float16
|
|
|
142 |
pipe = StableDiffusionInpaintPipeline.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype)
|
143 |
pipe = pipe.to(device)
|
144 |
|
145 |
+
width_start, width = mask_start_point, mask_end_point-mask_start_point
|
146 |
prompt = "A siren ringing with a vehicle speeding closer"
|
147 |
seed = 42
|
148 |
|
|
|
156 |
raw_image = image_add_color(torch_to_pil(norm_spec))
|
157 |
|
158 |
# Add Mask
|
159 |
+
mask = torch.zeros_like(norm_spec)[:1,...]
|
160 |
+
mask[:, :, width_start:width_start+width] = 1
|
161 |
+
mask_image = torch_to_pil(mask)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
162 |
|
163 |
+
mask, masked_spec = prepare_mask_and_masked_image(norm_spec, mask)
|
|
|
|
|
|
|
164 |
masked_spec_image = torch_to_pil(masked_spec)
|
165 |
|
166 |
# color masked spec and paint masked area to black
|
|
|
204 |
sf.write("output.wav", denorm_spec_audio, 16000)
|
205 |
|
206 |
# Save input spectrogram image
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
207 |
input_spec_image_path = "input_spectrogram.png"
|
208 |
raw_image.save(input_spec_image_path)
|
209 |
|
210 |
+
# Save output spectrogram image
|
211 |
+
output_spec_image_path = "output_spectrogram.png"
|
212 |
+
color_output_spec_image.save(output_spec_image_path)
|
213 |
+
|
214 |
+
return "output.wav", input_spec_image_path, color_output_spec_image
|
215 |
+
|
|
|
|
|
|
|
|
|
216 |
|
217 |
css="""
|
218 |
div#col-container{
|
|
|
283 |
with gr.Tab("Audio InPainting"):
|
284 |
prompt_inp = gr.Textbox(label="Prompt")
|
285 |
audio_in_inp = gr.Audio(label="Audio Reference", type="filepath")
|
286 |
+
mask_start_point = gr.Slider(label="Mask Start point", minimum=0, maximum=1024, step=1, value=256)
|
287 |
+
mask_end_point = gr.Slider(label="Mask End point", minimum=0, maximum=1024, step=1, value=768)
|
288 |
submit_btn_inp = gr.Button("Submit")
|
289 |
audio_out_inp = gr.Audio(label="Audio Ressult")
|
290 |
|
|
|
292 |
with gr.Column():
|
293 |
input_spectrogram_inp = gr.Image(label="Input Spectrogram")
|
294 |
output_spectrogram_inp = gr.Image(label="Output Spectrogram")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
295 |
|
296 |
submit_btn_inp.click(
|
297 |
fn = infer_inp,
|
298 |
+
inputs = [prompt_inp, audio_in_inp, mask_start_point, mask_end_point],
|
299 |
+
outputs = [audio_out_inp, input_spectrogram, output_spectrogram]
|
300 |
)
|
301 |
|
302 |
demo.queue().launch(show_api=False, show_error=True)
|