fffiloni commited on
Commit
0a20a75
·
verified ·
1 Parent(s): 20c6475

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +106 -8
app.py CHANGED
@@ -127,6 +127,92 @@ def infer_img2img(prompt, audio_path, desired_strength, progress=gr.Progress(tra
127
 
128
  return "output.wav", input_spec_image_path, output_spec_image_path
129
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
  css="""
131
  div#col-container{
132
  margin: 0 auto;
@@ -185,14 +271,7 @@ with gr.Blocks(css=css) as demo:
185
  input_spectrogram = gr.Image(label="Input Spectrogram")
186
  output_spectrogram = gr.Image(label="Output Spectrogram")
187
 
188
- gr.Examples(
189
- examples = [
190
- "Rolling thunder with lightning strikes",
191
- "Two gunshots followed by birds chirping",
192
- "A train whistle blowing in the distance"
193
- ],
194
- inputs = [prompt_img2img]
195
- )
196
 
197
  submit_btn_img2img.click(
198
  fn = infer_img2img,
@@ -200,4 +279,23 @@ with gr.Blocks(css=css) as demo:
200
  outputs = [audio_out_img2img, input_spectrogram, output_spectrogram]
201
  )
202
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
203
  demo.queue().launch(show_api=False, show_error=True)
 
127
 
128
  return "output.wav", input_spec_image_path, output_spec_image_path
129
 
130
+ def infer_inp(prompt, audio_path, progress=gr.Progress(track_tqdm=True)):
131
+
132
+ pretrained_model_name_or_path = "auffusion/auffusion-full-no-adapter"
133
+ dtype = torch.float16
134
+ device = "cuda"
135
+
136
+ if not os.path.isdir(pretrained_model_name_or_path):
137
+ pretrained_model_name_or_path = snapshot_download(pretrained_model_name_or_path)
138
+
139
+ vocoder = Generator.from_pretrained(pretrained_model_name_or_path, subfolder="vocoder")
140
+ vocoder = vocoder.to(device=device, dtype=dtype)
141
+
142
+ pipe = StableDiffusionImg2ImgPipeline.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype)
143
+ pipe = pipe.to(device)
144
+
145
+ width_start, width = 256, 512
146
+ prompt = "A siren ringing with a vehicle speeding closer"
147
+ seed = 42
148
+
149
+ # Loading
150
+ audio, sampling_rate = load_wav(audio_path)
151
+ audio, spec = get_mel_spectrogram_from_audio(audio)
152
+ norm_spec = normalize_spectrogram(spec)
153
+ norm_spec = pad_spec(norm_spec, 1024)
154
+ norm_spec = normalize(norm_spec) # normalize to [-1, 1], because pipeline do not normalize for torch.Tensor input
155
+
156
+ raw_image = image_add_color(torch_to_pil(norm_spec))
157
+
158
+ # Add Mask
159
+ mask = torch.zeros_like(norm_spec)[:1,...]
160
+ mask[:, :, width_start:width_start+width] = 1
161
+ mask_image = torch_to_pil(mask)
162
+
163
+ mask, masked_spec = prepare_mask_and_masked_image(norm_spec, mask)
164
+ masked_spec_image = torch_to_pil(masked_spec)
165
+
166
+ # color masked spec and paint masked area to black
167
+ color_masked_spec_image = image_add_color(masked_spec_image)
168
+ color_masked_spec_image = np.array(color_masked_spec_image)
169
+ color_masked_spec_image[:, width_start:width_start+width, :] = 0
170
+ color_masked_spec_image = Image.fromarray(color_masked_spec_image)
171
+
172
+ # Generation
173
+ generator = torch.Generator(device=device).manual_seed(seed)
174
+
175
+ with torch.autocast("cuda"):
176
+ output_spec = pipe(
177
+ prompt=prompt, image=norm_spec, mask_image=mask, num_inference_steps=100, generator=generator, height=256, width=1024, output_type="pt"
178
+ ).images[0]
179
+
180
+ output_spec_image = torch_to_pil(output_spec)
181
+ color_output_spec_image = image_add_color(output_spec_image)
182
+
183
+ # Display audio result: raw audio, masked raw audio, generated audio
184
+ post_norm_spec = denormalize(norm_spec).to(device, dtype)
185
+ raw_chunk_spec = denormalize_spectrogram(post_norm_spec)
186
+ raw_chunk_audio = vocoder.inference(raw_chunk_spec)
187
+
188
+ post_masked_spec = denormalize(masked_spec).to(device, dtype)
189
+ denorm_masked_spec = denormalize_spectrogram(post_masked_spec)
190
+ denorm_masked_spec_audio = vocoder.inference(denorm_masked_spec)
191
+
192
+ denorm_spec = denormalize_spectrogram(output_spec)
193
+ denorm_spec_audio = vocoder.inference(denorm_spec)
194
+
195
+ #———
196
+
197
+ # Ensure correct shape
198
+ denorm_spec_audio = denorm_spec_audio.flatten() # Converts (1, N) → (N,)
199
+
200
+ # Normalize the audio to prevent clipping or excessive loudness
201
+ denorm_spec_audio = denorm_spec_audio / np.max(np.abs(denorm_spec_audio)) # Scale between -1 and 1
202
+
203
+ # Save as WAV
204
+ sf.write("output.wav", denorm_spec_audio, 16000)
205
+
206
+ # Save input spectrogram image
207
+ #input_spec_image_path = "input_spectrogram.png"
208
+ #raw_image.save(input_spec_image_path)
209
+
210
+ # Save concatenated spectrogram image
211
+ #output_spec_image_path = "output_spectrogram.png"
212
+ concat_image.save(output_spec_image_path)
213
+
214
+ #return "output.wav"
215
+
216
  css="""
217
  div#col-container{
218
  margin: 0 auto;
 
271
  input_spectrogram = gr.Image(label="Input Spectrogram")
272
  output_spectrogram = gr.Image(label="Output Spectrogram")
273
 
274
+
 
 
 
 
 
 
 
275
 
276
  submit_btn_img2img.click(
277
  fn = infer_img2img,
 
279
  outputs = [audio_out_img2img, input_spectrogram, output_spectrogram]
280
  )
281
 
282
+ with gr.Tab("Audio InPainting"):
283
+ prompt_inp = gr.Textbox(label="Prompt")
284
+ audio_in_inp = gr.Audio(label="Audio Reference", type="filepath")
285
+ submit_btn_inp = gr.Button("Submit")
286
+ audio_out_inp = gr.Audio(label="Audio Ressult")
287
+
288
+ with gr.Accordion("Compare Spectrograms", open=False):
289
+ with gr.Column():
290
+ input_spectrogram_inp = gr.Image(label="Input Spectrogram")
291
+ output_spectrogram_inp = gr.Image(label="Output Spectrogram")
292
+
293
+
294
+
295
+ submit_btn_inp.click(
296
+ fn = infer_inp,
297
+ inputs = [prompt_inp, audio_in_inp],
298
+ outputs = [audio_out_inp]
299
+ )
300
+
301
  demo.queue().launch(show_api=False, show_error=True)