alibabasglab commited on
Commit
b7f0660
·
verified ·
1 Parent(s): 013095c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +48 -4
app.py CHANGED
@@ -4,6 +4,7 @@ import gradio as gr
4
  import spaces
5
  from clearvoice import ClearVoice
6
  import os
 
7
 
8
  @spaces.GPU
9
  def fn_clearvoice_se(input_wav, sr):
@@ -19,7 +20,7 @@ def fn_clearvoice_se(input_wav, sr):
19
  output_wav = output_wav_dict[key]
20
  else:
21
  output_wav = output_wav_dict
22
- sf.write('enhanced.wav', output_wav, fs)
23
  return 'enhanced.wav'
24
 
25
  @spaces.GPU
@@ -35,8 +36,8 @@ def fn_clearvoice_ss(input_wav):
35
  output_wav_list = output_wav_dict
36
  output_wav_s1 = output_wav_list[0]
37
  output_wav_s2 = output_wav_list[1]
38
- sf.write('separated_s1.wav', output_wav_s1, 16000)
39
- sf.write('separated_s2.wav', output_wav_s2, 16000)
40
  return "separated_s1.wav", "separated_s2.wav"
41
 
42
  def find_mp4_files(directory):
@@ -62,7 +63,27 @@ def fn_clearvoice_tse(input_video):
62
  output_list = find_mp4_files(f'path_to_output_videos_tse/AV_MossFormer2_TSE_16K/{os.path.basename(input_video).split(".")[0]}/')
63
 
64
  return output_list
 
 
 
 
 
 
 
 
 
 
 
65
 
 
 
 
 
 
 
 
 
 
66
  demo = gr.Blocks()
67
 
68
  se_demo = gr.Interface(
@@ -129,7 +150,30 @@ tse_demo = gr.Interface(
129
  cache_examples = True,
130
  )
131
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
  with demo:
133
- gr.TabbedInterface([se_demo, ss_demo, tse_demo], ["Task 1: Speech Enhancement", "Task 2: Speech Separation", "Task 3: Audio-Visual Speaker Extraction"])
134
 
135
  demo.launch()
 
4
  import spaces
5
  from clearvoice import ClearVoice
6
  import os
7
+ import random
8
 
9
  @spaces.GPU
10
  def fn_clearvoice_se(input_wav, sr):
 
20
  output_wav = output_wav_dict[key]
21
  else:
22
  output_wav = output_wav_dict
23
+ sf.write('enhanced.wav', output_wav[0,:], fs)
24
  return 'enhanced.wav'
25
 
26
  @spaces.GPU
 
36
  output_wav_list = output_wav_dict
37
  output_wav_s1 = output_wav_list[0]
38
  output_wav_s2 = output_wav_list[1]
39
+ sf.write('separated_s1.wav', output_wav_s1[0,:], 16000)
40
+ sf.write('separated_s2.wav', output_wav_s2[0,:], 16000)
41
  return "separated_s1.wav", "separated_s2.wav"
42
 
43
  def find_mp4_files(directory):
 
63
  output_list = find_mp4_files(f'path_to_output_videos_tse/AV_MossFormer2_TSE_16K/{os.path.basename(input_video).split(".")[0]}/')
64
 
65
  return output_list
66
+
67
+ @spaces.GPU
68
+ def fn_clearvoice_sr(input_wav, apply_se):
69
+ wavname = input_wav.split('/')[-1]
70
+ myClearVoice = ClearVoice(task='speech_super_resolution', model_names=['MossFormer2_SR_48K'])
71
+ fs = 48000
72
+ if apply_se:
73
+ new_wavname = wavname.replace('.wav', str(random.randint(0,1000))+'.wav')
74
+ myClearVoice_se = ClearVoice(task='speech_enhancement', model_names=['MossFormer2_SE_48K'])
75
+ myClearVoice_se(input_path=input_wav, online_write=True, output_path=new_wavname)
76
+ input_wav = new_wavname
77
 
78
+ output_wav_dict = myClearVoice(input_path=input_wav, online_write=False)
79
+ if isinstance(output_wav_dict, dict):
80
+ key = next(iter(output_wav_dict))
81
+ output_wav = output_wav_dict[key]
82
+ else:
83
+ output_wav = output_wav_dict
84
+ sf.write('enhanced_high_res.wav', output_wav[0,:], fs)
85
+ return 'enhanced_high_res.wav'
86
+
87
  demo = gr.Blocks()
88
 
89
  se_demo = gr.Interface(
 
150
  cache_examples = True,
151
  )
152
 
153
+ sr_demo = gr.Interface(
154
+ fn=fn_clearvoice_sr,
155
+ inputs = [
156
+ gr.Audio(label="Input Audio", type="filepath"),
157
+ gr.Checkbox(label="Apply Speech Enhancement", value=True),
158
+ ],
159
+ outputs = [
160
+ gr.Audio(label="Output Audio", type="filepath"),
161
+ ],
162
+ title = "<a href='https://github.com/modelscope/ClearerVoice-Studio/tree/main/clearvoice' target='_blank'>ClearVoice<a/>: Speech Super Resolution",
163
+ description = ("ClearVoice ([Github Repo](https://github.com/modelscope/ClearerVoice-Studio/tree/main/clearvoice)) is AI-powered and transform low-resolution audio (effective sampling rate ≥ 16 kHz) into crystal-clear, high-resolution audio at 48 kHz. It supports most of audio types. "
164
+ "To try it, simply upload your audio, or click one of the examples. "),
165
+ article = ("<p style='text-align: center'><a href='https://arxiv.org/abs/2206.07293' target='_blank'>FRCRN: Boosting Feature Representation Using Frequency Recurrence for Monaural Speech Enhancement</a> </p>"
166
+ "<p style='text-align: center'><a href='https://arxiv.org/abs/2312.11825' target='_blank'>MossFormer2: Combining Transformer and RNN-Free Recurrent Network for Enhanced Time-Domain Monaural Speech Separation</a> </p>"),
167
+ examples = [
168
+ ["examples/mandarin_speech_16kHz.wav", True],
169
+ ["examples/LJSpeech-001-0001-22k.wav", True],
170
+ ["examples/LibriTTS_986_129388_24k.wav", True],
171
+ ["examples/english_speech_48kHz.wav", True],
172
+ ],
173
+ cache_examples = True,
174
+ )
175
+
176
  with demo:
177
+ gr.TabbedInterface([se_demo, ss_demo, sr_demo, tse_demo], ["Task 1: Speech Enhancement", "Task 2: Speech Separation", "Task 3: Speech Super Resolution", "Task 4: Audio-Visual Speaker Extraction"])
178
 
179
  demo.launch()