csukuangfj commited on
Commit
df1c0da
·
1 Parent(s): 580c413

first commit

Browse files
Files changed (2) hide show
  1. app.py +227 -0
  2. requirements.txt +3 -0
app.py ADDED
@@ -0,0 +1,227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+
3
+ import logging
4
+ import random
5
+ import subprocess
6
+ import soundfile as sf
7
+
8
+ import gradio as gr
9
+ import numpy as np
10
+ import sherpa_onnx
11
+ from huggingface_hub import hf_hub_download
12
+
13
+ sample_rate = 16000
14
+
15
+
16
+ def _get_nn_model_filename(
17
+ repo_id: str,
18
+ filename: str,
19
+ subfolder: str = "exp",
20
+ ) -> str:
21
+ nn_model_filename = hf_hub_download(
22
+ repo_id=repo_id,
23
+ filename=filename,
24
+ subfolder=subfolder,
25
+ )
26
+ return nn_model_filename
27
+
28
+
29
+ def get_vad() -> sherpa_onnx.VoiceActivityDetector:
30
+ vad_model = _get_nn_model_filename(
31
+ repo_id="csukuangfj/vad",
32
+ filename="silero_vad.onnx",
33
+ subfolder=".",
34
+ )
35
+
36
+ config = sherpa_onnx.VadModelConfig()
37
+ config.silero_vad.model = vad_model
38
+ config.silero_vad.threshold = 0.5
39
+ config.silero_vad.min_silence_duration = 0.1
40
+ config.silero_vad.min_speech_duration = 0.25
41
+ config.sample_rate = sample_rate
42
+ config.silero_vad.max_speech_duration = 20 # seconds
43
+
44
+ vad = sherpa_onnx.VoiceActivityDetector(
45
+ config,
46
+ buffer_size_in_seconds=180,
47
+ )
48
+
49
+ return vad
50
+
51
+
52
+ def build_html_output(s: str, style: str = "result_item_success"):
53
+ return f"""
54
+ <div class='result'>
55
+ <div class='result_item {style}'>
56
+ {s}
57
+ </div>
58
+ </div>
59
+ """
60
+
61
+
62
+ def process_uploaded_audio_file(
63
+ in_filename: str,
64
+ ):
65
+ logging.warning(f"Processing audio {in_filename}")
66
+ if in_filename is None or in_filename == "":
67
+ return (
68
+ "",
69
+ build_html_output(
70
+ "Please first upload a file and then click " 'the button "Submit"',
71
+ "result_item_error",
72
+ ),
73
+ "",
74
+ "",
75
+ )
76
+
77
+ return process_file(in_filename)
78
+
79
+
80
+ def process_uploaded_video_file(
81
+ in_filename: str,
82
+ ):
83
+ logging.warning(f"Processing video {in_filename}")
84
+ if in_filename is None or in_filename == "":
85
+ return (
86
+ "",
87
+ build_html_output(
88
+ "Please first upload a file and then click " 'the button "Submit"',
89
+ "result_item_error",
90
+ ),
91
+ "",
92
+ "",
93
+ )
94
+
95
+ logging.warning(f"Processing uploaded video file: {in_filename}")
96
+
97
+ return process_file(in_filename)
98
+
99
+
100
+ def process_file(filename: str):
101
+ vad = get_vad()
102
+
103
+ ffmpeg_cmd = [
104
+ "ffmpeg",
105
+ "-i",
106
+ filename,
107
+ "-f",
108
+ "s16le",
109
+ "-acodec",
110
+ "pcm_s16le",
111
+ "-ac",
112
+ "1",
113
+ "-ar",
114
+ str(sample_rate),
115
+ "-",
116
+ ]
117
+
118
+ process = subprocess.Popen(
119
+ ffmpeg_cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL
120
+ )
121
+
122
+ frames_per_read = int(sample_rate * 100) # 100 second
123
+
124
+ window_size = 512
125
+
126
+ buffer = []
127
+ all_samples = []
128
+ is_last = False
129
+
130
+ while True:
131
+ # *2 because int16_t has two bytes
132
+ data = process.stdout.read(frames_per_read * 2)
133
+ if not data:
134
+ if is_last:
135
+ break
136
+ is_last = True
137
+ data = np.zeros(sample_rate, dtype=np.int16)
138
+
139
+ samples = np.frombuffer(data, dtype=np.int16)
140
+ samples = samples.astype(np.float32) / 32768
141
+ buffer = np.concatenate([buffer, samples])
142
+
143
+ while len(buffer) > window_size:
144
+ vad.accept_waveform(buffer[:window_size])
145
+ buffer = buffer[window_size:]
146
+
147
+ if is_last:
148
+ vad.flush()
149
+
150
+ while not vad.empty():
151
+ all_samples.extend(vad.front.samples)
152
+ vad.pop()
153
+ suffix = random.randint(1000, 10000)
154
+ out_filename = f"{filename}-{suffix}.wav"
155
+
156
+ speech_samples = np.array(all_samples, dtype=np.float32)
157
+ sf.write(out_filename, speech_samples, samplerate=sample_rate)
158
+
159
+ return (
160
+ out_filename,
161
+ build_html_output(
162
+ "Done! Please download the generated .wav file", "result_item_success"
163
+ ),
164
+ )
165
+
166
+
167
+ css = """
168
+ .result {display:flex;flex-direction:column}
169
+ .result_item {padding:15px;margin-bottom:8px;border-radius:15px;width:100%}
170
+ .result_item_success {background-color:mediumaquamarine;color:white;align-self:start}
171
+ .result_item_error {background-color:#ff7070;color:white;align-self:start}
172
+ """
173
+
174
+ demo = gr.Blocks(css=css)
175
+
176
+ with demo:
177
+ gr.Markdown("Remove non-speeches")
178
+ with gr.Tabs():
179
+ with gr.TabItem("Upload audio from disk (音频)"):
180
+ uploaded_audio_file = gr.Audio(
181
+ sources=["upload"], # Choose between "microphone", "upload"
182
+ type="filepath",
183
+ label="Upload audio from disk",
184
+ )
185
+ upload_audio_button = gr.Button("Submit")
186
+
187
+ output_audio = gr.Audio(label="Output")
188
+ output_info_audio = gr.HTML(label="Info")
189
+
190
+ with gr.TabItem("Upload video from disk (视频)"):
191
+ uploaded_video_file = gr.Video(
192
+ sources=["upload"],
193
+ label="Upload from disk",
194
+ show_share_button=True,
195
+ )
196
+ upload_video_button = gr.Button("Submit")
197
+
198
+ output_video = gr.Video(label="Output")
199
+ output_info_video = gr.HTML(label="Info")
200
+
201
+ upload_video_button.click(
202
+ process_uploaded_video_file,
203
+ inputs=[
204
+ uploaded_video_file,
205
+ ],
206
+ outputs=[
207
+ output_video,
208
+ output_info_video,
209
+ ],
210
+ )
211
+
212
+ upload_audio_button.click(
213
+ process_uploaded_audio_file,
214
+ inputs=[
215
+ uploaded_audio_file,
216
+ ],
217
+ outputs=[
218
+ output_audio,
219
+ output_info_audio,
220
+ ],
221
+ )
222
+
223
+ if __name__ == "__main__":
224
+ formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
225
+
226
+ logging.basicConfig(format=formatter, level=logging.WARNING)
227
+ demo.launch(share=True)
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ sherpa-onnx>=1.11.4
2
+ ffmpeg-python
3
+ soundfile