File size: 5,851 Bytes
df1c0da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
#!/usr/bin/env python3

import logging
import random
import subprocess
import soundfile as sf

import gradio as gr
import numpy as np
import sherpa_onnx
from huggingface_hub import hf_hub_download

sample_rate = 16000


def _get_nn_model_filename(
    repo_id: str,
    filename: str,
    subfolder: str = "exp",
) -> str:
    nn_model_filename = hf_hub_download(
        repo_id=repo_id,
        filename=filename,
        subfolder=subfolder,
    )
    return nn_model_filename


def get_vad() -> sherpa_onnx.VoiceActivityDetector:
    vad_model = _get_nn_model_filename(
        repo_id="csukuangfj/vad",
        filename="silero_vad.onnx",
        subfolder=".",
    )

    config = sherpa_onnx.VadModelConfig()
    config.silero_vad.model = vad_model
    config.silero_vad.threshold = 0.5
    config.silero_vad.min_silence_duration = 0.1
    config.silero_vad.min_speech_duration = 0.25
    config.sample_rate = sample_rate
    config.silero_vad.max_speech_duration = 20  # seconds

    vad = sherpa_onnx.VoiceActivityDetector(
        config,
        buffer_size_in_seconds=180,
    )

    return vad


def build_html_output(s: str, style: str = "result_item_success"):
    return f"""
    <div class='result'>
        <div class='result_item {style}'>
          {s}
        </div>
    </div>
    """


def process_uploaded_audio_file(
    in_filename: str,
):
    logging.warning(f"Processing audio {in_filename}")
    if in_filename is None or in_filename == "":
        return (
            "",
            build_html_output(
                "Please first upload a file and then click " 'the button "Submit"',
                "result_item_error",
            ),
            "",
            "",
        )

    return process_file(in_filename)


def process_uploaded_video_file(
    in_filename: str,
):
    logging.warning(f"Processing video {in_filename}")
    if in_filename is None or in_filename == "":
        return (
            "",
            build_html_output(
                "Please first upload a file and then click " 'the button "Submit"',
                "result_item_error",
            ),
            "",
            "",
        )

    logging.warning(f"Processing uploaded video file: {in_filename}")

    return process_file(in_filename)


def process_file(filename: str):
    vad = get_vad()

    ffmpeg_cmd = [
        "ffmpeg",
        "-i",
        filename,
        "-f",
        "s16le",
        "-acodec",
        "pcm_s16le",
        "-ac",
        "1",
        "-ar",
        str(sample_rate),
        "-",
    ]

    process = subprocess.Popen(
        ffmpeg_cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL
    )

    frames_per_read = int(sample_rate * 100)  # 100 second

    window_size = 512

    buffer = []
    all_samples = []
    is_last = False

    while True:
        # *2 because int16_t has two bytes
        data = process.stdout.read(frames_per_read * 2)
        if not data:
            if is_last:
                break
            is_last = True
            data = np.zeros(sample_rate, dtype=np.int16)

        samples = np.frombuffer(data, dtype=np.int16)
        samples = samples.astype(np.float32) / 32768
        buffer = np.concatenate([buffer, samples])

        while len(buffer) > window_size:
            vad.accept_waveform(buffer[:window_size])
            buffer = buffer[window_size:]

        if is_last:
            vad.flush()

        while not vad.empty():
            all_samples.extend(vad.front.samples)
            vad.pop()
    suffix = random.randint(1000, 10000)
    out_filename = f"{filename}-{suffix}.wav"

    speech_samples = np.array(all_samples, dtype=np.float32)
    sf.write(out_filename, speech_samples, samplerate=sample_rate)

    return (
        out_filename,
        build_html_output(
            "Done! Please download the generated .wav file", "result_item_success"
        ),
    )


css = """
.result {display:flex;flex-direction:column}
.result_item {padding:15px;margin-bottom:8px;border-radius:15px;width:100%}
.result_item_success {background-color:mediumaquamarine;color:white;align-self:start}
.result_item_error {background-color:#ff7070;color:white;align-self:start}
"""

demo = gr.Blocks(css=css)

with demo:
    gr.Markdown("Remove non-speeches")
    with gr.Tabs():
        with gr.TabItem("Upload audio from disk (音频)"):
            uploaded_audio_file = gr.Audio(
                sources=["upload"],  # Choose between "microphone", "upload"
                type="filepath",
                label="Upload audio from disk",
            )
            upload_audio_button = gr.Button("Submit")

            output_audio = gr.Audio(label="Output")
            output_info_audio = gr.HTML(label="Info")

        with gr.TabItem("Upload video from disk (视频)"):
            uploaded_video_file = gr.Video(
                sources=["upload"],
                label="Upload from disk",
                show_share_button=True,
            )
            upload_video_button = gr.Button("Submit")

            output_video = gr.Video(label="Output")
            output_info_video = gr.HTML(label="Info")

        upload_video_button.click(
            process_uploaded_video_file,
            inputs=[
                uploaded_video_file,
            ],
            outputs=[
                output_video,
                output_info_video,
            ],
        )

        upload_audio_button.click(
            process_uploaded_audio_file,
            inputs=[
                uploaded_audio_file,
            ],
            outputs=[
                output_audio,
                output_info_audio,
            ],
        )

if __name__ == "__main__":
    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"

    logging.basicConfig(format=formatter, level=logging.WARNING)
    demo.launch(share=True)