admin321 commited on
Commit
fd6a8dc
·
1 Parent(s): 10cc38f
.gitattributes ADDED
@@ -0,0 +1 @@
 
 
1
+ *.jit filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ pip-wheel-metadata/
24
+ share/python-wheels/
25
+ *.egg-info/
26
+ .installed.cfg
27
+ *.egg
28
+ MANIFEST
29
+
30
+ # PyInstaller
31
+ # Usually these files are written by a python script from a template
32
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
33
+ *.manifest
34
+ *.spec
35
+
36
+ # Installer logs
37
+ pip-log.txt
38
+ pip-delete-this-directory.txt
39
+
40
+ # Unit test / coverage reports
41
+ htmlcov/
42
+ .tox/
43
+ .nox/
44
+ .coverage
45
+ .coverage.*
46
+ .cache
47
+ nosetests.xml
48
+ coverage.xml
49
+ *.cover
50
+ *.py,cover
51
+ .hypothesis/
52
+ .pytest_cache/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ target/
76
+
77
+ # Jupyter Notebook
78
+ .ipynb_checkpoints
79
+
80
+ # IPython
81
+ profile_default/
82
+ ipython_config.py
83
+
84
+ # pyenv
85
+ .python-version
86
+
87
+ # pipenv
88
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
90
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
91
+ # install all needed dependencies.
92
+ #Pipfile.lock
93
+
94
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95
+ __pypackages__/
96
+
97
+ # Celery stuff
98
+ celerybeat-schedule
99
+ celerybeat.pid
100
+
101
+ # SageMath parsed files
102
+ *.sage.py
103
+
104
+ # Environments
105
+ .env
106
+ .venv
107
+ env/
108
+ venv/
109
+ ENV/
110
+ env.bak/
111
+ venv.bak/
112
+
113
+ # Spyder project settings
114
+ .spyderproject
115
+ .spyproject
116
+
117
+ # Rope project settings
118
+ .ropeproject
119
+
120
+ # mkdocs documentation
121
+ /site
122
+
123
+ # mypy
124
+ .mypy_cache/
125
+ .dmypy.json
126
+ dmypy.json
127
+
128
+ # Pyre type checker
129
+ .pyre/
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2022 fortypercnt
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # stream-translator
2
+ Command line utility to transcribe or translate audio from livestreams in real time. Uses [streamlink](https://github.com/streamlink/streamlink) to
3
+ get livestream URLs from various services and OpenAI's [whisper](https://github.com/openai/whisper) for transcription/translation.
4
+ This script is inspired by [audioWhisper](https://github.com/Awexander/audioWhisper) which transcribes/translates desktop audio.
5
+
6
+ ## Prerequisites
7
+
8
+ 1. [**Install and add ffmpeg to your PATH**](https://www.thewindowsclub.com/how-to-install-ffmpeg-on-windows-10#:~:text=Click%20New%20and%20type%20the,Click%20OK%20to%20apply%20changes.)
9
+ 2. [**Install CUDA on your system.**](https://developer.nvidia.com/cuda-downloads) If you installed a different version of CUDA than 11.3,
10
+ change cu113 in requirements.txt accordingly. You can check the installed CUDA version with ```nvcc --version```.
11
+
12
+ ## Setup
13
+
14
+ 1. Setup a virtual environment.
15
+ 2. ```git clone https://github.com/fortypercnt/stream-translator.git```
16
+ 3. ```pip install -r requirements.txt```
17
+ 4. Make sure that pytorch is installed with CUDA support. Whisper will probably not run in real time on a CPU.
18
+
19
+ ## Command-line usage
20
+
21
+ ```python translator.py URL --flags```
22
+
23
+ By default, the URL can be of the form ```twitch.tv/forsen``` and streamlink is used to obtain the .m3u8 link which is passed to ffmpeg.
24
+ See [streamlink plugins](https://streamlink.github.io/plugins.html) for info on all supported sites.
25
+
26
+
27
+ | --flags | Default Value | Description |
28
+ |:-------------------------------:|:---------------------:|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:|
29
+ | `--model` | small | Select model size. See [here](https://github.com/openai/whisper#available-models-and-languages) for available models. |
30
+ | `--task` | translate | Whether to transcribe the audio (keep original language) or translate to english. |
31
+ | `--language` | auto | Language spoken in the stream. See [here](https://github.com/openai/whisper#available-models-and-languages) for available languages. |
32
+ | `--interval` | 5 | Interval between calls to the language model in seconds. |
33
+ | `--history_buffer_size` | 0 | Seconds of previous audio/text to use for conditioning the model. Set to 0 to just use audio from the last interval. Note that this can easily lead to repetition/loops if the chosen language/model settings do not produce good results to begin with. |
34
+ | `--beam_size` | 5 | Number of beams in beam search. Set to 0 to use greedy algorithm instead (faster but less accurate). |
35
+ | `--best_of` | 5 | Number of candidates when sampling with non-zero temperature. |
36
+ | `--preferred_quality` | audio_only | Preferred stream quality option. "best" and "worst" should always be available. Type "streamlink URL" in the console to see quality options for your URL. |
37
+ | `--disable_vad` | | Set this flag to disable additional voice activity detection by Silero VAD. |
38
+ | `--direct_url` | | Set this flag to pass the URL directly to ffmpeg. Otherwise, streamlink is used to obtain the stream URL. |
39
+ | `--use_faster_whisper` | | Set this flag to use faster_whisper implementation instead of the original OpenAI implementation |
40
+ | `--faster_whisper_model_path` | whisper-large-v2-ct2/ | Path to a directory containing a Whisper model in the CTranslate2 format. |
41
+ | `--faster_whisper_device` | cuda | Set the device to run faster-whisper on. |
42
+ | `--faster_whisper_compute_type` | float16 | Set the quantization type for faster_whisper. See [here](https://opennmt.net/CTranslate2/quantization.html) for more info. |
43
+
44
+ ## Using faster-whisper
45
+
46
+ faster-whisper provides significant performance upgrades over the original OpenAI implementation (~ 4x faster, ~ 2x less memory).
47
+ To use it, follow the instructions [here](https://github.com/guillaumekln/faster-whisper#installation) to install faster-whisper and convert your models to CTranslate2 format.
48
+ Then you can run the CLI with --use_faster_whisper and set --faster_whisper_model_path to the location of your converted model.
faster_whisper/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from faster_whisper.transcribe import WhisperModel
faster_whisper/audio.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import av
2
+ import numpy as np
3
+
4
+
5
+ def decode_audio(input_file, sampling_rate=16000):
6
+ """Decodes the audio.
7
+
8
+ Args:
9
+ input_file: Path to the input file or a file-like object.
10
+ sampling_rate: Resample the audio to this sample rate.
11
+
12
+ Returns:
13
+ A float32 Numpy array.
14
+ """
15
+ fifo = av.audio.fifo.AudioFifo()
16
+ resampler = av.audio.resampler.AudioResampler(
17
+ format="s16",
18
+ layout="mono",
19
+ rate=sampling_rate,
20
+ )
21
+
22
+ with av.open(input_file) as container:
23
+ # Decode and resample each audio frame.
24
+ for frame in container.decode(audio=0):
25
+ frame.pts = None
26
+ for new_frame in resampler.resample(frame):
27
+ fifo.write(new_frame)
28
+
29
+ # Flush the resampler.
30
+ for new_frame in resampler.resample(None):
31
+ fifo.write(new_frame)
32
+
33
+ frame = fifo.read()
34
+
35
+ # Convert s16 back to f32.
36
+ return frame.to_ndarray().flatten().astype(np.float32) / 32768.0
faster_whisper/feature_extractor.py ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+
3
+
4
+ # Adapted from https://github.com/huggingface/transformers/blob/main/src/transformers/models/whisper/feature_extraction_whisper.py
5
+ class FeatureExtractor:
6
+ def __init__(
7
+ self,
8
+ feature_size=80,
9
+ sampling_rate=16000,
10
+ hop_length=160,
11
+ chunk_length=30,
12
+ n_fft=400,
13
+ ):
14
+ self.n_fft = n_fft
15
+ self.hop_length = hop_length
16
+ self.chunk_length = chunk_length
17
+ self.n_samples = chunk_length * sampling_rate
18
+ self.nb_max_frames = self.n_samples // hop_length
19
+ self.time_per_frame = hop_length / sampling_rate
20
+ self.sampling_rate = sampling_rate
21
+ self.mel_filters = self.get_mel_filters(
22
+ sampling_rate, n_fft, n_mels=feature_size
23
+ )
24
+
25
+ def get_mel_filters(self, sr, n_fft, n_mels=128, dtype=np.float32):
26
+ # Initialize the weights
27
+ n_mels = int(n_mels)
28
+ weights = np.zeros((n_mels, int(1 + n_fft // 2)), dtype=dtype)
29
+
30
+ # Center freqs of each FFT bin
31
+ fftfreqs = np.fft.rfftfreq(n=n_fft, d=1.0 / sr)
32
+
33
+ # 'Center freqs' of mel bands - uniformly spaced between limits
34
+ min_mel = 0.0
35
+ max_mel = 45.245640471924965
36
+
37
+ mels = np.linspace(min_mel, max_mel, n_mels + 2)
38
+
39
+ mels = np.asanyarray(mels)
40
+
41
+ # Fill in the linear scale
42
+ f_min = 0.0
43
+ f_sp = 200.0 / 3
44
+ freqs = f_min + f_sp * mels
45
+
46
+ # And now the nonlinear scale
47
+ min_log_hz = 1000.0 # beginning of log region (Hz)
48
+ min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels)
49
+ logstep = np.log(6.4) / 27.0 # step size for log region
50
+
51
+ # If we have vector data, vectorize
52
+ log_t = mels >= min_log_mel
53
+ freqs[log_t] = min_log_hz * np.exp(logstep * (mels[log_t] - min_log_mel))
54
+
55
+ mel_f = freqs
56
+
57
+ fdiff = np.diff(mel_f)
58
+ ramps = np.subtract.outer(mel_f, fftfreqs)
59
+
60
+ for i in range(n_mels):
61
+ # lower and upper slopes for all bins
62
+ lower = -ramps[i] / fdiff[i]
63
+ upper = ramps[i + 2] / fdiff[i + 1]
64
+
65
+ # .. then intersect them with each other and zero
66
+ weights[i] = np.maximum(0, np.minimum(lower, upper))
67
+
68
+ # Slaney-style mel is scaled to be approx constant energy per channel
69
+ enorm = 2.0 / (mel_f[2 : n_mels + 2] - mel_f[:n_mels])
70
+ weights *= enorm[:, np.newaxis]
71
+
72
+ return weights
73
+
74
+ def fram_wave(self, waveform, center=True):
75
+ """
76
+ Transform a raw waveform into a list of smaller waveforms.
77
+ The window length defines how much of the signal is
78
+ contain in each frame (smalle waveform), while the hope length defines the step
79
+ between the beginning of each new frame.
80
+ Centering is done by reflecting the waveform which is first centered around
81
+ `frame_idx * hop_length`.
82
+ """
83
+ frames = []
84
+ for i in range(0, waveform.shape[0] + 1, self.hop_length):
85
+ half_window = (self.n_fft - 1) // 2 + 1
86
+ if center:
87
+ start = i - half_window if i > half_window else 0
88
+ end = (
89
+ i + half_window
90
+ if i < waveform.shape[0] - half_window
91
+ else waveform.shape[0]
92
+ )
93
+
94
+ frame = waveform[start:end]
95
+
96
+ if start == 0:
97
+ padd_width = (-i + half_window, 0)
98
+ frame = np.pad(frame, pad_width=padd_width, mode="reflect")
99
+
100
+ elif end == waveform.shape[0]:
101
+ padd_width = (0, (i - waveform.shape[0] + half_window))
102
+ frame = np.pad(frame, pad_width=padd_width, mode="reflect")
103
+
104
+ else:
105
+ frame = waveform[i : i + self.n_fft]
106
+ frame_width = frame.shape[0]
107
+ if frame_width < waveform.shape[0]:
108
+ frame = np.lib.pad(
109
+ frame,
110
+ pad_width=(0, self.n_fft - frame_width),
111
+ mode="constant",
112
+ constant_values=0,
113
+ )
114
+
115
+ frames.append(frame)
116
+ return np.stack(frames, 0)
117
+
118
+ def stft(self, frames, window):
119
+ """
120
+ Calculates the complex Short-Time Fourier Transform (STFT) of the given framed signal.
121
+ Should give the same results as `torch.stft`.
122
+ """
123
+ frame_size = frames.shape[1]
124
+ fft_size = self.n_fft
125
+
126
+ if fft_size is None:
127
+ fft_size = frame_size
128
+
129
+ if fft_size < frame_size:
130
+ raise ValueError("FFT size must greater or equal the frame size")
131
+ # number of FFT bins to store
132
+ num_fft_bins = (fft_size >> 1) + 1
133
+
134
+ data = np.empty((len(frames), num_fft_bins), dtype=np.complex64)
135
+ fft_signal = np.zeros(fft_size)
136
+
137
+ for f, frame in enumerate(frames):
138
+ if window is not None:
139
+ np.multiply(frame, window, out=fft_signal[:frame_size])
140
+ else:
141
+ fft_signal[:frame_size] = frame
142
+ data[f] = np.fft.fft(fft_signal, axis=0)[:num_fft_bins]
143
+ return data.T
144
+
145
+ def __call__(self, waveform):
146
+ """
147
+ Compute the log-Mel spectrogram of the provided audio, gives similar results
148
+ whisper's original torch implementation with 1e-5 tolerance.
149
+ """
150
+ window = np.hanning(self.n_fft + 1)[:-1]
151
+
152
+ frames = self.fram_wave(waveform)
153
+ stft = self.stft(frames, window=window)
154
+ magnitudes = np.abs(stft[:, :-1]) ** 2
155
+
156
+ filters = self.mel_filters
157
+ mel_spec = filters @ magnitudes
158
+
159
+ log_spec = np.log10(np.clip(mel_spec, a_min=1e-10, a_max=None))
160
+ log_spec = np.maximum(log_spec, log_spec.max() - 8.0)
161
+ log_spec = (log_spec + 4.0) / 4.0
162
+
163
+ return log_spec
faster_whisper/transcribe.py ADDED
@@ -0,0 +1,406 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import collections
2
+ import zlib
3
+
4
+ import ctranslate2
5
+ import numpy as np
6
+ import tokenizers
7
+
8
+ from faster_whisper.feature_extractor import FeatureExtractor
9
+
10
+
11
+ class Segment(collections.namedtuple("Segment", ("start", "end", "text"))):
12
+ pass
13
+
14
+
15
+ class AudioInfo(
16
+ collections.namedtuple("AudioInfo", ("language", "language_probability"))
17
+ ):
18
+ pass
19
+
20
+
21
+ class TranscriptionOptions(
22
+ collections.namedtuple(
23
+ "TranscriptionOptions",
24
+ (
25
+ "task",
26
+ "beam_size",
27
+ "best_of",
28
+ "patience",
29
+ "length_penalty",
30
+ "log_prob_threshold",
31
+ "no_speech_threshold",
32
+ "compression_ratio_threshold",
33
+ "condition_on_previous_text",
34
+ "temperatures",
35
+ "initial_prompt",
36
+ "without_timestamps",
37
+ ),
38
+ )
39
+ ):
40
+ pass
41
+
42
+
43
+ class WhisperModel:
44
+ def __init__(
45
+ self,
46
+ model_path,
47
+ device="auto",
48
+ device_index=0,
49
+ compute_type="default",
50
+ cpu_threads=0,
51
+ num_workers=1,
52
+ ):
53
+ """Initializes the Whisper model.
54
+
55
+ Args:
56
+ model_path: Path to the converted model.
57
+ device: Device to use for computation ("cpu", "cuda", "auto").
58
+ device_index: Device ID to use.
59
+ The model can also be loaded on multiple GPUs by passing a list of IDs
60
+ (e.g. [0, 1, 2, 3]). In that case, multiple transcriptions can run in parallel
61
+ when transcribe() is called from multiple Python threads (see also num_workers).
62
+ compute_type: Type to use for computation.
63
+ See https://opennmt.net/CTranslate2/quantization.html.
64
+ cpu_threads: Number of threads to use when running on CPU (4 by default).
65
+ A non zero value overrides the OMP_NUM_THREADS environment variable.
66
+ num_workers: When transcribe() is called from multiple Python threads,
67
+ having multiple workers enables true parallelism when running the model
68
+ (concurrent calls to self.model.generate() will run in parallel).
69
+ This can improve the global throughput at the cost of increased memory usage.
70
+ """
71
+ self.model = ctranslate2.models.Whisper(
72
+ model_path,
73
+ device=device,
74
+ device_index=device_index,
75
+ compute_type=compute_type,
76
+ intra_threads=cpu_threads,
77
+ inter_threads=num_workers,
78
+ )
79
+
80
+ self.feature_extractor = FeatureExtractor()
81
+ self.tokenizer = tokenizers.Tokenizer.from_pretrained(
82
+ "openai/whisper-tiny" + ("" if self.model.is_multilingual else ".en")
83
+ )
84
+ self.eot_id = self.tokenizer.token_to_id("<|endoftext|>")
85
+ self.timestamp_begin_id = self.tokenizer.token_to_id("<|notimestamps|>") + 1
86
+ self.input_stride = 2
87
+ self.time_precision = 0.02
88
+ self.max_length = 448
89
+
90
+ def transcribe(
91
+ self,
92
+ audio,
93
+ language=None,
94
+ task="transcribe",
95
+ beam_size=5,
96
+ best_of=5,
97
+ patience=1,
98
+ length_penalty=1,
99
+ temperature=[0.0, 0.2, 0.4, 0.6, 0.8, 1.0],
100
+ compression_ratio_threshold=2.4,
101
+ log_prob_threshold=-1.0,
102
+ no_speech_threshold=0.6,
103
+ condition_on_previous_text=True,
104
+ initial_prompt=None,
105
+ without_timestamps=False,
106
+ ):
107
+ """Transcribes an input file.
108
+
109
+ Arguments:
110
+ audio: Union[str, np.ndarray], shape = (*)
111
+ The path to audio or a NumPy array containing the audio waveform in 16 kHz mono
112
+ language: The language spoken in the audio. It should be a language code such
113
+ as "en" or "fr". If not set, the language will be detected in the first 30 seconds
114
+ of audio.
115
+ task: Task to execute (transcribe or translate).
116
+ beam_size: Beam size to use for decoding.
117
+ best_of: Number of candidates when sampling with non-zero temperature.
118
+ patience: Beam search patience factor.
119
+ length_penalty: Exponential length penalty constant.
120
+ temperature: Temperature for sampling. It can be a tuple of temperatures,
121
+ which will be successively used upon failures according to either
122
+ `compression_ratio_threshold` or `logprob_threshold`.
123
+ compression_ratio_threshold: If the gzip compression ratio is above this value,
124
+ treat as failed.
125
+ log_prob_threshold: If the average log probability over sampled tokens is
126
+ below this value, treat as failed.
127
+ no_speech_threshold: If the no_speech probability is higher than this value AND
128
+ the average log probability over sampled tokens is below `logprob_threshold`,
129
+ consider the segment as silent.
130
+ condition_on_previous_text: If True, the previous output of the model is provided
131
+ as a prompt for the next window; disabling may make the text inconsistent across
132
+ windows, but the model becomes less prone to getting stuck in a failure loop,
133
+ such as repetition looping or timestamps going out of sync.
134
+ initial_prompt: Optional text to provide as a prompt for the first window.
135
+ without_timestamps: Only sample text tokens.
136
+
137
+ Returns:
138
+ A tuple with:
139
+
140
+ - a generator over transcribed segments
141
+ - an instance of AudioInfo
142
+ """
143
+ if isinstance(audio, str):
144
+ from faster_whisper.audio import decode_audio
145
+ audio = decode_audio(
146
+ input_file, sampling_rate=self.feature_extractor.sampling_rate
147
+ )
148
+ features = self.feature_extractor(audio)
149
+
150
+ if language is None:
151
+ if not self.model.is_multilingual:
152
+ language = "en"
153
+ language_probability = 1
154
+ else:
155
+ segment = self.get_segment(features)
156
+ input = self.get_input(segment)
157
+ results = self.model.detect_language(input)
158
+ language_token, language_probability = results[0][0]
159
+ language = language_token[2:-2]
160
+ else:
161
+ if self.tokenizer.token_to_id("<|%s|>" % language) is None:
162
+ raise ValueError("%s is not a valid language code" % language)
163
+ language_probability = 1
164
+
165
+ options = TranscriptionOptions(
166
+ task=task,
167
+ beam_size=beam_size,
168
+ best_of=best_of,
169
+ patience=patience,
170
+ length_penalty=length_penalty,
171
+ log_prob_threshold=log_prob_threshold,
172
+ no_speech_threshold=no_speech_threshold,
173
+ compression_ratio_threshold=compression_ratio_threshold,
174
+ condition_on_previous_text=condition_on_previous_text,
175
+ temperatures=(
176
+ temperature if isinstance(temperature, (list, tuple)) else [temperature]
177
+ ),
178
+ initial_prompt=initial_prompt,
179
+ without_timestamps=without_timestamps,
180
+ )
181
+
182
+ segments = self.generate_segments(features, language, options)
183
+
184
+ audio_info = AudioInfo(
185
+ language=language,
186
+ language_probability=language_probability,
187
+ )
188
+
189
+ return segments, audio_info
190
+
191
+ def generate_segments(self, features, language, options):
192
+ tokenized_segments = self.generate_tokenized_segments(
193
+ features, language, options
194
+ )
195
+
196
+ for start, end, tokens in tokenized_segments:
197
+ text = self.decode_text_tokens(tokens)
198
+ if not text.strip():
199
+ continue
200
+
201
+ yield Segment(
202
+ start=start,
203
+ end=end,
204
+ text=text,
205
+ )
206
+
207
+ def generate_tokenized_segments(self, features, language, options):
208
+ num_frames = features.shape[-1]
209
+ offset = 0
210
+ all_tokens = []
211
+ prompt_reset_since = 0
212
+
213
+ if options.initial_prompt is not None:
214
+ initial_prompt = " " + options.initial_prompt.strip()
215
+ initial_prompt_tokens = self.tokenizer.encode(
216
+ initial_prompt, add_special_tokens=False
217
+ )
218
+ all_tokens.extend(initial_prompt_tokens.ids)
219
+
220
+ while offset < num_frames:
221
+ time_offset = offset * self.feature_extractor.time_per_frame
222
+ segment = self.get_segment(features, offset)
223
+ segment_duration = segment.shape[-1] * self.feature_extractor.time_per_frame
224
+
225
+ previous_tokens = all_tokens[prompt_reset_since:]
226
+ prompt = self.get_prompt(
227
+ language,
228
+ previous_tokens,
229
+ task=options.task,
230
+ without_timestamps=options.without_timestamps,
231
+ )
232
+
233
+ result, avg_log_prob, temperature = self.generate_with_fallback(
234
+ segment, prompt, options
235
+ )
236
+
237
+ if (
238
+ result.no_speech_prob > options.no_speech_threshold
239
+ and avg_log_prob < options.log_prob_threshold
240
+ ):
241
+ offset += segment.shape[-1]
242
+ continue
243
+
244
+ tokens = result.sequences_ids[0]
245
+
246
+ consecutive_timestamps = [
247
+ i
248
+ for i in range(len(tokens))
249
+ if i > 0
250
+ and tokens[i] >= self.timestamp_begin_id
251
+ and tokens[i - 1] >= self.timestamp_begin_id
252
+ ]
253
+
254
+ if len(consecutive_timestamps) > 0:
255
+ last_slice = 0
256
+ for i, current_slice in enumerate(consecutive_timestamps):
257
+ sliced_tokens = tokens[last_slice:current_slice]
258
+ start_timestamp_position = (
259
+ sliced_tokens[0] - self.timestamp_begin_id
260
+ )
261
+ end_timestamp_position = sliced_tokens[-1] - self.timestamp_begin_id
262
+ start_time = (
263
+ time_offset + start_timestamp_position * self.time_precision
264
+ )
265
+ end_time = (
266
+ time_offset + end_timestamp_position * self.time_precision
267
+ )
268
+
269
+ last_in_window = i + 1 == len(consecutive_timestamps)
270
+
271
+ # Include the last timestamp so that all tokens are included in a segment.
272
+ if last_in_window:
273
+ sliced_tokens.append(tokens[current_slice])
274
+
275
+ yield start_time, end_time, sliced_tokens
276
+ last_slice = current_slice
277
+
278
+ last_timestamp_position = (
279
+ tokens[last_slice - 1] - self.timestamp_begin_id
280
+ )
281
+ offset += last_timestamp_position * self.input_stride
282
+ all_tokens.extend(tokens[: last_slice + 1])
283
+
284
+ else:
285
+ duration = segment_duration
286
+ timestamps = [
287
+ token for token in tokens if token >= self.timestamp_begin_id
288
+ ]
289
+ if len(timestamps) > 0 and timestamps[-1] != self.timestamp_begin_id:
290
+ last_timestamp_position = timestamps[-1] - self.timestamp_begin_id
291
+ duration = last_timestamp_position * self.time_precision
292
+
293
+ yield time_offset, time_offset + duration, tokens
294
+
295
+ offset += segment.shape[-1]
296
+ all_tokens.extend(tokens)
297
+
298
+ if not options.condition_on_previous_text or temperature > 0.5:
299
+ prompt_reset_since = len(all_tokens)
300
+
301
+ def decode_text_tokens(self, tokens):
302
+ text_tokens = [token for token in tokens if token < self.eot_id]
303
+ return self.tokenizer.decode(text_tokens)
304
+
305
+ def generate_with_fallback(self, segment, prompt, options):
306
+ features = self.get_input(segment)
307
+ result = None
308
+ avg_log_prob = None
309
+ final_temperature = None
310
+
311
+ for temperature in options.temperatures:
312
+ if temperature > 0:
313
+ kwargs = {
314
+ "beam_size": 1,
315
+ "num_hypotheses": options.best_of,
316
+ "sampling_topk": 0,
317
+ "sampling_temperature": temperature,
318
+ }
319
+ else:
320
+ kwargs = {
321
+ "beam_size": options.beam_size,
322
+ "patience": options.patience,
323
+ }
324
+
325
+ final_temperature = temperature
326
+ result = self.model.generate(
327
+ features,
328
+ [prompt],
329
+ length_penalty=options.length_penalty,
330
+ max_length=self.max_length,
331
+ return_scores=True,
332
+ return_no_speech_prob=True,
333
+ **kwargs,
334
+ )[0]
335
+
336
+ tokens = result.sequences_ids[0]
337
+
338
+ # Recover the average log prob from the returned score.
339
+ seq_len = len(tokens)
340
+ cum_log_prob = result.scores[0] * (seq_len**options.length_penalty)
341
+ avg_log_prob = cum_log_prob / (seq_len + 1)
342
+
343
+ text = self.decode_text_tokens(tokens).strip()
344
+ compression_ratio = get_compression_ratio(text)
345
+
346
+ if (
347
+ compression_ratio <= options.compression_ratio_threshold
348
+ and avg_log_prob >= options.log_prob_threshold
349
+ ):
350
+ break
351
+
352
+ return result, avg_log_prob, final_temperature
353
+
354
+ def get_prompt(
355
+ self,
356
+ language,
357
+ previous_tokens,
358
+ task="transcribe",
359
+ without_timestamps=False,
360
+ ):
361
+ prompt = []
362
+
363
+ if previous_tokens:
364
+ prompt.append(self.tokenizer.token_to_id("<|startofprev|>"))
365
+ prompt.extend(previous_tokens[-(self.max_length // 2 - 1) :])
366
+
367
+ prompt.append(self.tokenizer.token_to_id("<|startoftranscript|>"))
368
+
369
+ if self.model.is_multilingual:
370
+ prompt.extend(
371
+ [
372
+ self.tokenizer.token_to_id("<|%s|>" % language),
373
+ self.tokenizer.token_to_id("<|%s|>" % task),
374
+ ]
375
+ )
376
+
377
+ if without_timestamps:
378
+ prompt.append(self.tokenizer.token_to_id("<|notimestamps|>"))
379
+
380
+ return prompt
381
+
382
+ def get_segment(self, features, offset=0):
383
+ if offset > 0:
384
+ features = features[:, offset:]
385
+
386
+ num_frames = features.shape[-1]
387
+ required_num_frames = self.feature_extractor.nb_max_frames
388
+
389
+ if num_frames > required_num_frames:
390
+ features = features[:, :required_num_frames]
391
+ elif num_frames < required_num_frames:
392
+ pad_widths = [(0, 0), (0, required_num_frames - num_frames)]
393
+ features = np.pad(features, pad_widths)
394
+
395
+ features = np.ascontiguousarray(features)
396
+ return features
397
+
398
+ def get_input(self, segment):
399
+ segment = np.expand_dims(segment, 0)
400
+ segment = ctranslate2.StorageView.from_array(segment)
401
+ return segment
402
+
403
+
404
+ def get_compression_ratio(text):
405
+ text_bytes = text.encode("utf-8")
406
+ return len(text_bytes) / len(zlib.compress(text_bytes))
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ numpy
2
+ tqdm
3
+ more-itertools
4
+ --extra-index-url https://download.pytorch.org/whl/cu113
5
+ torch
6
+ transformers>=4.19.0
7
+ ffmpeg-python==0.2.0
8
+ git+https://github.com/openai/whisper.git
9
+ streamlink
translator.py ADDED
@@ -0,0 +1,276 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import sys
3
+ import signal
4
+ from datetime import datetime
5
+
6
+ import ffmpeg
7
+ import numpy as np
8
+ import whisper
9
+ from whisper.audio import SAMPLE_RATE
10
+
11
+
12
+ class RingBuffer:
13
+ def __init__(self, size):
14
+ self.size = size
15
+ self.data = []
16
+ self.full = False
17
+ self.cur = 0
18
+
19
+ def append(self, x):
20
+ if self.size <= 0:
21
+ return
22
+ if self.full:
23
+ self.data[self.cur] = x
24
+ self.cur = (self.cur + 1) % self.size
25
+ else:
26
+ self.data.append(x)
27
+ if len(self.data) == self.size:
28
+ self.full = True
29
+
30
+ def get_all(self):
31
+ """ Get all elements in chronological order from oldest to newest. """
32
+ all_data = []
33
+ for i in range(len(self.data)):
34
+ idx = (i + self.cur) % self.size
35
+ all_data.append(self.data[idx])
36
+ return all_data
37
+
38
+ def has_repetition(self):
39
+ prev = None
40
+ for elem in self.data:
41
+ if elem == prev:
42
+ return True
43
+ prev = elem
44
+ return False
45
+
46
+ def clear(self):
47
+ self.data = []
48
+ self.full = False
49
+ self.cur = 0
50
+
51
+
52
+ def open_stream(stream, direct_url, preferred_quality):
53
+ if direct_url:
54
+ try:
55
+ process = (
56
+ ffmpeg.input(stream, loglevel="panic")
57
+ .output("pipe:", format="s16le", acodec="pcm_s16le", ac=1, ar=SAMPLE_RATE)
58
+ .run_async(pipe_stdout=True)
59
+ )
60
+ except ffmpeg.Error as e:
61
+ raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
62
+
63
+ return process, None
64
+
65
+ import streamlink
66
+ import subprocess
67
+ import threading
68
+ stream_options = streamlink.streams(stream)
69
+ if not stream_options:
70
+ print("No playable streams found on this URL:", stream)
71
+ sys.exit(0)
72
+
73
+ option = None
74
+ for quality in [preferred_quality, 'audio_only', 'audio_mp4a', 'audio_opus', 'best']:
75
+ if quality in stream_options:
76
+ option = quality
77
+ break
78
+ if option is None:
79
+ # Fallback
80
+ option = next(iter(stream_options.values()))
81
+
82
+ def writer(streamlink_proc, ffmpeg_proc):
83
+ while (not streamlink_proc.poll()) and (not ffmpeg_proc.poll()):
84
+ try:
85
+ chunk = streamlink_proc.stdout.read(1024)
86
+ ffmpeg_proc.stdin.write(chunk)
87
+ except (BrokenPipeError, OSError):
88
+ pass
89
+
90
+ cmd = ['streamlink', stream, option, "-O"]
91
+ streamlink_process = subprocess.Popen(cmd, stdout=subprocess.PIPE)
92
+
93
+ try:
94
+ ffmpeg_process = (
95
+ ffmpeg.input("pipe:", loglevel="panic")
96
+ .output("pipe:", format="s16le", acodec="pcm_s16le", ac=1, ar=SAMPLE_RATE)
97
+ .run_async(pipe_stdin=True, pipe_stdout=True)
98
+ )
99
+ except ffmpeg.Error as e:
100
+ raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
101
+
102
+ thread = threading.Thread(target=writer, args=(streamlink_process, ffmpeg_process))
103
+ thread.start()
104
+ return ffmpeg_process, streamlink_process
105
+
106
+
107
+ def main(url, model="small", language=None, interval=5, history_buffer_size=0, preferred_quality="audio_only",
108
+ use_vad=True, direct_url=False, faster_whisper_args=None, **decode_options):
109
+
110
+ n_bytes = interval * SAMPLE_RATE * 2 # Factor 2 comes from reading the int16 stream as bytes
111
+ audio_buffer = RingBuffer((history_buffer_size // interval) + 1)
112
+ previous_text = RingBuffer(history_buffer_size // interval)
113
+
114
+ print("Loading model...")
115
+ if faster_whisper_args:
116
+ from faster_whisper import WhisperModel
117
+ model = WhisperModel(faster_whisper_args["model_path"],
118
+ device=faster_whisper_args["device"],
119
+ compute_type=faster_whisper_args["compute_type"])
120
+ else:
121
+ model = whisper.load_model(model)
122
+
123
+ if use_vad:
124
+ from vad import VAD
125
+ vad = VAD()
126
+
127
+ print("Opening stream...")
128
+ ffmpeg_process, streamlink_process = open_stream(url, direct_url, preferred_quality)
129
+
130
+ def handler(signum, frame):
131
+ ffmpeg_process.kill()
132
+ if streamlink_process:
133
+ streamlink_process.kill()
134
+ sys.exit(0)
135
+
136
+ signal.signal(signal.SIGINT, handler)
137
+
138
+ try:
139
+ while ffmpeg_process.poll() is None:
140
+ # Read audio from ffmpeg stream
141
+ in_bytes = ffmpeg_process.stdout.read(n_bytes)
142
+ if not in_bytes:
143
+ break
144
+
145
+ audio = np.frombuffer(in_bytes, np.int16).flatten().astype(np.float32) / 32768.0
146
+ if use_vad and vad.no_speech(audio):
147
+ print(f'{datetime.now().strftime("%H:%M:%S")}')
148
+ continue
149
+ audio_buffer.append(audio)
150
+
151
+ # Decode the audio
152
+ clear_buffers = False
153
+ if faster_whisper_args:
154
+ segments, info = model.transcribe(audio,
155
+ language=language,
156
+ **decode_options)
157
+
158
+ decoded_language = "" if language else "(" + info.language + ")"
159
+ decoded_text = ""
160
+ previous_segment = ""
161
+ for segment in segments:
162
+ if segment.text != previous_segment:
163
+ decoded_text += segment.text
164
+ previous_segment = segment.text
165
+
166
+ new_prefix = decoded_text
167
+
168
+ else:
169
+ result = model.transcribe(np.concatenate(audio_buffer.get_all()),
170
+ prefix="".join(previous_text.get_all()),
171
+ language=language,
172
+ without_timestamps=True,
173
+ **decode_options)
174
+
175
+ decoded_language = "" if language else "(" + result.get("language") + ")"
176
+ decoded_text = result.get("text")
177
+ new_prefix = ""
178
+ for segment in result["segments"]:
179
+ if segment["temperature"] < 0.5 and segment["no_speech_prob"] < 0.6:
180
+ new_prefix += segment["text"]
181
+ else:
182
+ # Clear history if the translation is unreliable, otherwise prompting on this leads to
183
+ # repetition and getting stuck.
184
+ clear_buffers = True
185
+
186
+ previous_text.append(new_prefix)
187
+
188
+ if clear_buffers or previous_text.has_repetition():
189
+ audio_buffer.clear()
190
+ previous_text.clear()
191
+
192
+ print(f'{datetime.now().strftime("%H:%M:%S")} {decoded_language} {decoded_text}')
193
+
194
+ print("Stream ended")
195
+ finally:
196
+ ffmpeg_process.kill()
197
+ if streamlink_process:
198
+ streamlink_process.kill()
199
+
200
+
201
+ def cli():
202
+ parser = argparse.ArgumentParser(description="Parameters for translator.py")
203
+ parser.add_argument('URL', type=str, help='Stream website and channel name, e.g. twitch.tv/forsen')
204
+ parser.add_argument('--model', type=str,
205
+ choices=['tiny', 'tiny.en', 'small', 'small.en', 'medium', 'medium.en', 'large'],
206
+ default='small',
207
+ help='Model to be used for generating audio transcription. Smaller models are faster and use '
208
+ 'less VRAM, but are also less accurate. .en models are more accurate but only work on '
209
+ 'English audio.')
210
+ parser.add_argument('--task', type=str, choices=['transcribe', 'translate'], default='transcribe',
211
+ help='Whether to transcribe the audio (keep original language) or translate to English.')
212
+ parser.add_argument('--language', type=str, default='Chinese',
213
+ help='Language spoken in the stream. Default option is to auto detect the spoken language. '
214
+ 'See https://github.com/openai/whisper for available languages.')
215
+ parser.add_argument('--interval', type=int, default=5,
216
+ help='Interval between calls to the language model in seconds.')
217
+ parser.add_argument('--history_buffer_size', type=int, default=0,
218
+ help='Seconds of previous audio/text to use for conditioning the model. Set to 0 to just use '
219
+ 'audio from the last interval. Note that this can easily lead to repetition/loops if the'
220
+ 'chosen language/model settings do not produce good results to begin with.')
221
+ parser.add_argument('--beam_size', type=int, default=5,
222
+ help='Number of beams in beam search. Set to 0 to use greedy algorithm instead.')
223
+ parser.add_argument('--best_of', type=int, default=5,
224
+ help='Number of candidates when sampling with non-zero temperature.')
225
+ parser.add_argument('--preferred_quality', type=str, default='worst',
226
+ help='Preferred stream quality option. "best" and "worst" should always be available. Type '
227
+ '"streamlink URL" in the console to see quality options for your URL.')
228
+ parser.add_argument('--disable_vad', action='store_true',
229
+ help='Set this flag to disable additional voice activity detection by Silero VAD.')
230
+ parser.add_argument('--direct_url', action='store_true',
231
+ help='Set this flag to pass the URL directly to ffmpeg. Otherwise, streamlink is used to '
232
+ 'obtain the stream URL.')
233
+ parser.add_argument('--use_faster_whisper', action='store_true',
234
+ help='Set this flag to use faster-whisper implementation instead of the original OpenAI '
235
+ 'implementation.')
236
+ parser.add_argument('--faster_whisper_model_path', type=str, default='whisper-large-v2-ct2/',
237
+ help='Path to a directory containing a Whisper model in the CTranslate2 format.')
238
+ parser.add_argument('--faster_whisper_device', type=str, choices=['cuda', 'cpu', 'auto'], default='cuda',
239
+ help='Set the device to run faster-whisper on.')
240
+ parser.add_argument('--faster_whisper_compute_type', type=str, choices=['int8', 'int8_float16', 'int16', 'float16'],
241
+ default='float16',
242
+ help='Set the quantization type for faster-whisper. See '
243
+ 'https://opennmt.net/CTranslate2/quantization.html for more info.')
244
+
245
+ args = parser.parse_args().__dict__
246
+ url = args.pop("URL")
247
+ args["use_vad"] = not args.pop("disable_vad")
248
+ use_faster_whisper = args.pop("use_faster_whisper")
249
+ faster_whisper_args = dict()
250
+ faster_whisper_args["model_path"] = args.pop("faster_whisper_model_path")
251
+ faster_whisper_args["device"] = args.pop("faster_whisper_device")
252
+ faster_whisper_args["compute_type"] = args.pop("faster_whisper_compute_type")
253
+
254
+ if args['model'].endswith('.en'):
255
+ if args['model'] == 'large.en':
256
+ print("English model does not have large model, please choose from {tiny.en, small.en, medium.en}")
257
+ sys.exit(0)
258
+ if args['language'] != 'English' and args['language'] != 'en':
259
+ if args['language'] == 'auto':
260
+ print("Using .en model, setting language from auto to English")
261
+ args['language'] = 'en'
262
+ else:
263
+ print("English model cannot be used to detect non english language, please choose a non .en model")
264
+ sys.exit(0)
265
+
266
+ if args['language'] == 'auto':
267
+ args['language'] = None
268
+
269
+ if args['beam_size'] == 0:
270
+ args['beam_size'] = None
271
+
272
+ main(url, faster_whisper_args=faster_whisper_args if use_faster_whisper else None, **args)
273
+
274
+
275
+ if __name__ == '__main__':
276
+ cli()
vad.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import warnings
3
+
4
+ warnings.filterwarnings("ignore")
5
+
6
+
7
+ class VAD:
8
+ def __init__(self):
9
+ self.model = init_jit_model("silero_vad.jit")
10
+
11
+ def no_speech(self, audio):
12
+ speech = get_speech_timestamps(torch.Tensor(audio), self.model, return_seconds=True)
13
+ # print(speech)
14
+ return len(speech) == 0
15
+
16
+
17
+ def init_jit_model(model_path: str,
18
+ device=torch.device('cpu')):
19
+ torch.set_grad_enabled(False)
20
+ model = torch.jit.load(model_path, map_location=device)
21
+ model.eval()
22
+ return model
23
+
24
+
25
+ def get_speech_timestamps(audio: torch.Tensor,
26
+ model,
27
+ threshold: float = 0.5,
28
+ sampling_rate: int = 16000,
29
+ min_speech_duration_ms: int = 250,
30
+ min_silence_duration_ms: int = 100,
31
+ window_size_samples: int = 1536,
32
+ speech_pad_ms: int = 30,
33
+ return_seconds: bool = False):
34
+ """
35
+ From https://github.com/snakers4/silero-vad/blob/master/utils_vad.py
36
+
37
+ This method is used for splitting long audios into speech chunks using silero VAD
38
+ Parameters
39
+ ----------
40
+ audio: torch.Tensor
41
+ One dimensional float torch.Tensor, other types are cast to torch if possible
42
+ model: preloaded .jit silero VAD model
43
+ threshold: float (default - 0.5)
44
+ Speech threshold. Silero VAD outputs speech probabilities for each audio chunk, probabilities ABOVE this value
45
+ are considered as SPEECH. It is better to tune this parameter for each dataset separately, but "lazy" 0.5 is
46
+ pretty good for most datasets.
47
+ sampling_rate: int (default - 16000)
48
+ Currently silero VAD models support 8000 and 16000 sample rates
49
+ min_speech_duration_ms: int (default - 250 milliseconds)
50
+ Final speech chunks shorter min_speech_duration_ms are thrown out
51
+ min_silence_duration_ms: int (default - 100 milliseconds)
52
+ In the end of each speech chunk wait for min_silence_duration_ms before separating it
53
+ window_size_samples: int (default - 1536 samples)
54
+ Audio chunks of window_size_samples size are fed to the silero VAD model.
55
+ WARNING! Silero VAD models were trained using 512, 1024, 1536 samples for 16000 sample rate and 256, 512, 768
56
+ samples for 8000 sample rate.Values other than these may affect model performance!!
57
+ speech_pad_ms: int (default - 30 milliseconds)
58
+ Final speech chunks are padded by speech_pad_ms each side
59
+ return_seconds: bool (default - False)
60
+ whether return timestamps in seconds (default - samples)
61
+ Returns
62
+ ----------
63
+ speeches: list of dicts
64
+ list containing ends and beginnings of speech chunks (samples or seconds based on return_seconds)
65
+ """
66
+ model.reset_states()
67
+ min_speech_samples = sampling_rate * min_speech_duration_ms / 1000
68
+ min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
69
+ speech_pad_samples = sampling_rate * speech_pad_ms / 1000
70
+
71
+ audio_length_samples = len(audio)
72
+
73
+ speech_probs = []
74
+ for current_start_sample in range(0, audio_length_samples, window_size_samples):
75
+ chunk = audio[current_start_sample: current_start_sample + window_size_samples]
76
+ if len(chunk) < window_size_samples:
77
+ chunk = torch.nn.functional.pad(chunk, (0, int(window_size_samples - len(chunk))))
78
+ speech_prob = model(chunk, sampling_rate).item()
79
+ speech_probs.append(speech_prob)
80
+
81
+ triggered = False
82
+ speeches = []
83
+ current_speech = {}
84
+ neg_threshold = threshold - 0.15
85
+ temp_end = 0
86
+
87
+ for i, speech_prob in enumerate(speech_probs):
88
+ if (speech_prob >= threshold) and temp_end:
89
+ temp_end = 0
90
+
91
+ if (speech_prob >= threshold) and not triggered:
92
+ triggered = True
93
+ current_speech['start'] = window_size_samples * i
94
+ continue
95
+
96
+ if (speech_prob < neg_threshold) and triggered:
97
+ if not temp_end:
98
+ temp_end = window_size_samples * i
99
+ if (window_size_samples * i) - temp_end < min_silence_samples:
100
+ continue
101
+ else:
102
+ current_speech['end'] = temp_end
103
+ if (current_speech['end'] - current_speech['start']) > min_speech_samples:
104
+ speeches.append(current_speech)
105
+ temp_end = 0
106
+ current_speech = {}
107
+ triggered = False
108
+ continue
109
+
110
+ if current_speech and (audio_length_samples - current_speech['start']) > min_speech_samples:
111
+ current_speech['end'] = audio_length_samples
112
+ speeches.append(current_speech)
113
+
114
+ for i, speech in enumerate(speeches):
115
+ if i == 0:
116
+ speech['start'] = int(max(0, speech['start'] - speech_pad_samples))
117
+ if i != len(speeches) - 1:
118
+ silence_duration = speeches[i + 1]['start'] - speech['end']
119
+ if silence_duration < 2 * speech_pad_samples:
120
+ speech['end'] += int(silence_duration // 2)
121
+ speeches[i + 1]['start'] = int(max(0, speeches[i + 1]['start'] - silence_duration // 2))
122
+ else:
123
+ speech['end'] = int(min(audio_length_samples, speech['end'] + speech_pad_samples))
124
+ speeches[i + 1]['start'] = int(max(0, speeches[i + 1]['start'] - speech_pad_samples))
125
+ else:
126
+ speech['end'] = int(min(audio_length_samples, speech['end'] + speech_pad_samples))
127
+
128
+ if return_seconds:
129
+ for speech_dict in speeches:
130
+ speech_dict['start'] = round(speech_dict['start'] / sampling_rate, 1)
131
+ speech_dict['end'] = round(speech_dict['end'] / sampling_rate, 1)
132
+
133
+ return speeches