Spaces:
Paused
Paused
deploy
Browse files- .gitattributes +1 -0
- .gitignore +129 -0
- LICENSE +21 -0
- README.md +48 -0
- faster_whisper/__init__.py +1 -0
- faster_whisper/audio.py +36 -0
- faster_whisper/feature_extractor.py +163 -0
- faster_whisper/transcribe.py +406 -0
- requirements.txt +9 -0
- translator.py +276 -0
- vad.py +133 -0
.gitattributes
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
*.jit filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Byte-compiled / optimized / DLL files
|
2 |
+
__pycache__/
|
3 |
+
*.py[cod]
|
4 |
+
*$py.class
|
5 |
+
|
6 |
+
# C extensions
|
7 |
+
*.so
|
8 |
+
|
9 |
+
# Distribution / packaging
|
10 |
+
.Python
|
11 |
+
build/
|
12 |
+
develop-eggs/
|
13 |
+
dist/
|
14 |
+
downloads/
|
15 |
+
eggs/
|
16 |
+
.eggs/
|
17 |
+
lib/
|
18 |
+
lib64/
|
19 |
+
parts/
|
20 |
+
sdist/
|
21 |
+
var/
|
22 |
+
wheels/
|
23 |
+
pip-wheel-metadata/
|
24 |
+
share/python-wheels/
|
25 |
+
*.egg-info/
|
26 |
+
.installed.cfg
|
27 |
+
*.egg
|
28 |
+
MANIFEST
|
29 |
+
|
30 |
+
# PyInstaller
|
31 |
+
# Usually these files are written by a python script from a template
|
32 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
33 |
+
*.manifest
|
34 |
+
*.spec
|
35 |
+
|
36 |
+
# Installer logs
|
37 |
+
pip-log.txt
|
38 |
+
pip-delete-this-directory.txt
|
39 |
+
|
40 |
+
# Unit test / coverage reports
|
41 |
+
htmlcov/
|
42 |
+
.tox/
|
43 |
+
.nox/
|
44 |
+
.coverage
|
45 |
+
.coverage.*
|
46 |
+
.cache
|
47 |
+
nosetests.xml
|
48 |
+
coverage.xml
|
49 |
+
*.cover
|
50 |
+
*.py,cover
|
51 |
+
.hypothesis/
|
52 |
+
.pytest_cache/
|
53 |
+
|
54 |
+
# Translations
|
55 |
+
*.mo
|
56 |
+
*.pot
|
57 |
+
|
58 |
+
# Django stuff:
|
59 |
+
*.log
|
60 |
+
local_settings.py
|
61 |
+
db.sqlite3
|
62 |
+
db.sqlite3-journal
|
63 |
+
|
64 |
+
# Flask stuff:
|
65 |
+
instance/
|
66 |
+
.webassets-cache
|
67 |
+
|
68 |
+
# Scrapy stuff:
|
69 |
+
.scrapy
|
70 |
+
|
71 |
+
# Sphinx documentation
|
72 |
+
docs/_build/
|
73 |
+
|
74 |
+
# PyBuilder
|
75 |
+
target/
|
76 |
+
|
77 |
+
# Jupyter Notebook
|
78 |
+
.ipynb_checkpoints
|
79 |
+
|
80 |
+
# IPython
|
81 |
+
profile_default/
|
82 |
+
ipython_config.py
|
83 |
+
|
84 |
+
# pyenv
|
85 |
+
.python-version
|
86 |
+
|
87 |
+
# pipenv
|
88 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
89 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
90 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
91 |
+
# install all needed dependencies.
|
92 |
+
#Pipfile.lock
|
93 |
+
|
94 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
|
95 |
+
__pypackages__/
|
96 |
+
|
97 |
+
# Celery stuff
|
98 |
+
celerybeat-schedule
|
99 |
+
celerybeat.pid
|
100 |
+
|
101 |
+
# SageMath parsed files
|
102 |
+
*.sage.py
|
103 |
+
|
104 |
+
# Environments
|
105 |
+
.env
|
106 |
+
.venv
|
107 |
+
env/
|
108 |
+
venv/
|
109 |
+
ENV/
|
110 |
+
env.bak/
|
111 |
+
venv.bak/
|
112 |
+
|
113 |
+
# Spyder project settings
|
114 |
+
.spyderproject
|
115 |
+
.spyproject
|
116 |
+
|
117 |
+
# Rope project settings
|
118 |
+
.ropeproject
|
119 |
+
|
120 |
+
# mkdocs documentation
|
121 |
+
/site
|
122 |
+
|
123 |
+
# mypy
|
124 |
+
.mypy_cache/
|
125 |
+
.dmypy.json
|
126 |
+
dmypy.json
|
127 |
+
|
128 |
+
# Pyre type checker
|
129 |
+
.pyre/
|
LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2022 fortypercnt
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
README.md
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# stream-translator
|
2 |
+
Command line utility to transcribe or translate audio from livestreams in real time. Uses [streamlink](https://github.com/streamlink/streamlink) to
|
3 |
+
get livestream URLs from various services and OpenAI's [whisper](https://github.com/openai/whisper) for transcription/translation.
|
4 |
+
This script is inspired by [audioWhisper](https://github.com/Awexander/audioWhisper) which transcribes/translates desktop audio.
|
5 |
+
|
6 |
+
## Prerequisites
|
7 |
+
|
8 |
+
1. [**Install and add ffmpeg to your PATH**](https://www.thewindowsclub.com/how-to-install-ffmpeg-on-windows-10#:~:text=Click%20New%20and%20type%20the,Click%20OK%20to%20apply%20changes.)
|
9 |
+
2. [**Install CUDA on your system.**](https://developer.nvidia.com/cuda-downloads) If you installed a different version of CUDA than 11.3,
|
10 |
+
change cu113 in requirements.txt accordingly. You can check the installed CUDA version with ```nvcc --version```.
|
11 |
+
|
12 |
+
## Setup
|
13 |
+
|
14 |
+
1. Setup a virtual environment.
|
15 |
+
2. ```git clone https://github.com/fortypercnt/stream-translator.git```
|
16 |
+
3. ```pip install -r requirements.txt```
|
17 |
+
4. Make sure that pytorch is installed with CUDA support. Whisper will probably not run in real time on a CPU.
|
18 |
+
|
19 |
+
## Command-line usage
|
20 |
+
|
21 |
+
```python translator.py URL --flags```
|
22 |
+
|
23 |
+
By default, the URL can be of the form ```twitch.tv/forsen``` and streamlink is used to obtain the .m3u8 link which is passed to ffmpeg.
|
24 |
+
See [streamlink plugins](https://streamlink.github.io/plugins.html) for info on all supported sites.
|
25 |
+
|
26 |
+
|
27 |
+
| --flags | Default Value | Description |
|
28 |
+
|:-------------------------------:|:---------------------:|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:|
|
29 |
+
| `--model` | small | Select model size. See [here](https://github.com/openai/whisper#available-models-and-languages) for available models. |
|
30 |
+
| `--task` | translate | Whether to transcribe the audio (keep original language) or translate to english. |
|
31 |
+
| `--language` | auto | Language spoken in the stream. See [here](https://github.com/openai/whisper#available-models-and-languages) for available languages. |
|
32 |
+
| `--interval` | 5 | Interval between calls to the language model in seconds. |
|
33 |
+
| `--history_buffer_size` | 0 | Seconds of previous audio/text to use for conditioning the model. Set to 0 to just use audio from the last interval. Note that this can easily lead to repetition/loops if the chosen language/model settings do not produce good results to begin with. |
|
34 |
+
| `--beam_size` | 5 | Number of beams in beam search. Set to 0 to use greedy algorithm instead (faster but less accurate). |
|
35 |
+
| `--best_of` | 5 | Number of candidates when sampling with non-zero temperature. |
|
36 |
+
| `--preferred_quality` | audio_only | Preferred stream quality option. "best" and "worst" should always be available. Type "streamlink URL" in the console to see quality options for your URL. |
|
37 |
+
| `--disable_vad` | | Set this flag to disable additional voice activity detection by Silero VAD. |
|
38 |
+
| `--direct_url` | | Set this flag to pass the URL directly to ffmpeg. Otherwise, streamlink is used to obtain the stream URL. |
|
39 |
+
| `--use_faster_whisper` | | Set this flag to use faster_whisper implementation instead of the original OpenAI implementation |
|
40 |
+
| `--faster_whisper_model_path` | whisper-large-v2-ct2/ | Path to a directory containing a Whisper model in the CTranslate2 format. |
|
41 |
+
| `--faster_whisper_device` | cuda | Set the device to run faster-whisper on. |
|
42 |
+
| `--faster_whisper_compute_type` | float16 | Set the quantization type for faster_whisper. See [here](https://opennmt.net/CTranslate2/quantization.html) for more info. |
|
43 |
+
|
44 |
+
## Using faster-whisper
|
45 |
+
|
46 |
+
faster-whisper provides significant performance upgrades over the original OpenAI implementation (~ 4x faster, ~ 2x less memory).
|
47 |
+
To use it, follow the instructions [here](https://github.com/guillaumekln/faster-whisper#installation) to install faster-whisper and convert your models to CTranslate2 format.
|
48 |
+
Then you can run the CLI with --use_faster_whisper and set --faster_whisper_model_path to the location of your converted model.
|
faster_whisper/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
from faster_whisper.transcribe import WhisperModel
|
faster_whisper/audio.py
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import av
|
2 |
+
import numpy as np
|
3 |
+
|
4 |
+
|
5 |
+
def decode_audio(input_file, sampling_rate=16000):
|
6 |
+
"""Decodes the audio.
|
7 |
+
|
8 |
+
Args:
|
9 |
+
input_file: Path to the input file or a file-like object.
|
10 |
+
sampling_rate: Resample the audio to this sample rate.
|
11 |
+
|
12 |
+
Returns:
|
13 |
+
A float32 Numpy array.
|
14 |
+
"""
|
15 |
+
fifo = av.audio.fifo.AudioFifo()
|
16 |
+
resampler = av.audio.resampler.AudioResampler(
|
17 |
+
format="s16",
|
18 |
+
layout="mono",
|
19 |
+
rate=sampling_rate,
|
20 |
+
)
|
21 |
+
|
22 |
+
with av.open(input_file) as container:
|
23 |
+
# Decode and resample each audio frame.
|
24 |
+
for frame in container.decode(audio=0):
|
25 |
+
frame.pts = None
|
26 |
+
for new_frame in resampler.resample(frame):
|
27 |
+
fifo.write(new_frame)
|
28 |
+
|
29 |
+
# Flush the resampler.
|
30 |
+
for new_frame in resampler.resample(None):
|
31 |
+
fifo.write(new_frame)
|
32 |
+
|
33 |
+
frame = fifo.read()
|
34 |
+
|
35 |
+
# Convert s16 back to f32.
|
36 |
+
return frame.to_ndarray().flatten().astype(np.float32) / 32768.0
|
faster_whisper/feature_extractor.py
ADDED
@@ -0,0 +1,163 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
|
3 |
+
|
4 |
+
# Adapted from https://github.com/huggingface/transformers/blob/main/src/transformers/models/whisper/feature_extraction_whisper.py
|
5 |
+
class FeatureExtractor:
|
6 |
+
def __init__(
|
7 |
+
self,
|
8 |
+
feature_size=80,
|
9 |
+
sampling_rate=16000,
|
10 |
+
hop_length=160,
|
11 |
+
chunk_length=30,
|
12 |
+
n_fft=400,
|
13 |
+
):
|
14 |
+
self.n_fft = n_fft
|
15 |
+
self.hop_length = hop_length
|
16 |
+
self.chunk_length = chunk_length
|
17 |
+
self.n_samples = chunk_length * sampling_rate
|
18 |
+
self.nb_max_frames = self.n_samples // hop_length
|
19 |
+
self.time_per_frame = hop_length / sampling_rate
|
20 |
+
self.sampling_rate = sampling_rate
|
21 |
+
self.mel_filters = self.get_mel_filters(
|
22 |
+
sampling_rate, n_fft, n_mels=feature_size
|
23 |
+
)
|
24 |
+
|
25 |
+
def get_mel_filters(self, sr, n_fft, n_mels=128, dtype=np.float32):
|
26 |
+
# Initialize the weights
|
27 |
+
n_mels = int(n_mels)
|
28 |
+
weights = np.zeros((n_mels, int(1 + n_fft // 2)), dtype=dtype)
|
29 |
+
|
30 |
+
# Center freqs of each FFT bin
|
31 |
+
fftfreqs = np.fft.rfftfreq(n=n_fft, d=1.0 / sr)
|
32 |
+
|
33 |
+
# 'Center freqs' of mel bands - uniformly spaced between limits
|
34 |
+
min_mel = 0.0
|
35 |
+
max_mel = 45.245640471924965
|
36 |
+
|
37 |
+
mels = np.linspace(min_mel, max_mel, n_mels + 2)
|
38 |
+
|
39 |
+
mels = np.asanyarray(mels)
|
40 |
+
|
41 |
+
# Fill in the linear scale
|
42 |
+
f_min = 0.0
|
43 |
+
f_sp = 200.0 / 3
|
44 |
+
freqs = f_min + f_sp * mels
|
45 |
+
|
46 |
+
# And now the nonlinear scale
|
47 |
+
min_log_hz = 1000.0 # beginning of log region (Hz)
|
48 |
+
min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels)
|
49 |
+
logstep = np.log(6.4) / 27.0 # step size for log region
|
50 |
+
|
51 |
+
# If we have vector data, vectorize
|
52 |
+
log_t = mels >= min_log_mel
|
53 |
+
freqs[log_t] = min_log_hz * np.exp(logstep * (mels[log_t] - min_log_mel))
|
54 |
+
|
55 |
+
mel_f = freqs
|
56 |
+
|
57 |
+
fdiff = np.diff(mel_f)
|
58 |
+
ramps = np.subtract.outer(mel_f, fftfreqs)
|
59 |
+
|
60 |
+
for i in range(n_mels):
|
61 |
+
# lower and upper slopes for all bins
|
62 |
+
lower = -ramps[i] / fdiff[i]
|
63 |
+
upper = ramps[i + 2] / fdiff[i + 1]
|
64 |
+
|
65 |
+
# .. then intersect them with each other and zero
|
66 |
+
weights[i] = np.maximum(0, np.minimum(lower, upper))
|
67 |
+
|
68 |
+
# Slaney-style mel is scaled to be approx constant energy per channel
|
69 |
+
enorm = 2.0 / (mel_f[2 : n_mels + 2] - mel_f[:n_mels])
|
70 |
+
weights *= enorm[:, np.newaxis]
|
71 |
+
|
72 |
+
return weights
|
73 |
+
|
74 |
+
def fram_wave(self, waveform, center=True):
|
75 |
+
"""
|
76 |
+
Transform a raw waveform into a list of smaller waveforms.
|
77 |
+
The window length defines how much of the signal is
|
78 |
+
contain in each frame (smalle waveform), while the hope length defines the step
|
79 |
+
between the beginning of each new frame.
|
80 |
+
Centering is done by reflecting the waveform which is first centered around
|
81 |
+
`frame_idx * hop_length`.
|
82 |
+
"""
|
83 |
+
frames = []
|
84 |
+
for i in range(0, waveform.shape[0] + 1, self.hop_length):
|
85 |
+
half_window = (self.n_fft - 1) // 2 + 1
|
86 |
+
if center:
|
87 |
+
start = i - half_window if i > half_window else 0
|
88 |
+
end = (
|
89 |
+
i + half_window
|
90 |
+
if i < waveform.shape[0] - half_window
|
91 |
+
else waveform.shape[0]
|
92 |
+
)
|
93 |
+
|
94 |
+
frame = waveform[start:end]
|
95 |
+
|
96 |
+
if start == 0:
|
97 |
+
padd_width = (-i + half_window, 0)
|
98 |
+
frame = np.pad(frame, pad_width=padd_width, mode="reflect")
|
99 |
+
|
100 |
+
elif end == waveform.shape[0]:
|
101 |
+
padd_width = (0, (i - waveform.shape[0] + half_window))
|
102 |
+
frame = np.pad(frame, pad_width=padd_width, mode="reflect")
|
103 |
+
|
104 |
+
else:
|
105 |
+
frame = waveform[i : i + self.n_fft]
|
106 |
+
frame_width = frame.shape[0]
|
107 |
+
if frame_width < waveform.shape[0]:
|
108 |
+
frame = np.lib.pad(
|
109 |
+
frame,
|
110 |
+
pad_width=(0, self.n_fft - frame_width),
|
111 |
+
mode="constant",
|
112 |
+
constant_values=0,
|
113 |
+
)
|
114 |
+
|
115 |
+
frames.append(frame)
|
116 |
+
return np.stack(frames, 0)
|
117 |
+
|
118 |
+
def stft(self, frames, window):
|
119 |
+
"""
|
120 |
+
Calculates the complex Short-Time Fourier Transform (STFT) of the given framed signal.
|
121 |
+
Should give the same results as `torch.stft`.
|
122 |
+
"""
|
123 |
+
frame_size = frames.shape[1]
|
124 |
+
fft_size = self.n_fft
|
125 |
+
|
126 |
+
if fft_size is None:
|
127 |
+
fft_size = frame_size
|
128 |
+
|
129 |
+
if fft_size < frame_size:
|
130 |
+
raise ValueError("FFT size must greater or equal the frame size")
|
131 |
+
# number of FFT bins to store
|
132 |
+
num_fft_bins = (fft_size >> 1) + 1
|
133 |
+
|
134 |
+
data = np.empty((len(frames), num_fft_bins), dtype=np.complex64)
|
135 |
+
fft_signal = np.zeros(fft_size)
|
136 |
+
|
137 |
+
for f, frame in enumerate(frames):
|
138 |
+
if window is not None:
|
139 |
+
np.multiply(frame, window, out=fft_signal[:frame_size])
|
140 |
+
else:
|
141 |
+
fft_signal[:frame_size] = frame
|
142 |
+
data[f] = np.fft.fft(fft_signal, axis=0)[:num_fft_bins]
|
143 |
+
return data.T
|
144 |
+
|
145 |
+
def __call__(self, waveform):
|
146 |
+
"""
|
147 |
+
Compute the log-Mel spectrogram of the provided audio, gives similar results
|
148 |
+
whisper's original torch implementation with 1e-5 tolerance.
|
149 |
+
"""
|
150 |
+
window = np.hanning(self.n_fft + 1)[:-1]
|
151 |
+
|
152 |
+
frames = self.fram_wave(waveform)
|
153 |
+
stft = self.stft(frames, window=window)
|
154 |
+
magnitudes = np.abs(stft[:, :-1]) ** 2
|
155 |
+
|
156 |
+
filters = self.mel_filters
|
157 |
+
mel_spec = filters @ magnitudes
|
158 |
+
|
159 |
+
log_spec = np.log10(np.clip(mel_spec, a_min=1e-10, a_max=None))
|
160 |
+
log_spec = np.maximum(log_spec, log_spec.max() - 8.0)
|
161 |
+
log_spec = (log_spec + 4.0) / 4.0
|
162 |
+
|
163 |
+
return log_spec
|
faster_whisper/transcribe.py
ADDED
@@ -0,0 +1,406 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import collections
|
2 |
+
import zlib
|
3 |
+
|
4 |
+
import ctranslate2
|
5 |
+
import numpy as np
|
6 |
+
import tokenizers
|
7 |
+
|
8 |
+
from faster_whisper.feature_extractor import FeatureExtractor
|
9 |
+
|
10 |
+
|
11 |
+
class Segment(collections.namedtuple("Segment", ("start", "end", "text"))):
|
12 |
+
pass
|
13 |
+
|
14 |
+
|
15 |
+
class AudioInfo(
|
16 |
+
collections.namedtuple("AudioInfo", ("language", "language_probability"))
|
17 |
+
):
|
18 |
+
pass
|
19 |
+
|
20 |
+
|
21 |
+
class TranscriptionOptions(
|
22 |
+
collections.namedtuple(
|
23 |
+
"TranscriptionOptions",
|
24 |
+
(
|
25 |
+
"task",
|
26 |
+
"beam_size",
|
27 |
+
"best_of",
|
28 |
+
"patience",
|
29 |
+
"length_penalty",
|
30 |
+
"log_prob_threshold",
|
31 |
+
"no_speech_threshold",
|
32 |
+
"compression_ratio_threshold",
|
33 |
+
"condition_on_previous_text",
|
34 |
+
"temperatures",
|
35 |
+
"initial_prompt",
|
36 |
+
"without_timestamps",
|
37 |
+
),
|
38 |
+
)
|
39 |
+
):
|
40 |
+
pass
|
41 |
+
|
42 |
+
|
43 |
+
class WhisperModel:
|
44 |
+
def __init__(
|
45 |
+
self,
|
46 |
+
model_path,
|
47 |
+
device="auto",
|
48 |
+
device_index=0,
|
49 |
+
compute_type="default",
|
50 |
+
cpu_threads=0,
|
51 |
+
num_workers=1,
|
52 |
+
):
|
53 |
+
"""Initializes the Whisper model.
|
54 |
+
|
55 |
+
Args:
|
56 |
+
model_path: Path to the converted model.
|
57 |
+
device: Device to use for computation ("cpu", "cuda", "auto").
|
58 |
+
device_index: Device ID to use.
|
59 |
+
The model can also be loaded on multiple GPUs by passing a list of IDs
|
60 |
+
(e.g. [0, 1, 2, 3]). In that case, multiple transcriptions can run in parallel
|
61 |
+
when transcribe() is called from multiple Python threads (see also num_workers).
|
62 |
+
compute_type: Type to use for computation.
|
63 |
+
See https://opennmt.net/CTranslate2/quantization.html.
|
64 |
+
cpu_threads: Number of threads to use when running on CPU (4 by default).
|
65 |
+
A non zero value overrides the OMP_NUM_THREADS environment variable.
|
66 |
+
num_workers: When transcribe() is called from multiple Python threads,
|
67 |
+
having multiple workers enables true parallelism when running the model
|
68 |
+
(concurrent calls to self.model.generate() will run in parallel).
|
69 |
+
This can improve the global throughput at the cost of increased memory usage.
|
70 |
+
"""
|
71 |
+
self.model = ctranslate2.models.Whisper(
|
72 |
+
model_path,
|
73 |
+
device=device,
|
74 |
+
device_index=device_index,
|
75 |
+
compute_type=compute_type,
|
76 |
+
intra_threads=cpu_threads,
|
77 |
+
inter_threads=num_workers,
|
78 |
+
)
|
79 |
+
|
80 |
+
self.feature_extractor = FeatureExtractor()
|
81 |
+
self.tokenizer = tokenizers.Tokenizer.from_pretrained(
|
82 |
+
"openai/whisper-tiny" + ("" if self.model.is_multilingual else ".en")
|
83 |
+
)
|
84 |
+
self.eot_id = self.tokenizer.token_to_id("<|endoftext|>")
|
85 |
+
self.timestamp_begin_id = self.tokenizer.token_to_id("<|notimestamps|>") + 1
|
86 |
+
self.input_stride = 2
|
87 |
+
self.time_precision = 0.02
|
88 |
+
self.max_length = 448
|
89 |
+
|
90 |
+
def transcribe(
|
91 |
+
self,
|
92 |
+
audio,
|
93 |
+
language=None,
|
94 |
+
task="transcribe",
|
95 |
+
beam_size=5,
|
96 |
+
best_of=5,
|
97 |
+
patience=1,
|
98 |
+
length_penalty=1,
|
99 |
+
temperature=[0.0, 0.2, 0.4, 0.6, 0.8, 1.0],
|
100 |
+
compression_ratio_threshold=2.4,
|
101 |
+
log_prob_threshold=-1.0,
|
102 |
+
no_speech_threshold=0.6,
|
103 |
+
condition_on_previous_text=True,
|
104 |
+
initial_prompt=None,
|
105 |
+
without_timestamps=False,
|
106 |
+
):
|
107 |
+
"""Transcribes an input file.
|
108 |
+
|
109 |
+
Arguments:
|
110 |
+
audio: Union[str, np.ndarray], shape = (*)
|
111 |
+
The path to audio or a NumPy array containing the audio waveform in 16 kHz mono
|
112 |
+
language: The language spoken in the audio. It should be a language code such
|
113 |
+
as "en" or "fr". If not set, the language will be detected in the first 30 seconds
|
114 |
+
of audio.
|
115 |
+
task: Task to execute (transcribe or translate).
|
116 |
+
beam_size: Beam size to use for decoding.
|
117 |
+
best_of: Number of candidates when sampling with non-zero temperature.
|
118 |
+
patience: Beam search patience factor.
|
119 |
+
length_penalty: Exponential length penalty constant.
|
120 |
+
temperature: Temperature for sampling. It can be a tuple of temperatures,
|
121 |
+
which will be successively used upon failures according to either
|
122 |
+
`compression_ratio_threshold` or `logprob_threshold`.
|
123 |
+
compression_ratio_threshold: If the gzip compression ratio is above this value,
|
124 |
+
treat as failed.
|
125 |
+
log_prob_threshold: If the average log probability over sampled tokens is
|
126 |
+
below this value, treat as failed.
|
127 |
+
no_speech_threshold: If the no_speech probability is higher than this value AND
|
128 |
+
the average log probability over sampled tokens is below `logprob_threshold`,
|
129 |
+
consider the segment as silent.
|
130 |
+
condition_on_previous_text: If True, the previous output of the model is provided
|
131 |
+
as a prompt for the next window; disabling may make the text inconsistent across
|
132 |
+
windows, but the model becomes less prone to getting stuck in a failure loop,
|
133 |
+
such as repetition looping or timestamps going out of sync.
|
134 |
+
initial_prompt: Optional text to provide as a prompt for the first window.
|
135 |
+
without_timestamps: Only sample text tokens.
|
136 |
+
|
137 |
+
Returns:
|
138 |
+
A tuple with:
|
139 |
+
|
140 |
+
- a generator over transcribed segments
|
141 |
+
- an instance of AudioInfo
|
142 |
+
"""
|
143 |
+
if isinstance(audio, str):
|
144 |
+
from faster_whisper.audio import decode_audio
|
145 |
+
audio = decode_audio(
|
146 |
+
input_file, sampling_rate=self.feature_extractor.sampling_rate
|
147 |
+
)
|
148 |
+
features = self.feature_extractor(audio)
|
149 |
+
|
150 |
+
if language is None:
|
151 |
+
if not self.model.is_multilingual:
|
152 |
+
language = "en"
|
153 |
+
language_probability = 1
|
154 |
+
else:
|
155 |
+
segment = self.get_segment(features)
|
156 |
+
input = self.get_input(segment)
|
157 |
+
results = self.model.detect_language(input)
|
158 |
+
language_token, language_probability = results[0][0]
|
159 |
+
language = language_token[2:-2]
|
160 |
+
else:
|
161 |
+
if self.tokenizer.token_to_id("<|%s|>" % language) is None:
|
162 |
+
raise ValueError("%s is not a valid language code" % language)
|
163 |
+
language_probability = 1
|
164 |
+
|
165 |
+
options = TranscriptionOptions(
|
166 |
+
task=task,
|
167 |
+
beam_size=beam_size,
|
168 |
+
best_of=best_of,
|
169 |
+
patience=patience,
|
170 |
+
length_penalty=length_penalty,
|
171 |
+
log_prob_threshold=log_prob_threshold,
|
172 |
+
no_speech_threshold=no_speech_threshold,
|
173 |
+
compression_ratio_threshold=compression_ratio_threshold,
|
174 |
+
condition_on_previous_text=condition_on_previous_text,
|
175 |
+
temperatures=(
|
176 |
+
temperature if isinstance(temperature, (list, tuple)) else [temperature]
|
177 |
+
),
|
178 |
+
initial_prompt=initial_prompt,
|
179 |
+
without_timestamps=without_timestamps,
|
180 |
+
)
|
181 |
+
|
182 |
+
segments = self.generate_segments(features, language, options)
|
183 |
+
|
184 |
+
audio_info = AudioInfo(
|
185 |
+
language=language,
|
186 |
+
language_probability=language_probability,
|
187 |
+
)
|
188 |
+
|
189 |
+
return segments, audio_info
|
190 |
+
|
191 |
+
def generate_segments(self, features, language, options):
|
192 |
+
tokenized_segments = self.generate_tokenized_segments(
|
193 |
+
features, language, options
|
194 |
+
)
|
195 |
+
|
196 |
+
for start, end, tokens in tokenized_segments:
|
197 |
+
text = self.decode_text_tokens(tokens)
|
198 |
+
if not text.strip():
|
199 |
+
continue
|
200 |
+
|
201 |
+
yield Segment(
|
202 |
+
start=start,
|
203 |
+
end=end,
|
204 |
+
text=text,
|
205 |
+
)
|
206 |
+
|
207 |
+
def generate_tokenized_segments(self, features, language, options):
|
208 |
+
num_frames = features.shape[-1]
|
209 |
+
offset = 0
|
210 |
+
all_tokens = []
|
211 |
+
prompt_reset_since = 0
|
212 |
+
|
213 |
+
if options.initial_prompt is not None:
|
214 |
+
initial_prompt = " " + options.initial_prompt.strip()
|
215 |
+
initial_prompt_tokens = self.tokenizer.encode(
|
216 |
+
initial_prompt, add_special_tokens=False
|
217 |
+
)
|
218 |
+
all_tokens.extend(initial_prompt_tokens.ids)
|
219 |
+
|
220 |
+
while offset < num_frames:
|
221 |
+
time_offset = offset * self.feature_extractor.time_per_frame
|
222 |
+
segment = self.get_segment(features, offset)
|
223 |
+
segment_duration = segment.shape[-1] * self.feature_extractor.time_per_frame
|
224 |
+
|
225 |
+
previous_tokens = all_tokens[prompt_reset_since:]
|
226 |
+
prompt = self.get_prompt(
|
227 |
+
language,
|
228 |
+
previous_tokens,
|
229 |
+
task=options.task,
|
230 |
+
without_timestamps=options.without_timestamps,
|
231 |
+
)
|
232 |
+
|
233 |
+
result, avg_log_prob, temperature = self.generate_with_fallback(
|
234 |
+
segment, prompt, options
|
235 |
+
)
|
236 |
+
|
237 |
+
if (
|
238 |
+
result.no_speech_prob > options.no_speech_threshold
|
239 |
+
and avg_log_prob < options.log_prob_threshold
|
240 |
+
):
|
241 |
+
offset += segment.shape[-1]
|
242 |
+
continue
|
243 |
+
|
244 |
+
tokens = result.sequences_ids[0]
|
245 |
+
|
246 |
+
consecutive_timestamps = [
|
247 |
+
i
|
248 |
+
for i in range(len(tokens))
|
249 |
+
if i > 0
|
250 |
+
and tokens[i] >= self.timestamp_begin_id
|
251 |
+
and tokens[i - 1] >= self.timestamp_begin_id
|
252 |
+
]
|
253 |
+
|
254 |
+
if len(consecutive_timestamps) > 0:
|
255 |
+
last_slice = 0
|
256 |
+
for i, current_slice in enumerate(consecutive_timestamps):
|
257 |
+
sliced_tokens = tokens[last_slice:current_slice]
|
258 |
+
start_timestamp_position = (
|
259 |
+
sliced_tokens[0] - self.timestamp_begin_id
|
260 |
+
)
|
261 |
+
end_timestamp_position = sliced_tokens[-1] - self.timestamp_begin_id
|
262 |
+
start_time = (
|
263 |
+
time_offset + start_timestamp_position * self.time_precision
|
264 |
+
)
|
265 |
+
end_time = (
|
266 |
+
time_offset + end_timestamp_position * self.time_precision
|
267 |
+
)
|
268 |
+
|
269 |
+
last_in_window = i + 1 == len(consecutive_timestamps)
|
270 |
+
|
271 |
+
# Include the last timestamp so that all tokens are included in a segment.
|
272 |
+
if last_in_window:
|
273 |
+
sliced_tokens.append(tokens[current_slice])
|
274 |
+
|
275 |
+
yield start_time, end_time, sliced_tokens
|
276 |
+
last_slice = current_slice
|
277 |
+
|
278 |
+
last_timestamp_position = (
|
279 |
+
tokens[last_slice - 1] - self.timestamp_begin_id
|
280 |
+
)
|
281 |
+
offset += last_timestamp_position * self.input_stride
|
282 |
+
all_tokens.extend(tokens[: last_slice + 1])
|
283 |
+
|
284 |
+
else:
|
285 |
+
duration = segment_duration
|
286 |
+
timestamps = [
|
287 |
+
token for token in tokens if token >= self.timestamp_begin_id
|
288 |
+
]
|
289 |
+
if len(timestamps) > 0 and timestamps[-1] != self.timestamp_begin_id:
|
290 |
+
last_timestamp_position = timestamps[-1] - self.timestamp_begin_id
|
291 |
+
duration = last_timestamp_position * self.time_precision
|
292 |
+
|
293 |
+
yield time_offset, time_offset + duration, tokens
|
294 |
+
|
295 |
+
offset += segment.shape[-1]
|
296 |
+
all_tokens.extend(tokens)
|
297 |
+
|
298 |
+
if not options.condition_on_previous_text or temperature > 0.5:
|
299 |
+
prompt_reset_since = len(all_tokens)
|
300 |
+
|
301 |
+
def decode_text_tokens(self, tokens):
|
302 |
+
text_tokens = [token for token in tokens if token < self.eot_id]
|
303 |
+
return self.tokenizer.decode(text_tokens)
|
304 |
+
|
305 |
+
def generate_with_fallback(self, segment, prompt, options):
|
306 |
+
features = self.get_input(segment)
|
307 |
+
result = None
|
308 |
+
avg_log_prob = None
|
309 |
+
final_temperature = None
|
310 |
+
|
311 |
+
for temperature in options.temperatures:
|
312 |
+
if temperature > 0:
|
313 |
+
kwargs = {
|
314 |
+
"beam_size": 1,
|
315 |
+
"num_hypotheses": options.best_of,
|
316 |
+
"sampling_topk": 0,
|
317 |
+
"sampling_temperature": temperature,
|
318 |
+
}
|
319 |
+
else:
|
320 |
+
kwargs = {
|
321 |
+
"beam_size": options.beam_size,
|
322 |
+
"patience": options.patience,
|
323 |
+
}
|
324 |
+
|
325 |
+
final_temperature = temperature
|
326 |
+
result = self.model.generate(
|
327 |
+
features,
|
328 |
+
[prompt],
|
329 |
+
length_penalty=options.length_penalty,
|
330 |
+
max_length=self.max_length,
|
331 |
+
return_scores=True,
|
332 |
+
return_no_speech_prob=True,
|
333 |
+
**kwargs,
|
334 |
+
)[0]
|
335 |
+
|
336 |
+
tokens = result.sequences_ids[0]
|
337 |
+
|
338 |
+
# Recover the average log prob from the returned score.
|
339 |
+
seq_len = len(tokens)
|
340 |
+
cum_log_prob = result.scores[0] * (seq_len**options.length_penalty)
|
341 |
+
avg_log_prob = cum_log_prob / (seq_len + 1)
|
342 |
+
|
343 |
+
text = self.decode_text_tokens(tokens).strip()
|
344 |
+
compression_ratio = get_compression_ratio(text)
|
345 |
+
|
346 |
+
if (
|
347 |
+
compression_ratio <= options.compression_ratio_threshold
|
348 |
+
and avg_log_prob >= options.log_prob_threshold
|
349 |
+
):
|
350 |
+
break
|
351 |
+
|
352 |
+
return result, avg_log_prob, final_temperature
|
353 |
+
|
354 |
+
def get_prompt(
|
355 |
+
self,
|
356 |
+
language,
|
357 |
+
previous_tokens,
|
358 |
+
task="transcribe",
|
359 |
+
without_timestamps=False,
|
360 |
+
):
|
361 |
+
prompt = []
|
362 |
+
|
363 |
+
if previous_tokens:
|
364 |
+
prompt.append(self.tokenizer.token_to_id("<|startofprev|>"))
|
365 |
+
prompt.extend(previous_tokens[-(self.max_length // 2 - 1) :])
|
366 |
+
|
367 |
+
prompt.append(self.tokenizer.token_to_id("<|startoftranscript|>"))
|
368 |
+
|
369 |
+
if self.model.is_multilingual:
|
370 |
+
prompt.extend(
|
371 |
+
[
|
372 |
+
self.tokenizer.token_to_id("<|%s|>" % language),
|
373 |
+
self.tokenizer.token_to_id("<|%s|>" % task),
|
374 |
+
]
|
375 |
+
)
|
376 |
+
|
377 |
+
if without_timestamps:
|
378 |
+
prompt.append(self.tokenizer.token_to_id("<|notimestamps|>"))
|
379 |
+
|
380 |
+
return prompt
|
381 |
+
|
382 |
+
def get_segment(self, features, offset=0):
|
383 |
+
if offset > 0:
|
384 |
+
features = features[:, offset:]
|
385 |
+
|
386 |
+
num_frames = features.shape[-1]
|
387 |
+
required_num_frames = self.feature_extractor.nb_max_frames
|
388 |
+
|
389 |
+
if num_frames > required_num_frames:
|
390 |
+
features = features[:, :required_num_frames]
|
391 |
+
elif num_frames < required_num_frames:
|
392 |
+
pad_widths = [(0, 0), (0, required_num_frames - num_frames)]
|
393 |
+
features = np.pad(features, pad_widths)
|
394 |
+
|
395 |
+
features = np.ascontiguousarray(features)
|
396 |
+
return features
|
397 |
+
|
398 |
+
def get_input(self, segment):
|
399 |
+
segment = np.expand_dims(segment, 0)
|
400 |
+
segment = ctranslate2.StorageView.from_array(segment)
|
401 |
+
return segment
|
402 |
+
|
403 |
+
|
404 |
+
def get_compression_ratio(text):
|
405 |
+
text_bytes = text.encode("utf-8")
|
406 |
+
return len(text_bytes) / len(zlib.compress(text_bytes))
|
requirements.txt
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
numpy
|
2 |
+
tqdm
|
3 |
+
more-itertools
|
4 |
+
--extra-index-url https://download.pytorch.org/whl/cu113
|
5 |
+
torch
|
6 |
+
transformers>=4.19.0
|
7 |
+
ffmpeg-python==0.2.0
|
8 |
+
git+https://github.com/openai/whisper.git
|
9 |
+
streamlink
|
translator.py
ADDED
@@ -0,0 +1,276 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import sys
|
3 |
+
import signal
|
4 |
+
from datetime import datetime
|
5 |
+
|
6 |
+
import ffmpeg
|
7 |
+
import numpy as np
|
8 |
+
import whisper
|
9 |
+
from whisper.audio import SAMPLE_RATE
|
10 |
+
|
11 |
+
|
12 |
+
class RingBuffer:
|
13 |
+
def __init__(self, size):
|
14 |
+
self.size = size
|
15 |
+
self.data = []
|
16 |
+
self.full = False
|
17 |
+
self.cur = 0
|
18 |
+
|
19 |
+
def append(self, x):
|
20 |
+
if self.size <= 0:
|
21 |
+
return
|
22 |
+
if self.full:
|
23 |
+
self.data[self.cur] = x
|
24 |
+
self.cur = (self.cur + 1) % self.size
|
25 |
+
else:
|
26 |
+
self.data.append(x)
|
27 |
+
if len(self.data) == self.size:
|
28 |
+
self.full = True
|
29 |
+
|
30 |
+
def get_all(self):
|
31 |
+
""" Get all elements in chronological order from oldest to newest. """
|
32 |
+
all_data = []
|
33 |
+
for i in range(len(self.data)):
|
34 |
+
idx = (i + self.cur) % self.size
|
35 |
+
all_data.append(self.data[idx])
|
36 |
+
return all_data
|
37 |
+
|
38 |
+
def has_repetition(self):
|
39 |
+
prev = None
|
40 |
+
for elem in self.data:
|
41 |
+
if elem == prev:
|
42 |
+
return True
|
43 |
+
prev = elem
|
44 |
+
return False
|
45 |
+
|
46 |
+
def clear(self):
|
47 |
+
self.data = []
|
48 |
+
self.full = False
|
49 |
+
self.cur = 0
|
50 |
+
|
51 |
+
|
52 |
+
def open_stream(stream, direct_url, preferred_quality):
|
53 |
+
if direct_url:
|
54 |
+
try:
|
55 |
+
process = (
|
56 |
+
ffmpeg.input(stream, loglevel="panic")
|
57 |
+
.output("pipe:", format="s16le", acodec="pcm_s16le", ac=1, ar=SAMPLE_RATE)
|
58 |
+
.run_async(pipe_stdout=True)
|
59 |
+
)
|
60 |
+
except ffmpeg.Error as e:
|
61 |
+
raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
|
62 |
+
|
63 |
+
return process, None
|
64 |
+
|
65 |
+
import streamlink
|
66 |
+
import subprocess
|
67 |
+
import threading
|
68 |
+
stream_options = streamlink.streams(stream)
|
69 |
+
if not stream_options:
|
70 |
+
print("No playable streams found on this URL:", stream)
|
71 |
+
sys.exit(0)
|
72 |
+
|
73 |
+
option = None
|
74 |
+
for quality in [preferred_quality, 'audio_only', 'audio_mp4a', 'audio_opus', 'best']:
|
75 |
+
if quality in stream_options:
|
76 |
+
option = quality
|
77 |
+
break
|
78 |
+
if option is None:
|
79 |
+
# Fallback
|
80 |
+
option = next(iter(stream_options.values()))
|
81 |
+
|
82 |
+
def writer(streamlink_proc, ffmpeg_proc):
|
83 |
+
while (not streamlink_proc.poll()) and (not ffmpeg_proc.poll()):
|
84 |
+
try:
|
85 |
+
chunk = streamlink_proc.stdout.read(1024)
|
86 |
+
ffmpeg_proc.stdin.write(chunk)
|
87 |
+
except (BrokenPipeError, OSError):
|
88 |
+
pass
|
89 |
+
|
90 |
+
cmd = ['streamlink', stream, option, "-O"]
|
91 |
+
streamlink_process = subprocess.Popen(cmd, stdout=subprocess.PIPE)
|
92 |
+
|
93 |
+
try:
|
94 |
+
ffmpeg_process = (
|
95 |
+
ffmpeg.input("pipe:", loglevel="panic")
|
96 |
+
.output("pipe:", format="s16le", acodec="pcm_s16le", ac=1, ar=SAMPLE_RATE)
|
97 |
+
.run_async(pipe_stdin=True, pipe_stdout=True)
|
98 |
+
)
|
99 |
+
except ffmpeg.Error as e:
|
100 |
+
raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
|
101 |
+
|
102 |
+
thread = threading.Thread(target=writer, args=(streamlink_process, ffmpeg_process))
|
103 |
+
thread.start()
|
104 |
+
return ffmpeg_process, streamlink_process
|
105 |
+
|
106 |
+
|
107 |
+
def main(url, model="small", language=None, interval=5, history_buffer_size=0, preferred_quality="audio_only",
|
108 |
+
use_vad=True, direct_url=False, faster_whisper_args=None, **decode_options):
|
109 |
+
|
110 |
+
n_bytes = interval * SAMPLE_RATE * 2 # Factor 2 comes from reading the int16 stream as bytes
|
111 |
+
audio_buffer = RingBuffer((history_buffer_size // interval) + 1)
|
112 |
+
previous_text = RingBuffer(history_buffer_size // interval)
|
113 |
+
|
114 |
+
print("Loading model...")
|
115 |
+
if faster_whisper_args:
|
116 |
+
from faster_whisper import WhisperModel
|
117 |
+
model = WhisperModel(faster_whisper_args["model_path"],
|
118 |
+
device=faster_whisper_args["device"],
|
119 |
+
compute_type=faster_whisper_args["compute_type"])
|
120 |
+
else:
|
121 |
+
model = whisper.load_model(model)
|
122 |
+
|
123 |
+
if use_vad:
|
124 |
+
from vad import VAD
|
125 |
+
vad = VAD()
|
126 |
+
|
127 |
+
print("Opening stream...")
|
128 |
+
ffmpeg_process, streamlink_process = open_stream(url, direct_url, preferred_quality)
|
129 |
+
|
130 |
+
def handler(signum, frame):
|
131 |
+
ffmpeg_process.kill()
|
132 |
+
if streamlink_process:
|
133 |
+
streamlink_process.kill()
|
134 |
+
sys.exit(0)
|
135 |
+
|
136 |
+
signal.signal(signal.SIGINT, handler)
|
137 |
+
|
138 |
+
try:
|
139 |
+
while ffmpeg_process.poll() is None:
|
140 |
+
# Read audio from ffmpeg stream
|
141 |
+
in_bytes = ffmpeg_process.stdout.read(n_bytes)
|
142 |
+
if not in_bytes:
|
143 |
+
break
|
144 |
+
|
145 |
+
audio = np.frombuffer(in_bytes, np.int16).flatten().astype(np.float32) / 32768.0
|
146 |
+
if use_vad and vad.no_speech(audio):
|
147 |
+
print(f'{datetime.now().strftime("%H:%M:%S")}')
|
148 |
+
continue
|
149 |
+
audio_buffer.append(audio)
|
150 |
+
|
151 |
+
# Decode the audio
|
152 |
+
clear_buffers = False
|
153 |
+
if faster_whisper_args:
|
154 |
+
segments, info = model.transcribe(audio,
|
155 |
+
language=language,
|
156 |
+
**decode_options)
|
157 |
+
|
158 |
+
decoded_language = "" if language else "(" + info.language + ")"
|
159 |
+
decoded_text = ""
|
160 |
+
previous_segment = ""
|
161 |
+
for segment in segments:
|
162 |
+
if segment.text != previous_segment:
|
163 |
+
decoded_text += segment.text
|
164 |
+
previous_segment = segment.text
|
165 |
+
|
166 |
+
new_prefix = decoded_text
|
167 |
+
|
168 |
+
else:
|
169 |
+
result = model.transcribe(np.concatenate(audio_buffer.get_all()),
|
170 |
+
prefix="".join(previous_text.get_all()),
|
171 |
+
language=language,
|
172 |
+
without_timestamps=True,
|
173 |
+
**decode_options)
|
174 |
+
|
175 |
+
decoded_language = "" if language else "(" + result.get("language") + ")"
|
176 |
+
decoded_text = result.get("text")
|
177 |
+
new_prefix = ""
|
178 |
+
for segment in result["segments"]:
|
179 |
+
if segment["temperature"] < 0.5 and segment["no_speech_prob"] < 0.6:
|
180 |
+
new_prefix += segment["text"]
|
181 |
+
else:
|
182 |
+
# Clear history if the translation is unreliable, otherwise prompting on this leads to
|
183 |
+
# repetition and getting stuck.
|
184 |
+
clear_buffers = True
|
185 |
+
|
186 |
+
previous_text.append(new_prefix)
|
187 |
+
|
188 |
+
if clear_buffers or previous_text.has_repetition():
|
189 |
+
audio_buffer.clear()
|
190 |
+
previous_text.clear()
|
191 |
+
|
192 |
+
print(f'{datetime.now().strftime("%H:%M:%S")} {decoded_language} {decoded_text}')
|
193 |
+
|
194 |
+
print("Stream ended")
|
195 |
+
finally:
|
196 |
+
ffmpeg_process.kill()
|
197 |
+
if streamlink_process:
|
198 |
+
streamlink_process.kill()
|
199 |
+
|
200 |
+
|
201 |
+
def cli():
|
202 |
+
parser = argparse.ArgumentParser(description="Parameters for translator.py")
|
203 |
+
parser.add_argument('URL', type=str, help='Stream website and channel name, e.g. twitch.tv/forsen')
|
204 |
+
parser.add_argument('--model', type=str,
|
205 |
+
choices=['tiny', 'tiny.en', 'small', 'small.en', 'medium', 'medium.en', 'large'],
|
206 |
+
default='small',
|
207 |
+
help='Model to be used for generating audio transcription. Smaller models are faster and use '
|
208 |
+
'less VRAM, but are also less accurate. .en models are more accurate but only work on '
|
209 |
+
'English audio.')
|
210 |
+
parser.add_argument('--task', type=str, choices=['transcribe', 'translate'], default='transcribe',
|
211 |
+
help='Whether to transcribe the audio (keep original language) or translate to English.')
|
212 |
+
parser.add_argument('--language', type=str, default='Chinese',
|
213 |
+
help='Language spoken in the stream. Default option is to auto detect the spoken language. '
|
214 |
+
'See https://github.com/openai/whisper for available languages.')
|
215 |
+
parser.add_argument('--interval', type=int, default=5,
|
216 |
+
help='Interval between calls to the language model in seconds.')
|
217 |
+
parser.add_argument('--history_buffer_size', type=int, default=0,
|
218 |
+
help='Seconds of previous audio/text to use for conditioning the model. Set to 0 to just use '
|
219 |
+
'audio from the last interval. Note that this can easily lead to repetition/loops if the'
|
220 |
+
'chosen language/model settings do not produce good results to begin with.')
|
221 |
+
parser.add_argument('--beam_size', type=int, default=5,
|
222 |
+
help='Number of beams in beam search. Set to 0 to use greedy algorithm instead.')
|
223 |
+
parser.add_argument('--best_of', type=int, default=5,
|
224 |
+
help='Number of candidates when sampling with non-zero temperature.')
|
225 |
+
parser.add_argument('--preferred_quality', type=str, default='worst',
|
226 |
+
help='Preferred stream quality option. "best" and "worst" should always be available. Type '
|
227 |
+
'"streamlink URL" in the console to see quality options for your URL.')
|
228 |
+
parser.add_argument('--disable_vad', action='store_true',
|
229 |
+
help='Set this flag to disable additional voice activity detection by Silero VAD.')
|
230 |
+
parser.add_argument('--direct_url', action='store_true',
|
231 |
+
help='Set this flag to pass the URL directly to ffmpeg. Otherwise, streamlink is used to '
|
232 |
+
'obtain the stream URL.')
|
233 |
+
parser.add_argument('--use_faster_whisper', action='store_true',
|
234 |
+
help='Set this flag to use faster-whisper implementation instead of the original OpenAI '
|
235 |
+
'implementation.')
|
236 |
+
parser.add_argument('--faster_whisper_model_path', type=str, default='whisper-large-v2-ct2/',
|
237 |
+
help='Path to a directory containing a Whisper model in the CTranslate2 format.')
|
238 |
+
parser.add_argument('--faster_whisper_device', type=str, choices=['cuda', 'cpu', 'auto'], default='cuda',
|
239 |
+
help='Set the device to run faster-whisper on.')
|
240 |
+
parser.add_argument('--faster_whisper_compute_type', type=str, choices=['int8', 'int8_float16', 'int16', 'float16'],
|
241 |
+
default='float16',
|
242 |
+
help='Set the quantization type for faster-whisper. See '
|
243 |
+
'https://opennmt.net/CTranslate2/quantization.html for more info.')
|
244 |
+
|
245 |
+
args = parser.parse_args().__dict__
|
246 |
+
url = args.pop("URL")
|
247 |
+
args["use_vad"] = not args.pop("disable_vad")
|
248 |
+
use_faster_whisper = args.pop("use_faster_whisper")
|
249 |
+
faster_whisper_args = dict()
|
250 |
+
faster_whisper_args["model_path"] = args.pop("faster_whisper_model_path")
|
251 |
+
faster_whisper_args["device"] = args.pop("faster_whisper_device")
|
252 |
+
faster_whisper_args["compute_type"] = args.pop("faster_whisper_compute_type")
|
253 |
+
|
254 |
+
if args['model'].endswith('.en'):
|
255 |
+
if args['model'] == 'large.en':
|
256 |
+
print("English model does not have large model, please choose from {tiny.en, small.en, medium.en}")
|
257 |
+
sys.exit(0)
|
258 |
+
if args['language'] != 'English' and args['language'] != 'en':
|
259 |
+
if args['language'] == 'auto':
|
260 |
+
print("Using .en model, setting language from auto to English")
|
261 |
+
args['language'] = 'en'
|
262 |
+
else:
|
263 |
+
print("English model cannot be used to detect non english language, please choose a non .en model")
|
264 |
+
sys.exit(0)
|
265 |
+
|
266 |
+
if args['language'] == 'auto':
|
267 |
+
args['language'] = None
|
268 |
+
|
269 |
+
if args['beam_size'] == 0:
|
270 |
+
args['beam_size'] = None
|
271 |
+
|
272 |
+
main(url, faster_whisper_args=faster_whisper_args if use_faster_whisper else None, **args)
|
273 |
+
|
274 |
+
|
275 |
+
if __name__ == '__main__':
|
276 |
+
cli()
|
vad.py
ADDED
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import warnings
|
3 |
+
|
4 |
+
warnings.filterwarnings("ignore")
|
5 |
+
|
6 |
+
|
7 |
+
class VAD:
|
8 |
+
def __init__(self):
|
9 |
+
self.model = init_jit_model("silero_vad.jit")
|
10 |
+
|
11 |
+
def no_speech(self, audio):
|
12 |
+
speech = get_speech_timestamps(torch.Tensor(audio), self.model, return_seconds=True)
|
13 |
+
# print(speech)
|
14 |
+
return len(speech) == 0
|
15 |
+
|
16 |
+
|
17 |
+
def init_jit_model(model_path: str,
|
18 |
+
device=torch.device('cpu')):
|
19 |
+
torch.set_grad_enabled(False)
|
20 |
+
model = torch.jit.load(model_path, map_location=device)
|
21 |
+
model.eval()
|
22 |
+
return model
|
23 |
+
|
24 |
+
|
25 |
+
def get_speech_timestamps(audio: torch.Tensor,
|
26 |
+
model,
|
27 |
+
threshold: float = 0.5,
|
28 |
+
sampling_rate: int = 16000,
|
29 |
+
min_speech_duration_ms: int = 250,
|
30 |
+
min_silence_duration_ms: int = 100,
|
31 |
+
window_size_samples: int = 1536,
|
32 |
+
speech_pad_ms: int = 30,
|
33 |
+
return_seconds: bool = False):
|
34 |
+
"""
|
35 |
+
From https://github.com/snakers4/silero-vad/blob/master/utils_vad.py
|
36 |
+
|
37 |
+
This method is used for splitting long audios into speech chunks using silero VAD
|
38 |
+
Parameters
|
39 |
+
----------
|
40 |
+
audio: torch.Tensor
|
41 |
+
One dimensional float torch.Tensor, other types are cast to torch if possible
|
42 |
+
model: preloaded .jit silero VAD model
|
43 |
+
threshold: float (default - 0.5)
|
44 |
+
Speech threshold. Silero VAD outputs speech probabilities for each audio chunk, probabilities ABOVE this value
|
45 |
+
are considered as SPEECH. It is better to tune this parameter for each dataset separately, but "lazy" 0.5 is
|
46 |
+
pretty good for most datasets.
|
47 |
+
sampling_rate: int (default - 16000)
|
48 |
+
Currently silero VAD models support 8000 and 16000 sample rates
|
49 |
+
min_speech_duration_ms: int (default - 250 milliseconds)
|
50 |
+
Final speech chunks shorter min_speech_duration_ms are thrown out
|
51 |
+
min_silence_duration_ms: int (default - 100 milliseconds)
|
52 |
+
In the end of each speech chunk wait for min_silence_duration_ms before separating it
|
53 |
+
window_size_samples: int (default - 1536 samples)
|
54 |
+
Audio chunks of window_size_samples size are fed to the silero VAD model.
|
55 |
+
WARNING! Silero VAD models were trained using 512, 1024, 1536 samples for 16000 sample rate and 256, 512, 768
|
56 |
+
samples for 8000 sample rate.Values other than these may affect model performance!!
|
57 |
+
speech_pad_ms: int (default - 30 milliseconds)
|
58 |
+
Final speech chunks are padded by speech_pad_ms each side
|
59 |
+
return_seconds: bool (default - False)
|
60 |
+
whether return timestamps in seconds (default - samples)
|
61 |
+
Returns
|
62 |
+
----------
|
63 |
+
speeches: list of dicts
|
64 |
+
list containing ends and beginnings of speech chunks (samples or seconds based on return_seconds)
|
65 |
+
"""
|
66 |
+
model.reset_states()
|
67 |
+
min_speech_samples = sampling_rate * min_speech_duration_ms / 1000
|
68 |
+
min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
|
69 |
+
speech_pad_samples = sampling_rate * speech_pad_ms / 1000
|
70 |
+
|
71 |
+
audio_length_samples = len(audio)
|
72 |
+
|
73 |
+
speech_probs = []
|
74 |
+
for current_start_sample in range(0, audio_length_samples, window_size_samples):
|
75 |
+
chunk = audio[current_start_sample: current_start_sample + window_size_samples]
|
76 |
+
if len(chunk) < window_size_samples:
|
77 |
+
chunk = torch.nn.functional.pad(chunk, (0, int(window_size_samples - len(chunk))))
|
78 |
+
speech_prob = model(chunk, sampling_rate).item()
|
79 |
+
speech_probs.append(speech_prob)
|
80 |
+
|
81 |
+
triggered = False
|
82 |
+
speeches = []
|
83 |
+
current_speech = {}
|
84 |
+
neg_threshold = threshold - 0.15
|
85 |
+
temp_end = 0
|
86 |
+
|
87 |
+
for i, speech_prob in enumerate(speech_probs):
|
88 |
+
if (speech_prob >= threshold) and temp_end:
|
89 |
+
temp_end = 0
|
90 |
+
|
91 |
+
if (speech_prob >= threshold) and not triggered:
|
92 |
+
triggered = True
|
93 |
+
current_speech['start'] = window_size_samples * i
|
94 |
+
continue
|
95 |
+
|
96 |
+
if (speech_prob < neg_threshold) and triggered:
|
97 |
+
if not temp_end:
|
98 |
+
temp_end = window_size_samples * i
|
99 |
+
if (window_size_samples * i) - temp_end < min_silence_samples:
|
100 |
+
continue
|
101 |
+
else:
|
102 |
+
current_speech['end'] = temp_end
|
103 |
+
if (current_speech['end'] - current_speech['start']) > min_speech_samples:
|
104 |
+
speeches.append(current_speech)
|
105 |
+
temp_end = 0
|
106 |
+
current_speech = {}
|
107 |
+
triggered = False
|
108 |
+
continue
|
109 |
+
|
110 |
+
if current_speech and (audio_length_samples - current_speech['start']) > min_speech_samples:
|
111 |
+
current_speech['end'] = audio_length_samples
|
112 |
+
speeches.append(current_speech)
|
113 |
+
|
114 |
+
for i, speech in enumerate(speeches):
|
115 |
+
if i == 0:
|
116 |
+
speech['start'] = int(max(0, speech['start'] - speech_pad_samples))
|
117 |
+
if i != len(speeches) - 1:
|
118 |
+
silence_duration = speeches[i + 1]['start'] - speech['end']
|
119 |
+
if silence_duration < 2 * speech_pad_samples:
|
120 |
+
speech['end'] += int(silence_duration // 2)
|
121 |
+
speeches[i + 1]['start'] = int(max(0, speeches[i + 1]['start'] - silence_duration // 2))
|
122 |
+
else:
|
123 |
+
speech['end'] = int(min(audio_length_samples, speech['end'] + speech_pad_samples))
|
124 |
+
speeches[i + 1]['start'] = int(max(0, speeches[i + 1]['start'] - speech_pad_samples))
|
125 |
+
else:
|
126 |
+
speech['end'] = int(min(audio_length_samples, speech['end'] + speech_pad_samples))
|
127 |
+
|
128 |
+
if return_seconds:
|
129 |
+
for speech_dict in speeches:
|
130 |
+
speech_dict['start'] = round(speech_dict['start'] / sampling_rate, 1)
|
131 |
+
speech_dict['end'] = round(speech_dict['end'] / sampling_rate, 1)
|
132 |
+
|
133 |
+
return speeches
|