Spaces:

gordonchan
/

whisper-app

Paused

App Files Files Community

admin321 commited on Aug 13, 2023

Commit

fd6a8dc

1 Parent(s): 10cc38f

deploy

Browse files

Files changed (11) hide show

.gitattributes +1 -0
.gitignore +129 -0
LICENSE +21 -0
README.md +48 -0
faster_whisper/__init__.py +1 -0
faster_whisper/audio.py +36 -0
faster_whisper/feature_extractor.py +163 -0
faster_whisper/transcribe.py +406 -0
requirements.txt +9 -0
translator.py +276 -0
vad.py +133 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1 @@


1	+ *.jit filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,129 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2022 fortypercnt
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md ADDED Viewed

	@@ -0,0 +1,48 @@

+# stream-translator
+Command line utility to transcribe or translate audio from livestreams in real time. Uses [streamlink](https://github.com/streamlink/streamlink) to
+get livestream URLs from various services and OpenAI's [whisper](https://github.com/openai/whisper) for transcription/translation.
+This script is inspired by [audioWhisper](https://github.com/Awexander/audioWhisper) which transcribes/translates desktop audio.
+## Prerequisites
+1. [**Install and add ffmpeg to your PATH**](https://www.thewindowsclub.com/how-to-install-ffmpeg-on-windows-10#:~:text=Click%20New%20and%20type%20the,Click%20OK%20to%20apply%20changes.)
+2. [**Install CUDA on your system.**](https://developer.nvidia.com/cuda-downloads) If you installed a different version of CUDA than 11.3,
+ change cu113 in requirements.txt accordingly. You can check the installed CUDA version with ```nvcc --version```.
+## Setup
+1. Setup a virtual environment.
+2. ```git clone https://github.com/fortypercnt/stream-translator.git```
+3. ```pip install -r requirements.txt```
+4. Make sure that pytorch is installed with CUDA support. Whisper will probably not run in real time on a CPU.
+## Command-line usage
+```python translator.py URL --flags```
+By default, the URL can be of the form ```twitch.tv/forsen``` and streamlink is used to obtain the .m3u8 link which is passed to ffmpeg.
+See [streamlink plugins](https://streamlink.github.io/plugins.html) for info on all supported sites.
+|             --flags             |     Default Value     |                                                                                                                       Description                                                                                                                        |
+|:-------------------------------:|:---------------------:|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:|
+|            `--model`            |         small         |                                                                  Select model size. See [here](https://github.com/openai/whisper#available-models-and-languages) for available models.                                                                   |
+|            `--task`             |       translate       |                                                                                    Whether to transcribe the audio (keep original language) or translate to english.                                                                                     |
+|          `--language`           |         auto          |                                                           Language spoken in the stream. See [here](https://github.com/openai/whisper#available-models-and-languages) for available languages.                                                           |
+|          `--interval`           |           5           |                                                                                                 Interval between calls to the language model in seconds.                                                                                                 |
+|     `--history_buffer_size`     |           0           | Seconds of previous audio/text to use for conditioning the model. Set to 0 to just use audio from the last interval. Note that this can easily lead to repetition/loops if the chosen language/model settings do not produce good results to begin with. |
+|          `--beam_size`          |           5           |                                                                           Number of beams in beam search. Set to 0 to use greedy algorithm instead (faster but less accurate).                                                                           |
+|           `--best_of`           |           5           |                                                                                              Number of candidates when sampling with non-zero temperature.                                                                                               |
+|      `--preferred_quality`      |      audio_only       |                                                Preferred stream quality option. "best" and "worst" should always be available. Type "streamlink URL" in the console to see quality options for your URL.                                                 |
+|         `--disable_vad`         |                       |                                                                                       Set this flag to disable additional voice activity detection by Silero VAD.                                                                                        |
+|         `--direct_url`          |                       |                                                                        Set this flag to pass the URL directly to ffmpeg. Otherwise, streamlink is used to obtain the stream URL.                                                                         |
+|     `--use_faster_whisper`      |                       |                                                                             Set this flag to use faster_whisper implementation instead of the original OpenAI implementation                                                                             |
+|  `--faster_whisper_model_path`  | whisper-large-v2-ct2/ |                                                                                        Path to a directory containing a Whisper model in the CTranslate2 format.                                                                                         |
+|    `--faster_whisper_device`    |         cuda          |                                                                                                         Set the device to run faster-whisper on.                                                                                                         |
+| `--faster_whisper_compute_type` |        float16        |                                                                Set the quantization type for faster_whisper. See [here](https://opennmt.net/CTranslate2/quantization.html) for more info.                                                                |
+## Using faster-whisper
+faster-whisper provides significant performance upgrades over the original OpenAI implementation (~ 4x faster, ~ 2x less memory).
+To use it, follow the instructions [here](https://github.com/guillaumekln/faster-whisper#installation) to install faster-whisper and convert your models to CTranslate2 format.
+Then you can run the CLI with --use_faster_whisper and set --faster_whisper_model_path to the location of your converted model.

faster_whisper/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from faster_whisper.transcribe import WhisperModel

faster_whisper/audio.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import av
+import numpy as np
+def decode_audio(input_file, sampling_rate=16000):
+    """Decodes the audio.
+    Args:
+      input_file: Path to the input file or a file-like object.
+      sampling_rate: Resample the audio to this sample rate.
+    Returns:
+      A float32 Numpy array.
+    """
+    fifo = av.audio.fifo.AudioFifo()
+    resampler = av.audio.resampler.AudioResampler(
+        format="s16",
+        layout="mono",
+        rate=sampling_rate,
+    )
+    with av.open(input_file) as container:
+        # Decode and resample each audio frame.
+        for frame in container.decode(audio=0):
+            frame.pts = None
+            for new_frame in resampler.resample(frame):
+                fifo.write(new_frame)
+        # Flush the resampler.
+        for new_frame in resampler.resample(None):
+            fifo.write(new_frame)
+    frame = fifo.read()
+    # Convert s16 back to f32.
+    return frame.to_ndarray().flatten().astype(np.float32) / 32768.0

faster_whisper/feature_extractor.py ADDED Viewed

	@@ -0,0 +1,163 @@

+import numpy as np
+# Adapted from https://github.com/huggingface/transformers/blob/main/src/transformers/models/whisper/feature_extraction_whisper.py
+class FeatureExtractor:
+    def __init__(
+        self,
+        feature_size=80,
+        sampling_rate=16000,
+        hop_length=160,
+        chunk_length=30,
+        n_fft=400,
+    ):
+        self.n_fft = n_fft
+        self.hop_length = hop_length
+        self.chunk_length = chunk_length
+        self.n_samples = chunk_length * sampling_rate
+        self.nb_max_frames = self.n_samples // hop_length
+        self.time_per_frame = hop_length / sampling_rate
+        self.sampling_rate = sampling_rate
+        self.mel_filters = self.get_mel_filters(
+            sampling_rate, n_fft, n_mels=feature_size
+        )
+    def get_mel_filters(self, sr, n_fft, n_mels=128, dtype=np.float32):
+        # Initialize the weights
+        n_mels = int(n_mels)
+        weights = np.zeros((n_mels, int(1 + n_fft // 2)), dtype=dtype)
+        # Center freqs of each FFT bin
+        fftfreqs = np.fft.rfftfreq(n=n_fft, d=1.0 / sr)
+        # 'Center freqs' of mel bands - uniformly spaced between limits
+        min_mel = 0.0
+        max_mel = 45.245640471924965
+        mels = np.linspace(min_mel, max_mel, n_mels + 2)
+        mels = np.asanyarray(mels)
+        # Fill in the linear scale
+        f_min = 0.0
+        f_sp = 200.0 / 3
+        freqs = f_min + f_sp * mels
+        # And now the nonlinear scale
+        min_log_hz = 1000.0  # beginning of log region (Hz)
+        min_log_mel = (min_log_hz - f_min) / f_sp  # same (Mels)
+        logstep = np.log(6.4) / 27.0  # step size for log region
+        # If we have vector data, vectorize
+        log_t = mels >= min_log_mel
+        freqs[log_t] = min_log_hz * np.exp(logstep * (mels[log_t] - min_log_mel))
+        mel_f = freqs
+        fdiff = np.diff(mel_f)
+        ramps = np.subtract.outer(mel_f, fftfreqs)
+        for i in range(n_mels):
+            # lower and upper slopes for all bins
+            lower = -ramps[i] / fdiff[i]
+            upper = ramps[i + 2] / fdiff[i + 1]
+            # .. then intersect them with each other and zero
+            weights[i] = np.maximum(0, np.minimum(lower, upper))
+        # Slaney-style mel is scaled to be approx constant energy per channel
+        enorm = 2.0 / (mel_f[2 : n_mels + 2] - mel_f[:n_mels])
+        weights *= enorm[:, np.newaxis]
+        return weights
+    def fram_wave(self, waveform, center=True):
+        """
+        Transform a raw waveform into a list of smaller waveforms.
+        The window length defines how much of the signal is
+        contain in each frame (smalle waveform), while the hope length defines the step
+        between the beginning of each new frame.
+        Centering is done by reflecting the waveform which is first centered around
+        `frame_idx * hop_length`.
+        """
+        frames = []
+        for i in range(0, waveform.shape[0] + 1, self.hop_length):
+            half_window = (self.n_fft - 1) // 2 + 1
+            if center:
+                start = i - half_window if i > half_window else 0
+                end = (
+                    i + half_window
+                    if i < waveform.shape[0] - half_window
+                    else waveform.shape[0]
+                )
+                frame = waveform[start:end]
+                if start == 0:
+                    padd_width = (-i + half_window, 0)
+                    frame = np.pad(frame, pad_width=padd_width, mode="reflect")
+                elif end == waveform.shape[0]:
+                    padd_width = (0, (i - waveform.shape[0] + half_window))
+                    frame = np.pad(frame, pad_width=padd_width, mode="reflect")
+            else:
+                frame = waveform[i : i + self.n_fft]
+                frame_width = frame.shape[0]
+                if frame_width < waveform.shape[0]:
+                    frame = np.lib.pad(
+                        frame,
+                        pad_width=(0, self.n_fft - frame_width),
+                        mode="constant",
+                        constant_values=0,
+                    )
+            frames.append(frame)
+        return np.stack(frames, 0)
+    def stft(self, frames, window):
+        """
+        Calculates the complex Short-Time Fourier Transform (STFT) of the given framed signal.
+        Should give the same results as `torch.stft`.
+        """
+        frame_size = frames.shape[1]
+        fft_size = self.n_fft
+        if fft_size is None:
+            fft_size = frame_size
+        if fft_size < frame_size:
+            raise ValueError("FFT size must greater or equal the frame size")
+        # number of FFT bins to store
+        num_fft_bins = (fft_size >> 1) + 1
+        data = np.empty((len(frames), num_fft_bins), dtype=np.complex64)
+        fft_signal = np.zeros(fft_size)
+        for f, frame in enumerate(frames):
+            if window is not None:
+                np.multiply(frame, window, out=fft_signal[:frame_size])
+            else:
+                fft_signal[:frame_size] = frame
+            data[f] = np.fft.fft(fft_signal, axis=0)[:num_fft_bins]
+        return data.T
+    def __call__(self, waveform):
+        """
+        Compute the log-Mel spectrogram of the provided audio, gives similar results
+        whisper's original torch implementation with 1e-5 tolerance.
+        """
+        window = np.hanning(self.n_fft + 1)[:-1]
+        frames = self.fram_wave(waveform)
+        stft = self.stft(frames, window=window)
+        magnitudes = np.abs(stft[:, :-1]) ** 2
+        filters = self.mel_filters
+        mel_spec = filters @ magnitudes
+        log_spec = np.log10(np.clip(mel_spec, a_min=1e-10, a_max=None))
+        log_spec = np.maximum(log_spec, log_spec.max() - 8.0)
+        log_spec = (log_spec + 4.0) / 4.0
+        return log_spec

faster_whisper/transcribe.py ADDED Viewed

	@@ -0,0 +1,406 @@

+import collections
+import zlib
+import ctranslate2
+import numpy as np
+import tokenizers
+from faster_whisper.feature_extractor import FeatureExtractor
+class Segment(collections.namedtuple("Segment", ("start", "end", "text"))):
+    pass
+class AudioInfo(
+    collections.namedtuple("AudioInfo", ("language", "language_probability"))
+):
+    pass
+class TranscriptionOptions(
+    collections.namedtuple(
+        "TranscriptionOptions",
+        (
+            "task",
+            "beam_size",
+            "best_of",
+            "patience",
+            "length_penalty",
+            "log_prob_threshold",
+            "no_speech_threshold",
+            "compression_ratio_threshold",
+            "condition_on_previous_text",
+            "temperatures",
+            "initial_prompt",
+            "without_timestamps",
+        ),
+    )
+):
+    pass
+class WhisperModel:
+    def __init__(
+        self,
+        model_path,
+        device="auto",
+        device_index=0,
+        compute_type="default",
+        cpu_threads=0,
+        num_workers=1,
+    ):
+        """Initializes the Whisper model.
+        Args:
+          model_path: Path to the converted model.
+          device: Device to use for computation ("cpu", "cuda", "auto").
+          device_index: Device ID to use.
+            The model can also be loaded on multiple GPUs by passing a list of IDs
+            (e.g. [0, 1, 2, 3]). In that case, multiple transcriptions can run in parallel
+            when transcribe() is called from multiple Python threads (see also num_workers).
+          compute_type: Type to use for computation.
+            See https://opennmt.net/CTranslate2/quantization.html.
+          cpu_threads: Number of threads to use when running on CPU (4 by default).
+            A non zero value overrides the OMP_NUM_THREADS environment variable.
+          num_workers: When transcribe() is called from multiple Python threads,
+            having multiple workers enables true parallelism when running the model
+            (concurrent calls to self.model.generate() will run in parallel).
+            This can improve the global throughput at the cost of increased memory usage.
+        """
+        self.model = ctranslate2.models.Whisper(
+            model_path,
+            device=device,
+            device_index=device_index,
+            compute_type=compute_type,
+            intra_threads=cpu_threads,
+            inter_threads=num_workers,
+        )
+        self.feature_extractor = FeatureExtractor()
+        self.tokenizer = tokenizers.Tokenizer.from_pretrained(
+            "openai/whisper-tiny" + ("" if self.model.is_multilingual else ".en")
+        )
+        self.eot_id = self.tokenizer.token_to_id("<|endoftext|>")
+        self.timestamp_begin_id = self.tokenizer.token_to_id("<|notimestamps|>") + 1
+        self.input_stride = 2
+        self.time_precision = 0.02
+        self.max_length = 448
+    def transcribe(
+        self,
+        audio,
+        language=None,
+        task="transcribe",
+        beam_size=5,
+        best_of=5,
+        patience=1,
+        length_penalty=1,
+        temperature=[0.0, 0.2, 0.4, 0.6, 0.8, 1.0],
+        compression_ratio_threshold=2.4,
+        log_prob_threshold=-1.0,
+        no_speech_threshold=0.6,
+        condition_on_previous_text=True,
+        initial_prompt=None,
+        without_timestamps=False,
+    ):
+        """Transcribes an input file.
+        Arguments:
+          audio: Union[str, np.ndarray], shape = (*)
+            The path to audio or a NumPy array containing the audio waveform in 16 kHz mono
+          language: The language spoken in the audio. It should be a language code such
+            as "en" or "fr". If not set, the language will be detected in the first 30 seconds
+            of audio.
+          task: Task to execute (transcribe or translate).
+          beam_size: Beam size to use for decoding.
+          best_of: Number of candidates when sampling with non-zero temperature.
+          patience: Beam search patience factor.
+          length_penalty: Exponential length penalty constant.
+          temperature: Temperature for sampling. It can be a tuple of temperatures,
+            which will be successively used upon failures according to either
+            `compression_ratio_threshold` or `logprob_threshold`.
+          compression_ratio_threshold: If the gzip compression ratio is above this value,
+            treat as failed.
+          log_prob_threshold: If the average log probability over sampled tokens is
+            below this value, treat as failed.
+          no_speech_threshold: If the no_speech probability is higher than this value AND
+            the average log probability over sampled tokens is below `logprob_threshold`,
+            consider the segment as silent.
+          condition_on_previous_text: If True, the previous output of the model is provided
+            as a prompt for the next window; disabling may make the text inconsistent across
+            windows, but the model becomes less prone to getting stuck in a failure loop,
+            such as repetition looping or timestamps going out of sync.
+          initial_prompt: Optional text to provide as a prompt for the first window.
+          without_timestamps: Only sample text tokens.
+        Returns:
+          A tuple with:
+            - a generator over transcribed segments
+            - an instance of AudioInfo
+        """
+        if isinstance(audio, str):
+            from faster_whisper.audio import decode_audio
+            audio = decode_audio(
+                input_file, sampling_rate=self.feature_extractor.sampling_rate
+            )
+        features = self.feature_extractor(audio)
+        if language is None:
+            if not self.model.is_multilingual:
+                language = "en"
+                language_probability = 1
+            else:
+                segment = self.get_segment(features)
+                input = self.get_input(segment)
+                results = self.model.detect_language(input)
+                language_token, language_probability = results[0][0]
+                language = language_token[2:-2]
+        else:
+            if self.tokenizer.token_to_id("<|%s|>" % language) is None:
+                raise ValueError("%s is not a valid language code" % language)
+            language_probability = 1
+        options = TranscriptionOptions(
+            task=task,
+            beam_size=beam_size,
+            best_of=best_of,
+            patience=patience,
+            length_penalty=length_penalty,
+            log_prob_threshold=log_prob_threshold,
+            no_speech_threshold=no_speech_threshold,
+            compression_ratio_threshold=compression_ratio_threshold,
+            condition_on_previous_text=condition_on_previous_text,
+            temperatures=(
+                temperature if isinstance(temperature, (list, tuple)) else [temperature]
+            ),
+            initial_prompt=initial_prompt,
+            without_timestamps=without_timestamps,
+        )
+        segments = self.generate_segments(features, language, options)
+        audio_info = AudioInfo(
+            language=language,
+            language_probability=language_probability,
+        )
+        return segments, audio_info
+    def generate_segments(self, features, language, options):
+        tokenized_segments = self.generate_tokenized_segments(
+            features, language, options
+        )
+        for start, end, tokens in tokenized_segments:
+            text = self.decode_text_tokens(tokens)
+            if not text.strip():
+                continue
+            yield Segment(
+                start=start,
+                end=end,
+                text=text,
+            )
+    def generate_tokenized_segments(self, features, language, options):
+        num_frames = features.shape[-1]
+        offset = 0
+        all_tokens = []
+        prompt_reset_since = 0
+        if options.initial_prompt is not None:
+            initial_prompt = " " + options.initial_prompt.strip()
+            initial_prompt_tokens = self.tokenizer.encode(
+                initial_prompt, add_special_tokens=False
+            )
+            all_tokens.extend(initial_prompt_tokens.ids)
+        while offset < num_frames:
+            time_offset = offset * self.feature_extractor.time_per_frame
+            segment = self.get_segment(features, offset)
+            segment_duration = segment.shape[-1] * self.feature_extractor.time_per_frame
+            previous_tokens = all_tokens[prompt_reset_since:]
+            prompt = self.get_prompt(
+                language,
+                previous_tokens,
+                task=options.task,
+                without_timestamps=options.without_timestamps,
+            )
+            result, avg_log_prob, temperature = self.generate_with_fallback(
+                segment, prompt, options
+            )
+            if (
+                result.no_speech_prob > options.no_speech_threshold
+                and avg_log_prob < options.log_prob_threshold
+            ):
+                offset += segment.shape[-1]
+                continue
+            tokens = result.sequences_ids[0]
+            consecutive_timestamps = [
+                i
+                for i in range(len(tokens))
+                if i > 0
+                and tokens[i] >= self.timestamp_begin_id
+                and tokens[i - 1] >= self.timestamp_begin_id
+            ]
+            if len(consecutive_timestamps) > 0:
+                last_slice = 0
+                for i, current_slice in enumerate(consecutive_timestamps):
+                    sliced_tokens = tokens[last_slice:current_slice]
+                    start_timestamp_position = (
+                        sliced_tokens[0] - self.timestamp_begin_id
+                    )
+                    end_timestamp_position = sliced_tokens[-1] - self.timestamp_begin_id
+                    start_time = (
+                        time_offset + start_timestamp_position * self.time_precision
+                    )
+                    end_time = (
+                        time_offset + end_timestamp_position * self.time_precision
+                    )
+                    last_in_window = i + 1 == len(consecutive_timestamps)
+                    # Include the last timestamp so that all tokens are included in a segment.
+                    if last_in_window:
+                        sliced_tokens.append(tokens[current_slice])
+                    yield start_time, end_time, sliced_tokens
+                    last_slice = current_slice
+                last_timestamp_position = (
+                    tokens[last_slice - 1] - self.timestamp_begin_id
+                )
+                offset += last_timestamp_position * self.input_stride
+                all_tokens.extend(tokens[: last_slice + 1])
+            else:
+                duration = segment_duration
+                timestamps = [
+                    token for token in tokens if token >= self.timestamp_begin_id
+                ]
+                if len(timestamps) > 0 and timestamps[-1] != self.timestamp_begin_id:
+                    last_timestamp_position = timestamps[-1] - self.timestamp_begin_id
+                    duration = last_timestamp_position * self.time_precision
+                yield time_offset, time_offset + duration, tokens
+                offset += segment.shape[-1]
+                all_tokens.extend(tokens)
+            if not options.condition_on_previous_text or temperature > 0.5:
+                prompt_reset_since = len(all_tokens)
+    def decode_text_tokens(self, tokens):
+        text_tokens = [token for token in tokens if token < self.eot_id]
+        return self.tokenizer.decode(text_tokens)
+    def generate_with_fallback(self, segment, prompt, options):
+        features = self.get_input(segment)
+        result = None
+        avg_log_prob = None
+        final_temperature = None
+        for temperature in options.temperatures:
+            if temperature > 0:
+                kwargs = {
+                    "beam_size": 1,
+                    "num_hypotheses": options.best_of,
+                    "sampling_topk": 0,
+                    "sampling_temperature": temperature,
+                }
+            else:
+                kwargs = {
+                    "beam_size": options.beam_size,
+                    "patience": options.patience,
+                }
+            final_temperature = temperature
+            result = self.model.generate(
+                features,
+                [prompt],
+                length_penalty=options.length_penalty,
+                max_length=self.max_length,
+                return_scores=True,
+                return_no_speech_prob=True,
+                **kwargs,
+            )[0]
+            tokens = result.sequences_ids[0]
+            # Recover the average log prob from the returned score.
+            seq_len = len(tokens)
+            cum_log_prob = result.scores[0] * (seq_len**options.length_penalty)
+            avg_log_prob = cum_log_prob / (seq_len + 1)
+            text = self.decode_text_tokens(tokens).strip()
+            compression_ratio = get_compression_ratio(text)
+            if (
+                compression_ratio <= options.compression_ratio_threshold
+                and avg_log_prob >= options.log_prob_threshold
+            ):
+                break
+        return result, avg_log_prob, final_temperature
+    def get_prompt(
+        self,
+        language,
+        previous_tokens,
+        task="transcribe",
+        without_timestamps=False,
+    ):
+        prompt = []
+        if previous_tokens:
+            prompt.append(self.tokenizer.token_to_id("<|startofprev|>"))
+            prompt.extend(previous_tokens[-(self.max_length // 2 - 1) :])
+        prompt.append(self.tokenizer.token_to_id("<|startoftranscript|>"))
+        if self.model.is_multilingual:
+            prompt.extend(
+                [
+                    self.tokenizer.token_to_id("<|%s|>" % language),
+                    self.tokenizer.token_to_id("<|%s|>" % task),
+                ]
+            )
+        if without_timestamps:
+            prompt.append(self.tokenizer.token_to_id("<|notimestamps|>"))
+        return prompt
+    def get_segment(self, features, offset=0):
+        if offset > 0:
+            features = features[:, offset:]
+        num_frames = features.shape[-1]
+        required_num_frames = self.feature_extractor.nb_max_frames
+        if num_frames > required_num_frames:
+            features = features[:, :required_num_frames]
+        elif num_frames < required_num_frames:
+            pad_widths = [(0, 0), (0, required_num_frames - num_frames)]
+            features = np.pad(features, pad_widths)
+        features = np.ascontiguousarray(features)
+        return features
+    def get_input(self, segment):
+        segment = np.expand_dims(segment, 0)
+        segment = ctranslate2.StorageView.from_array(segment)
+        return segment
+def get_compression_ratio(text):
+    text_bytes = text.encode("utf-8")
+    return len(text_bytes) / len(zlib.compress(text_bytes))

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+numpy
+tqdm
+more-itertools
+--extra-index-url https://download.pytorch.org/whl/cu113
+torch
+transformers>=4.19.0
+ffmpeg-python==0.2.0
+git+https://github.com/openai/whisper.git
+streamlink

translator.py ADDED Viewed

	@@ -0,0 +1,276 @@

+import argparse
+import sys
+import signal
+from datetime import datetime
+import ffmpeg
+import numpy as np
+import whisper
+from whisper.audio import SAMPLE_RATE
+class RingBuffer:
+    def __init__(self, size):
+        self.size = size
+        self.data = []
+        self.full = False
+        self.cur = 0
+    def append(self, x):
+        if self.size <= 0:
+            return
+        if self.full:
+            self.data[self.cur] = x
+            self.cur = (self.cur + 1) % self.size
+        else:
+            self.data.append(x)
+            if len(self.data) == self.size:
+                self.full = True
+    def get_all(self):
+        """ Get all elements in chronological order from oldest to newest. """
+        all_data = []
+        for i in range(len(self.data)):
+            idx = (i + self.cur) % self.size
+            all_data.append(self.data[idx])
+        return all_data
+    def has_repetition(self):
+        prev = None
+        for elem in self.data:
+            if elem == prev:
+                return True
+            prev = elem
+        return False
+    def clear(self):
+        self.data = []
+        self.full = False
+        self.cur = 0
+def open_stream(stream, direct_url, preferred_quality):
+    if direct_url:
+        try:
+            process = (
+                ffmpeg.input(stream, loglevel="panic")
+                .output("pipe:", format="s16le", acodec="pcm_s16le", ac=1, ar=SAMPLE_RATE)
+                .run_async(pipe_stdout=True)
+            )
+        except ffmpeg.Error as e:
+            raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
+        return process, None
+    import streamlink
+    import subprocess
+    import threading
+    stream_options = streamlink.streams(stream)
+    if not stream_options:
+        print("No playable streams found on this URL:", stream)
+        sys.exit(0)
+    option = None
+    for quality in [preferred_quality, 'audio_only', 'audio_mp4a', 'audio_opus', 'best']:
+        if quality in stream_options:
+            option = quality
+            break
+    if option is None:
+        # Fallback
+        option = next(iter(stream_options.values()))
+    def writer(streamlink_proc, ffmpeg_proc):
+        while (not streamlink_proc.poll()) and (not ffmpeg_proc.poll()):
+            try:
+                chunk = streamlink_proc.stdout.read(1024)
+                ffmpeg_proc.stdin.write(chunk)
+            except (BrokenPipeError, OSError):
+                pass
+    cmd = ['streamlink', stream, option, "-O"]
+    streamlink_process = subprocess.Popen(cmd, stdout=subprocess.PIPE)
+    try:
+        ffmpeg_process = (
+            ffmpeg.input("pipe:", loglevel="panic")
+            .output("pipe:", format="s16le", acodec="pcm_s16le", ac=1, ar=SAMPLE_RATE)
+            .run_async(pipe_stdin=True, pipe_stdout=True)
+        )
+    except ffmpeg.Error as e:
+        raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
+    thread = threading.Thread(target=writer, args=(streamlink_process, ffmpeg_process))
+    thread.start()
+    return ffmpeg_process, streamlink_process
+def main(url, model="small", language=None, interval=5, history_buffer_size=0, preferred_quality="audio_only",
+         use_vad=True, direct_url=False, faster_whisper_args=None, **decode_options):
+    n_bytes = interval * SAMPLE_RATE * 2  # Factor 2 comes from reading the int16 stream as bytes
+    audio_buffer = RingBuffer((history_buffer_size // interval) + 1)
+    previous_text = RingBuffer(history_buffer_size // interval)
+    print("Loading model...")
+    if faster_whisper_args:
+        from faster_whisper import WhisperModel
+        model = WhisperModel(faster_whisper_args["model_path"],
+                             device=faster_whisper_args["device"],
+                             compute_type=faster_whisper_args["compute_type"])
+    else:
+        model = whisper.load_model(model)
+    if use_vad:
+        from vad import VAD
+        vad = VAD()
+    print("Opening stream...")
+    ffmpeg_process, streamlink_process = open_stream(url, direct_url, preferred_quality)
+    def handler(signum, frame):
+        ffmpeg_process.kill()
+        if streamlink_process:
+            streamlink_process.kill()
+        sys.exit(0)
+    signal.signal(signal.SIGINT, handler)
+    try:
+        while ffmpeg_process.poll() is None:
+            # Read audio from ffmpeg stream
+            in_bytes = ffmpeg_process.stdout.read(n_bytes)
+            if not in_bytes:
+                break
+            audio = np.frombuffer(in_bytes, np.int16).flatten().astype(np.float32) / 32768.0
+            if use_vad and vad.no_speech(audio):
+                print(f'{datetime.now().strftime("%H:%M:%S")}')
+                continue
+            audio_buffer.append(audio)
+            # Decode the audio
+            clear_buffers = False
+            if faster_whisper_args:
+                segments, info = model.transcribe(audio,
+                                                  language=language,
+                                                  **decode_options)
+                decoded_language = "" if language else "(" + info.language + ")"
+                decoded_text = ""
+                previous_segment = ""
+                for segment in segments:
+                    if segment.text != previous_segment:
+                        decoded_text += segment.text
+                        previous_segment = segment.text
+                new_prefix = decoded_text
+            else:
+                result = model.transcribe(np.concatenate(audio_buffer.get_all()),
+                                          prefix="".join(previous_text.get_all()),
+                                          language=language,
+                                          without_timestamps=True,
+                                          **decode_options)
+                decoded_language = "" if language else "(" + result.get("language") + ")"
+                decoded_text = result.get("text")
+                new_prefix = ""
+                for segment in result["segments"]:
+                    if segment["temperature"] < 0.5 and segment["no_speech_prob"] < 0.6:
+                        new_prefix += segment["text"]
+                    else:
+                        # Clear history if the translation is unreliable, otherwise prompting on this leads to
+                        # repetition and getting stuck.
+                        clear_buffers = True
+            previous_text.append(new_prefix)
+            if clear_buffers or previous_text.has_repetition():
+                audio_buffer.clear()
+                previous_text.clear()
+            print(f'{datetime.now().strftime("%H:%M:%S")} {decoded_language} {decoded_text}')
+        print("Stream ended")
+    finally:
+        ffmpeg_process.kill()
+        if streamlink_process:
+            streamlink_process.kill()
+def cli():
+    parser = argparse.ArgumentParser(description="Parameters for translator.py")
+    parser.add_argument('URL', type=str, help='Stream website and channel name, e.g. twitch.tv/forsen')
+    parser.add_argument('--model', type=str,
+                        choices=['tiny', 'tiny.en', 'small', 'small.en', 'medium', 'medium.en', 'large'],
+                        default='small',
+                        help='Model to be used for generating audio transcription. Smaller models are faster and use '
+                             'less VRAM, but are also less accurate. .en models are more accurate but only work on '
+                             'English audio.')
+    parser.add_argument('--task', type=str, choices=['transcribe', 'translate'], default='transcribe',
+                        help='Whether to transcribe the audio (keep original language) or translate to English.')
+    parser.add_argument('--language', type=str, default='Chinese',
+                        help='Language spoken in the stream. Default option is to auto detect the spoken language. '
+                             'See https://github.com/openai/whisper for available languages.')
+    parser.add_argument('--interval', type=int, default=5,
+                        help='Interval between calls to the language model in seconds.')
+    parser.add_argument('--history_buffer_size', type=int, default=0,
+                        help='Seconds of previous audio/text to use for conditioning the model. Set to 0 to just use '
+                             'audio from the last interval. Note that this can easily lead to repetition/loops if the'
+                             'chosen language/model settings do not produce good results to begin with.')
+    parser.add_argument('--beam_size', type=int, default=5,
+                        help='Number of beams in beam search. Set to 0 to use greedy algorithm instead.')
+    parser.add_argument('--best_of', type=int, default=5,
+                        help='Number of candidates when sampling with non-zero temperature.')
+    parser.add_argument('--preferred_quality', type=str, default='worst',
+                        help='Preferred stream quality option. "best" and "worst" should always be available. Type '
+                             '"streamlink URL" in the console to see quality options for your URL.')
+    parser.add_argument('--disable_vad', action='store_true',
+                        help='Set this flag to disable additional voice activity detection by Silero VAD.')
+    parser.add_argument('--direct_url', action='store_true',
+                        help='Set this flag to pass the URL directly to ffmpeg. Otherwise, streamlink is used to '
+                             'obtain the stream URL.')
+    parser.add_argument('--use_faster_whisper', action='store_true',
+                        help='Set this flag to use faster-whisper implementation instead of the original OpenAI '
+                             'implementation.')
+    parser.add_argument('--faster_whisper_model_path', type=str, default='whisper-large-v2-ct2/',
+                        help='Path to a directory containing a Whisper model in the CTranslate2 format.')
+    parser.add_argument('--faster_whisper_device', type=str, choices=['cuda', 'cpu', 'auto'], default='cuda',
+                        help='Set the device to run faster-whisper on.')
+    parser.add_argument('--faster_whisper_compute_type', type=str, choices=['int8', 'int8_float16', 'int16', 'float16'],
+                        default='float16',
+                        help='Set the quantization type for faster-whisper. See '
+                             'https://opennmt.net/CTranslate2/quantization.html for more info.')
+    args = parser.parse_args().__dict__
+    url = args.pop("URL")
+    args["use_vad"] = not args.pop("disable_vad")
+    use_faster_whisper = args.pop("use_faster_whisper")
+    faster_whisper_args = dict()
+    faster_whisper_args["model_path"] = args.pop("faster_whisper_model_path")
+    faster_whisper_args["device"] = args.pop("faster_whisper_device")
+    faster_whisper_args["compute_type"] = args.pop("faster_whisper_compute_type")
+    if args['model'].endswith('.en'):
+        if args['model'] == 'large.en':
+            print("English model does not have large model, please choose from {tiny.en, small.en, medium.en}")
+            sys.exit(0)
+        if args['language'] != 'English' and args['language'] != 'en':
+            if args['language'] == 'auto':
+                print("Using .en model, setting language from auto to English")
+                args['language'] = 'en'
+            else:
+                print("English model cannot be used to detect non english language, please choose a non .en model")
+                sys.exit(0)
+    if args['language'] == 'auto':
+        args['language'] = None
+    if args['beam_size'] == 0:
+        args['beam_size'] = None
+    main(url, faster_whisper_args=faster_whisper_args if use_faster_whisper else None, **args)
+if __name__ == '__main__':
+    cli()

vad.py ADDED Viewed

	@@ -0,0 +1,133 @@

+import torch
+import warnings
+warnings.filterwarnings("ignore")
+class VAD:
+    def __init__(self):
+        self.model = init_jit_model("silero_vad.jit")
+    def no_speech(self, audio):
+        speech = get_speech_timestamps(torch.Tensor(audio), self.model, return_seconds=True)
+        # print(speech)
+        return len(speech) == 0
+def init_jit_model(model_path: str,
+                   device=torch.device('cpu')):
+    torch.set_grad_enabled(False)
+    model = torch.jit.load(model_path, map_location=device)
+    model.eval()
+    return model
+def get_speech_timestamps(audio: torch.Tensor,
+                          model,
+                          threshold: float = 0.5,
+                          sampling_rate: int = 16000,
+                          min_speech_duration_ms: int = 250,
+                          min_silence_duration_ms: int = 100,
+                          window_size_samples: int = 1536,
+                          speech_pad_ms: int = 30,
+                          return_seconds: bool = False):
+    """
+    From https://github.com/snakers4/silero-vad/blob/master/utils_vad.py
+    This method is used for splitting long audios into speech chunks using silero VAD
+    Parameters
+    ----------
+    audio: torch.Tensor
+        One dimensional float torch.Tensor, other types are cast to torch if possible
+    model: preloaded .jit silero VAD model
+    threshold: float (default - 0.5)
+        Speech threshold. Silero VAD outputs speech probabilities for each audio chunk, probabilities ABOVE this value
+        are considered as SPEECH. It is better to tune this parameter for each dataset separately, but "lazy" 0.5 is
+        pretty good for most datasets.
+    sampling_rate: int (default - 16000)
+        Currently silero VAD models support 8000 and 16000 sample rates
+    min_speech_duration_ms: int (default - 250 milliseconds)
+        Final speech chunks shorter min_speech_duration_ms are thrown out
+    min_silence_duration_ms: int (default - 100 milliseconds)
+        In the end of each speech chunk wait for min_silence_duration_ms before separating it
+    window_size_samples: int (default - 1536 samples)
+        Audio chunks of window_size_samples size are fed to the silero VAD model.
+        WARNING! Silero VAD models were trained using 512, 1024, 1536 samples for 16000 sample rate and 256, 512, 768
+        samples for 8000 sample rate.Values other than these may affect model performance!!
+    speech_pad_ms: int (default - 30 milliseconds)
+        Final speech chunks are padded by speech_pad_ms each side
+    return_seconds: bool (default - False)
+        whether return timestamps in seconds (default - samples)
+    Returns
+    ----------
+    speeches: list of dicts
+        list containing ends and beginnings of speech chunks (samples or seconds based on return_seconds)
+    """
+    model.reset_states()
+    min_speech_samples = sampling_rate * min_speech_duration_ms / 1000
+    min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
+    speech_pad_samples = sampling_rate * speech_pad_ms / 1000
+    audio_length_samples = len(audio)
+    speech_probs = []
+    for current_start_sample in range(0, audio_length_samples, window_size_samples):
+        chunk = audio[current_start_sample: current_start_sample + window_size_samples]
+        if len(chunk) < window_size_samples:
+            chunk = torch.nn.functional.pad(chunk, (0, int(window_size_samples - len(chunk))))
+        speech_prob = model(chunk, sampling_rate).item()
+        speech_probs.append(speech_prob)
+    triggered = False
+    speeches = []
+    current_speech = {}
+    neg_threshold = threshold - 0.15
+    temp_end = 0
+    for i, speech_prob in enumerate(speech_probs):
+        if (speech_prob >= threshold) and temp_end:
+            temp_end = 0
+        if (speech_prob >= threshold) and not triggered:
+            triggered = True
+            current_speech['start'] = window_size_samples * i
+            continue
+        if (speech_prob < neg_threshold) and triggered:
+            if not temp_end:
+                temp_end = window_size_samples * i
+            if (window_size_samples * i) - temp_end < min_silence_samples:
+                continue
+            else:
+                current_speech['end'] = temp_end
+                if (current_speech['end'] - current_speech['start']) > min_speech_samples:
+                    speeches.append(current_speech)
+                temp_end = 0
+                current_speech = {}
+                triggered = False
+                continue
+    if current_speech and (audio_length_samples - current_speech['start']) > min_speech_samples:
+        current_speech['end'] = audio_length_samples
+        speeches.append(current_speech)
+    for i, speech in enumerate(speeches):
+        if i == 0:
+            speech['start'] = int(max(0, speech['start'] - speech_pad_samples))
+        if i != len(speeches) - 1:
+            silence_duration = speeches[i + 1]['start'] - speech['end']
+            if silence_duration < 2 * speech_pad_samples:
+                speech['end'] += int(silence_duration // 2)
+                speeches[i + 1]['start'] = int(max(0, speeches[i + 1]['start'] - silence_duration // 2))
+            else:
+                speech['end'] = int(min(audio_length_samples, speech['end'] + speech_pad_samples))
+                speeches[i + 1]['start'] = int(max(0, speeches[i + 1]['start'] - speech_pad_samples))
+        else:
+            speech['end'] = int(min(audio_length_samples, speech['end'] + speech_pad_samples))
+    if return_seconds:
+        for speech_dict in speeches:
+            speech_dict['start'] = round(speech_dict['start'] / sampling_rate, 1)
+            speech_dict['end'] = round(speech_dict['end'] / sampling_rate, 1)
+    return speeches