Spaces:

renatotn7
/

vozparatexto

Runtime error

App Files Files Community

FAMILIA commited on Nov 22, 2022

Commit

bd77f79

1 Parent(s): a2af5c8

Add application file

Browse files

Files changed (17) hide show

.gitignore +5 -0
README.md +58 -4
app-local.py +3 -0
app-network.py +3 -0
app-shared.py +3 -0
app.py +256 -0
cli.py +110 -0
dockerfile +20 -0
docs/options.md +78 -0
requirements.txt +6 -0
src/__init__.py +0 -0
src/download.py +72 -0
src/segments.py +55 -0
src/utils.py +115 -0
src/vad.py +477 -0
tests/segments_test.py +48 -0
tests/vad_test.py +66 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,5 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+flagged/
+*.py[cod]
+*$py.class

README.md CHANGED Viewed

@@ -1,12 +1,66 @@
 ---
-title: Vozparatexto
 emoji: ⚡
-colorFrom: green
-colorTo: red
 sdk: gradio
-sdk_version: 3.10.1
 app_file: app.py
 pinned: false
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Voz para Texto
 emoji: ⚡
+colorFrom: pink
+colorTo: purple
 sdk: gradio
+sdk_version: 3.3.1
 app_file: app.py
 pinned: false
+license: apache-2.0
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+# Running Locally
+To run this program locally, first install Python 3.9+ and Git. Then install Pytorch 10.1+ and all the other dependencies:
+```
+pip install -r requirements.txt
+```
+Finally, run the full version (no audio length restrictions) of the app:
+```
+python app-full.py
+```
+You can also run the CLI interface, which is similar to Whisper's own CLI but also supports the following additional arguments:
+```
+python cli.py \
+[--vad {none,silero-vad,silero-vad-skip-gaps,silero-vad-expand-into-gaps,periodic-vad}] \
+[--vad_merge_window VAD_MERGE_WINDOW] \
+[--vad_max_merge_size VAD_MAX_MERGE_SIZE] \
+[--vad_padding VAD_PADDING] \
+[--vad_prompt_window VAD_PROMPT_WINDOW]
+```
+In addition, you may also use URL's in addition to file paths as input.
+```
+python cli.py --model large --vad silero-vad --language Japanese "https://www.youtube.com/watch?v=4cICErqqRSM"
+```
+# Docker
+To run it in Docker, first install Docker and optionally the NVIDIA Container Toolkit in order to use the GPU. Then
+check out this repository and build an image:
+```
+sudo docker build -t whisper-webui:1 .
+```
+You can then start the WebUI with GPU support like so:
+```
+sudo docker run -d --gpus=all -p 7860:7860 whisper-webui:1
+```
+Leave out "--gpus=all" if you don't have access to a GPU with enough memory, and are fine with running it on the CPU only:
+```
+sudo docker run -d -p 7860:7860 whisper-webui:1
+```
+## Caching
+Note that the models themselves are currently not included in the Docker images, and will be downloaded on the demand.
+To avoid this, bind the directory /root/.cache/whisper to some directory on the host (for instance /home/administrator/.cache/whisper), where you can (optionally)
+prepopulate the directory with the different Whisper models.
+```
+sudo docker run -d --gpus=all -p 7860:7860 --mount type=bind,source=/home/administrator/.cache/whisper,target=/root/.cache/whisper whisper-webui:1
+```

app-local.py ADDED Viewed

	@@ -0,0 +1,3 @@

+# Run the app with no audio file restrictions
+from app import create_ui
+create_ui(-1)

app-network.py ADDED Viewed

	@@ -0,0 +1,3 @@

+# Run the app with no audio file restrictions, and make it available on the network
+from app import create_ui
+create_ui(-1, server_name="0.0.0.0")

app-shared.py ADDED Viewed

	@@ -0,0 +1,3 @@

+# Run the app with no audio file restrictions
+from app import create_ui
+create_ui(-1, share=True)

app.py ADDED Viewed

	@@ -0,0 +1,256 @@

+from typing import Iterator
+from io import StringIO
+import os
+import pathlib
+import tempfile
+# External programs
+import whisper
+import ffmpeg
+# UI
+import gradio as gr
+from src.download import ExceededMaximumDuration, download_url
+from src.utils import slugify, write_srt, write_vtt
+from src.vad import NonSpeechStrategy, PeriodicTranscriptionConfig, TranscriptionConfig, VadPeriodicTranscription, VadSileroTranscription
+# Limitations (set to -1 to disable)
+DEFAULT_INPUT_AUDIO_MAX_DURATION = 600 # seconds
+# Whether or not to automatically delete all uploaded files, to save disk space
+DELETE_UPLOADED_FILES = True
+# Gradio seems to truncate files without keeping the extension, so we need to truncate the file prefix ourself
+MAX_FILE_PREFIX_LENGTH = 17
+LANGUAGES = [
+ "English", "Chinese", "German", "Spanish", "Russian", "Korean",
+ "French", "Japanese", "Portuguese", "Turkish", "Polish", "Catalan",
+ "Dutch", "Arabic", "Swedish", "Italian", "Indonesian", "Hindi",
+ "Finnish", "Vietnamese", "Hebrew", "Ukrainian", "Greek", "Malay",
+ "Czech", "Romanian", "Danish", "Hungarian", "Tamil", "Norwegian",
+ "Thai", "Urdu", "Croatian", "Bulgarian", "Lithuanian", "Latin",
+ "Maori", "Malayalam", "Welsh", "Slovak", "Telugu", "Persian",
+ "Latvian", "Bengali", "Serbian", "Azerbaijani", "Slovenian",
+ "Kannada", "Estonian", "Macedonian", "Breton", "Basque", "Icelandic",
+ "Armenian", "Nepali", "Mongolian", "Bosnian", "Kazakh", "Albanian",
+ "Swahili", "Galician", "Marathi", "Punjabi", "Sinhala", "Khmer",
+ "Shona", "Yoruba", "Somali", "Afrikaans", "Occitan", "Georgian",
+ "Belarusian", "Tajik", "Sindhi", "Gujarati", "Amharic", "Yiddish",
+ "Lao", "Uzbek", "Faroese", "Haitian Creole", "Pashto", "Turkmen",
+ "Nynorsk", "Maltese", "Sanskrit", "Luxembourgish", "Myanmar", "Tibetan",
+ "Tagalog", "Malagasy", "Assamese", "Tatar", "Hawaiian", "Lingala",
+ "Hausa", "Bashkir", "Javanese", "Sundanese"
+]
+class WhisperTranscriber:
+    def __init__(self, inputAudioMaxDuration: float = DEFAULT_INPUT_AUDIO_MAX_DURATION, deleteUploadedFiles: bool = DELETE_UPLOADED_FILES):
+        self.model_cache = dict()
+        self.vad_model = None
+        self.inputAudioMaxDuration = inputAudioMaxDuration
+        self.deleteUploadedFiles = deleteUploadedFiles
+    def transcribe_webui(self, modelName, languageName, urlData, uploadFile, microphoneData, task, vad, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow):
+        try:
+            source, sourceName = self.__get_source(urlData, uploadFile, microphoneData)
+            try:
+                selectedLanguage = languageName.lower() if len(languageName) > 0 else None
+                selectedModel = modelName if modelName is not None else "base"
+                model = self.model_cache.get(selectedModel, None)
+                if not model:
+                    model = whisper.load_model(selectedModel)
+                    self.model_cache[selectedModel] = model
+                # Execute whisper
+                result = self.transcribe_file(model, source, selectedLanguage, task, vad, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow)
+                # Write result
+                downloadDirectory = tempfile.mkdtemp()
+                filePrefix = slugify(sourceName, allow_unicode=True)
+                download, text, vtt = self.write_result(result, filePrefix, downloadDirectory)
+                return download, text, vtt
+            finally:
+                # Cleanup source
+                if self.deleteUploadedFiles:
+                    print("Deleting source file " + source)
+                    os.remove(source)
+        except ExceededMaximumDuration as e:
+            return [], ("[ERROR]: Maximum remote video length is " + str(e.maxDuration) + "s, file was " + str(e.videoDuration) + "s"), "[ERROR]"
+    def transcribe_file(self, model: whisper.Whisper, audio_path: str, language: str, task: str = None, vad: str = None,
+                        vadMergeWindow: float = 5, vadMaxMergeSize: float = 150, vadPadding: float = 1, vadPromptWindow: float = 1, **decodeOptions: dict):
+        initial_prompt = decodeOptions.pop('initial_prompt', None)
+        if ('task' in decodeOptions):
+            task = decodeOptions.pop('task')
+        # Callable for processing an audio file
+        whisperCallable = lambda audio, segment_index, prompt, detected_language : model.transcribe(audio, \
+                 language=language if language else detected_language, task=task, \
+                 initial_prompt=self._concat_prompt(initial_prompt, prompt) if segment_index == 0 else prompt, \
+                 **decodeOptions)
+        # The results
+        if (vad == 'silero-vad'):
+            # Silero VAD where non-speech gaps are transcribed
+            process_gaps = self._create_silero_config(NonSpeechStrategy.CREATE_SEGMENT, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow)
+            result = self.vad_model.transcribe(audio_path, whisperCallable, process_gaps)
+        elif (vad == 'silero-vad-skip-gaps'):
+            # Silero VAD where non-speech gaps are simply ignored
+            skip_gaps = self._create_silero_config(NonSpeechStrategy.SKIP, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow)
+            result = self.vad_model.transcribe(audio_path, whisperCallable, skip_gaps)
+        elif (vad == 'silero-vad-expand-into-gaps'):
+            # Use Silero VAD where speech-segments are expanded into non-speech gaps
+            expand_gaps = self._create_silero_config(NonSpeechStrategy.EXPAND_SEGMENT, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow)
+            result = self.vad_model.transcribe(audio_path, whisperCallable, expand_gaps)
+        elif (vad == 'periodic-vad'):
+            # Very simple VAD - mark every 5 minutes as speech. This makes it less likely that Whisper enters an infinite loop, but
+            # it may create a break in the middle of a sentence, causing some artifacts.
+            periodic_vad = VadPeriodicTranscription()
+            result = periodic_vad.transcribe(audio_path, whisperCallable, PeriodicTranscriptionConfig(periodic_duration=vadMaxMergeSize, max_prompt_window=vadPromptWindow))
+        else:
+            # Default VAD
+            result = whisperCallable(audio_path, 0, None, None)
+        return result
+    def _concat_prompt(self, prompt1, prompt2):
+        if (prompt1 is None):
+            return prompt2
+        elif (prompt2 is None):
+            return prompt1
+        else:
+            return prompt1 + " " + prompt2
+    def _create_silero_config(self, non_speech_strategy: NonSpeechStrategy, vadMergeWindow: float = 5, vadMaxMergeSize: float = 150, vadPadding: float = 1, vadPromptWindow: float = 1):
+        # Use Silero VAD
+        if (self.vad_model is None):
+            self.vad_model = VadSileroTranscription()
+        config = TranscriptionConfig(non_speech_strategy = non_speech_strategy,
+                max_silent_period=vadMergeWindow, max_merge_size=vadMaxMergeSize,
+                segment_padding_left=vadPadding, segment_padding_right=vadPadding,
+                max_prompt_window=vadPromptWindow)
+        return config
+    def write_result(self, result: dict, source_name: str, output_dir: str):
+        if not os.path.exists(output_dir):
+            os.makedirs(output_dir)
+        text = result["text"]
+        language = result["language"]
+        languageMaxLineWidth = self.__get_max_line_width(language)
+        print("Max line width " + str(languageMaxLineWidth))
+        vtt = self.__get_subs(result["segments"], "vtt", languageMaxLineWidth)
+        srt = self.__get_subs(result["segments"], "srt", languageMaxLineWidth)
+        output_files = []
+        output_files.append(self.__create_file(srt, output_dir, source_name + "-subs.srt"));
+        output_files.append(self.__create_file(vtt, output_dir, source_name + "-subs.vtt"));
+        output_files.append(self.__create_file(text, output_dir, source_name + "-transcript.txt"));
+        return output_files, text, vtt
+    def clear_cache(self):
+        self.model_cache = dict()
+        self.vad_model = None
+    def __get_source(self, urlData, uploadFile, microphoneData):
+        if urlData:
+            # Download from YouTube
+            source = download_url(urlData, self.inputAudioMaxDuration)[0]
+        else:
+            # File input
+            source = uploadFile if uploadFile is not None else microphoneData
+            if self.inputAudioMaxDuration > 0:
+                # Calculate audio length
+                audioDuration = ffmpeg.probe(source)["format"]["duration"]
+                if float(audioDuration) > self.inputAudioMaxDuration:
+                    raise ExceededMaximumDuration(videoDuration=audioDuration, maxDuration=self.inputAudioMaxDuration, message="Video is too long")
+        file_path = pathlib.Path(source)
+        sourceName = file_path.stem[:MAX_FILE_PREFIX_LENGTH] + file_path.suffix
+        return source, sourceName
+    def __get_max_line_width(self, language: str) -> int:
+        if (language and language.lower() in ["japanese", "ja", "chinese", "zh"]):
+            # Chinese characters and kana are wider, so limit line length to 40 characters
+            return 40
+        else:
+            # TODO: Add more languages
+            # 80 latin characters should fit on a 1080p/720p screen
+            return 80
+    def __get_subs(self, segments: Iterator[dict], format: str, maxLineWidth: int) -> str:
+        segmentStream = StringIO()
+        if format == 'vtt':
+            write_vtt(segments, file=segmentStream, maxLineWidth=maxLineWidth)
+        elif format == 'srt':
+            write_srt(segments, file=segmentStream, maxLineWidth=maxLineWidth)
+        else:
+            raise Exception("Unknown format " + format)
+        segmentStream.seek(0)
+        return segmentStream.read()
+    def __create_file(self, text: str, directory: str, fileName: str) -> str:
+        # Write the text to a file
+        with open(os.path.join(directory, fileName), 'w+', encoding="utf-8") as file:
+            file.write(text)
+        return file.name
+def create_ui(inputAudioMaxDuration, share=False, server_name: str = None):
+    ui = WhisperTranscriber(inputAudioMaxDuration)
+    ui_description = "Whisper is a general-purpose speech recognition model. It is trained on a large dataset of diverse "
+    ui_description += " audio and is also a multi-task model that can perform multilingual speech recognition "
+    ui_description += " as well as speech translation and language identification. "
+    ui_description += "\n\n\n\nFor longer audio files (>10 minutes) not in English, it is recommended that you select Silero VAD (Voice Activity Detector) in the VAD option."
+    if inputAudioMaxDuration > 0:
+        ui_description += "\n\n" + "Max audio file length: " + str(inputAudioMaxDuration) + " s"
+    ui_article = "Read the [documentation here](https://huggingface.co/spaces/aadnk/whisper-webui/blob/main/docs/options.md)"
+    demo = gr.Interface(fn=ui.transcribe_webui, description=ui_description, article=ui_article, inputs=[
+        gr.Dropdown(choices=["tiny", "base", "small", "medium", "large"], value="medium", label="Model"),
+        gr.Dropdown(choices=sorted(LANGUAGES), label="Language"),
+        gr.Text(label="URL (YouTube, etc.)"),
+        gr.Audio(source="upload", type="filepath", label="Upload Audio"),
+        gr.Audio(source="microphone", type="filepath", label="Microphone Input"),
+        gr.Dropdown(choices=["transcribe", "translate"], label="Task"),
+        gr.Dropdown(choices=["none", "silero-vad", "silero-vad-skip-gaps", "silero-vad-expand-into-gaps", "periodic-vad"], label="VAD"),
+        gr.Number(label="VAD - Merge Window (s)", precision=0, value=5),
+        gr.Number(label="VAD - Max Merge Size (s)", precision=0, value=30),
+        gr.Number(label="VAD - Padding (s)", precision=None, value=1),
+        gr.Number(label="VAD - Prompt Window (s)", precision=None, value=3)
+    ], outputs=[
+        gr.File(label="Download"),
+        gr.Text(label="Transcription"),
+        gr.Text(label="Segments")
+    ])
+    demo.launch(share=share, server_name=server_name)
+if __name__ == '__main__':
+    create_ui(DEFAULT_INPUT_AUDIO_MAX_DURATION)

cli.py ADDED Viewed

	@@ -0,0 +1,110 @@

+import argparse
+import os
+import pathlib
+from urllib.parse import urlparse
+import warnings
+import numpy as np
+import whisper
+import torch
+from app import LANGUAGES, WhisperTranscriber
+from src.download import download_url
+from src.utils import optional_float, optional_int, str2bool
+def cli():
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument("audio", nargs="+", type=str, help="audio file(s) to transcribe")
+    parser.add_argument("--model", default="small", choices=["tiny", "base", "small", "medium", "large"], help="name of the Whisper model to use")
+    parser.add_argument("--model_dir", type=str, default=None, help="the path to save model files; uses ~/.cache/whisper by default")
+    parser.add_argument("--device", default="cuda" if torch.cuda.is_available() else "cpu", help="device to use for PyTorch inference")
+    parser.add_argument("--output_dir", "-o", type=str, default=".", help="directory to save the outputs")
+    parser.add_argument("--verbose", type=str2bool, default=True, help="whether to print out the progress and debug messages")
+    parser.add_argument("--task", type=str, default="transcribe", choices=["transcribe", "translate"], help="whether to perform X->X speech recognition ('transcribe') or X->English translation ('translate')")
+    parser.add_argument("--language", type=str, default=None, choices=sorted(LANGUAGES), help="language spoken in the audio, specify None to perform language detection")
+    parser.add_argument("--vad", type=str, default="none", choices=["none", "silero-vad", "silero-vad-skip-gaps", "silero-vad-expand-into-gaps", "periodic-vad"], help="The voice activity detection algorithm to use")
+    parser.add_argument("--vad_merge_window", type=optional_float, default=5, help="The window size (in seconds) to merge voice segments")
+    parser.add_argument("--vad_max_merge_size", type=optional_float, default=30, help="The maximum size (in seconds) of a voice segment")
+    parser.add_argument("--vad_padding", type=optional_float, default=1, help="The padding (in seconds) to add to each voice segment")
+    parser.add_argument("--vad_prompt_window", type=optional_float, default=3, help="The window size of the prompt to pass to Whisper")
+    parser.add_argument("--temperature", type=float, default=0, help="temperature to use for sampling")
+    parser.add_argument("--best_of", type=optional_int, default=5, help="number of candidates when sampling with non-zero temperature")
+    parser.add_argument("--beam_size", type=optional_int, default=5, help="number of beams in beam search, only applicable when temperature is zero")
+    parser.add_argument("--patience", type=float, default=None, help="optional patience value to use in beam decoding, as in https://arxiv.org/abs/2204.05424, the default (1.0) is equivalent to conventional beam search")
+    parser.add_argument("--length_penalty", type=float, default=None, help="optional token length penalty coefficient (alpha) as in https://arxiv.org/abs/1609.08144, uses simple lengt normalization by default")
+    parser.add_argument("--suppress_tokens", type=str, default="-1", help="comma-separated list of token ids to suppress during sampling; '-1' will suppress most special characters except common punctuations")
+    parser.add_argument("--initial_prompt", type=str, default=None, help="optional text to provide as a prompt for the first window.")
+    parser.add_argument("--condition_on_previous_text", type=str2bool, default=True, help="if True, provide the previous output of the model as a prompt for the next window; disabling may make the text inconsistent across windows, but the model becomes less prone to getting stuck in a failure loop")
+    parser.add_argument("--fp16", type=str2bool, default=True, help="whether to perform inference in fp16; True by default")
+    parser.add_argument("--temperature_increment_on_fallback", type=optional_float, default=0.2, help="temperature to increase when falling back when the decoding fails to meet either of the thresholds below")
+    parser.add_argument("--compression_ratio_threshold", type=optional_float, default=2.4, help="if the gzip compression ratio is higher than this value, treat the decoding as failed")
+    parser.add_argument("--logprob_threshold", type=optional_float, default=-1.0, help="if the average log probability is lower than this value, treat the decoding as failed")
+    parser.add_argument("--no_speech_threshold", type=optional_float, default=0.6, help="if the probability of the <|nospeech|> token is higher than this value AND the decoding has failed due to `logprob_threshold`, consider the segment as silence")
+    args = parser.parse_args().__dict__
+    model_name: str = args.pop("model")
+    model_dir: str = args.pop("model_dir")
+    output_dir: str = args.pop("output_dir")
+    device: str = args.pop("device")
+    os.makedirs(output_dir, exist_ok=True)
+    if model_name.endswith(".en") and args["language"] not in {"en", "English"}:
+        warnings.warn(f"{model_name} is an English-only model but receipted '{args['language']}'; using English instead.")
+        args["language"] = "en"
+    temperature = args.pop("temperature")
+    temperature_increment_on_fallback = args.pop("temperature_increment_on_fallback")
+    if temperature_increment_on_fallback is not None:
+        temperature = tuple(np.arange(temperature, 1.0 + 1e-6, temperature_increment_on_fallback))
+    else:
+        temperature = [temperature]
+    vad = args.pop("vad")
+    vad_merge_window = args.pop("vad_merge_window")
+    vad_max_merge_size = args.pop("vad_max_merge_size")
+    vad_padding = args.pop("vad_padding")
+    vad_prompt_window = args.pop("vad_prompt_window")
+    model = whisper.load_model(model_name, device=device, download_root=model_dir)
+    transcriber = WhisperTranscriber(deleteUploadedFiles=False)
+    for audio_path in args.pop("audio"):
+        sources = []
+        # Detect URL and download the audio
+        if (uri_validator(audio_path)):
+            # Download from YouTube/URL directly
+            for source_path in  download_url(audio_path, maxDuration=-1, destinationDirectory=output_dir, playlistItems=None):
+                source_name = os.path.basename(source_path)
+                sources.append({ "path": source_path, "name": source_name })
+        else:
+            sources.append({ "path": audio_path, "name": os.path.basename(audio_path) })
+        for source in sources:
+            source_path = source["path"]
+            source_name = source["name"]
+            result = transcriber.transcribe_file(model, source_path, temperature=temperature,
+                                                vad=vad, vadMergeWindow=vad_merge_window, vadMaxMergeSize=vad_max_merge_size,
+                                                vadPadding=vad_padding, vadPromptWindow=vad_prompt_window, **args)
+            transcriber.write_result(result, source_name, output_dir)
+    transcriber.clear_cache()
+def uri_validator(x):
+    try:
+        result = urlparse(x)
+        return all([result.scheme, result.netloc])
+    except:
+        return False
+if __name__ == '__main__':
+    cli()

dockerfile ADDED Viewed

	@@ -0,0 +1,20 @@

+FROM huggingface/transformers-pytorch-gpu
+EXPOSE 7860
+ADD . /opt/whisper-webui/
+# Latest version of transformers-pytorch-gpu seems to lack tk.
+# Further, pip install fails, so we must upgrade pip first.
+RUN apt-get -y install python3-tk
+RUN  python3 -m pip install --upgrade pip &&\
+     python3 -m pip install -r /opt/whisper-webui/requirements.txt
+# Note: Models will be downloaded on demand to the directory /root/.cache/whisper.
+# You can also bind this directory in the container to somewhere on the host.
+# To be able to see logs in real time
+ENV PYTHONUNBUFFERED=1
+WORKDIR /opt/whisper-webui/
+ENTRYPOINT ["python3"]
+CMD ["app-network.py"]

docs/options.md ADDED Viewed

	@@ -0,0 +1,78 @@

+# Options
+To transcribe or translate an audio file, you can either copy an URL from a website (all [websites](https://github.com/yt-dlp/yt-dlp/blob/master/supportedsites.md)
+supported by YT-DLP will work, including YouTube). Otherwise, upload an audio file (choose "All Files (*.*)"
+in the file selector to select any file type, including video files) or use the microphone.
+For longer audio files (>10 minutes), it is recommended that you select Silero VAD (Voice Activity Detector) in the VAD option.
+## Model
+Select the model that Whisper will use to transcribe the audio:
+| Size   | Parameters | English-only model | Multilingual model | Required VRAM | Relative speed |
+|--------|------------|--------------------|--------------------|---------------|----------------|
+| tiny   | 39 M       | tiny.en            | tiny               | ~1 GB         | ~32x           |
+| base   | 74 M       | base.en            | base               | ~1 GB         | ~16x           |
+| small  | 244 M      | small.en           | small              | ~2 GB         | ~6x            |
+| medium | 769 M      | medium.en          | medium             | ~5 GB         | ~2x            |
+| large  | 1550 M     | N/A                | large              | ~10 GB        | 1x             |
+## Language
+Select the language, or leave it empty for Whisper to automatically detect it.
+Note that if the selected language and the language in the audio differs, Whisper may start to translate the audio to the selected
+language. For instance, if the audio is in English but you select Japaneese, the model may translate the audio to Japanese.
+## Inputs
+The options "URL (YouTube, etc.)", "Upload Audio" or "Micriphone Input" allows you to send an audio input to the model.
+Note that the UI will only process the first valid input - i.e. if you enter both an URL and upload an audio, it will only process
+the URL.
+## Task
+Select the task - either "transcribe" to transcribe the audio to text, or "translate" to translate it to English.
+## Vad
+Using a VAD will improve the timing accuracy of each transcribed line, as well as prevent Whisper getting into an infinite
+loop detecting the same sentence over and over again. The downside is that this may be at a cost to text accuracy, especially
+with regards to unique words or names that appear in the audio. You can compensate for this by increasing the prompt window.
+Note that English is very well handled by Whisper, and it's less susceptible to issues surrounding bad timings and infinite loops.
+So you may only need to use a VAD for other languages, such as Japanese, or when the audio is very long.
+* none
+  * Run whisper on the entire audio input
+* silero-vad
+   * Use Silero VAD to detect sections that contain speech, and run Whisper on independently on each section. Whisper is also run
+     on the gaps between each speech section, by either expanding the section up to the max merge size, or running Whisper independently
+     on the non-speech section.
+* silero-vad-expand-into-gaps
+   * Use Silero VAD to detect sections that contain speech, and run Whisper on independently on each section. Each spech section will be expanded
+     such that they cover any adjacent non-speech sections. For instance, if an audio file of one minute contains the speech sections
+     00:00 - 00:10 (A) and 00:30 - 00:40 (B), the first section (A) will be expanded to 00:00 - 00:30, and (B) will be expanded to 00:30 - 00:60.
+* silero-vad-skip-gaps
+   * As above, but sections that doesn't contain speech according to Silero will be skipped. This will be slightly faster, but
+     may cause dialogue to be skipped.
+* periodic-vad
+   * Create sections of speech every 'VAD - Max Merge Size' seconds. This is very fast and simple, but will potentially break
+     a sentence or word in two.
+## VAD - Merge Window
+If set, any adjacent speech sections that are at most this number of seconds apart will be automatically merged.
+## VAD - Max Merge Size (s)
+Disables merging of adjacent speech sections if they are this number of seconds long.
+## VAD - Padding (s)
+The number of seconds (floating point) to add to the beginning and end of each speech section. Setting this to a number
+larger than zero ensures that Whisper is more likely to correctly transcribe a sentence in the beginning of
+a speech section. However, this also increases the probability of Whisper assigning the wrong timestamp
+to each transcribed line. The default value is 1 second.
+## VAD - Prompt Window (s)
+The text of a detected line will be included as a prompt to the next speech section, if the speech section starts at most this
+number of seconds after the line has finished. For instance, if a line ends at 10:00, and the next speech section starts at
+10:04, the line's text will be included if the prompt window is 4 seconds or more (10:04 - 10:00 = 4 seconds).
+Note that detected lines in gaps between speech sections will not be included in the prompt
+(if silero-vad or silero-vad-expand-into-gaps) is used.

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+git+https://github.com/openai/whisper.git
+transformers
+ffmpeg-python==0.2.0
+gradio
+yt-dlp
+torchaudio

src/__init__.py ADDED Viewed

File without changes

src/download.py ADDED Viewed

	@@ -0,0 +1,72 @@

+from tempfile import mkdtemp
+from typing import List
+from yt_dlp import YoutubeDL
+import yt_dlp
+from yt_dlp.postprocessor import PostProcessor
+class FilenameCollectorPP(PostProcessor):
+    def __init__(self):
+        super(FilenameCollectorPP, self).__init__(None)
+        self.filenames = []
+    def run(self, information):
+        self.filenames.append(information["filepath"])
+        return [], information
+def download_url(url: str, maxDuration: int = None, destinationDirectory: str = None, playlistItems: str = "1") -> List[str]:
+    try:
+        return _perform_download(url, maxDuration=maxDuration, outputTemplate=None, destinationDirectory=destinationDirectory, playlistItems=playlistItems)
+    except yt_dlp.utils.DownloadError as e:
+        # In case of an OS error, try again with a different output template
+        if e.msg and e.msg.find("[Errno 36] File name too long") >= 0:
+            return _perform_download(url, maxDuration=maxDuration, outputTemplate="%(title).10s %(id)s.%(ext)s")
+        pass
+def _perform_download(url: str, maxDuration: int = None, outputTemplate: str = None, destinationDirectory: str = None, playlistItems: str = "1"):
+    # Create a temporary directory to store the downloaded files
+    if destinationDirectory is None:
+        destinationDirectory = mkdtemp()
+    ydl_opts = {
+        "format": "bestaudio/best",
+        'paths': {
+            'home': destinationDirectory
+        }
+    }
+    if (playlistItems):
+        ydl_opts['playlist_items'] = playlistItems
+    # Add output template if specified
+    if outputTemplate:
+        ydl_opts['outtmpl'] = outputTemplate
+    filename_collector = FilenameCollectorPP()
+    with YoutubeDL(ydl_opts) as ydl:
+        if maxDuration and maxDuration > 0:
+            info = ydl.extract_info(url, download=False)
+            duration = info['duration']
+            if duration >= maxDuration:
+                raise ExceededMaximumDuration(videoDuration=duration, maxDuration=maxDuration, message="Video is too long")
+        ydl.add_post_processor(filename_collector)
+        ydl.download([url])
+    if len(filename_collector.filenames) <= 0:
+        raise Exception("Cannot download " + url)
+    result = []
+    for filename in filename_collector.filenames:
+        result.append(filename)
+        print("Downloaded " + filename)
+    return result
+class ExceededMaximumDuration(Exception):
+    def __init__(self, videoDuration, maxDuration, message):
+        self.videoDuration = videoDuration
+        self.maxDuration = maxDuration
+        super().__init__(message)

src/segments.py ADDED Viewed

	@@ -0,0 +1,55 @@

+from typing import Any, Dict, List
+import copy
+def merge_timestamps(timestamps: List[Dict[str, Any]], merge_window: float = 5, max_merge_size: float = 30, padding_left: float = 1, padding_right: float = 1):
+    result = []
+    if len(timestamps) == 0:
+        return result
+    if max_merge_size is None:
+        return timestamps
+    if padding_left is None:
+        padding_left = 0
+    if padding_right is None:
+        padding_right = 0
+    processed_time = 0
+    current_segment = None
+    for i in range(len(timestamps)):
+        next_segment = timestamps[i]
+        delta = next_segment['start'] - processed_time
+        # Note that segments can still be longer than the max merge size, they just won't be merged in that case
+        if current_segment is None or (merge_window is not None and delta > merge_window) \
+                 or next_segment['end'] - current_segment['start'] > max_merge_size:
+            # Finish the current segment
+            if current_segment is not None:
+                # Add right padding
+                finish_padding = min(padding_right, delta / 2) if delta < padding_left + padding_right else padding_right
+                current_segment['end'] += finish_padding
+                delta -= finish_padding
+                result.append(current_segment)
+            # Start a new segment
+            current_segment = copy.deepcopy(next_segment)
+            # Pad the segment
+            current_segment['start'] = current_segment['start'] - min(padding_left, delta)
+            processed_time = current_segment['end']
+        else:
+            # Merge the segment
+            current_segment['end'] = next_segment['end']
+            processed_time = current_segment['end']
+    # Add the last segment
+    if current_segment is not None:
+        current_segment['end'] += padding_right
+        result.append(current_segment)
+    return result

src/utils.py ADDED Viewed

	@@ -0,0 +1,115 @@

+import textwrap
+import unicodedata
+import re
+import zlib
+from typing import Iterator, TextIO
+def exact_div(x, y):
+    assert x % y == 0
+    return x // y
+def str2bool(string):
+    str2val = {"True": True, "False": False}
+    if string in str2val:
+        return str2val[string]
+    else:
+        raise ValueError(f"Expected one of {set(str2val.keys())}, got {string}")
+def optional_int(string):
+    return None if string == "None" else int(string)
+def optional_float(string):
+    return None if string == "None" else float(string)
+def compression_ratio(text) -> float:
+    return len(text) / len(zlib.compress(text.encode("utf-8")))
+def format_timestamp(seconds: float, always_include_hours: bool = False, fractionalSeperator: str = '.'):
+    assert seconds >= 0, "non-negative timestamp expected"
+    milliseconds = round(seconds * 1000.0)
+    hours = milliseconds // 3_600_000
+    milliseconds -= hours * 3_600_000
+    minutes = milliseconds // 60_000
+    milliseconds -= minutes * 60_000
+    seconds = milliseconds // 1_000
+    milliseconds -= seconds * 1_000
+    hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else ""
+    return f"{hours_marker}{minutes:02d}:{seconds:02d}{fractionalSeperator}{milliseconds:03d}"
+def write_txt(transcript: Iterator[dict], file: TextIO):
+    for segment in transcript:
+        print(segment['text'].strip(), file=file, flush=True)
+def write_vtt(transcript: Iterator[dict], file: TextIO, maxLineWidth=None):
+    print("WEBVTT\n", file=file)
+    for segment in transcript:
+        text = process_text(segment['text'], maxLineWidth).replace('-->', '->')
+        print(
+            f"{format_timestamp(segment['start'])} --> {format_timestamp(segment['end'])}\n"
+            f"{text}\n",
+            file=file,
+            flush=True,
+        )
+def write_srt(transcript: Iterator[dict], file: TextIO, maxLineWidth=None):
+    """
+    Write a transcript to a file in SRT format.
+    Example usage:
+        from pathlib import Path
+        from whisper.utils import write_srt
+        result = transcribe(model, audio_path, temperature=temperature, **args)
+        # save SRT
+        audio_basename = Path(audio_path).stem
+        with open(Path(output_dir) / (audio_basename + ".srt"), "w", encoding="utf-8") as srt:
+            write_srt(result["segments"], file=srt)
+    """
+    for i, segment in enumerate(transcript, start=1):
+        text = process_text(segment['text'].strip(), maxLineWidth).replace('-->', '->')
+        # write srt lines
+        print(
+            f"{i}\n"
+            f"{format_timestamp(segment['start'], always_include_hours=True, fractionalSeperator=',')} --> "
+            f"{format_timestamp(segment['end'], always_include_hours=True, fractionalSeperator=',')}\n"
+            f"{text}\n",
+            file=file,
+            flush=True,
+        )
+def process_text(text: str, maxLineWidth=None):
+    if (maxLineWidth is None or maxLineWidth < 0):
+        return text
+    lines = textwrap.wrap(text, width=maxLineWidth, tabsize=4)
+    return '\n'.join(lines)
+def slugify(value, allow_unicode=False):
+    """
+    Taken from https://github.com/django/django/blob/master/django/utils/text.py
+    Convert to ASCII if 'allow_unicode' is False. Convert spaces or repeated
+    dashes to single dashes. Remove characters that aren't alphanumerics,
+    underscores, or hyphens. Convert to lowercase. Also strip leading and
+    trailing whitespace, dashes, and underscores.
+    """
+    value = str(value)
+    if allow_unicode:
+        value = unicodedata.normalize('NFKC', value)
+    else:
+        value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii')
+    value = re.sub(r'[^\w\s-]', '', value.lower())
+    return re.sub(r'[-\s]+', '-', value).strip('-_')

src/vad.py ADDED Viewed

	@@ -0,0 +1,477 @@

+from abc import ABC, abstractmethod
+from collections import Counter, deque
+from typing import Any, Deque, Iterator, List, Dict
+from pprint import pprint
+from src.segments import merge_timestamps
+# Workaround for https://github.com/tensorflow/tensorflow/issues/48797
+try:
+    import tensorflow as tf
+except ModuleNotFoundError:
+    # Error handling
+    pass
+import torch
+import ffmpeg
+import numpy as np
+from src.utils import format_timestamp
+from enum import Enum
+class NonSpeechStrategy(Enum):
+    """
+    Ignore non-speech frames segments.
+    """
+    SKIP = 1
+    """
+    Just treat non-speech segments as speech.
+    """
+    CREATE_SEGMENT = 2
+    """
+    Expand speech segments into subsequent non-speech segments.
+    """
+    EXPAND_SEGMENT = 3
+# Defaults for Silero
+SPEECH_TRESHOLD = 0.3
+# Minimum size of segments to process
+MIN_SEGMENT_DURATION = 1
+# The maximum time for texts from old segments to be used in the next segment
+MAX_PROMPT_WINDOW = 0 # seconds (0 = disabled)
+PROMPT_NO_SPEECH_PROB = 0.1 # Do not pass the text from segments with a no speech probability higher than this
+VAD_MAX_PROCESSING_CHUNK = 60 * 60 # 60 minutes of audio
+class TranscriptionConfig(ABC):
+    def __init__(self, non_speech_strategy: NonSpeechStrategy = NonSpeechStrategy.SKIP,
+                       segment_padding_left: float = None, segment_padding_right = None, max_silent_period: float = None,
+                       max_merge_size: float = None, max_prompt_window: float = None):
+        self.non_speech_strategy = non_speech_strategy
+        self.segment_padding_left = segment_padding_left
+        self.segment_padding_right = segment_padding_right
+        self.max_silent_period = max_silent_period
+        self.max_merge_size = max_merge_size
+        self.max_prompt_window = max_prompt_window
+class PeriodicTranscriptionConfig(TranscriptionConfig):
+    def __init__(self, periodic_duration: float, non_speech_strategy: NonSpeechStrategy = NonSpeechStrategy.SKIP,
+                       segment_padding_left: float = None, segment_padding_right = None, max_silent_period: float = None,
+                       max_merge_size: float = None, max_prompt_window: float = None):
+        super().__init__(non_speech_strategy, segment_padding_left, segment_padding_right, max_silent_period, max_merge_size, max_prompt_window)
+        self.periodic_duration = periodic_duration
+class AbstractTranscription(ABC):
+    def __init__(self, sampling_rate: int = 16000):
+        self.sampling_rate = sampling_rate
+    def get_audio_segment(self, str, start_time: str = None, duration: str = None):
+        return load_audio(str, self.sampling_rate, start_time, duration)
+    @abstractmethod
+    def get_transcribe_timestamps(self, audio: str, config: TranscriptionConfig):
+        """
+        Get the start and end timestamps of the sections that should be transcribed by this VAD method.
+        Parameters
+        ----------
+        audio: str
+            The audio file.
+        config: TranscriptionConfig
+            The transcription configuration.
+        Returns
+        -------
+        A list of start and end timestamps, in fractional seconds.
+        """
+        return
+    def transcribe(self, audio: str, whisperCallable, config: TranscriptionConfig):
+        """
+        Transcribe the given audo file.
+        Parameters
+        ----------
+        audio: str
+            The audio file.
+        whisperCallable: Callable[[Union[str, np.ndarray, torch.Tensor], int, str, str], dict[str, Union[dict, Any]]]
+            The callback that is used to invoke Whisper on an audio file/buffer. The first parameter is the audio file/buffer,
+            the second parameter is an optional text prompt, and the last is the current detected language. The return value is the result of the Whisper call.
+        Returns
+        -------
+        A list of start and end timestamps, in fractional seconds.
+        """
+        # get speech timestamps from full audio file
+        seconds_timestamps = self.get_transcribe_timestamps(audio, config)
+        #for seconds_timestamp in seconds_timestamps:
+        #    print("VAD timestamp ", format_timestamp(seconds_timestamp['start']), " to ", format_timestamp(seconds_timestamp['end']))
+        merged = merge_timestamps(seconds_timestamps, config.max_silent_period, config.max_merge_size, config.segment_padding_left, config.segment_padding_right)
+        # A deque of transcribed segments that is passed to the next segment as a prompt
+        prompt_window = deque()
+        print("Timestamps:")
+        pprint(merged)
+        if config.non_speech_strategy != NonSpeechStrategy.SKIP:
+            max_audio_duration = get_audio_duration(audio)
+            # Expand segments to include the gaps between them
+            if (config.non_speech_strategy == NonSpeechStrategy.CREATE_SEGMENT):
+                # When we have a prompt window, we create speech segments betwen each segment if we exceed the merge size
+                merged = self.fill_gaps(merged, total_duration=max_audio_duration, max_expand_size=config.max_merge_size)
+            elif config.non_speech_strategy == NonSpeechStrategy.EXPAND_SEGMENT:
+                # With no prompt window, it is better to just expand the segments (this effectively passes the prompt to the next segment)
+                merged = self.expand_gaps(merged, total_duration=max_audio_duration)
+            else:
+                raise Exception("Unknown non-speech strategy: " + str(config.non_speech_strategy))
+            print("Transcribing non-speech:")
+            pprint(merged)
+        result = {
+            'text': "",
+            'segments': [],
+            'language': ""
+        }
+        languageCounter = Counter()
+        detected_language = None
+        segment_index = -1
+        # For each time segment, run whisper
+        for segment in merged:
+            segment_index += 1
+            segment_start = segment['start']
+            segment_end = segment['end']
+            segment_expand_amount = segment.get('expand_amount', 0)
+            segment_gap = segment.get('gap', False)
+            segment_duration = segment_end - segment_start
+            if segment_duration < MIN_SEGMENT_DURATION:
+                continue;
+            # Audio to run on Whisper
+            segment_audio = self.get_audio_segment(audio, start_time = str(segment_start), duration = str(segment_duration))
+            # Previous segments to use as a prompt
+            segment_prompt = ' '.join([segment['text'] for segment in prompt_window]) if len(prompt_window) > 0 else None
+            # Detected language
+            detected_language = languageCounter.most_common(1)[0][0] if len(languageCounter) > 0 else None
+            print("Running whisper from ", format_timestamp(segment_start), " to ", format_timestamp(segment_end), ", duration: ",
+                  segment_duration, "expanded: ", segment_expand_amount, "prompt: ", segment_prompt, "language: ", detected_language)
+            segment_result = whisperCallable(segment_audio, segment_index, segment_prompt, detected_language)
+            adjusted_segments = self.adjust_timestamp(segment_result["segments"], adjust_seconds=segment_start, max_source_time=segment_duration)
+            # Propagate expand amount to the segments
+            if (segment_expand_amount > 0):
+                segment_without_expansion = segment_duration - segment_expand_amount
+                for adjusted_segment in adjusted_segments:
+                    adjusted_segment_end = adjusted_segment['end']
+                    # Add expand amount if the segment got expanded
+                    if (adjusted_segment_end > segment_without_expansion):
+                        adjusted_segment["expand_amount"] = adjusted_segment_end - segment_without_expansion
+            # Append to output
+            result['text'] += segment_result['text']
+            result['segments'].extend(adjusted_segments)
+            # Increment detected language
+            if not segment_gap:
+                languageCounter[segment_result['language']] += 1
+            # Update prompt window
+            self.__update_prompt_window(prompt_window, adjusted_segments, segment_end, segment_gap, config)
+        if detected_language is not None:
+            result['language'] = detected_language
+        return result
+    def __update_prompt_window(self, prompt_window: Deque, adjusted_segments: List, segment_end: float, segment_gap: bool, config: TranscriptionConfig):
+        if (config.max_prompt_window is not None and config.max_prompt_window > 0):
+            # Add segments to the current prompt window (unless it is a speech gap)
+            if not segment_gap:
+                for segment in adjusted_segments:
+                    if segment.get('no_speech_prob', 0) <= PROMPT_NO_SPEECH_PROB:
+                        prompt_window.append(segment)
+            while (len(prompt_window) > 0):
+                first_end_time = prompt_window[0].get('end', 0)
+                # Time expanded in the segments should be discounted from the prompt window
+                first_expand_time = prompt_window[0].get('expand_amount', 0)
+                if (first_end_time - first_expand_time < segment_end - config.max_prompt_window):
+                    prompt_window.popleft()
+                else:
+                    break
+    def include_gaps(self, segments: Iterator[dict], min_gap_length: float, total_duration: float):
+        result = []
+        last_end_time = 0
+        for segment in segments:
+            segment_start = float(segment['start'])
+            segment_end = float(segment['end'])
+            if (last_end_time != segment_start):
+                delta = segment_start - last_end_time
+                if (min_gap_length is None or delta >= min_gap_length):
+                    result.append( { 'start': last_end_time, 'end': segment_start, 'gap': True } )
+            last_end_time = segment_end
+            result.append(segment)
+        # Also include total duration if specified
+        if (total_duration is not None and last_end_time < total_duration):
+            delta = total_duration - segment_start
+            if (min_gap_length is None or delta >= min_gap_length):
+                result.append( { 'start': last_end_time, 'end': total_duration, 'gap': True } )
+        return result
+    # Expand the end time of each segment to the start of the next segment
+    def expand_gaps(self, segments: List[Dict[str, Any]], total_duration: float):
+        result = []
+        if len(segments) == 0:
+            return result
+        # Add gap at the beginning if needed
+        if (segments[0]['start'] > 0):
+            result.append({ 'start': 0, 'end': segments[0]['start'], 'gap': True } )
+        for i in range(len(segments) - 1):
+            current_segment = segments[i]
+            next_segment = segments[i + 1]
+            delta = next_segment['start'] - current_segment['end']
+            # Expand if the gap actually exists
+            if (delta >= 0):
+                current_segment = current_segment.copy()
+                current_segment['expand_amount'] = delta
+                current_segment['end'] = next_segment['start']
+            result.append(current_segment)
+        # Add last segment
+        last_segment = segments[-1]
+        result.append(last_segment)
+        # Also include total duration if specified
+        if (total_duration is not None):
+            last_segment = result[-1]
+            if (last_segment['end'] < total_duration):
+                last_segment = last_segment.copy()
+                last_segment['end'] = total_duration
+                result[-1] = last_segment
+        return result
+    def fill_gaps(self, segments: List[Dict[str, Any]], total_duration: float, max_expand_size: float = None):
+        result = []
+        if len(segments) == 0:
+            return result
+        # Add gap at the beginning if needed
+        if (segments[0]['start'] > 0):
+            result.append({ 'start': 0, 'end': segments[0]['start'], 'gap': True } )
+        for i in range(len(segments) - 1):
+            expanded = False
+            current_segment = segments[i]
+            next_segment = segments[i + 1]
+            delta = next_segment['start'] - current_segment['end']
+            if (max_expand_size is not None and delta <= max_expand_size):
+                # Just expand the current segment
+                current_segment = current_segment.copy()
+                current_segment['expand_amount'] = delta
+                current_segment['end'] = next_segment['start']
+                expanded = True
+            result.append(current_segment)
+            # Add a gap to the next segment if needed
+            if (delta >= 0 and not expanded):
+                result.append({ 'start': current_segment['end'], 'end': next_segment['start'], 'gap': True } )
+        # Add last segment
+        last_segment = segments[-1]
+        result.append(last_segment)
+        # Also include total duration if specified
+        if (total_duration is not None):
+            last_segment = result[-1]
+            delta = total_duration - last_segment['end']
+            if (delta > 0):
+                if (max_expand_size is not None and delta <= max_expand_size):
+                    # Expand the last segment
+                    last_segment = last_segment.copy()
+                    last_segment['expand_amount'] = delta
+                    last_segment['end'] = total_duration
+                    result[-1] = last_segment
+                else:
+                    result.append({ 'start': last_segment['end'], 'end': total_duration, 'gap': True } )
+        return result
+    def adjust_timestamp(self, segments: Iterator[dict], adjust_seconds: float, max_source_time: float = None):
+        result = []
+        for segment in segments:
+            segment_start = float(segment['start'])
+            segment_end = float(segment['end'])
+            # Filter segments?
+            if (max_source_time is not None):
+                if (segment_start > max_source_time):
+                    continue
+                segment_end = min(max_source_time, segment_end)
+                new_segment = segment.copy()
+            # Add to start and end
+            new_segment['start'] = segment_start + adjust_seconds
+            new_segment['end'] = segment_end + adjust_seconds
+            result.append(new_segment)
+        return result
+    def multiply_timestamps(self, timestamps: List[Dict[str, Any]], factor: float):
+        result = []
+        for entry in timestamps:
+            start = entry['start']
+            end = entry['end']
+            result.append({
+                'start': start * factor,
+                'end': end * factor
+            })
+        return result
+class VadSileroTranscription(AbstractTranscription):
+    def __init__(self, sampling_rate: int = 16000):
+        super().__init__(sampling_rate=sampling_rate)
+        self.model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad', model='silero_vad')
+        (self.get_speech_timestamps, _, _, _, _) = utils
+    def get_transcribe_timestamps(self, audio: str, config: TranscriptionConfig):
+        audio_duration = get_audio_duration(audio)
+        result = []
+        # Divide procesisng of audio into chunks
+        chunk_start = 0.0
+        while (chunk_start < audio_duration):
+            chunk_duration = min(audio_duration - chunk_start, VAD_MAX_PROCESSING_CHUNK)
+            print("Processing VAD in chunk from {} to {}".format(format_timestamp(chunk_start), format_timestamp(chunk_start + chunk_duration)))
+            wav = self.get_audio_segment(audio, str(chunk_start), str(chunk_duration))
+            sample_timestamps = self.get_speech_timestamps(wav, self.model, sampling_rate=self.sampling_rate, threshold=SPEECH_TRESHOLD)
+            seconds_timestamps = self.multiply_timestamps(sample_timestamps, factor=1 / self.sampling_rate)
+            adjusted = self.adjust_timestamp(seconds_timestamps, adjust_seconds=chunk_start, max_source_time=chunk_start + chunk_duration)
+            #pprint(adjusted)
+            result.extend(adjusted)
+            chunk_start += chunk_duration
+        return result
+# A very simple VAD that just marks every N seconds as speech
+class VadPeriodicTranscription(AbstractTranscription):
+    def __init__(self, sampling_rate: int = 16000):
+        super().__init__(sampling_rate=sampling_rate)
+    def get_transcribe_timestamps(self, audio: str, config: PeriodicTranscriptionConfig):
+        # Get duration in seconds
+        audio_duration = get_audio_duration(audio)
+        result = []
+        # Generate a timestamp every N seconds
+        start_timestamp = 0
+        while (start_timestamp < audio_duration):
+            end_timestamp = min(start_timestamp + config.periodic_duration, audio_duration)
+            segment_duration = end_timestamp - start_timestamp
+            # Minimum duration is 1 second
+            if (segment_duration >= 1):
+                result.append( {  'start': start_timestamp, 'end': end_timestamp } )
+            start_timestamp = end_timestamp
+        return result
+def get_audio_duration(file: str):
+    return float(ffmpeg.probe(file)["format"]["duration"])
+def load_audio(file: str, sample_rate: int = 16000,
+               start_time: str = None, duration: str = None):
+    """
+    Open an audio file and read as mono waveform, resampling as necessary
+    Parameters
+    ----------
+    file: str
+        The audio file to open
+    sr: int
+        The sample rate to resample the audio if necessary
+    start_time: str
+        The start time, using the standard FFMPEG time duration syntax, or None to disable.
+    duration: str
+        The duration, using the standard FFMPEG time duration syntax, or None to disable.
+    Returns
+    -------
+    A NumPy array containing the audio waveform, in float32 dtype.
+    """
+    try:
+        inputArgs = {'threads': 0}
+        if (start_time is not None):
+            inputArgs['ss'] = start_time
+        if (duration is not None):
+            inputArgs['t'] = duration
+        # This launches a subprocess to decode audio while down-mixing and resampling as necessary.
+        # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
+        out, _ = (
+            ffmpeg.input(file, **inputArgs)
+            .output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=sample_rate)
+            .run(cmd="ffmpeg", capture_stdout=True, capture_stderr=True)
+        )
+    except ffmpeg.Error as e:
+        raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}")
+    return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0

tests/segments_test.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import sys
+import unittest
+sys.path.append('../whisper-webui')
+from src.segments import merge_timestamps
+class TestSegments(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super(TestSegments, self).__init__(*args, **kwargs)
+    def test_merge_segments(self):
+        segments = [
+            {'start': 10.0, 'end': 20.0},
+            {'start': 22.0, 'end': 27.0},
+            {'start': 31.0, 'end': 35.0},
+            {'start': 45.0, 'end': 60.0},
+            {'start': 61.0, 'end': 65.0},
+            {'start': 68.0, 'end': 98.0},
+            {'start': 100.0, 'end': 102.0},
+            {'start': 110.0, 'end': 112.0}
+        ]
+        result = merge_timestamps(segments, merge_window=5, max_merge_size=30, padding_left=1, padding_right=1)
+        self.assertListEqual(result, [
+            {'start': 9.0, 'end': 36.0},
+            {'start': 44.0, 'end': 66.0},
+            {'start': 67.0, 'end': 99.0},
+            {'start': 99.0, 'end': 103.0},
+            {'start': 109.0, 'end': 113.0}
+        ])
+    def test_overlap_next(self):
+        segments = [
+            {'start': 5.0, 'end': 39.182},
+            {'start': 39.986, 'end': 40.814}
+        ]
+        result = merge_timestamps(segments, merge_window=5, max_merge_size=30, padding_left=1, padding_right=1)
+        self.assertListEqual(result, [
+            {'start': 4.0, 'end': 39.584},
+            {'start': 39.584, 'end': 41.814}
+        ])
+if __name__ == '__main__':
+    unittest.main()

tests/vad_test.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import pprint
+import unittest
+import numpy as np
+import sys
+sys.path.append('../whisper-webui')
+from src.vad import AbstractTranscription, VadSileroTranscription
+class TestVad(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super(TestVad, self).__init__(*args, **kwargs)
+        self.transcribe_calls = []
+    def test_transcript(self):
+        mock = MockVadTranscription()
+        self.transcribe_calls.clear()
+        result = mock.transcribe("mock", lambda segment : self.transcribe_segments(segment))
+        self.assertListEqual(self.transcribe_calls, [
+            [30, 30],
+            [100, 100]
+        ])
+        self.assertListEqual(result['segments'],
+            [{'end': 50.0, 'start': 40.0, 'text': 'Hello world '},
+            {'end': 120.0, 'start': 110.0, 'text': 'Hello world '}]
+        )
+    def transcribe_segments(self, segment):
+        self.transcribe_calls.append(segment.tolist())
+        # Dummy text
+        return {
+            'text': "Hello world ",
+            'segments': [
+                {
+                    "start": 10.0,
+                    "end": 20.0,
+                    "text": "Hello world "
+                }
+            ],
+            'language': ""
+        }
+class MockVadTranscription(AbstractTranscription):
+    def __init__(self):
+        super().__init__()
+    def get_audio_segment(self, str, start_time: str = None, duration: str = None):
+        start_time_seconds = float(start_time.removesuffix("s"))
+        duration_seconds = float(duration.removesuffix("s"))
+        # For mocking, this just returns a simple numppy array
+        return np.array([start_time_seconds, duration_seconds], dtype=np.float64)
+    def get_transcribe_timestamps(self, audio: str):
+        result = []
+        result.append( {  'start': 30, 'end': 60 } )
+        result.append( {  'start': 100, 'end': 200 } )
+        return result
+if __name__ == '__main__':
+    unittest.main()