Spaces:

thewh1teagle
/

whisper-heb-ipa

Running

App Files Files

xet

Community

thewh1teagle commited on 29 days ago

Commit

24a8315

0 Parent(s):

latest

Browse files

Files changed (6) hide show

.gitattributes +1 -0
Dockerfile +24 -0
README.md +10 -0
app.py +155 -0
example1.wav +3 -0
requirements.txt +441 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1 @@


1	+ *.wav filter=lfs diff=lfs merge=lfs -text

Dockerfile ADDED Viewed

	@@ -0,0 +1,24 @@

+FROM ghcr.io/astral-sh/uv:python3.12-bookworm-slim
+RUN useradd -m -u 1000 user
+RUN apt-get update && apt-get install -y git ffmpeg && rm -rf /var/lib/apt/lists/*
+WORKDIR /app
+# Use system Python environment (recommended for containers)
+ENV UV_SYSTEM_PYTHON=1
+COPY requirements.txt .
+RUN uv pip install --no-cache -r requirements.txt
+COPY . .
+# Setup user
+RUN chown -R user:user /app
+USER user
+ENV HOME=/app
+EXPOSE 7860
+CMD ["uv", "run", "app.py", "--model", "thewh1teagle/whisper-heb-ipa"]

README.md ADDED Viewed

	@@ -0,0 +1,10 @@

+---
+title: Transcribe Hebrew Speech into IPA
+emoji: 🎙️
+colorFrom: red
+colorTo: blue
+sdk: docker
+sdk_version: "5.45.0"
+app_file: app.py
+pinned: false
+---

app.py ADDED Viewed

	@@ -0,0 +1,155 @@

+"""
+Usage:
+wget https://github.com/thewh1teagle/phonikud-chatterbox/releases/download/asset-files-v1/female1.wav -O example1.wav
+# Run with default HF model
+uv run src/infer.py
+# Or run with local checkpoint
+uv run src/infer.py --model ./whisper-heb-ipa/checkpoint-600
+# Or with whisper small
+uv run src/infer.py --model openai/whisper-small
+# Or with thewh1teagle/whisper-heb-ipa
+uv run src/infer.py --model thewh1teagle/whisper-heb-ipa
+"""
+import torch
+from transformers import pipeline
+import gradio as gr
+import argparse
+from pydub import AudioSegment
+from pydub.effects import normalize
+import tempfile
+import os
+def main():
+    parser = argparse.ArgumentParser(description="Whisper Transcription Demo")
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="openai/whisper-small",
+        help="Model name or path for Whisper (default: openai/whisper-small)"
+    )
+    args = parser.parse_args()
+    MODEL_NAME = args.model
+    BATCH_SIZE = 8
+    device = 0 if torch.cuda.is_available() else "cpu"
+    pipe = pipeline(
+        task="automatic-speech-recognition",
+        model=MODEL_NAME,
+        chunk_length_s=30,
+        device=device,
+    )
+    def normalize_audio(file_path):
+        """Normalize audio using pydub to improve transcription quality."""
+        try:
+            # Load audio file
+            audio = AudioSegment.from_file(file_path)
+            # Normalize the audio (adjusts volume to optimal level)
+            normalized_audio = normalize(audio)
+            # Create a temporary file for the normalized audio
+            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
+                normalized_audio.export(temp_file.name, format="wav")
+                return temp_file.name
+        except Exception as e:
+            print(f"Warning: Audio normalization failed: {e}")
+            # Return original file if normalization fails
+            return file_path
+    def transcribe(file, task):
+        # Normalize the audio before transcription
+        normalized_file = normalize_audio(file)
+        try:
+            outputs = pipe(normalized_file, batch_size=BATCH_SIZE, generate_kwargs={"task": task})
+            text = outputs["text"]
+            return text
+        finally:
+            # Clean up temporary normalized file if it was created
+            if normalized_file != file and os.path.exists(normalized_file):
+                try:
+                    os.unlink(normalized_file)
+                except Exception as e:
+                    print(f"Warning: Could not delete temporary file {normalized_file}: {e}")
+    demo = gr.Blocks(
+        css="""
+        .large-textbox textarea {
+            font-size: 20px !important;
+            line-height: 1.6 !important;
+        }
+        """
+    )
+    mic_transcribe = gr.Interface(
+        fn=transcribe,
+        inputs=[
+            gr.Audio(sources=["microphone", "upload"], type="filepath"),
+            gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
+        ],
+        outputs=gr.Textbox(
+            label="Transcription",
+            lines=6,
+            max_lines=15,
+            min_width=400,
+            show_copy_button=True,
+            placeholder="Transcribed text will appear here...",
+            elem_classes=["large-textbox"]
+        ),
+        theme="huggingface",
+        title="Whisper Demo: Transcribe Audio",
+        description=(
+            "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the"
+            f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files"
+            " of arbitrary length."
+        ),
+        allow_flagging="never",
+    )
+    file_transcribe = gr.Interface(
+        fn=transcribe,
+        inputs=[
+            gr.Audio(sources=["upload"], label="Audio file", type="filepath"),
+            gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
+        ],
+        outputs=gr.Textbox(
+            label="Transcription",
+            lines=6,
+            max_lines=15,
+            min_width=400,
+            show_copy_button=True,
+            placeholder="Transcribed text will appear here...",
+            elem_classes=["large-textbox"]
+        ),
+        theme="huggingface",
+        title="Whisper Demo: Transcribe Audio",
+        description=(
+            "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the"
+            f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files"
+            " of arbitrary length."
+        ),
+        examples=[
+            ["./example1.wav", "transcribe"],
+        ],
+        cache_examples=True,
+        allow_flagging="never",
+    )
+    with demo:
+        gr.TabbedInterface([file_transcribe, mic_transcribe], ["Transcribe Audio File", "Transcribe Microphone"])
+    demo.launch(server_name="0.0.0.0", server_port=7860)
+if __name__ == "__main__":
+    main()

example1.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c0ec1ccd0339360929e254fd1d530154f9f307b1fc5af2878726f6ebf245673d
+size 976974

requirements.txt ADDED Viewed

	@@ -0,0 +1,441 @@

+# This file was autogenerated by uv via the following command:
+#    uv export --no-hashes --no-emit-project
+absl-py==2.3.1
+    # via tensorboard
+accelerate==1.10.1
+    # via whisper-heb-ipa
+aiofiles==24.1.0
+    # via gradio
+aiohappyeyeballs==2.6.1
+    # via aiohttp
+aiohttp==3.12.15
+    # via fsspec
+aiosignal==1.4.0
+    # via aiohttp
+annotated-types==0.7.0
+    # via pydantic
+anyio==4.10.0
+    # via
+    #   gradio
+    #   httpx
+    #   starlette
+attrs==25.3.0
+    # via aiohttp
+audioop-lts==0.2.2 ; python_full_version >= '3.13'
+    # via
+    #   gradio
+    #   standard-aifc
+    #   standard-sunau
+audioread==3.0.1
+    # via librosa
+brotli==1.1.0
+    # via gradio
+certifi==2025.8.3
+    # via
+    #   httpcore
+    #   httpx
+    #   requests
+    #   sentry-sdk
+cffi==2.0.0
+    # via soundfile
+charset-normalizer==3.4.3
+    # via requests
+click==8.2.1
+    # via
+    #   jiwer
+    #   typer
+    #   uvicorn
+    #   wandb
+colorama==0.4.6 ; sys_platform == 'win32'
+    # via
+    #   click
+    #   tqdm
+datasets==4.1.0
+    # via
+    #   evaluate
+    #   whisper-heb-ipa
+decorator==5.2.1
+    # via librosa
+dill==0.4.0
+    # via
+    #   datasets
+    #   evaluate
+    #   multiprocess
+evaluate==0.4.5
+    # via whisper-heb-ipa
+fastapi==0.116.2
+    # via gradio
+ffmpy==0.6.1
+    # via gradio
+filelock==3.19.1
+    # via
+    #   datasets
+    #   huggingface-hub
+    #   torch
+    #   transformers
+frozenlist==1.7.0
+    # via
+    #   aiohttp
+    #   aiosignal
+fsspec==2025.9.0
+    # via
+    #   datasets
+    #   evaluate
+    #   gradio-client
+    #   huggingface-hub
+    #   torch
+gitdb==4.0.12
+    # via gitpython
+gitpython==3.1.45
+    # via wandb
+gradio==5.46.0
+    # via whisper-heb-ipa
+gradio-client==1.13.0
+    # via gradio
+groovy==0.1.2
+    # via gradio
+grpcio==1.75.0
+    # via tensorboard
+h11==0.16.0
+    # via
+    #   httpcore
+    #   uvicorn
+hf-xet==1.1.10 ; platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'arm64' or platform_machine == 'x86_64'
+    # via huggingface-hub
+httpcore==1.0.9
+    # via httpx
+httpx==0.28.1
+    # via
+    #   gradio
+    #   gradio-client
+    #   safehttpx
+huggingface-hub==0.35.0
+    # via
+    #   accelerate
+    #   datasets
+    #   evaluate
+    #   gradio
+    #   gradio-client
+    #   tokenizers
+    #   transformers
+    #   whisper-heb-ipa
+idna==3.10
+    # via
+    #   anyio
+    #   httpx
+    #   requests
+    #   yarl
+jinja2==3.1.6
+    # via
+    #   gradio
+    #   torch
+jiwer==4.0.0
+    # via whisper-heb-ipa
+joblib==1.5.2
+    # via
+    #   librosa
+    #   scikit-learn
+lazy-loader==0.4
+    # via librosa
+librosa==0.11.0
+    # via whisper-heb-ipa
+llvmlite==0.44.0
+    # via numba
+markdown==3.9
+    # via tensorboard
+markdown-it-py==4.0.0
+    # via rich
+markupsafe==3.0.2
+    # via
+    #   gradio
+    #   jinja2
+    #   werkzeug
+mdurl==0.1.2
+    # via markdown-it-py
+more-itertools==10.8.0
+    # via openai-whisper
+mpmath==1.3.0
+    # via sympy
+msgpack==1.1.1
+    # via librosa
+multidict==6.6.4
+    # via
+    #   aiohttp
+    #   yarl
+multiprocess==0.70.16
+    # via
+    #   datasets
+    #   evaluate
+networkx==3.5
+    # via torch
+numba==0.61.2
+    # via
+    #   librosa
+    #   openai-whisper
+numpy==2.2.6
+    # via
+    #   accelerate
+    #   datasets
+    #   evaluate
+    #   gradio
+    #   librosa
+    #   numba
+    #   openai-whisper
+    #   pandas
+    #   scikit-learn
+    #   scipy
+    #   soundfile
+    #   soxr
+    #   tensorboard
+    #   transformers
+nvidia-cublas-cu12==12.8.4.1 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via
+    #   nvidia-cudnn-cu12
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-cuda-cupti-cu12==12.8.90 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via torch
+nvidia-cuda-nvrtc-cu12==12.8.93 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via torch
+nvidia-cuda-runtime-cu12==12.8.90 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via torch
+nvidia-cudnn-cu12==9.10.2.21 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via torch
+nvidia-cufft-cu12==11.3.3.83 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via torch
+nvidia-cufile-cu12==1.13.1.3 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via torch
+nvidia-curand-cu12==10.3.9.90 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via torch
+nvidia-cusolver-cu12==11.7.3.90 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via torch
+nvidia-cusparse-cu12==12.5.8.93 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-cusparselt-cu12==0.7.1 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via torch
+nvidia-nccl-cu12==2.27.3 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via torch
+nvidia-nvjitlink-cu12==12.8.93 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via
+    #   nvidia-cufft-cu12
+    #   nvidia-cusolver-cu12
+    #   nvidia-cusparse-cu12
+    #   torch
+nvidia-nvtx-cu12==12.8.90 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via torch
+openai-whisper==20250625
+    # via whisper-heb-ipa
+orjson==3.11.3
+    # via gradio
+packaging==25.0
+    # via
+    #   accelerate
+    #   datasets
+    #   evaluate
+    #   gradio
+    #   gradio-client
+    #   huggingface-hub
+    #   lazy-loader
+    #   pooch
+    #   tensorboard
+    #   transformers
+    #   wandb
+pandas==2.3.2
+    # via
+    #   datasets
+    #   evaluate
+    #   gradio
+    #   whisper-heb-ipa
+pillow==11.3.0
+    # via
+    #   gradio
+    #   tensorboard
+platformdirs==4.4.0
+    # via
+    #   pooch
+    #   wandb
+pooch==1.8.2
+    # via librosa
+propcache==0.3.2
+    # via
+    #   aiohttp
+    #   yarl
+protobuf==6.32.1
+    # via
+    #   tensorboard
+    #   wandb
+psutil==7.0.0
+    # via accelerate
+pyarrow==21.0.0
+    # via datasets
+pycparser==2.23 ; implementation_name != 'PyPy'
+    # via cffi
+pydantic==2.11.9
+    # via
+    #   fastapi
+    #   gradio
+    #   wandb
+pydantic-core==2.33.2
+    # via pydantic
+pydub==0.25.1
+    # via
+    #   gradio
+    #   whisper-heb-ipa
+pygments==2.19.2
+    # via rich
+python-dateutil==2.9.0.post0
+    # via pandas
+python-multipart==0.0.20
+    # via gradio
+pytz==2025.2
+    # via pandas
+pyyaml==6.0.2
+    # via
+    #   accelerate
+    #   datasets
+    #   gradio
+    #   huggingface-hub
+    #   transformers
+    #   wandb
+rapidfuzz==3.14.1
+    # via jiwer
+regex==2025.9.1
+    # via
+    #   tiktoken
+    #   transformers
+requests==2.32.5
+    # via
+    #   datasets
+    #   evaluate
+    #   huggingface-hub
+    #   pooch
+    #   tiktoken
+    #   transformers
+    #   wandb
+rich==14.1.0
+    # via typer
+ruff==0.13.0
+    # via gradio
+safehttpx==0.1.6
+    # via gradio
+safetensors==0.6.2
+    # via
+    #   accelerate
+    #   transformers
+scikit-learn==1.7.2
+    # via librosa
+scipy==1.16.2
+    # via
+    #   librosa
+    #   scikit-learn
+semantic-version==2.10.0
+    # via gradio
+sentry-sdk==2.38.0
+    # via wandb
+setuptools==80.9.0
+    # via
+    #   tensorboard
+    #   torch
+    #   triton
+shellingham==1.5.4
+    # via typer
+six==1.17.0
+    # via python-dateutil
+smmap==5.0.2
+    # via gitdb
+sniffio==1.3.1
+    # via anyio
+soundfile==0.13.1
+    # via librosa
+soxr==1.0.0
+    # via librosa
+standard-aifc==3.13.0 ; python_full_version >= '3.13'
+    # via librosa
+standard-chunk==3.13.0 ; python_full_version >= '3.13'
+    # via standard-aifc
+standard-sunau==3.13.0 ; python_full_version >= '3.13'
+    # via librosa
+starlette==0.48.0
+    # via
+    #   fastapi
+    #   gradio
+sympy==1.14.0
+    # via torch
+tensorboard==2.20.0
+    # via whisper-heb-ipa
+tensorboard-data-server==0.7.2
+    # via tensorboard
+threadpoolctl==3.6.0
+    # via scikit-learn
+tiktoken==0.11.0
+    # via openai-whisper
+tokenizers==0.22.0
+    # via transformers
+tomlkit==0.13.3
+    # via gradio
+torch==2.8.0
+    # via
+    #   accelerate
+    #   datasets
+    #   openai-whisper
+    #   torchaudio
+torchaudio==2.8.0
+    # via whisper-heb-ipa
+torchcodec==0.7.0
+    # via datasets
+tqdm==4.67.1
+    # via
+    #   datasets
+    #   evaluate
+    #   huggingface-hub
+    #   openai-whisper
+    #   transformers
+transformers==4.56.1
+    # via whisper-heb-ipa
+triton==3.4.0 ; (platform_machine == 'x86_64' and sys_platform == 'linux') or sys_platform == 'linux2'
+    # via
+    #   openai-whisper
+    #   torch
+typer==0.17.4
+    # via gradio
+typing-extensions==4.15.0
+    # via
+    #   aiosignal
+    #   anyio
+    #   fastapi
+    #   gradio
+    #   gradio-client
+    #   grpcio
+    #   huggingface-hub
+    #   librosa
+    #   pydantic
+    #   pydantic-core
+    #   starlette
+    #   torch
+    #   typer
+    #   typing-inspection
+    #   wandb
+typing-inspection==0.4.1
+    # via pydantic
+tzdata==2025.2
+    # via pandas
+urllib3==2.5.0
+    # via
+    #   requests
+    #   sentry-sdk
+uvicorn==0.35.0
+    # via gradio
+wandb==0.21.4
+    # via whisper-heb-ipa
+websockets==15.0.1
+    # via gradio-client
+werkzeug==3.1.3
+    # via tensorboard
+xxhash==3.5.0
+    # via
+    #   datasets
+    #   evaluate
+yarl==1.20.1
+    # via aiohttp