seewav-gui

Running

App Files Files Community

hysts HF Staff commited on May 30

Commit

9ee5547

1 Parent(s): 74fd95b

Update

Browse files

Files changed (14) hide show

.gitattributes +1 -0
.pre-commit-config.yaml +33 -0
.python-version +1 -0
.vscode/extensions.json +8 -0
.vscode/settings.json +17 -0
README.md +2 -2
app.py +75 -290
assets/sample.wav +3 -0
packages.txt +1 -1
pyproject.toml +52 -0
requirements.txt +182 -6
style.css +4 -0
utils.py +230 -0
uv.lock +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.wav filter=lfs diff=lfs merge=lfs -text

.pre-commit-config.yaml ADDED Viewed

	@@ -0,0 +1,33 @@

+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v5.0.0
+    hooks:
+      - id: check-executables-have-shebangs
+      - id: check-json
+      - id: check-merge-conflict
+      - id: check-shebang-scripts-are-executable
+      - id: check-toml
+      - id: check-yaml
+      - id: end-of-file-fixer
+      - id: mixed-line-ending
+        args: ["--fix=lf"]
+      - id: requirements-txt-fixer
+      - id: trailing-whitespace
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.11.11
+    hooks:
+      - id: ruff-check
+        args: ["--fix"]
+      - id: ruff-format
+  - repo: https://github.com/pre-commit/mirrors-mypy
+    rev: v1.15.0
+    hooks:
+      - id: mypy
+        args: ["--ignore-missing-imports"]
+        additional_dependencies:
+          [
+            "types-python-slugify",
+            "types-pytz",
+            "types-PyYAML",
+            "types-requests",
+          ]

.python-version ADDED Viewed

	@@ -0,0 +1 @@


1	+ 3.10

.vscode/extensions.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "recommendations": [
+        "ms-python.python",
+        "charliermarsh.ruff",
+        "streetsidesoftware.code-spell-checker",
+        "tamasfe.even-better-toml"
+    ]
+}

.vscode/settings.json ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+    "editor.formatOnSave": true,
+    "files.insertFinalNewline": false,
+    "[python]": {
+        "editor.defaultFormatter": "charliermarsh.ruff",
+        "editor.formatOnType": true,
+        "editor.codeActionsOnSave": {
+            "source.fixAll.ruff": "explicit",
+            "source.organizeImports": "explicit"
+        }
+    },
+    "[jupyter]": {
+        "files.insertFinalNewline": false
+    },
+    "notebook.output.scrolling": true,
+    "notebook.formatOnSave.enabled": true
+}

README.md CHANGED Viewed

@@ -4,9 +4,9 @@ emoji: 🔊
 colorFrom: indigo
 colorTo: indigo
 sdk: gradio
-sdk_version: 4.26.0
 app_file: app.py
 license: cc0-1.0
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 colorFrom: indigo
 colorTo: indigo
 sdk: gradio
+sdk_version: 5.31.0
 app_file: app.py
 license: cc0-1.0
 ---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py CHANGED Viewed

@@ -1,314 +1,99 @@
-# Thank you to the authors of seewav for dedicating it into the public domain.
-# This program is also dedicated into the public domain.
-# You may use it, at your choice, under the Unlicense, CC0, or WTFPL license.
-# Enjoy!
-# Mostly from: https://github.com/adefossez/seewav
-# Original author: adefossez
-import math
 import tempfile
-from pathlib import Path
-import subprocess
-import cairo
-import numpy as np
-import gradio as gr
-from pydub import AudioSegment
-def read_audio(audio, seek=None, duration=None):
-    """
-    Read the `audio` file, starting at `seek` (or 0) seconds for `duration` (or all) seconds.
-    Returns `float[channels, samples]`.
-    """
-    audio_segment = AudioSegment.from_file(audio)
-    channels = audio_segment.channels
-    samplerate = audio_segment.frame_rate
-    if seek is not None:
-        seek_ms = int(seek * 1000)
-        audio_segment = audio_segment[seek_ms:]
-    if duration is not None:
-        duration_ms = int(duration * 1000)
-        audio_segment = audio_segment[:duration_ms]
-    samples = audio_segment.get_array_of_samples()
-    wav = np.array(samples, dtype=np.float32)
-    return wav.reshape(channels, -1), samplerate
-def sigmoid(x):
-    return 1 / (1 + np.exp(-x))
-def envelope(wav, window, stride):
-    """
-    Extract the envelope of the waveform `wav` (float[samples]), using average pooling
-    with `window` samples and the given `stride`.
-    """
-    # pos = np.pad(np.maximum(wav, 0), window // 2)
-    wav = np.pad(wav, window // 2)
-    out = []
-    for off in range(0, len(wav) - window, stride):
-        frame = wav[off : off + window]
-        out.append(np.maximum(frame, 0).mean())
-    out = np.array(out)
-    # Some form of audio compressor based on the sigmoid.
-    out = 1.9 * (sigmoid(2.5 * out) - 0.5)
-    return out
-def draw_env(envs, out, fg_colors, bg_color, size):
-    """
-    Internal function, draw a single frame (two frames for stereo) using cairo and save
-    it to the `out` file as png. envs is a list of envelopes over channels, each env
-    is a float[bars] representing the height of the envelope to draw. Each entry will
-    be represented by a bar.
-    """
-    surface = cairo.ImageSurface(cairo.FORMAT_ARGB32, *size)
-    ctx = cairo.Context(surface)
-    ctx.scale(*size)
-    ctx.set_source_rgb(*bg_color)
-    ctx.rectangle(0, 0, 1, 1)
-    ctx.fill()
-    K = len(envs)  # Number of waves to draw (waves are stacked vertically)
-    T = len(envs[0])  # Numbert of time steps
-    pad_ratio = 0.1  # spacing ratio between 2 bars
-    width = 1.0 / (T * (1 + 2 * pad_ratio))
-    pad = pad_ratio * width
-    delta = 2 * pad + width
-    ctx.set_line_width(width)
-    for step in range(T):
-        for i in range(K):
-            half = 0.5 * envs[i][step]  # (semi-)height of the bar
-            half /= K  # as we stack K waves vertically
-            midrule = (1 + 2 * i) / (2 * K)  # midrule of i-th wave
-            ctx.set_source_rgb(*fg_colors[i])
-            ctx.move_to(pad + step * delta, midrule - half)
-            ctx.line_to(pad + step * delta, midrule)
-            ctx.stroke()
-            ctx.set_source_rgba(*fg_colors[i], 0.8)
-            ctx.move_to(pad + step * delta, midrule)
-            ctx.line_to(pad + step * delta, midrule + 0.9 * half)
-            ctx.stroke()
-    surface.write_to_png(out)
-def interpole(x1, y1, x2, y2, x):
-    return y1 + (y2 - y1) * (x - x1) / (x2 - x1)
-def visualize(
-    progress,
-    audio,
-    tmp,
-    out,
-    seek=None,
-    duration=None,
-    rate=60,
-    bars=50,
-    speed=4,
-    time=0.4,
-    oversample=3,
-    fg_color=(0.2, 0.2, 0.2),
-    fg_color2=(0.5, 0.3, 0.6),
-    bg_color=(1, 1, 1),
-    size=(400, 400),
-    stereo=False,
-):
-    """
-    Generate the visualisation for the `audio` file, using a `tmp` folder and saving the final
-    video in `out`.
-    `seek` and `durations` gives the extract location if any.
-    `rate` is the framerate of the output video.
-    `bars` is the number of bars in the animation.
-    `speed` is the base speed of transition. Depending on volume, actual speed will vary
-        between 0.5 and 2 times it.
-    `time` amount of audio shown at once on a frame.
-    `oversample` higher values will lead to more frequent changes.
-    `fg_color` is the rgb color to use for the foreground.
-    `fg_color2` is the rgb color to use for the second wav if stereo is set.
-    `bg_color` is the rgb color to use for the background.
-    `size` is the `(width, height)` in pixels to generate.
-    `stereo` is whether to create 2 waves.
-    """
-    try:
-        wav, sr = read_audio(audio, seek=seek, duration=duration)
-    except (IOError, ValueError) as err:
-        raise gr.Error(err)
-    # wavs is a list of wav over channels
-    wavs = []
-    if stereo:
-        assert wav.shape[0] == 2, "stereo requires stereo audio file"
-        wavs.append(wav[0])
-        wavs.append(wav[1])
-    else:
-        wav = wav.mean(0)
-        wavs.append(wav)
-    for i, wav in enumerate(wavs):
-        wavs[i] = wav / wav.std()
-    window = int(sr * time / bars)
-    stride = int(window / oversample)
-    # envs is a list of env over channels
-    envs = []
-    for wav in wavs:
-        env = envelope(wav, window, stride)
-        env = np.pad(env, (bars // 2, 2 * bars))
-        envs.append(env)
-    duration = len(wavs[0]) / sr
-    frames = int(rate * duration)
-    smooth = np.hanning(bars)
-    gr.Info("Generating the frames...")
-    for idx in progress(range(frames)):
-        pos = (((idx / rate)) * sr) / stride / bars
-        off = int(pos)
-        loc = pos - off
-        denvs = []
-        for env in envs:
-            env1 = env[off * bars : (off + 1) * bars]
-            env2 = env[(off + 1) * bars : (off + 2) * bars]
-            # we want loud parts to be updated faster
-            maxvol = math.log10(1e-4 + env2.max()) * 10
-            speedup = np.clip(interpole(-6, 0.5, 0, 2, maxvol), 0.5, 2)
-            w = sigmoid(speed * speedup * (loc - 0.5))
-            denv = (1 - w) * env1 + w * env2
-            denv *= smooth
-            denvs.append(denv)
-        draw_env(denvs, tmp / f"{idx:06d}.png", (fg_color, fg_color2), bg_color, size)
-    gr.Info("Encoding the animation video...")
-    subprocess.run([
-        "ffmpeg", "-y", "-loglevel", "panic", "-r",
-        str(rate), "-f", "image2", "-s", f"{size[0]}x{size[1]}", "-i", "%06d.png", "-i", audio, "-c:a", "aac", "-vcodec", "libx264", "-crf", "10", "-pix_fmt", "yuv420p",
-        out.resolve()
-    ], check=True, cwd=tmp)
-    return out
-def parse_color(colorstr):
     """
-    Given a comma separated rgb(a) colors, returns a 4-tuple of float.
-    """
-    try:
-        r, g, b = [float(i) for i in colorstr.split(",")]
-        return r, g, b
-    except ValueError:
-        raise gr.Error(
-            "Format for color is 3 floats separated by commas 0.xx,0.xx,0.xx, rgb order"
-        )
-def hex_to_rgb(hex_color):
-    hex_color = hex_color.lstrip('#')
-    r = int(hex_color[0:2], 16) / 255.0
-    g = int(hex_color[2:4], 16) / 255.0
-    b = int(hex_color[4:6], 16) / 255.0
-    return (r, g, b)
-def do_viz(
-    inp_aud,
-    inp_bgcolor,
-    inp_color1,
-    inp_nbars,
-    inp_vidw,
-    inp_vidh,
-    progress=gr.Progress(),
-):
-    with tempfile.TemporaryDirectory() as tmp, tempfile.NamedTemporaryFile(
-        suffix=".mp4",
-        delete=False
-    ) as out:
         return visualize(
-            progress.tqdm,
-            inp_aud,
-            Path(tmp),
-            Path(out.name),
-            bars=inp_nbars,
-            fg_color=hex_to_rgb(inp_color1),
-            bg_color=hex_to_rgb(inp_bgcolor),
-            size=(inp_vidw, inp_vidh),
         )
-import gradio as gr
-ABOUT = """
-# seewav GUI
-> Have an audio clip but need a video (e.g. for X/Twitter)?
-**Convert audio into a nice video!**
-An online graphical user interface for [seewav](https://github.com/adefossez/seewav).
-Enjoy!
-"""
-with gr.Blocks() as demo:
     gr.Markdown(ABOUT)
     with gr.Row():
         with gr.Column():
-            inp_aud = gr.Audio(type='filepath')
-            with gr.Group():
-                inp_color1 = gr.ColorPicker(
-                    label="Color",
-                    info="Color of the top waveform",
-                    value="#00237E",
-                    interactive=True,
-                )
-                inp_bgcolor = gr.ColorPicker(
-                    label="Background Color",
-                    info="Color of the background",
-                    value="#000000",
-                    interactive=True,
-                )
             with gr.Accordion("Advanced Configuration", open=False):
-                inp_nbars = gr.Slider(
-                    label="Num. Bars",
-                    value=50,
-                    interactive=True,
                     minimum=5,
                     maximum=1500,
                 )
-                inp_vidw = gr.Slider(
                     label="Video Width",
-                    value=400,
-                    interactive=True,
                     minimum=100,
                     maximum=3000,
                 )
-                inp_vidh = gr.Slider(
                     label="Video Height",
-                    value=400,
-                    interactive=True,
                     minimum=100,
                     maximum=3000,
                 )
-            inp_go = gr.Button("Visualize", variant="primary")
         with gr.Column():
-            out_vid = gr.Video(interactive=False)
-        inp_go.click(
-            do_viz,
-            inputs=[
-                inp_aud,
-                inp_bgcolor,
-                inp_color1,
-                inp_nbars,
-                inp_vidw,
-                inp_vidh,
-            ],
-            outputs=[out_vid],
-        )
-demo.queue(api_open=True, default_concurrency_limit=20).launch(show_api=True)

+import pathlib
 import tempfile
+import gradio as gr
+from utils import hex_to_rgb, visualize
+ABOUT = "# [seewav](https://github.com/adefossez/seewav)"
+def run(
+    audio_file: str,
+    wave_color: str = "#00237E",
+    background_color: str = "#000000",
+    num_bars: int = 50,
+    video_width: int = 400,
+    video_height: int = 300,
+    progress: gr.Progress = gr.Progress(track_tqdm=True),  # noqa: ARG001, B008
+) -> str:
+    """Generates a waveform video from an audio file using the seewav tool.
+    This function processes the input audio file and creates a video visualizing its waveform.
+    The waveform and background colors, number of waveform bars, and video resolution can be customized.
+    Args:
+        audio_file (str): Path to the input audio file (e.g., WAV or MP3).
+        wave_color (str, optional): Hex color code for the waveform. Defaults to "#00237E".
+        background_color (str, optional): Hex color code for the background. Defaults to "#000000".
+        num_bars (int, optional): Number of bars to display in the waveform visualization. Defaults to 50.
+        video_width (int, optional): Width of the output video in pixels. Defaults to 400.
+        video_height (int, optional): Height of the output video in pixels. Defaults to 300.
+        progress (gr.Progress, optional): Internal parameter for displaying progress in a Gradio interface.
+            Not intended to be set manually by the user.
+    Returns:
+        str: Path to the generated waveform video file.
     """
+    with tempfile.TemporaryDirectory() as tmp, tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as out:
         return visualize(
+            audio_file,
+            pathlib.Path(tmp),
+            pathlib.Path(out.name),
+            bars=num_bars,
+            fg_color=hex_to_rgb(wave_color),
+            bg_color=hex_to_rgb(background_color),
+            size=(video_width, video_height),
         )
+with gr.Blocks(css_paths="style.css") as demo:
     gr.Markdown(ABOUT)
     with gr.Row():
         with gr.Column():
+            audio_file = gr.Audio(type="filepath")
             with gr.Accordion("Advanced Configuration", open=False):
+                wave_color = gr.ColorPicker(label="Waveform Color", value="#00237E")
+                background_color = gr.ColorPicker(label="Background Color", value="#000000")
+                num_bars = gr.Slider(
+                    label="Number of Bars",
                     minimum=5,
                     maximum=1500,
+                    step=5,
+                    value=50,
                 )
+                video_width = gr.Slider(
                     label="Video Width",
                     minimum=100,
                     maximum=3000,
+                    step=10,
+                    value=400,
                 )
+                video_height = gr.Slider(
                     label="Video Height",
                     minimum=100,
                     maximum=3000,
+                    step=10,
+                    value=300,
                 )
+            run_button = gr.Button(variant="primary")
         with gr.Column():
+            video = gr.Video(interactive=False)
+    gr.Examples(examples=["assets/sample.wav"], fn=run, inputs=audio_file, outputs=video)
+    run_button.click(
+        fn=run,
+        inputs=[
+            audio_file,
+            wave_color,
+            background_color,
+            num_bars,
+            video_width,
+            video_height,
+        ],
+        outputs=video,
+    )
+if __name__ == "__main__":
+    demo.launch(mcp_server=True)

assets/sample.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fd3186d0ca643fa0742dc349829c17edf101b4ffe8410ea871d8f0c2768a237f
+size 452444

packages.txt CHANGED Viewed

	@@ -1 +1 @@
1	- ffmpeg


1	+ ffmpeg

pyproject.toml ADDED Viewed

	@@ -0,0 +1,52 @@

+[project]
+name = "seewav-gui"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.10"
+dependencies = [
+    "gradio[mcp]>=5.31.0",
+    "pycairo>=1.28.0",
+]
+[tool.ruff]
+line-length = 119
+exclude = ["utils.py"]
+[tool.ruff.lint]
+select = ["ALL"]
+ignore = [
+    "COM812", # missing-trailing-comma
+    "D203",   # one-blank-line-before-class
+    "D213",   # multi-line-summary-second-line
+    "E501",   # line-too-long
+    "SIM117", # multiple-with-statements
+    #
+    "D100",    # undocumented-public-module
+    "D101",    # undocumented-public-class
+    "D102",    # undocumented-public-method
+    "D103",    # undocumented-public-function
+    "D104",    # undocumented-public-package
+    "D105",    # undocumented-magic-method
+    "D107",    # undocumented-public-init
+    "EM101",   # raw-string-in-exception
+    "FBT001",  # boolean-type-hint-positional-argument
+    "FBT002",  # boolean-default-value-positional-argument
+    "PD901",   # pandas-df-variable-name
+    "PGH003",  # blanket-type-ignore
+    "PLR0913", # too-many-arguments
+    "PLR0915", # too-many-statements
+    "TRY003",  # raise-vanilla-args
+]
+unfixable = [
+    "F401", # unused-import
+]
+[tool.ruff.lint.pydocstyle]
+convention = "google"
+[tool.ruff.lint.per-file-ignores]
+"*.ipynb" = ["T201", "T203"]
+[tool.ruff.format]
+docstring-code-format = true

requirements.txt CHANGED Viewed

@@ -1,6 +1,182 @@
-numpy
-pycairo
-tqdm
-pydub
-ffmpeg-python
-opencv-python

+# This file was autogenerated by uv via the following command:
+#    uv pip compile pyproject.toml -o requirements.txt
+aiofiles==24.1.0
+    # via gradio
+annotated-types==0.7.0
+    # via pydantic
+anyio==4.9.0
+    # via
+    #   gradio
+    #   httpx
+    #   mcp
+    #   sse-starlette
+    #   starlette
+certifi==2025.4.26
+    # via
+    #   httpcore
+    #   httpx
+    #   requests
+charset-normalizer==3.4.2
+    # via requests
+click==8.2.1
+    # via
+    #   typer
+    #   uvicorn
+exceptiongroup==1.3.0
+    # via anyio
+fastapi==0.115.12
+    # via gradio
+ffmpy==0.5.0
+    # via gradio
+filelock==3.18.0
+    # via huggingface-hub
+fsspec==2025.5.1
+    # via
+    #   gradio-client
+    #   huggingface-hub
+gradio==5.31.0
+    # via seewav-gui (pyproject.toml)
+gradio-client==1.10.1
+    # via gradio
+groovy==0.1.2
+    # via gradio
+h11==0.16.0
+    # via
+    #   httpcore
+    #   uvicorn
+hf-xet==1.1.2
+    # via huggingface-hub
+httpcore==1.0.9
+    # via httpx
+httpx==0.28.1
+    # via
+    #   gradio
+    #   gradio-client
+    #   mcp
+    #   safehttpx
+httpx-sse==0.4.0
+    # via mcp
+huggingface-hub==0.32.3
+    # via
+    #   gradio
+    #   gradio-client
+idna==3.10
+    # via
+    #   anyio
+    #   httpx
+    #   requests
+jinja2==3.1.6
+    # via gradio
+markdown-it-py==3.0.0
+    # via rich
+markupsafe==3.0.2
+    # via
+    #   gradio
+    #   jinja2
+mcp==1.9.0
+    # via gradio
+mdurl==0.1.2
+    # via markdown-it-py
+numpy==2.2.6
+    # via
+    #   gradio
+    #   pandas
+orjson==3.10.18
+    # via gradio
+packaging==25.0
+    # via
+    #   gradio
+    #   gradio-client
+    #   huggingface-hub
+pandas==2.2.3
+    # via gradio
+pillow==11.2.1
+    # via gradio
+pycairo==1.28.0
+    # via seewav-gui (pyproject.toml)
+pydantic==2.11.5
+    # via
+    #   fastapi
+    #   gradio
+    #   mcp
+    #   pydantic-settings
+pydantic-core==2.33.2
+    # via pydantic
+pydantic-settings==2.9.1
+    # via mcp
+pydub==0.25.1
+    # via gradio
+pygments==2.19.1
+    # via rich
+python-dateutil==2.9.0.post0
+    # via pandas
+python-dotenv==1.1.0
+    # via pydantic-settings
+python-multipart==0.0.20
+    # via
+    #   gradio
+    #   mcp
+pytz==2025.2
+    # via pandas
+pyyaml==6.0.2
+    # via
+    #   gradio
+    #   huggingface-hub
+requests==2.32.3
+    # via huggingface-hub
+rich==14.0.0
+    # via typer
+ruff==0.11.12
+    # via gradio
+safehttpx==0.1.6
+    # via gradio
+semantic-version==2.10.0
+    # via gradio
+shellingham==1.5.4
+    # via typer
+six==1.17.0
+    # via python-dateutil
+sniffio==1.3.1
+    # via anyio
+sse-starlette==2.3.5
+    # via mcp
+starlette==0.46.2
+    # via
+    #   fastapi
+    #   gradio
+    #   mcp
+    #   sse-starlette
+tomlkit==0.13.2
+    # via gradio
+tqdm==4.67.1
+    # via huggingface-hub
+typer==0.16.0
+    # via gradio
+typing-extensions==4.13.2
+    # via
+    #   anyio
+    #   exceptiongroup
+    #   fastapi
+    #   gradio
+    #   gradio-client
+    #   huggingface-hub
+    #   pydantic
+    #   pydantic-core
+    #   rich
+    #   typer
+    #   typing-inspection
+    #   uvicorn
+typing-inspection==0.4.1
+    # via
+    #   pydantic
+    #   pydantic-settings
+tzdata==2025.2
+    # via pandas
+urllib3==2.4.0
+    # via requests
+uvicorn==0.34.2
+    # via
+    #   gradio
+    #   mcp
+websockets==15.0.1
+    # via gradio-client

style.css ADDED Viewed

	@@ -0,0 +1,4 @@

+h1 {
+  text-align: center;
+  display: block;
+}

utils.py ADDED Viewed

	@@ -0,0 +1,230 @@

+# Thank you to the authors of seewav for dedicating it into the public domain.
+# This program is also dedicated into the public domain.
+# You may use it, at your choice, under the Unlicense, CC0, or WTFPL license.
+# Enjoy!
+# Mostly from: https://github.com/adefossez/seewav
+# Original author: adefossez
+import math
+import subprocess
+import cairo
+import gradio as gr
+import numpy as np
+import tqdm
+from pydub import AudioSegment
+def read_audio(audio, seek=None, duration=None):
+    """Read the `audio` file, starting at `seek` (or 0) seconds for `duration` (or all) seconds.
+    Returns `float[channels, samples]`.
+    """
+    audio_segment = AudioSegment.from_file(audio)
+    channels = audio_segment.channels
+    samplerate = audio_segment.frame_rate
+    if seek is not None:
+        seek_ms = int(seek * 1000)
+        audio_segment = audio_segment[seek_ms:]
+    if duration is not None:
+        duration_ms = int(duration * 1000)
+        audio_segment = audio_segment[:duration_ms]
+    samples = audio_segment.get_array_of_samples()
+    wav = np.array(samples, dtype=np.float32)
+    return wav.reshape(channels, -1), samplerate
+def sigmoid(x):
+    return 1 / (1 + np.exp(-x))
+def envelope(wav, window, stride):
+    """Extract the envelope of the waveform `wav` (float[samples]), using average pooling
+    with `window` samples and the given `stride`.
+    """
+    # pos = np.pad(np.maximum(wav, 0), window // 2)
+    wav = np.pad(wav, window // 2)
+    out = []
+    for off in range(0, len(wav) - window, stride):
+        frame = wav[off : off + window]
+        out.append(np.maximum(frame, 0).mean())
+    out = np.array(out)
+    # Some form of audio compressor based on the sigmoid.
+    out = 1.9 * (sigmoid(2.5 * out) - 0.5)
+    return out
+def draw_env(envs, out, fg_colors, bg_color, size):
+    """Internal function, draw a single frame (two frames for stereo) using cairo and save
+    it to the `out` file as png. envs is a list of envelopes over channels, each env
+    is a float[bars] representing the height of the envelope to draw. Each entry will
+    be represented by a bar.
+    """
+    surface = cairo.ImageSurface(cairo.FORMAT_ARGB32, *size)
+    ctx = cairo.Context(surface)
+    ctx.scale(*size)
+    ctx.set_source_rgb(*bg_color)
+    ctx.rectangle(0, 0, 1, 1)
+    ctx.fill()
+    K = len(envs)  # Number of waves to draw (waves are stacked vertically)
+    T = len(envs[0])  # Numbert of time steps
+    pad_ratio = 0.1  # spacing ratio between 2 bars
+    width = 1.0 / (T * (1 + 2 * pad_ratio))
+    pad = pad_ratio * width
+    delta = 2 * pad + width
+    ctx.set_line_width(width)
+    for step in range(T):
+        for i in range(K):
+            half = 0.5 * envs[i][step]  # (semi-)height of the bar
+            half /= K  # as we stack K waves vertically
+            midrule = (1 + 2 * i) / (2 * K)  # midrule of i-th wave
+            ctx.set_source_rgb(*fg_colors[i])
+            ctx.move_to(pad + step * delta, midrule - half)
+            ctx.line_to(pad + step * delta, midrule)
+            ctx.stroke()
+            ctx.set_source_rgba(*fg_colors[i], 0.8)
+            ctx.move_to(pad + step * delta, midrule)
+            ctx.line_to(pad + step * delta, midrule + 0.9 * half)
+            ctx.stroke()
+    surface.write_to_png(out)
+def interpole(x1, y1, x2, y2, x):
+    return y1 + (y2 - y1) * (x - x1) / (x2 - x1)
+def visualize(
+    audio,
+    tmp,
+    out,
+    seek=None,
+    duration=None,
+    rate=60,
+    bars=50,
+    speed=4,
+    time=0.4,
+    oversample=3,
+    fg_color=(0.2, 0.2, 0.2),
+    fg_color2=(0.5, 0.3, 0.6),
+    bg_color=(1, 1, 1),
+    size=(400, 400),
+    stereo=False,
+):
+    """Generate the visualisation for the `audio` file, using a `tmp` folder and saving the final
+    video in `out`.
+    `seek` and `durations` gives the extract location if any.
+    `rate` is the framerate of the output video.
+    `bars` is the number of bars in the animation.
+    `speed` is the base speed of transition. Depending on volume, actual speed will vary
+        between 0.5 and 2 times it.
+    `time` amount of audio shown at once on a frame.
+    `oversample` higher values will lead to more frequent changes.
+    `fg_color` is the rgb color to use for the foreground.
+    `fg_color2` is the rgb color to use for the second wav if stereo is set.
+    `bg_color` is the rgb color to use for the background.
+    `size` is the `(width, height)` in pixels to generate.
+    `stereo` is whether to create 2 waves.
+    """
+    try:
+        wav, sr = read_audio(audio, seek=seek, duration=duration)
+    except (OSError, ValueError) as err:
+        raise gr.Error(err)
+    # wavs is a list of wav over channels
+    wavs = []
+    if stereo:
+        assert wav.shape[0] == 2, "stereo requires stereo audio file"
+        wavs.append(wav[0])
+        wavs.append(wav[1])
+    else:
+        wav = wav.mean(0)
+        wavs.append(wav)
+    for i, wav in enumerate(wavs):
+        wavs[i] = wav / wav.std()
+    window = int(sr * time / bars)
+    stride = int(window / oversample)
+    # envs is a list of env over channels
+    envs = []
+    for wav in wavs:
+        env = envelope(wav, window, stride)
+        env = np.pad(env, (bars // 2, 2 * bars))
+        envs.append(env)
+    duration = len(wavs[0]) / sr
+    frames = int(rate * duration)
+    smooth = np.hanning(bars)
+    for idx in tqdm.tqdm(range(frames)):
+        pos = ((idx / rate) * sr) / stride / bars
+        off = int(pos)
+        loc = pos - off
+        denvs = []
+        for env in envs:
+            env1 = env[off * bars : (off + 1) * bars]
+            env2 = env[(off + 1) * bars : (off + 2) * bars]
+            # we want loud parts to be updated faster
+            maxvol = math.log10(1e-4 + env2.max()) * 10
+            speedup = np.clip(interpole(-6, 0.5, 0, 2, maxvol), 0.5, 2)
+            w = sigmoid(speed * speedup * (loc - 0.5))
+            denv = (1 - w) * env1 + w * env2
+            denv *= smooth
+            denvs.append(denv)
+        draw_env(denvs, tmp / f"{idx:06d}.png", (fg_color, fg_color2), bg_color, size)
+    subprocess.run(
+        [
+            "ffmpeg",
+            "-y",
+            "-loglevel",
+            "panic",
+            "-r",
+            str(rate),
+            "-f",
+            "image2",
+            "-s",
+            f"{size[0]}x{size[1]}",
+            "-i",
+            "%06d.png",
+            "-i",
+            audio,
+            "-c:a",
+            "aac",
+            "-vcodec",
+            "libx264",
+            "-crf",
+            "10",
+            "-pix_fmt",
+            "yuv420p",
+            out.resolve(),
+        ],
+        check=True,
+        cwd=tmp,
+    )
+    return out
+def parse_color(colorstr):
+    """Given a comma separated rgb(a) colors, returns a 4-tuple of float."""
+    try:
+        r, g, b = [float(i) for i in colorstr.split(",")]
+        return r, g, b
+    except ValueError:
+        raise gr.Error("Format for color is 3 floats separated by commas 0.xx,0.xx,0.xx, rgb order")
+def hex_to_rgb(hex_color):
+    hex_color = hex_color.lstrip("#")
+    r = int(hex_color[0:2], 16) / 255.0
+    g = int(hex_color[2:4], 16) / 255.0
+    b = int(hex_color[4:6], 16) / 255.0
+    return (r, g, b)

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff