hysts HF Staff commited on
Commit
9ee5547
·
1 Parent(s): 74fd95b
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.wav filter=lfs diff=lfs merge=lfs -text
.pre-commit-config.yaml ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ repos:
2
+ - repo: https://github.com/pre-commit/pre-commit-hooks
3
+ rev: v5.0.0
4
+ hooks:
5
+ - id: check-executables-have-shebangs
6
+ - id: check-json
7
+ - id: check-merge-conflict
8
+ - id: check-shebang-scripts-are-executable
9
+ - id: check-toml
10
+ - id: check-yaml
11
+ - id: end-of-file-fixer
12
+ - id: mixed-line-ending
13
+ args: ["--fix=lf"]
14
+ - id: requirements-txt-fixer
15
+ - id: trailing-whitespace
16
+ - repo: https://github.com/astral-sh/ruff-pre-commit
17
+ rev: v0.11.11
18
+ hooks:
19
+ - id: ruff-check
20
+ args: ["--fix"]
21
+ - id: ruff-format
22
+ - repo: https://github.com/pre-commit/mirrors-mypy
23
+ rev: v1.15.0
24
+ hooks:
25
+ - id: mypy
26
+ args: ["--ignore-missing-imports"]
27
+ additional_dependencies:
28
+ [
29
+ "types-python-slugify",
30
+ "types-pytz",
31
+ "types-PyYAML",
32
+ "types-requests",
33
+ ]
.python-version ADDED
@@ -0,0 +1 @@
 
 
1
+ 3.10
.vscode/extensions.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "recommendations": [
3
+ "ms-python.python",
4
+ "charliermarsh.ruff",
5
+ "streetsidesoftware.code-spell-checker",
6
+ "tamasfe.even-better-toml"
7
+ ]
8
+ }
.vscode/settings.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "editor.formatOnSave": true,
3
+ "files.insertFinalNewline": false,
4
+ "[python]": {
5
+ "editor.defaultFormatter": "charliermarsh.ruff",
6
+ "editor.formatOnType": true,
7
+ "editor.codeActionsOnSave": {
8
+ "source.fixAll.ruff": "explicit",
9
+ "source.organizeImports": "explicit"
10
+ }
11
+ },
12
+ "[jupyter]": {
13
+ "files.insertFinalNewline": false
14
+ },
15
+ "notebook.output.scrolling": true,
16
+ "notebook.formatOnSave.enabled": true
17
+ }
README.md CHANGED
@@ -4,9 +4,9 @@ emoji: 🔊
4
  colorFrom: indigo
5
  colorTo: indigo
6
  sdk: gradio
7
- sdk_version: 4.26.0
8
  app_file: app.py
9
  license: cc0-1.0
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
4
  colorFrom: indigo
5
  colorTo: indigo
6
  sdk: gradio
7
+ sdk_version: 5.31.0
8
  app_file: app.py
9
  license: cc0-1.0
10
  ---
11
 
12
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
@@ -1,314 +1,99 @@
1
- # Thank you to the authors of seewav for dedicating it into the public domain.
2
- # This program is also dedicated into the public domain.
3
- # You may use it, at your choice, under the Unlicense, CC0, or WTFPL license.
4
- # Enjoy!
5
-
6
- # Mostly from: https://github.com/adefossez/seewav
7
- # Original author: adefossez
8
-
9
-
10
- import math
11
  import tempfile
12
- from pathlib import Path
13
- import subprocess
14
- import cairo
15
- import numpy as np
16
- import gradio as gr
17
- from pydub import AudioSegment
18
-
19
-
20
- def read_audio(audio, seek=None, duration=None):
21
- """
22
- Read the `audio` file, starting at `seek` (or 0) seconds for `duration` (or all) seconds.
23
- Returns `float[channels, samples]`.
24
- """
25
-
26
- audio_segment = AudioSegment.from_file(audio)
27
- channels = audio_segment.channels
28
- samplerate = audio_segment.frame_rate
29
-
30
- if seek is not None:
31
- seek_ms = int(seek * 1000)
32
- audio_segment = audio_segment[seek_ms:]
33
-
34
- if duration is not None:
35
- duration_ms = int(duration * 1000)
36
- audio_segment = audio_segment[:duration_ms]
37
-
38
- samples = audio_segment.get_array_of_samples()
39
- wav = np.array(samples, dtype=np.float32)
40
- return wav.reshape(channels, -1), samplerate
41
-
42
-
43
- def sigmoid(x):
44
- return 1 / (1 + np.exp(-x))
45
-
46
-
47
- def envelope(wav, window, stride):
48
- """
49
- Extract the envelope of the waveform `wav` (float[samples]), using average pooling
50
- with `window` samples and the given `stride`.
51
- """
52
- # pos = np.pad(np.maximum(wav, 0), window // 2)
53
- wav = np.pad(wav, window // 2)
54
- out = []
55
- for off in range(0, len(wav) - window, stride):
56
- frame = wav[off : off + window]
57
- out.append(np.maximum(frame, 0).mean())
58
- out = np.array(out)
59
- # Some form of audio compressor based on the sigmoid.
60
- out = 1.9 * (sigmoid(2.5 * out) - 0.5)
61
- return out
62
-
63
-
64
- def draw_env(envs, out, fg_colors, bg_color, size):
65
- """
66
- Internal function, draw a single frame (two frames for stereo) using cairo and save
67
- it to the `out` file as png. envs is a list of envelopes over channels, each env
68
- is a float[bars] representing the height of the envelope to draw. Each entry will
69
- be represented by a bar.
70
- """
71
- surface = cairo.ImageSurface(cairo.FORMAT_ARGB32, *size)
72
- ctx = cairo.Context(surface)
73
- ctx.scale(*size)
74
-
75
- ctx.set_source_rgb(*bg_color)
76
- ctx.rectangle(0, 0, 1, 1)
77
- ctx.fill()
78
-
79
- K = len(envs) # Number of waves to draw (waves are stacked vertically)
80
- T = len(envs[0]) # Numbert of time steps
81
- pad_ratio = 0.1 # spacing ratio between 2 bars
82
- width = 1.0 / (T * (1 + 2 * pad_ratio))
83
- pad = pad_ratio * width
84
- delta = 2 * pad + width
85
-
86
- ctx.set_line_width(width)
87
- for step in range(T):
88
- for i in range(K):
89
- half = 0.5 * envs[i][step] # (semi-)height of the bar
90
- half /= K # as we stack K waves vertically
91
- midrule = (1 + 2 * i) / (2 * K) # midrule of i-th wave
92
- ctx.set_source_rgb(*fg_colors[i])
93
- ctx.move_to(pad + step * delta, midrule - half)
94
- ctx.line_to(pad + step * delta, midrule)
95
- ctx.stroke()
96
- ctx.set_source_rgba(*fg_colors[i], 0.8)
97
- ctx.move_to(pad + step * delta, midrule)
98
- ctx.line_to(pad + step * delta, midrule + 0.9 * half)
99
- ctx.stroke()
100
-
101
- surface.write_to_png(out)
102
-
103
-
104
- def interpole(x1, y1, x2, y2, x):
105
- return y1 + (y2 - y1) * (x - x1) / (x2 - x1)
106
-
107
-
108
- def visualize(
109
- progress,
110
- audio,
111
- tmp,
112
- out,
113
- seek=None,
114
- duration=None,
115
- rate=60,
116
- bars=50,
117
- speed=4,
118
- time=0.4,
119
- oversample=3,
120
- fg_color=(0.2, 0.2, 0.2),
121
- fg_color2=(0.5, 0.3, 0.6),
122
- bg_color=(1, 1, 1),
123
- size=(400, 400),
124
- stereo=False,
125
- ):
126
- """
127
- Generate the visualisation for the `audio` file, using a `tmp` folder and saving the final
128
- video in `out`.
129
- `seek` and `durations` gives the extract location if any.
130
- `rate` is the framerate of the output video.
131
-
132
- `bars` is the number of bars in the animation.
133
- `speed` is the base speed of transition. Depending on volume, actual speed will vary
134
- between 0.5 and 2 times it.
135
- `time` amount of audio shown at once on a frame.
136
- `oversample` higher values will lead to more frequent changes.
137
- `fg_color` is the rgb color to use for the foreground.
138
- `fg_color2` is the rgb color to use for the second wav if stereo is set.
139
- `bg_color` is the rgb color to use for the background.
140
- `size` is the `(width, height)` in pixels to generate.
141
- `stereo` is whether to create 2 waves.
142
- """
143
- try:
144
- wav, sr = read_audio(audio, seek=seek, duration=duration)
145
- except (IOError, ValueError) as err:
146
- raise gr.Error(err)
147
- # wavs is a list of wav over channels
148
- wavs = []
149
- if stereo:
150
- assert wav.shape[0] == 2, "stereo requires stereo audio file"
151
- wavs.append(wav[0])
152
- wavs.append(wav[1])
153
- else:
154
- wav = wav.mean(0)
155
- wavs.append(wav)
156
-
157
- for i, wav in enumerate(wavs):
158
- wavs[i] = wav / wav.std()
159
-
160
- window = int(sr * time / bars)
161
- stride = int(window / oversample)
162
- # envs is a list of env over channels
163
- envs = []
164
- for wav in wavs:
165
- env = envelope(wav, window, stride)
166
- env = np.pad(env, (bars // 2, 2 * bars))
167
- envs.append(env)
168
-
169
- duration = len(wavs[0]) / sr
170
- frames = int(rate * duration)
171
- smooth = np.hanning(bars)
172
-
173
- gr.Info("Generating the frames...")
174
- for idx in progress(range(frames)):
175
- pos = (((idx / rate)) * sr) / stride / bars
176
- off = int(pos)
177
- loc = pos - off
178
- denvs = []
179
- for env in envs:
180
- env1 = env[off * bars : (off + 1) * bars]
181
- env2 = env[(off + 1) * bars : (off + 2) * bars]
182
-
183
- # we want loud parts to be updated faster
184
- maxvol = math.log10(1e-4 + env2.max()) * 10
185
- speedup = np.clip(interpole(-6, 0.5, 0, 2, maxvol), 0.5, 2)
186
- w = sigmoid(speed * speedup * (loc - 0.5))
187
- denv = (1 - w) * env1 + w * env2
188
- denv *= smooth
189
- denvs.append(denv)
190
- draw_env(denvs, tmp / f"{idx:06d}.png", (fg_color, fg_color2), bg_color, size)
191
- gr.Info("Encoding the animation video...")
192
- subprocess.run([
193
- "ffmpeg", "-y", "-loglevel", "panic", "-r",
194
- str(rate), "-f", "image2", "-s", f"{size[0]}x{size[1]}", "-i", "%06d.png", "-i", audio, "-c:a", "aac", "-vcodec", "libx264", "-crf", "10", "-pix_fmt", "yuv420p",
195
- out.resolve()
196
- ], check=True, cwd=tmp)
197
- return out
198
-
199
 
 
200
 
201
- def parse_color(colorstr):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
202
  """
203
- Given a comma separated rgb(a) colors, returns a 4-tuple of float.
204
- """
205
- try:
206
- r, g, b = [float(i) for i in colorstr.split(",")]
207
- return r, g, b
208
- except ValueError:
209
- raise gr.Error(
210
- "Format for color is 3 floats separated by commas 0.xx,0.xx,0.xx, rgb order"
211
- )
212
-
213
-
214
- def hex_to_rgb(hex_color):
215
- hex_color = hex_color.lstrip('#')
216
- r = int(hex_color[0:2], 16) / 255.0
217
- g = int(hex_color[2:4], 16) / 255.0
218
- b = int(hex_color[4:6], 16) / 255.0
219
- return (r, g, b)
220
-
221
- def do_viz(
222
- inp_aud,
223
- inp_bgcolor,
224
- inp_color1,
225
- inp_nbars,
226
- inp_vidw,
227
- inp_vidh,
228
- progress=gr.Progress(),
229
- ):
230
- with tempfile.TemporaryDirectory() as tmp, tempfile.NamedTemporaryFile(
231
- suffix=".mp4",
232
- delete=False
233
- ) as out:
234
  return visualize(
235
- progress.tqdm,
236
- inp_aud,
237
- Path(tmp),
238
- Path(out.name),
239
- bars=inp_nbars,
240
- fg_color=hex_to_rgb(inp_color1),
241
- bg_color=hex_to_rgb(inp_bgcolor),
242
- size=(inp_vidw, inp_vidh),
243
  )
244
 
245
 
246
- import gradio as gr
247
-
248
- ABOUT = """
249
- # seewav GUI
250
-
251
- > Have an audio clip but need a video (e.g. for X/Twitter)?
252
-
253
- **Convert audio into a nice video!**
254
-
255
- An online graphical user interface for [seewav](https://github.com/adefossez/seewav).
256
-
257
- Enjoy!
258
- """
259
- with gr.Blocks() as demo:
260
  gr.Markdown(ABOUT)
261
  with gr.Row():
262
  with gr.Column():
263
- inp_aud = gr.Audio(type='filepath')
264
- with gr.Group():
265
- inp_color1 = gr.ColorPicker(
266
- label="Color",
267
- info="Color of the top waveform",
268
- value="#00237E",
269
- interactive=True,
270
- )
271
- inp_bgcolor = gr.ColorPicker(
272
- label="Background Color",
273
- info="Color of the background",
274
- value="#000000",
275
- interactive=True,
276
- )
277
  with gr.Accordion("Advanced Configuration", open=False):
278
- inp_nbars = gr.Slider(
279
- label="Num. Bars",
280
- value=50,
281
- interactive=True,
282
  minimum=5,
283
  maximum=1500,
 
 
284
  )
285
- inp_vidw = gr.Slider(
286
  label="Video Width",
287
- value=400,
288
- interactive=True,
289
  minimum=100,
290
  maximum=3000,
 
 
291
  )
292
- inp_vidh = gr.Slider(
293
  label="Video Height",
294
- value=400,
295
- interactive=True,
296
  minimum=100,
297
  maximum=3000,
 
 
298
  )
299
- inp_go = gr.Button("Visualize", variant="primary")
300
  with gr.Column():
301
- out_vid = gr.Video(interactive=False)
302
- inp_go.click(
303
- do_viz,
304
- inputs=[
305
- inp_aud,
306
- inp_bgcolor,
307
- inp_color1,
308
- inp_nbars,
309
- inp_vidw,
310
- inp_vidh,
311
- ],
312
- outputs=[out_vid],
313
- )
314
- demo.queue(api_open=True, default_concurrency_limit=20).launch(show_api=True)
 
 
 
 
 
 
1
+ import pathlib
 
 
 
 
 
 
 
 
 
2
  import tempfile
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
+ import gradio as gr
5
 
6
+ from utils import hex_to_rgb, visualize
7
+
8
+ ABOUT = "# [seewav](https://github.com/adefossez/seewav)"
9
+
10
+
11
+ def run(
12
+ audio_file: str,
13
+ wave_color: str = "#00237E",
14
+ background_color: str = "#000000",
15
+ num_bars: int = 50,
16
+ video_width: int = 400,
17
+ video_height: int = 300,
18
+ progress: gr.Progress = gr.Progress(track_tqdm=True), # noqa: ARG001, B008
19
+ ) -> str:
20
+ """Generates a waveform video from an audio file using the seewav tool.
21
+
22
+ This function processes the input audio file and creates a video visualizing its waveform.
23
+ The waveform and background colors, number of waveform bars, and video resolution can be customized.
24
+
25
+ Args:
26
+ audio_file (str): Path to the input audio file (e.g., WAV or MP3).
27
+ wave_color (str, optional): Hex color code for the waveform. Defaults to "#00237E".
28
+ background_color (str, optional): Hex color code for the background. Defaults to "#000000".
29
+ num_bars (int, optional): Number of bars to display in the waveform visualization. Defaults to 50.
30
+ video_width (int, optional): Width of the output video in pixels. Defaults to 400.
31
+ video_height (int, optional): Height of the output video in pixels. Defaults to 300.
32
+ progress (gr.Progress, optional): Internal parameter for displaying progress in a Gradio interface.
33
+ Not intended to be set manually by the user.
34
+
35
+ Returns:
36
+ str: Path to the generated waveform video file.
37
  """
38
+ with tempfile.TemporaryDirectory() as tmp, tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as out:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  return visualize(
40
+ audio_file,
41
+ pathlib.Path(tmp),
42
+ pathlib.Path(out.name),
43
+ bars=num_bars,
44
+ fg_color=hex_to_rgb(wave_color),
45
+ bg_color=hex_to_rgb(background_color),
46
+ size=(video_width, video_height),
 
47
  )
48
 
49
 
50
+ with gr.Blocks(css_paths="style.css") as demo:
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  gr.Markdown(ABOUT)
52
  with gr.Row():
53
  with gr.Column():
54
+ audio_file = gr.Audio(type="filepath")
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  with gr.Accordion("Advanced Configuration", open=False):
56
+ wave_color = gr.ColorPicker(label="Waveform Color", value="#00237E")
57
+ background_color = gr.ColorPicker(label="Background Color", value="#000000")
58
+ num_bars = gr.Slider(
59
+ label="Number of Bars",
60
  minimum=5,
61
  maximum=1500,
62
+ step=5,
63
+ value=50,
64
  )
65
+ video_width = gr.Slider(
66
  label="Video Width",
 
 
67
  minimum=100,
68
  maximum=3000,
69
+ step=10,
70
+ value=400,
71
  )
72
+ video_height = gr.Slider(
73
  label="Video Height",
 
 
74
  minimum=100,
75
  maximum=3000,
76
+ step=10,
77
+ value=300,
78
  )
79
+ run_button = gr.Button(variant="primary")
80
  with gr.Column():
81
+ video = gr.Video(interactive=False)
82
+
83
+ gr.Examples(examples=["assets/sample.wav"], fn=run, inputs=audio_file, outputs=video)
84
+
85
+ run_button.click(
86
+ fn=run,
87
+ inputs=[
88
+ audio_file,
89
+ wave_color,
90
+ background_color,
91
+ num_bars,
92
+ video_width,
93
+ video_height,
94
+ ],
95
+ outputs=video,
96
+ )
97
+
98
+ if __name__ == "__main__":
99
+ demo.launch(mcp_server=True)
assets/sample.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fd3186d0ca643fa0742dc349829c17edf101b4ffe8410ea871d8f0c2768a237f
3
+ size 452444
packages.txt CHANGED
@@ -1 +1 @@
1
- ffmpeg
 
1
+ ffmpeg
pyproject.toml ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "seewav-gui"
3
+ version = "0.1.0"
4
+ description = "Add your description here"
5
+ readme = "README.md"
6
+ requires-python = ">=3.10"
7
+ dependencies = [
8
+ "gradio[mcp]>=5.31.0",
9
+ "pycairo>=1.28.0",
10
+ ]
11
+
12
+ [tool.ruff]
13
+ line-length = 119
14
+ exclude = ["utils.py"]
15
+
16
+ [tool.ruff.lint]
17
+ select = ["ALL"]
18
+ ignore = [
19
+ "COM812", # missing-trailing-comma
20
+ "D203", # one-blank-line-before-class
21
+ "D213", # multi-line-summary-second-line
22
+ "E501", # line-too-long
23
+ "SIM117", # multiple-with-statements
24
+ #
25
+ "D100", # undocumented-public-module
26
+ "D101", # undocumented-public-class
27
+ "D102", # undocumented-public-method
28
+ "D103", # undocumented-public-function
29
+ "D104", # undocumented-public-package
30
+ "D105", # undocumented-magic-method
31
+ "D107", # undocumented-public-init
32
+ "EM101", # raw-string-in-exception
33
+ "FBT001", # boolean-type-hint-positional-argument
34
+ "FBT002", # boolean-default-value-positional-argument
35
+ "PD901", # pandas-df-variable-name
36
+ "PGH003", # blanket-type-ignore
37
+ "PLR0913", # too-many-arguments
38
+ "PLR0915", # too-many-statements
39
+ "TRY003", # raise-vanilla-args
40
+ ]
41
+ unfixable = [
42
+ "F401", # unused-import
43
+ ]
44
+
45
+ [tool.ruff.lint.pydocstyle]
46
+ convention = "google"
47
+
48
+ [tool.ruff.lint.per-file-ignores]
49
+ "*.ipynb" = ["T201", "T203"]
50
+
51
+ [tool.ruff.format]
52
+ docstring-code-format = true
requirements.txt CHANGED
@@ -1,6 +1,182 @@
1
- numpy
2
- pycairo
3
- tqdm
4
- pydub
5
- ffmpeg-python
6
- opencv-python
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This file was autogenerated by uv via the following command:
2
+ # uv pip compile pyproject.toml -o requirements.txt
3
+ aiofiles==24.1.0
4
+ # via gradio
5
+ annotated-types==0.7.0
6
+ # via pydantic
7
+ anyio==4.9.0
8
+ # via
9
+ # gradio
10
+ # httpx
11
+ # mcp
12
+ # sse-starlette
13
+ # starlette
14
+ certifi==2025.4.26
15
+ # via
16
+ # httpcore
17
+ # httpx
18
+ # requests
19
+ charset-normalizer==3.4.2
20
+ # via requests
21
+ click==8.2.1
22
+ # via
23
+ # typer
24
+ # uvicorn
25
+ exceptiongroup==1.3.0
26
+ # via anyio
27
+ fastapi==0.115.12
28
+ # via gradio
29
+ ffmpy==0.5.0
30
+ # via gradio
31
+ filelock==3.18.0
32
+ # via huggingface-hub
33
+ fsspec==2025.5.1
34
+ # via
35
+ # gradio-client
36
+ # huggingface-hub
37
+ gradio==5.31.0
38
+ # via seewav-gui (pyproject.toml)
39
+ gradio-client==1.10.1
40
+ # via gradio
41
+ groovy==0.1.2
42
+ # via gradio
43
+ h11==0.16.0
44
+ # via
45
+ # httpcore
46
+ # uvicorn
47
+ hf-xet==1.1.2
48
+ # via huggingface-hub
49
+ httpcore==1.0.9
50
+ # via httpx
51
+ httpx==0.28.1
52
+ # via
53
+ # gradio
54
+ # gradio-client
55
+ # mcp
56
+ # safehttpx
57
+ httpx-sse==0.4.0
58
+ # via mcp
59
+ huggingface-hub==0.32.3
60
+ # via
61
+ # gradio
62
+ # gradio-client
63
+ idna==3.10
64
+ # via
65
+ # anyio
66
+ # httpx
67
+ # requests
68
+ jinja2==3.1.6
69
+ # via gradio
70
+ markdown-it-py==3.0.0
71
+ # via rich
72
+ markupsafe==3.0.2
73
+ # via
74
+ # gradio
75
+ # jinja2
76
+ mcp==1.9.0
77
+ # via gradio
78
+ mdurl==0.1.2
79
+ # via markdown-it-py
80
+ numpy==2.2.6
81
+ # via
82
+ # gradio
83
+ # pandas
84
+ orjson==3.10.18
85
+ # via gradio
86
+ packaging==25.0
87
+ # via
88
+ # gradio
89
+ # gradio-client
90
+ # huggingface-hub
91
+ pandas==2.2.3
92
+ # via gradio
93
+ pillow==11.2.1
94
+ # via gradio
95
+ pycairo==1.28.0
96
+ # via seewav-gui (pyproject.toml)
97
+ pydantic==2.11.5
98
+ # via
99
+ # fastapi
100
+ # gradio
101
+ # mcp
102
+ # pydantic-settings
103
+ pydantic-core==2.33.2
104
+ # via pydantic
105
+ pydantic-settings==2.9.1
106
+ # via mcp
107
+ pydub==0.25.1
108
+ # via gradio
109
+ pygments==2.19.1
110
+ # via rich
111
+ python-dateutil==2.9.0.post0
112
+ # via pandas
113
+ python-dotenv==1.1.0
114
+ # via pydantic-settings
115
+ python-multipart==0.0.20
116
+ # via
117
+ # gradio
118
+ # mcp
119
+ pytz==2025.2
120
+ # via pandas
121
+ pyyaml==6.0.2
122
+ # via
123
+ # gradio
124
+ # huggingface-hub
125
+ requests==2.32.3
126
+ # via huggingface-hub
127
+ rich==14.0.0
128
+ # via typer
129
+ ruff==0.11.12
130
+ # via gradio
131
+ safehttpx==0.1.6
132
+ # via gradio
133
+ semantic-version==2.10.0
134
+ # via gradio
135
+ shellingham==1.5.4
136
+ # via typer
137
+ six==1.17.0
138
+ # via python-dateutil
139
+ sniffio==1.3.1
140
+ # via anyio
141
+ sse-starlette==2.3.5
142
+ # via mcp
143
+ starlette==0.46.2
144
+ # via
145
+ # fastapi
146
+ # gradio
147
+ # mcp
148
+ # sse-starlette
149
+ tomlkit==0.13.2
150
+ # via gradio
151
+ tqdm==4.67.1
152
+ # via huggingface-hub
153
+ typer==0.16.0
154
+ # via gradio
155
+ typing-extensions==4.13.2
156
+ # via
157
+ # anyio
158
+ # exceptiongroup
159
+ # fastapi
160
+ # gradio
161
+ # gradio-client
162
+ # huggingface-hub
163
+ # pydantic
164
+ # pydantic-core
165
+ # rich
166
+ # typer
167
+ # typing-inspection
168
+ # uvicorn
169
+ typing-inspection==0.4.1
170
+ # via
171
+ # pydantic
172
+ # pydantic-settings
173
+ tzdata==2025.2
174
+ # via pandas
175
+ urllib3==2.4.0
176
+ # via requests
177
+ uvicorn==0.34.2
178
+ # via
179
+ # gradio
180
+ # mcp
181
+ websockets==15.0.1
182
+ # via gradio-client
style.css ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ h1 {
2
+ text-align: center;
3
+ display: block;
4
+ }
utils.py ADDED
@@ -0,0 +1,230 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Thank you to the authors of seewav for dedicating it into the public domain.
2
+ # This program is also dedicated into the public domain.
3
+ # You may use it, at your choice, under the Unlicense, CC0, or WTFPL license.
4
+ # Enjoy!
5
+
6
+ # Mostly from: https://github.com/adefossez/seewav
7
+ # Original author: adefossez
8
+
9
+
10
+ import math
11
+ import subprocess
12
+
13
+ import cairo
14
+ import gradio as gr
15
+ import numpy as np
16
+ import tqdm
17
+ from pydub import AudioSegment
18
+
19
+
20
+ def read_audio(audio, seek=None, duration=None):
21
+ """Read the `audio` file, starting at `seek` (or 0) seconds for `duration` (or all) seconds.
22
+ Returns `float[channels, samples]`.
23
+ """
24
+ audio_segment = AudioSegment.from_file(audio)
25
+ channels = audio_segment.channels
26
+ samplerate = audio_segment.frame_rate
27
+
28
+ if seek is not None:
29
+ seek_ms = int(seek * 1000)
30
+ audio_segment = audio_segment[seek_ms:]
31
+
32
+ if duration is not None:
33
+ duration_ms = int(duration * 1000)
34
+ audio_segment = audio_segment[:duration_ms]
35
+
36
+ samples = audio_segment.get_array_of_samples()
37
+ wav = np.array(samples, dtype=np.float32)
38
+ return wav.reshape(channels, -1), samplerate
39
+
40
+
41
+ def sigmoid(x):
42
+ return 1 / (1 + np.exp(-x))
43
+
44
+
45
+ def envelope(wav, window, stride):
46
+ """Extract the envelope of the waveform `wav` (float[samples]), using average pooling
47
+ with `window` samples and the given `stride`.
48
+ """
49
+ # pos = np.pad(np.maximum(wav, 0), window // 2)
50
+ wav = np.pad(wav, window // 2)
51
+ out = []
52
+ for off in range(0, len(wav) - window, stride):
53
+ frame = wav[off : off + window]
54
+ out.append(np.maximum(frame, 0).mean())
55
+ out = np.array(out)
56
+ # Some form of audio compressor based on the sigmoid.
57
+ out = 1.9 * (sigmoid(2.5 * out) - 0.5)
58
+ return out
59
+
60
+
61
+ def draw_env(envs, out, fg_colors, bg_color, size):
62
+ """Internal function, draw a single frame (two frames for stereo) using cairo and save
63
+ it to the `out` file as png. envs is a list of envelopes over channels, each env
64
+ is a float[bars] representing the height of the envelope to draw. Each entry will
65
+ be represented by a bar.
66
+ """
67
+ surface = cairo.ImageSurface(cairo.FORMAT_ARGB32, *size)
68
+ ctx = cairo.Context(surface)
69
+ ctx.scale(*size)
70
+
71
+ ctx.set_source_rgb(*bg_color)
72
+ ctx.rectangle(0, 0, 1, 1)
73
+ ctx.fill()
74
+
75
+ K = len(envs) # Number of waves to draw (waves are stacked vertically)
76
+ T = len(envs[0]) # Numbert of time steps
77
+ pad_ratio = 0.1 # spacing ratio between 2 bars
78
+ width = 1.0 / (T * (1 + 2 * pad_ratio))
79
+ pad = pad_ratio * width
80
+ delta = 2 * pad + width
81
+
82
+ ctx.set_line_width(width)
83
+ for step in range(T):
84
+ for i in range(K):
85
+ half = 0.5 * envs[i][step] # (semi-)height of the bar
86
+ half /= K # as we stack K waves vertically
87
+ midrule = (1 + 2 * i) / (2 * K) # midrule of i-th wave
88
+ ctx.set_source_rgb(*fg_colors[i])
89
+ ctx.move_to(pad + step * delta, midrule - half)
90
+ ctx.line_to(pad + step * delta, midrule)
91
+ ctx.stroke()
92
+ ctx.set_source_rgba(*fg_colors[i], 0.8)
93
+ ctx.move_to(pad + step * delta, midrule)
94
+ ctx.line_to(pad + step * delta, midrule + 0.9 * half)
95
+ ctx.stroke()
96
+
97
+ surface.write_to_png(out)
98
+
99
+
100
+ def interpole(x1, y1, x2, y2, x):
101
+ return y1 + (y2 - y1) * (x - x1) / (x2 - x1)
102
+
103
+
104
+ def visualize(
105
+ audio,
106
+ tmp,
107
+ out,
108
+ seek=None,
109
+ duration=None,
110
+ rate=60,
111
+ bars=50,
112
+ speed=4,
113
+ time=0.4,
114
+ oversample=3,
115
+ fg_color=(0.2, 0.2, 0.2),
116
+ fg_color2=(0.5, 0.3, 0.6),
117
+ bg_color=(1, 1, 1),
118
+ size=(400, 400),
119
+ stereo=False,
120
+ ):
121
+ """Generate the visualisation for the `audio` file, using a `tmp` folder and saving the final
122
+ video in `out`.
123
+ `seek` and `durations` gives the extract location if any.
124
+ `rate` is the framerate of the output video.
125
+
126
+ `bars` is the number of bars in the animation.
127
+ `speed` is the base speed of transition. Depending on volume, actual speed will vary
128
+ between 0.5 and 2 times it.
129
+ `time` amount of audio shown at once on a frame.
130
+ `oversample` higher values will lead to more frequent changes.
131
+ `fg_color` is the rgb color to use for the foreground.
132
+ `fg_color2` is the rgb color to use for the second wav if stereo is set.
133
+ `bg_color` is the rgb color to use for the background.
134
+ `size` is the `(width, height)` in pixels to generate.
135
+ `stereo` is whether to create 2 waves.
136
+ """
137
+ try:
138
+ wav, sr = read_audio(audio, seek=seek, duration=duration)
139
+ except (OSError, ValueError) as err:
140
+ raise gr.Error(err)
141
+ # wavs is a list of wav over channels
142
+ wavs = []
143
+ if stereo:
144
+ assert wav.shape[0] == 2, "stereo requires stereo audio file"
145
+ wavs.append(wav[0])
146
+ wavs.append(wav[1])
147
+ else:
148
+ wav = wav.mean(0)
149
+ wavs.append(wav)
150
+
151
+ for i, wav in enumerate(wavs):
152
+ wavs[i] = wav / wav.std()
153
+
154
+ window = int(sr * time / bars)
155
+ stride = int(window / oversample)
156
+ # envs is a list of env over channels
157
+ envs = []
158
+ for wav in wavs:
159
+ env = envelope(wav, window, stride)
160
+ env = np.pad(env, (bars // 2, 2 * bars))
161
+ envs.append(env)
162
+
163
+ duration = len(wavs[0]) / sr
164
+ frames = int(rate * duration)
165
+ smooth = np.hanning(bars)
166
+
167
+ for idx in tqdm.tqdm(range(frames)):
168
+ pos = ((idx / rate) * sr) / stride / bars
169
+ off = int(pos)
170
+ loc = pos - off
171
+ denvs = []
172
+ for env in envs:
173
+ env1 = env[off * bars : (off + 1) * bars]
174
+ env2 = env[(off + 1) * bars : (off + 2) * bars]
175
+
176
+ # we want loud parts to be updated faster
177
+ maxvol = math.log10(1e-4 + env2.max()) * 10
178
+ speedup = np.clip(interpole(-6, 0.5, 0, 2, maxvol), 0.5, 2)
179
+ w = sigmoid(speed * speedup * (loc - 0.5))
180
+ denv = (1 - w) * env1 + w * env2
181
+ denv *= smooth
182
+ denvs.append(denv)
183
+ draw_env(denvs, tmp / f"{idx:06d}.png", (fg_color, fg_color2), bg_color, size)
184
+ subprocess.run(
185
+ [
186
+ "ffmpeg",
187
+ "-y",
188
+ "-loglevel",
189
+ "panic",
190
+ "-r",
191
+ str(rate),
192
+ "-f",
193
+ "image2",
194
+ "-s",
195
+ f"{size[0]}x{size[1]}",
196
+ "-i",
197
+ "%06d.png",
198
+ "-i",
199
+ audio,
200
+ "-c:a",
201
+ "aac",
202
+ "-vcodec",
203
+ "libx264",
204
+ "-crf",
205
+ "10",
206
+ "-pix_fmt",
207
+ "yuv420p",
208
+ out.resolve(),
209
+ ],
210
+ check=True,
211
+ cwd=tmp,
212
+ )
213
+ return out
214
+
215
+
216
+ def parse_color(colorstr):
217
+ """Given a comma separated rgb(a) colors, returns a 4-tuple of float."""
218
+ try:
219
+ r, g, b = [float(i) for i in colorstr.split(",")]
220
+ return r, g, b
221
+ except ValueError:
222
+ raise gr.Error("Format for color is 3 floats separated by commas 0.xx,0.xx,0.xx, rgb order")
223
+
224
+
225
+ def hex_to_rgb(hex_color):
226
+ hex_color = hex_color.lstrip("#")
227
+ r = int(hex_color[0:2], 16) / 255.0
228
+ g = int(hex_color[2:4], 16) / 255.0
229
+ b = int(hex_color[4:6], 16) / 255.0
230
+ return (r, g, b)
uv.lock ADDED
The diff for this file is too large to render. See raw diff