ron-hf commited on
Commit
4e64ba6
·
1 Parent(s): b9ec21e

Fix & update

Browse files
.gitignore ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ /.venv
2
+
3
+ /*/__pycache__
4
+
5
+ *.mp3
6
+ *.wav
7
+
8
+ *.pth
app.py CHANGED
@@ -1,67 +1,101 @@
1
- import os
2
- import argparse
3
  import gradio as gr
4
- from timeit import default_timer as timer
5
- import torch
6
  import numpy as np
7
- import pandas as pd
8
- from huggingface_hub import hf_hub_download
 
 
 
9
  from model.bart import BartCaptionModel
10
  from utils.audio_utils import load_audio, STR_CH_FIRST
11
 
12
  if os.path.isfile("transfer.pth") == False:
13
- torch.hub.download_url_to_file('https://huggingface.co/seungheondoh/lp-music-caps/resolve/main/transfer.pth', 'transfer.pth')
14
- torch.hub.download_url_to_file('https://huggingface.co/seungheondoh/lp-music-caps/resolve/main/folk.wav', 'folk.wav')
15
- torch.hub.download_url_to_file('https://huggingface.co/seungheondoh/lp-music-caps/resolve/main/electronic.mp3', 'electronic.mp3')
16
- torch.hub.download_url_to_file('https://huggingface.co/seungheondoh/lp-music-caps/resolve/main/orchestra.wav', 'orchestra.wav')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
19
 
20
- example_list = ['folk.wav', 'electronic.mp3', 'orchestra.wav']
21
- model = BartCaptionModel(max_length = 128)
22
- pretrained_object = torch.load('./transfer.pth', map_location='cpu')
23
- state_dict = pretrained_object['state_dict']
24
  model.load_state_dict(state_dict)
 
25
  if torch.cuda.is_available():
26
  torch.cuda.set_device(device)
27
  model = model.cuda(device)
 
28
  model.eval()
29
 
 
30
  def get_audio(audio_path, duration=10, target_sr=16000):
31
  n_samples = int(duration * target_sr)
 
32
  audio, sr = load_audio(
33
- path= audio_path,
34
- ch_format= STR_CH_FIRST,
35
- sample_rate= target_sr,
36
- downmix_to_mono= True,
37
  )
 
38
  if len(audio.shape) == 2:
39
  audio = audio.mean(0, False) # to mono
 
40
  input_size = int(n_samples)
 
41
  if audio.shape[-1] < input_size: # pad sequence
42
  pad = np.zeros(input_size)
43
  pad[: audio.shape[-1]] = audio
44
  audio = pad
 
45
  ceil = int(audio.shape[-1] // n_samples)
46
- audio = torch.from_numpy(np.stack(np.split(audio[:ceil * n_samples], ceil)).astype('float32'))
 
 
 
 
47
  return audio
48
 
 
49
  def captioning(audio_path):
50
- audio_tensor = get_audio(audio_path = audio_path)
 
51
  if torch.cuda.is_available():
52
  audio_tensor = audio_tensor.to(device)
 
53
  with torch.no_grad():
54
  output = model.generate(
55
  samples=audio_tensor,
56
  num_beams=5,
57
  )
 
58
  inference = ""
 
59
  number_of_chunks = range(audio_tensor.shape[0])
 
60
  for chunk, text in zip(number_of_chunks, output):
61
  time = f"[{chunk * 10}:00-{(chunk + 1) * 10}:00]"
62
  inference += f"{time}\n{text} \n \n"
 
63
  return inference
64
 
 
65
  title = "Interactive demo: Music Captioning 🤖🎵"
66
  description = """
67
  <p style='text-align: center'> LP-MusicCaps: LLM-Based Pseudo Music Captioning</p>
@@ -73,16 +107,17 @@ description = """
73
 
74
  article = "<p style='text-align: center'><a href='https://seungheondoh.github.io/' target='_blank'>Author Info</a> | <a href='https://github.com/seungheondoh' target='_blank'>Github</a></p>"
75
 
 
 
 
 
 
 
 
 
 
 
 
 
76
 
77
- demo = gr.Interface(fn=captioning,
78
- inputs=gr.Audio(type="filepath"),
79
- outputs=[
80
- gr.Textbox(label="Caption generated by LP-MusicCaps Transfer Model"),
81
- ],
82
- examples=example_list,
83
- title=title,
84
- description=description,
85
- article=article,
86
- cache_examples=False
87
- )
88
- demo.launch()
 
 
 
1
  import gradio as gr
 
 
2
  import numpy as np
3
+ import os
4
+ import torch
5
+
6
+ from timeit import default_timer as timer
7
+
8
  from model.bart import BartCaptionModel
9
  from utils.audio_utils import load_audio, STR_CH_FIRST
10
 
11
  if os.path.isfile("transfer.pth") == False:
12
+ torch.hub.download_url_to_file(
13
+ "https://huggingface.co/seungheondoh/lp-music-caps/resolve/main/transfer.pth",
14
+ "transfer.pth",
15
+ )
16
+
17
+ torch.hub.download_url_to_file(
18
+ "https://huggingface.co/seungheondoh/lp-music-caps/resolve/main/folk.wav",
19
+ "folk.wav",
20
+ )
21
+
22
+ torch.hub.download_url_to_file(
23
+ "https://huggingface.co/seungheondoh/lp-music-caps/resolve/main/electronic.mp3",
24
+ "electronic.mp3",
25
+ )
26
+
27
+ torch.hub.download_url_to_file(
28
+ "https://huggingface.co/seungheondoh/lp-music-caps/resolve/main/orchestra.wav",
29
+ "orchestra.wav",
30
+ )
31
 
32
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
33
 
34
+ example_list = ["folk.wav", "electronic.mp3", "orchestra.wav"]
35
+ model = BartCaptionModel(max_length=128)
36
+ pretrained_object = torch.load("./transfer.pth", map_location="cpu")
37
+ state_dict = pretrained_object["state_dict"]
38
  model.load_state_dict(state_dict)
39
+
40
  if torch.cuda.is_available():
41
  torch.cuda.set_device(device)
42
  model = model.cuda(device)
43
+
44
  model.eval()
45
 
46
+
47
  def get_audio(audio_path, duration=10, target_sr=16000):
48
  n_samples = int(duration * target_sr)
49
+
50
  audio, sr = load_audio(
51
+ path=audio_path,
52
+ ch_format=STR_CH_FIRST,
53
+ sample_rate=target_sr,
54
+ downmix_to_mono=True,
55
  )
56
+
57
  if len(audio.shape) == 2:
58
  audio = audio.mean(0, False) # to mono
59
+
60
  input_size = int(n_samples)
61
+
62
  if audio.shape[-1] < input_size: # pad sequence
63
  pad = np.zeros(input_size)
64
  pad[: audio.shape[-1]] = audio
65
  audio = pad
66
+
67
  ceil = int(audio.shape[-1] // n_samples)
68
+
69
+ audio = torch.from_numpy(
70
+ np.stack(np.split(audio[: ceil * n_samples], ceil)).astype("float32")
71
+ )
72
+
73
  return audio
74
 
75
+
76
  def captioning(audio_path):
77
+ audio_tensor = get_audio(audio_path=audio_path)
78
+
79
  if torch.cuda.is_available():
80
  audio_tensor = audio_tensor.to(device)
81
+
82
  with torch.no_grad():
83
  output = model.generate(
84
  samples=audio_tensor,
85
  num_beams=5,
86
  )
87
+
88
  inference = ""
89
+
90
  number_of_chunks = range(audio_tensor.shape[0])
91
+
92
  for chunk, text in zip(number_of_chunks, output):
93
  time = f"[{chunk * 10}:00-{(chunk + 1) * 10}:00]"
94
  inference += f"{time}\n{text} \n \n"
95
+
96
  return inference
97
 
98
+
99
  title = "Interactive demo: Music Captioning 🤖🎵"
100
  description = """
101
  <p style='text-align: center'> LP-MusicCaps: LLM-Based Pseudo Music Captioning</p>
 
107
 
108
  article = "<p style='text-align: center'><a href='https://seungheondoh.github.io/' target='_blank'>Author Info</a> | <a href='https://github.com/seungheondoh' target='_blank'>Github</a></p>"
109
 
110
+ demo = gr.Interface(
111
+ fn=captioning,
112
+ inputs=gr.Audio(type="filepath"),
113
+ outputs=[
114
+ gr.Textbox(label="Caption generated by LP-MusicCaps Transfer Model"),
115
+ ],
116
+ examples=example_list,
117
+ title=title,
118
+ description=description,
119
+ article=article,
120
+ cache_examples=False,
121
+ )
122
 
123
+ demo.launch()
 
 
 
 
 
 
 
 
 
 
 
model/__pycache__/bart.cpython-310.pyc CHANGED
Binary files a/model/__pycache__/bart.cpython-310.pyc and b/model/__pycache__/bart.cpython-310.pyc differ
 
model/__pycache__/modules.cpython-310.pyc CHANGED
Binary files a/model/__pycache__/modules.cpython-310.pyc and b/model/__pycache__/modules.cpython-310.pyc differ
 
requirements copy.txt ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiofiles==23.2.1
2
+ annotated-types==0.7.0
3
+ anyio==4.7.0
4
+ audioread==3.0.1
5
+ certifi==2024.8.30
6
+ cffi==1.17.1
7
+ charset-normalizer==3.4.0
8
+ click==8.1.7
9
+ decorator==5.1.1
10
+ exceptiongroup==1.2.2
11
+ fastapi==0.115.6
12
+ ffmpy==0.4.0
13
+ filelock==3.16.1
14
+ fsspec==2024.10.0
15
+ gradio==5.8.0
16
+ gradio_client==1.5.1
17
+ h11==0.14.0
18
+ httpcore==1.0.7
19
+ httpx==0.28.1
20
+ huggingface-hub==0.26.5
21
+ idna==3.10
22
+ Jinja2==3.1.4
23
+ joblib==1.4.2
24
+ lazy_loader==0.4
25
+ librosa==0.10.2.post1
26
+ llvmlite==0.43.0
27
+ markdown-it-py==3.0.0
28
+ MarkupSafe==2.1.5
29
+ mdurl==0.1.2
30
+ msgpack==1.1.0
31
+ numba==0.60.0
32
+ numpy==1.26.4
33
+ orjson==3.10.12
34
+ packaging==24.2
35
+ pandas==2.2.3
36
+ pillow==11.0.0
37
+ platformdirs==4.3.6
38
+ pooch==1.8.2
39
+ pycparser==2.22
40
+ pydantic==2.10.3
41
+ pydantic_core==2.27.1
42
+ pydub==0.25.1
43
+ Pygments==2.18.0
44
+ python-dateutil==2.9.0.post0
45
+ python-multipart==0.0.19
46
+ pytz==2024.2
47
+ PyYAML==6.0.2
48
+ regex==2024.11.6
49
+ requests==2.32.3
50
+ rich==13.9.4
51
+ ruff==0.8.3
52
+ safehttpx==0.1.6
53
+ safetensors==0.4.5
54
+ scikit-learn==1.6.0
55
+ scipy==1.14.1
56
+ semantic-version==2.10.0
57
+ shellingham==1.5.4
58
+ six==1.17.0
59
+ sniffio==1.3.1
60
+ soundfile==0.12.1
61
+ soxr==0.5.0.post1
62
+ starlette==0.41.3
63
+ threadpoolctl==3.5.0
64
+ tokenizers==0.19.1
65
+ tomlkit==0.13.2
66
+ torch==1.13.1
67
+ torchaudio==0.13.1
68
+ tqdm==4.67.1
69
+ transformers==4.42.0
70
+ typer==0.15.1
71
+ typing_extensions==4.12.2
72
+ tzdata==2024.2
73
+ urllib3==2.2.3
74
+ uvicorn==0.32.1
75
+ websockets==14.1
requirements.txt CHANGED
@@ -1,8 +1,7 @@
1
  --extra-index-url https://download.pytorch.org/whl/cpu
2
  torch==1.13.1
3
  torchaudio==0.13.1
4
- transformers==4.26.1
5
- librosa >= 0.8
6
- pip>=23.2
7
- gradio_client==0.2.7
8
  numpy<2
 
1
  --extra-index-url https://download.pytorch.org/whl/cpu
2
  torch==1.13.1
3
  torchaudio==0.13.1
4
+ transformers==4.42.0
5
+ librosa>=0.8
6
+ gradio==5.8.0
 
7
  numpy<2
utils/__pycache__/audio_utils.cpython-310.pyc CHANGED
Binary files a/utils/__pycache__/audio_utils.cpython-310.pyc and b/utils/__pycache__/audio_utils.cpython-310.pyc differ