si43-musiccap

Running

App Files Files Community

ron-hf commited on Dec 13, 2024

Commit

4e64ba6

1 Parent(s): b9ec21e

Fix & update

Browse files

Files changed (7) hide show

.gitignore +8 -0
app.py +67 -32
model/__pycache__/bart.cpython-310.pyc +0 -0
model/__pycache__/modules.cpython-310.pyc +0 -0
requirements copy.txt +75 -0
requirements.txt +3 -4
utils/__pycache__/audio_utils.cpython-310.pyc +0 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,8 @@

+/.venv
+/*/__pycache__
+*.mp3
+*.wav
+*.pth

app.py CHANGED Viewed

@@ -1,67 +1,101 @@
-import os
-import argparse
 import gradio as gr
-from timeit import default_timer as timer
-import torch
 import numpy as np
-import pandas as pd
-from huggingface_hub import hf_hub_download
 from model.bart import BartCaptionModel
 from utils.audio_utils import load_audio, STR_CH_FIRST
 if os.path.isfile("transfer.pth") == False:
-    torch.hub.download_url_to_file('https://huggingface.co/seungheondoh/lp-music-caps/resolve/main/transfer.pth', 'transfer.pth')
-    torch.hub.download_url_to_file('https://huggingface.co/seungheondoh/lp-music-caps/resolve/main/folk.wav', 'folk.wav')
-    torch.hub.download_url_to_file('https://huggingface.co/seungheondoh/lp-music-caps/resolve/main/electronic.mp3', 'electronic.mp3')
-    torch.hub.download_url_to_file('https://huggingface.co/seungheondoh/lp-music-caps/resolve/main/orchestra.wav', 'orchestra.wav')
 device = "cuda:0" if torch.cuda.is_available() else "cpu"
-example_list = ['folk.wav', 'electronic.mp3', 'orchestra.wav']
-model = BartCaptionModel(max_length = 128)
-pretrained_object = torch.load('./transfer.pth', map_location='cpu')
-state_dict = pretrained_object['state_dict']
 model.load_state_dict(state_dict)
 if torch.cuda.is_available():
     torch.cuda.set_device(device)
     model = model.cuda(device)
 model.eval()
 def get_audio(audio_path, duration=10, target_sr=16000):
     n_samples = int(duration * target_sr)
     audio, sr = load_audio(
-        path= audio_path,
-        ch_format= STR_CH_FIRST,
-        sample_rate= target_sr,
-        downmix_to_mono= True,
     )
     if len(audio.shape) == 2:
         audio = audio.mean(0, False)  # to mono
     input_size = int(n_samples)
     if audio.shape[-1] < input_size:  # pad sequence
         pad = np.zeros(input_size)
         pad[: audio.shape[-1]] = audio
         audio = pad
     ceil = int(audio.shape[-1] // n_samples)
-    audio = torch.from_numpy(np.stack(np.split(audio[:ceil * n_samples], ceil)).astype('float32'))
     return audio
 def captioning(audio_path):
-    audio_tensor = get_audio(audio_path = audio_path)
     if torch.cuda.is_available():
         audio_tensor = audio_tensor.to(device)
     with torch.no_grad():
         output = model.generate(
             samples=audio_tensor,
             num_beams=5,
         )
     inference = ""
     number_of_chunks = range(audio_tensor.shape[0])
     for chunk, text in zip(number_of_chunks, output):
         time = f"[{chunk * 10}:00-{(chunk + 1) * 10}:00]"
         inference += f"{time}\n{text} \n \n"
     return inference
 title = "Interactive demo: Music Captioning 🤖🎵"
 description = """
 <p style='text-align: center'> LP-MusicCaps: LLM-Based Pseudo Music Captioning</p>
@@ -73,16 +107,17 @@ description = """
 article = "<p style='text-align: center'><a href='https://seungheondoh.github.io/' target='_blank'>Author Info</a> | <a href='https://github.com/seungheondoh' target='_blank'>Github</a></p>"
-demo = gr.Interface(fn=captioning,
-                    inputs=gr.Audio(type="filepath"),
-                    outputs=[
-                        gr.Textbox(label="Caption generated by LP-MusicCaps Transfer Model"),
-                        ],
-                    examples=example_list,
-                    title=title,
-                    description=description,
-                    article=article,
-                    cache_examples=False
-                    )
-demo.launch()

 import gradio as gr
 import numpy as np
+import os
+import torch
+from timeit import default_timer as timer
 from model.bart import BartCaptionModel
 from utils.audio_utils import load_audio, STR_CH_FIRST
 if os.path.isfile("transfer.pth") == False:
+    torch.hub.download_url_to_file(
+        "https://huggingface.co/seungheondoh/lp-music-caps/resolve/main/transfer.pth",
+        "transfer.pth",
+    )
+    torch.hub.download_url_to_file(
+        "https://huggingface.co/seungheondoh/lp-music-caps/resolve/main/folk.wav",
+        "folk.wav",
+    )
+    torch.hub.download_url_to_file(
+        "https://huggingface.co/seungheondoh/lp-music-caps/resolve/main/electronic.mp3",
+        "electronic.mp3",
+    )
+    torch.hub.download_url_to_file(
+        "https://huggingface.co/seungheondoh/lp-music-caps/resolve/main/orchestra.wav",
+        "orchestra.wav",
+    )
 device = "cuda:0" if torch.cuda.is_available() else "cpu"
+example_list = ["folk.wav", "electronic.mp3", "orchestra.wav"]
+model = BartCaptionModel(max_length=128)
+pretrained_object = torch.load("./transfer.pth", map_location="cpu")
+state_dict = pretrained_object["state_dict"]
 model.load_state_dict(state_dict)
 if torch.cuda.is_available():
     torch.cuda.set_device(device)
     model = model.cuda(device)
 model.eval()
 def get_audio(audio_path, duration=10, target_sr=16000):
     n_samples = int(duration * target_sr)
     audio, sr = load_audio(
+        path=audio_path,
+        ch_format=STR_CH_FIRST,
+        sample_rate=target_sr,
+        downmix_to_mono=True,
     )
     if len(audio.shape) == 2:
         audio = audio.mean(0, False)  # to mono
     input_size = int(n_samples)
     if audio.shape[-1] < input_size:  # pad sequence
         pad = np.zeros(input_size)
         pad[: audio.shape[-1]] = audio
         audio = pad
     ceil = int(audio.shape[-1] // n_samples)
+    audio = torch.from_numpy(
+        np.stack(np.split(audio[: ceil * n_samples], ceil)).astype("float32")
+    )
     return audio
 def captioning(audio_path):
+    audio_tensor = get_audio(audio_path=audio_path)
     if torch.cuda.is_available():
         audio_tensor = audio_tensor.to(device)
     with torch.no_grad():
         output = model.generate(
             samples=audio_tensor,
             num_beams=5,
         )
     inference = ""
     number_of_chunks = range(audio_tensor.shape[0])
     for chunk, text in zip(number_of_chunks, output):
         time = f"[{chunk * 10}:00-{(chunk + 1) * 10}:00]"
         inference += f"{time}\n{text} \n \n"
     return inference
 title = "Interactive demo: Music Captioning 🤖🎵"
 description = """
 <p style='text-align: center'> LP-MusicCaps: LLM-Based Pseudo Music Captioning</p>
 article = "<p style='text-align: center'><a href='https://seungheondoh.github.io/' target='_blank'>Author Info</a> | <a href='https://github.com/seungheondoh' target='_blank'>Github</a></p>"
+demo = gr.Interface(
+    fn=captioning,
+    inputs=gr.Audio(type="filepath"),
+    outputs=[
+        gr.Textbox(label="Caption generated by LP-MusicCaps Transfer Model"),
+    ],
+    examples=example_list,
+    title=title,
+    description=description,
+    article=article,
+    cache_examples=False,
+)
+demo.launch()

model/__pycache__/bart.cpython-310.pyc CHANGED Viewed

Binary files a/model/__pycache__/bart.cpython-310.pyc and b/model/__pycache__/bart.cpython-310.pyc differ

model/__pycache__/modules.cpython-310.pyc CHANGED Viewed

Binary files a/model/__pycache__/modules.cpython-310.pyc and b/model/__pycache__/modules.cpython-310.pyc differ

requirements copy.txt ADDED Viewed

	@@ -0,0 +1,75 @@

+aiofiles==23.2.1
+annotated-types==0.7.0
+anyio==4.7.0
+audioread==3.0.1
+certifi==2024.8.30
+cffi==1.17.1
+charset-normalizer==3.4.0
+click==8.1.7
+decorator==5.1.1
+exceptiongroup==1.2.2
+fastapi==0.115.6
+ffmpy==0.4.0
+filelock==3.16.1
+fsspec==2024.10.0
+gradio==5.8.0
+gradio_client==1.5.1
+h11==0.14.0
+httpcore==1.0.7
+httpx==0.28.1
+huggingface-hub==0.26.5
+idna==3.10
+Jinja2==3.1.4
+joblib==1.4.2
+lazy_loader==0.4
+librosa==0.10.2.post1
+llvmlite==0.43.0
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+mdurl==0.1.2
+msgpack==1.1.0
+numba==0.60.0
+numpy==1.26.4
+orjson==3.10.12
+packaging==24.2
+pandas==2.2.3
+pillow==11.0.0
+platformdirs==4.3.6
+pooch==1.8.2
+pycparser==2.22
+pydantic==2.10.3
+pydantic_core==2.27.1
+pydub==0.25.1
+Pygments==2.18.0
+python-dateutil==2.9.0.post0
+python-multipart==0.0.19
+pytz==2024.2
+PyYAML==6.0.2
+regex==2024.11.6
+requests==2.32.3
+rich==13.9.4
+ruff==0.8.3
+safehttpx==0.1.6
+safetensors==0.4.5
+scikit-learn==1.6.0
+scipy==1.14.1
+semantic-version==2.10.0
+shellingham==1.5.4
+six==1.17.0
+sniffio==1.3.1
+soundfile==0.12.1
+soxr==0.5.0.post1
+starlette==0.41.3
+threadpoolctl==3.5.0
+tokenizers==0.19.1
+tomlkit==0.13.2
+torch==1.13.1
+torchaudio==0.13.1
+tqdm==4.67.1
+transformers==4.42.0
+typer==0.15.1
+typing_extensions==4.12.2
+tzdata==2024.2
+urllib3==2.2.3
+uvicorn==0.32.1
+websockets==14.1

requirements.txt CHANGED Viewed

@@ -1,8 +1,7 @@
 --extra-index-url https://download.pytorch.org/whl/cpu
 torch==1.13.1
 torchaudio==0.13.1
-transformers==4.26.1
-librosa >= 0.8
-pip>=23.2
-gradio_client==0.2.7
 numpy<2

 --extra-index-url https://download.pytorch.org/whl/cpu
 torch==1.13.1
 torchaudio==0.13.1
+transformers==4.42.0
+librosa>=0.8
+gradio==5.8.0
 numpy<2

utils/__pycache__/audio_utils.cpython-310.pyc CHANGED Viewed

Binary files a/utils/__pycache__/audio_utils.cpython-310.pyc and b/utils/__pycache__/audio_utils.cpython-310.pyc differ