Spaces:
Running
Running
import gradio as gr | |
import numpy as np | |
import os | |
import torch | |
from timeit import default_timer as timer | |
from model.bart import BartCaptionModel | |
from utils.audio_utils import load_audio, STR_CH_FIRST | |
if os.path.isfile("transfer.pth") == False: | |
torch.hub.download_url_to_file( | |
"https://huggingface.co/seungheondoh/lp-music-caps/resolve/main/transfer.pth", | |
"transfer.pth", | |
) | |
torch.hub.download_url_to_file( | |
"https://huggingface.co/seungheondoh/lp-music-caps/resolve/main/folk.wav", | |
"folk.wav", | |
) | |
torch.hub.download_url_to_file( | |
"https://huggingface.co/seungheondoh/lp-music-caps/resolve/main/electronic.mp3", | |
"electronic.mp3", | |
) | |
torch.hub.download_url_to_file( | |
"https://huggingface.co/seungheondoh/lp-music-caps/resolve/main/orchestra.wav", | |
"orchestra.wav", | |
) | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
example_list = ["folk.wav", "electronic.mp3", "orchestra.wav"] | |
model = BartCaptionModel(max_length=128) | |
pretrained_object = torch.load("./transfer.pth", map_location="cpu") | |
state_dict = pretrained_object["state_dict"] | |
model.load_state_dict(state_dict) | |
if torch.cuda.is_available(): | |
torch.cuda.set_device(device) | |
model = model.cuda(device) | |
model.eval() | |
def get_audio(audio_path, duration=10, target_sr=16000): | |
n_samples = int(duration * target_sr) | |
audio, sr = load_audio( | |
path=audio_path, | |
ch_format=STR_CH_FIRST, | |
sample_rate=target_sr, | |
downmix_to_mono=True, | |
) | |
if len(audio.shape) == 2: | |
audio = audio.mean(0, False) # to mono | |
input_size = int(n_samples) | |
if audio.shape[-1] < input_size: # pad sequence | |
pad = np.zeros(input_size) | |
pad[: audio.shape[-1]] = audio | |
audio = pad | |
ceil = int(audio.shape[-1] // n_samples) | |
audio = torch.from_numpy( | |
np.stack(np.split(audio[: ceil * n_samples], ceil)).astype("float32") | |
) | |
return audio | |
def captioning(audio_path): | |
audio_tensor = get_audio(audio_path=audio_path) | |
if torch.cuda.is_available(): | |
audio_tensor = audio_tensor.to(device) | |
with torch.no_grad(): | |
output = model.generate( | |
samples=audio_tensor, | |
num_beams=5, | |
) | |
inference = "" | |
number_of_chunks = range(audio_tensor.shape[0]) | |
for chunk, text in zip(number_of_chunks, output): | |
time = f"[{chunk * 10}:00-{(chunk + 1) * 10}:00]" | |
inference += f"{time}\n{text} \n \n" | |
return inference | |
def load_css(): | |
with open("static/css/musicapp.css", "r") as file: | |
css_content = file.read() | |
return css_content | |
title = "Capabara - Interactive demo: Music Captioning 🤖🎵" | |
description = """ | |
<p style='text-align: center'> LP-MusicCaps: LLM-Based Pseudo Music Captioning</p> | |
<p style='text-align: center'> SeungHeon Doh, Keunwoo Choi, Jongpil Lee, Juhan Nam, ISMIR 2023</p> | |
<p style='text-align: center'> <a href='https://arxiv.org/abs/2307.16372' target='_blank'>ArXiv</a> | <a href='https://github.com/seungheondoh/lp-music-caps' target='_blank'>Codes</a> | <a href='https://huggingface.co/datasets/seungheondoh/LP-MusicCaps-MC' target='_blank'>Dataset</a> </p> | |
<p style='text-align: center'> To use it, simply upload your audio and click 'submit', or click one of the examples to load them. Read more at the links below. </p> | |
<p style='text-align: center'> If you have any error, plz check this code: <a href='https://github.com/seungheondoh/lp-music-caps/blob/main/demo/app.py' target='_blank'>Demo</a>. </p> | |
""" | |
article = "<p style='text-align: center'><a href='https://seungheondoh.github.io/' target='_blank'>Author Info</a> | <a href='https://github.com/seungheondoh' target='_blank'>Github</a></p>" | |
demo = gr.Interface( | |
fn=captioning, | |
inputs=gr.Audio(type="filepath"), | |
outputs=[ | |
gr.Textbox(label="Caption generated by LP-MusicCaps Transfer Model"), | |
], | |
examples=example_list, | |
title=title, | |
theme=gr.themes.Default(font=[gr.themes.GoogleFont("Work Sans"), "sans-serif"]), | |
description=description, | |
article=article, | |
cache_examples=False, | |
css=load_css(), | |
) | |
demo.launch() | |