Spaces:

m-a-p
/

MERT-Music-Genre-Tagging-Prediction

Runtime error

App Files Files Community

Epsilon617 commited on May 18, 2023

Commit

826be26

1 Parent(s): 5247bff

add model inference codes

Browse files

Files changed (3) hide show

__pycache__/app.cpython-310.pyc +0 -0
app.py +25 -8
requirements.txt +88 -0

__pycache__/app.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/app.cpython-310.pyc and b/__pycache__/app.cpython-310.pyc differ

app.py CHANGED Viewed

@@ -5,9 +5,21 @@ import torch
 from torch import nn
 import torchaudio
 import torchaudio.transforms as T
 # input cr: https://huggingface.co/spaces/thealphhamerc/audio-to-text/blob/main/app.py
 inputs = [gr.components.Audio(type="filepath", label="Add music audio file"),
           gr.inputs.Audio(source="microphone",optional=True, type="filepath"),
           ]
@@ -17,8 +29,8 @@ title = "Output the tags of a (music) audio"
 description = "An example of using MERT-95M-public to conduct music tagging."
 article = ""
 audio_examples = [
-    ["input/example-1.wav"],
-    ["input/example-2.wav"],
 ]
 # Load the model
@@ -26,13 +38,14 @@ model = AutoModel.from_pretrained("m-a-p/MERT-v0-public", trust_remote_code=True
 # loading the corresponding preprocessor config
 processor = Wav2Vec2FeatureExtractor.from_pretrained("m-a-p/MERT-v0-public",trust_remote_code=True)
 def convert_audio(inputs, microphone):
     if (microphone is not None):
         inputs = microphone
     waveform, sample_rate = torchaudio.load(inputs)
     resample_rate = processor.sampling_rate
@@ -42,15 +55,19 @@ def convert_audio(inputs, microphone):
         resampler = T.Resample(sample_rate, resample_rate)
         waveform = resampler(waveform)
-    inputs = processor(waveform, sampling_rate=resample_rate, return_tensors="pt")
     with torch.no_grad():
-        outputs = model(**inputs, output_hidden_states=True)
     # take a look at the output shape, there are 13 layers of representation
     # each layer performs differently in different downstream tasks, you should choose empirically
-    all_layer_hidden_states = torch.stack(outputs.hidden_states).squeeze()
     # print(all_layer_hidden_states.shape) # [13 layer, Time steps, 768 feature_dim]
-    return str(all_layer_hidden_states.shape)
 # iface = gr.Interface(fn=convert_audio, inputs="audio", outputs="text")

 from torch import nn
 import torchaudio
 import torchaudio.transforms as T
+import logging
 # input cr: https://huggingface.co/spaces/thealphhamerc/audio-to-text/blob/main/app.py
+logger = logging.getLogger("whisper-jax-app")
+logger.setLevel(logging.INFO)
+ch = logging.StreamHandler()
+ch.setLevel(logging.INFO)
+formatter = logging.Formatter(
+    "%(asctime)s;%(levelname)s;%(message)s", "%Y-%m-%d %H:%M:%S")
+ch.setFormatter(formatter)
+logger.addHandler(ch)
 inputs = [gr.components.Audio(type="filepath", label="Add music audio file"),
           gr.inputs.Audio(source="microphone",optional=True, type="filepath"),
           ]
 description = "An example of using MERT-95M-public to conduct music tagging."
 article = ""
 audio_examples = [
+    # ["input/example-1.wav"],
+    # ["input/example-2.wav"],
 ]
 # Load the model
 # loading the corresponding preprocessor config
 processor = Wav2Vec2FeatureExtractor.from_pretrained("m-a-p/MERT-v0-public",trust_remote_code=True)
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+model.to(device)
 def convert_audio(inputs, microphone):
     if (microphone is not None):
         inputs = microphone
     waveform, sample_rate = torchaudio.load(inputs)
     resample_rate = processor.sampling_rate
         resampler = T.Resample(sample_rate, resample_rate)
         waveform = resampler(waveform)
+    waveform = waveform.view(-1,) # make it (n_sample, )
+    model_inputs = processor(waveform, sampling_rate=resample_rate, return_tensors="pt")
+    model_inputs.to(device)
     with torch.no_grad():
+        model_outputs = model(**model_inputs, output_hidden_states=True)
     # take a look at the output shape, there are 13 layers of representation
     # each layer performs differently in different downstream tasks, you should choose empirically
+    all_layer_hidden_states = torch.stack(model_outputs.hidden_states).squeeze()
     # print(all_layer_hidden_states.shape) # [13 layer, Time steps, 768 feature_dim]
+    # logger.warning(all_layer_hidden_states.shape)
+    return device + "  :" + str(all_layer_hidden_states.shape)
 # iface = gr.Interface(fn=convert_audio, inputs="audio", outputs="text")

requirements.txt ADDED Viewed

	@@ -0,0 +1,88 @@

+aiofiles==23.1.0
+aiohttp==3.8.4
+aiosignal==1.3.1
+altair==5.0.0
+anyio==3.6.2
+async-timeout==4.0.2
+attrs==23.1.0
+certifi==2023.5.7
+charset-normalizer==3.1.0
+click==8.1.3
+cmake==3.26.3
+contourpy==1.0.7
+cycler==0.11.0
+fastapi==0.95.2
+ffmpy==0.3.0
+filelock==3.12.0
+fonttools==4.39.4
+frozenlist==1.3.3
+fsspec==2023.5.0
+gradio==3.31.0
+gradio_client==0.2.5
+h11==0.14.0
+httpcore==0.17.1
+httpx==0.24.0
+huggingface-hub==0.14.1
+idna==3.4
+Jinja2==3.1.2
+jsonschema==4.17.3
+kiwisolver==1.4.4
+linkify-it-py==2.0.2
+lit==16.0.5
+markdown-it-py==2.2.0
+MarkupSafe==2.1.2
+matplotlib==3.7.1
+mdit-py-plugins==0.3.3
+mdurl==0.1.2
+mpmath==1.3.0
+multidict==6.0.4
+networkx==3.1
+nnAudio==0.3.2
+numpy==1.24.3
+nvidia-cublas-cu11==11.10.3.66
+nvidia-cuda-cupti-cu11==11.7.101
+nvidia-cuda-nvrtc-cu11==11.7.99
+nvidia-cuda-runtime-cu11==11.7.99
+nvidia-cudnn-cu11==8.5.0.96
+nvidia-cufft-cu11==10.9.0.58
+nvidia-curand-cu11==10.2.10.91
+nvidia-cusolver-cu11==11.4.0.1
+nvidia-cusparse-cu11==11.7.4.91
+nvidia-nccl-cu11==2.14.3
+nvidia-nvtx-cu11==11.7.91
+orjson==3.8.12
+packaging==23.1
+pandas==2.0.1
+Pillow==9.5.0
+pydantic==1.10.7
+pydub==0.25.1
+Pygments==2.15.1
+pyparsing==3.0.9
+pyrsistent==0.19.3
+python-dateutil==2.8.2
+python-multipart==0.0.6
+pytz==2023.3
+PyYAML==6.0
+regex==2023.5.5
+requests==2.30.0
+scipy==1.10.1
+semantic-version==2.10.0
+six==1.16.0
+sniffio==1.3.0
+starlette==0.27.0
+sympy==1.12
+tokenizers==0.13.3
+toolz==0.12.0
+torch==2.0.1
+torchaudio==2.0.2
+torchvision==0.15.2
+tqdm==4.65.0
+transformers==4.29.2
+triton==2.0.0
+typing_extensions==4.5.0
+tzdata==2023.3
+uc-micro-py==1.0.2
+urllib3==2.0.2
+uvicorn==0.22.0
+websockets==11.0.3
+yarl==1.9.2