Spaces:

nvidia
/

audio-flamingo-demo

Build error

App Files Files Community

ZhifengKong commited on Jul 22, 2024

Commit

92740f3

1 Parent(s): 15f9587

upload

Browse files

Files changed (46) hide show

LICENSE +21 -0
LICENSE_OPT_IML.md +65 -0
app.py +223 -0
audio/wav1.wav +0 -0
audio/wav2.wav +0 -0
audio/wav3.wav +0 -0
audio/wav4.wav +0 -0
audio/wav5.wav +0 -0
audio/wav6.wav +0 -0
chat.yaml +23 -0
data.py +243 -0
inference_utils.py +81 -0
ms_clap/.DS_Store +0 -0
ms_clap/.gitignore +350 -0
ms_clap/CODE_OF_CONDUCT.md +9 -0
ms_clap/LICENSE +21 -0
ms_clap/README.md +120 -0
ms_clap/SECURITY.md +41 -0
ms_clap/SUPPORT.md +25 -0
ms_clap/requirements.txt +50 -0
ms_clap/src/.DS_Store +0 -0
ms_clap/src/CLAPWrapper.py +458 -0
ms_clap/src/__init__.py +0 -0
ms_clap/src/audio_captioning.py +25 -0
ms_clap/src/configs/config_2022.yml +26 -0
ms_clap/src/configs/config_2023.yml +26 -0
ms_clap/src/configs/config_clapcap.yml +34 -0
ms_clap/src/esc50_dataset.py +82 -0
ms_clap/src/models/__init__.py +6 -0
ms_clap/src/models/audio.py +186 -0
ms_clap/src/models/clap.py +109 -0
ms_clap/src/models/config.py +128 -0
ms_clap/src/models/htsat.py +956 -0
ms_clap/src/models/mapper.py +200 -0
ms_clap/src/models/pytorch_utils.py +184 -0
ms_clap/src/models/utils.py +26 -0
ms_clap/src/zero_shot_classification.py +46 -0
ms_clap/src/zero_shot_predictions.py +51 -0
requirements.txt +14 -0
src/.DS_Store +0 -0
src/__init__.py +2 -0
src/factory.py +198 -0
src/flamingo.py +260 -0
src/flamingo_lm.py +177 -0
src/helpers.py +379 -0
src/utils.py +54 -0

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2024 NVIDIA CORPORATION.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

LICENSE_OPT_IML.md ADDED Viewed

	@@ -0,0 +1,65 @@

+<h2 align="center"> OPT-IML 175B LICENSE AGREEMENT </h2>
+This License Agreement (as may be amended in accordance with this License Agreement, **“License”**), between you, or your employer or other entity (if you are entering into this agreement on behalf of your employer or other entity) (**“Licensee”** or **“you”**) and Meta Platforms, Inc. (**“Meta”** or **“we”**) applies to your use of any computer program, algorithm, source code, object code, or software that is made available by Meta under this License (**“Software”**) and any specifications, manuals, documentation, and other written information provided by Meta related to the Software (**“Documentation”**).
+**By clicking “I Accept” below or by using the Software, you agree to the terms of this License.  If you do not agree to this License, then you do not have any rights to use the Software or Documentation (collectively, the “Software Products”), and you must immediately cease using the Software Products.  If you are agreeing to be bound by the terms of this License on behalf of your employer or other entity, you represent and warrant to Meta that you have full legal authority to bind your employer or such entity to this License. If you do not have the requisite authority, you may not accept the License or access the Software Products on behalf of your employer or other entity.**
+<br><br>
+1. **LICENSE GRANT**
+<br><br>
+   a. Subject to your compliance with the Documentation and Sections 2, 3, and 5, Meta grants you a non-exclusive, worldwide, non-transferable, non-sublicensable, revocable, royalty free and limited license under Meta’s copyright interests to reproduce, distribute, and create derivative works of the Software solely for your non-commercial research purposes.  The foregoing license is personal to you, and you may not assign or sublicense this License or any other rights or obligations under this License without Meta’s prior written consent; any such assignment or sublicense will be void and will automatically and immediately terminate this License.
+<br><br>
+   b. You may make a reasonable number of copies of the Documentation solely for use in connection with the license to the Software granted above.
+<br><br>
+   c. The grant of rights expressly set forth in this Section 1 (License Grant) are the complete grant of rights to you in the Software Products, and no other licenses are granted, whether by waiver, estoppel, implication, equity or otherwise.  Meta and its licensors reserve all rights not expressly granted by this License.
+<br><br>
+2. **RESTRICTIONS**
+<br><br>
+   You will not, and will not permit, assist or cause any third party to:
+<br><br>
+   a. use, modify, copy, reproduce, create derivative works of, or distribute the Software Products (or any derivative works thereof, works incorporating the Software Products, or any data produced by the Software), in whole or in part, for (i) any commercial or production purposes, (ii) military purposes or in the service of nuclear technology, (iii) purposes of surveillance, including any research or development relating to surveillance, (iv) biometric processing, (v) in any manner that infringes, misappropriates, or otherwise violates any third-party rights, or (vi) in any manner that violates any applicable law, including accessing the Software Products from an embargoed country as prohibited by the U.S. government, and violating any privacy or security laws, rules, regulations, directives, or governmental requirements (including the General Data Privacy Regulation (Regulation (EU) 2016/679), the California Consumer Privacy Act, and any and all laws governing the processing of biometric information), as well as all amendments and successor laws to any of the foregoing;
+<br><br>
+   b. alter or remove copyright and other proprietary notices which appear on or in the Software Products;
+<br><br>
+   c. utilize any equipment, device, software, or other means to circumvent or remove any security or protection used by Meta in connection with the Software, or to circumvent or remove any usage restrictions, or to enable functionality disabled by Meta; or
+<br><br>
+   d. offer or impose any terms on the Software Products that alter, restrict, or are inconsistent with the terms of this License.
+<br><br>
+3. **ATTRIBUTION**
+<br><br>
+   Together with any copies of the Software Products (as well as derivative works thereof or works incorporating the Software Products) that you distribute, you must provide (i) a copy of this License, and (ii) the following attribution notice: “OPT-IML 175B is licensed under the OPT-175B license, Copyright (c) Meta Platforms, Inc. All Rights Reserved.”
+<br><br>
+4. **DISCLAIMERS**
+<br><br>
+   THE SOFTWARE PRODUCTS ARE PROVIDED “AS IS” and “WITH ALL FAULTS” WITH NO WARRANTY OF ANY KIND, EXPRESS OR IMPLIED.  META EXPRESSLY DISCLAIMS ALL REPRESENTATIONS AND WARRANTIES, EXPRESS OR IMPLIED, WHETHER BY STATUTE, CUSTOM, USAGE OR OTHERWISE AS TO ANY MATTERS RELATED TO THE SOFTWARE PRODUCTS, INCLUDING BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE, SATISFACTORY QUALITY, OR NON-INFRINGEMENT.  META MAKES NO WARRANTIES OR REPRESENTATIONS THAT THE SOFTWARE PRODUCTS WILL BE ERROR FREE OR FREE OF VIRUSES OR OTHER HARMFUL COMPONENTS, OR PRODUCE ANY PARTICULAR RESULTS.
+<br><br>
+5. **LIMITATION OF LIABILITY**
+<br><br>
+   TO THE FULLEST EXTENT PERMITTED BY LAW, IN NO EVENT WILL META BE LIABLE TO YOU (A) UNDER ANY THEORY OF LIABILITY, WHETHER BASED IN CONTRACT, TORT, NEGLIGENCE, STRICT LIABILITY, WARRANTY, OR OTHERWISE UNDER THIS LICENSE, OR (B) FOR ANY INDIRECT, CONSEQUENTIAL, EXEMPLARY, INCIDENTAL, PUNITIVE OR SPECIAL DAMAGES OR LOST PROFITS, EVEN IF META HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. THE SOFTWARE PRODUCTS, THEIR CONSTITUENT COMPONENTS, AND ANY OUTPUT (COLLECTIVELY, **“SOFTWARE MATERIALS”**) ARE NOT DESIGNED OR INTENDED FOR USE IN ANY APPLICATION OR SITUATION WHERE FAILURE OR FAULT OF THE SOFTWARE MATERIALS COULD REASONABLY BE ANTICIPATED TO LEAD TO SERIOUS INJURY OF ANY PERSON, INCLUDING POTENTIAL DISCRIMINATION OR VIOLATION OF AN INDIVIDUAL’S PRIVACY RIGHTS, OR TO SEVERE PHYSICAL, PROPERTY, OR ENVIRONMENTAL DAMAGE (EACH, A **“HIGH-RISK USE”**). IF YOU ELECT TO USE ANY OF THE SOFTWARE MATERIALS FOR A HIGH-RISK USE, YOU DO SO AT YOUR OWN RISK. YOU AGREE TO DESIGN AND IMPLEMENT APPROPRIATE DECISION-MAKING AND RISK-MITIGATION PROCEDURES AND POLICIES IN CONNECTION WITH A HIGH-RISK USE SUCH THAT EVEN IF THERE IS A FAILURE OR FAULT IN ANY OF THE SOFTWARE MATERIALS, THE SAFETY OF PERSONS OR PROPERTY AFFECTED BY THE ACTIVITY STAYS AT A LEVEL THAT IS REASONABLE, APPROPRIATE, AND LAWFUL FOR THE FIELD OF THE HIGH-RISK USE.
+<br><br>
+6. **INDEMNIFICATION**
+<br><br>
+   You will indemnify, defend and hold harmless Meta and our subsidiaries and affiliates, and each of our respective shareholders, directors, officers, employees, agents, successors, and assigns (collectively, the **“Meta Parties”**) from and against any losses, liabilities, damages, fines, penalties, and expenses (including reasonable attorneys’ fees) incurred by any Meta Party in connection with any claim, demand, allegation, lawsuit, proceeding, or investigation (collectively, **“Claims”**) arising out of or related to: (a) your access to or use of the Software Products (as well as any results or data generated from such access or use), including any High-Risk Use (defined below); (b) your violation of this License; or (c) your violation, misappropriation or infringement of any rights of another (including intellectual property or other proprietary rights and privacy rights).  You will promptly notify the Meta Parties of any such Claims, and cooperate with Meta Parties in defending such Claims.  You will also grant the Meta Parties sole control of the defense or settlement, at Meta’s sole option, of any Claims.  This indemnity is in addition to, and not in lieu of, any other indemnities or remedies set forth in a written agreement between you and Meta or the other Meta Parties.
+<br><br>
+7. **TERMINATION; SURVIVAL**
+<br><br>
+   a. This License will automatically terminate upon any breach by you of the terms of this License.
+<br><br>
+   b. We may terminate this License, in whole or in part, at any time upon notice (including electronic) to you.
+<br><br>
+   c. The following sections survive termination of this License: 2 (Restrictions), 3 (Attribution), 4 (Disclaimers), 5 (Limitation on Liability), 6 (Indemnification) 7 (Termination; Survival), 8 (Third Party Materials), 9 (Trademarks), 10 (Applicable Law; Dispute Resolution), and 11 (Miscellaneous).
+<br><br>
+8. **THIRD PARTY MATERIALS**
+<br><br>
+   The Software Products may contain third-party software or other components (including free and open source software) (all of the foregoing, **“Third Party Materials”**), which are subject to the license terms of the respective third-party licensors.  Your dealings or correspondence with third parties and your use of or interaction with any Third Party Materials are solely between you and the third party.  Meta does not control or endorse, and makes no representations or warranties regarding, any Third Party Materials, and your access to and use of such Third Party Materials are at your own risk.
+<br><br>
+9. **TRADEMARKS**
+<br><br>
+   Licensee has not been granted any trademark license as part of this License and may not use any name or mark associated with Meta without the prior written permission of Meta, except to the extent necessary to make the reference required by the “ATTRIBUTION” section of this Agreement.
+<br><br>
+10. **APPLICABLE LAW; DISPUTE RESOLUTION**
+<br><br>
+   This License will be governed and construed under the laws of the State of California without regard to conflicts of law provisions.  Any suit or proceeding arising out of or relating to this License will be brought in the federal or state courts, as applicable, in San Mateo County, California, and each party irrevocably submits to the jurisdiction and venue of such courts.
+<br><br>
+11. **MISCELLANEOUS**
+<br><br>
+   If any provision or part of a provision of this License is unlawful, void or unenforceable, that provision or part of the provision is deemed severed from this License, and will not affect the validity and enforceability of any remaining provisions.  The failure of Meta to exercise or enforce any right or provision of this License will not operate as a waiver of such right or provision. This License does not confer any third-party beneficiary rights upon any other person or entity.  This License, together with the Documentation, contains the entire understanding between you and Meta regarding the subject matter of this License, and supersedes all other written or oral agreements and understandings between you and Meta regarding such subject matter.  No change or addition to any provision of this License will be binding unless it is in writing and signed by an authorized representative of both you and Meta.

app.py ADDED Viewed

	@@ -0,0 +1,223 @@

+# Copyright (c) 2024 NVIDIA CORPORATION.
+#   Licensed under the MIT license.
+import os
+import yaml
+import gradio as gr
+import librosa
+from pydub import AudioSegment
+import soundfile as sf
+import numpy as np
+import torch
+import laion_clap
+from inference_utils import prepare_tokenizer, prepare_model, inference
+from data import AudioTextDataProcessor
+def load_laionclap():
+    model = laion_clap.CLAP_Module(enable_fusion=True, amodel='HTSAT-tiny').cuda()
+    model.load_ckpt(ckpt='630k-audioset-fusion-best.pt')
+    model.eval()
+    return model
+def int16_to_float32(x):
+    return (x / 32767.0).astype(np.float32)
+def float32_to_int16(x):
+    x = np.clip(x, a_min=-1., a_max=1.)
+    return (x * 32767.).astype(np.int16)
+def load_audio(file_path, target_sr=44100, duration=33.25, start=0.0):
+    if file_path.endswith('.mp3'):
+        audio = AudioSegment.from_file(file_path)
+        if len(audio) > (start + duration) * 1000:
+            audio = audio[start * 1000:(start + duration) * 1000]
+        if audio.frame_rate != target_sr:
+            audio = audio.set_frame_rate(target_sr)
+        if audio.channels > 1:
+            audio = audio.set_channels(1)
+        data = np.array(audio.get_array_of_samples())
+        if audio.sample_width == 2:
+            data = data.astype(np.float32) / np.iinfo(np.int16).max
+        elif audio.sample_width == 4:
+            data = data.astype(np.float32) / np.iinfo(np.int32).max
+        else:
+            raise ValueError("Unsupported bit depth: {}".format(audio.sample_width))
+    else:
+        with sf.SoundFile(file_path) as audio:
+            original_sr = audio.samplerate
+            channels = audio.channels
+            max_frames = int((start + duration) * original_sr)
+            audio.seek(int(start * original_sr))
+            frames_to_read = min(max_frames, len(audio))
+            data = audio.read(frames_to_read)
+            if data.max() > 1 or data.min() < -1:
+                data = data / max(abs(data.max()), abs(data.min()))
+        if original_sr != target_sr:
+            if channels == 1:
+                data = librosa.resample(data.flatten(), orig_sr=original_sr, target_sr=target_sr)
+            else:
+                data = librosa.resample(data.T, orig_sr=original_sr, target_sr=target_sr)[0]
+        else:
+            if channels != 1:
+                data = data.T[0]
+    if data.min() >= 0:
+        data = 2 * data / abs(data.max()) - 1.0
+    else:
+        data = data / max(abs(data.max()), abs(data.min()))
+    return data
+@torch.no_grad()
+def compute_laionclap_text_audio_sim(audio_file, laionclap_model, outputs):
+    try:
+        data = load_audio(audio_file, target_sr=48000)
+    except Exception as e:
+        print(audio_file, 'unsuccessful due to', e)
+        return [0.0] * len(outputs)
+    audio_data = data.reshape(1, -1)
+    audio_data_tensor = torch.from_numpy(int16_to_float32(float32_to_int16(audio_data))).float().cuda()
+    audio_embed = laionclap_model.get_audio_embedding_from_data(x=audio_data_tensor, use_tensor=True)
+    text_embed = laionclap_model.get_text_embedding(outputs, use_tensor=True)
+    cos = torch.nn.CosineSimilarity(dim=1, eps=1e-6)
+    cos_similarity = cos(audio_embed.repeat(text_embed.shape[0], 1), text_embed)
+    return cos_similarity.squeeze().cpu().numpy()
+inference_kwargs = {
+    "do_sample": True,
+    "top_k": 50,
+    "top_p": 0.95,
+    "num_return_sequences": 10
+}
+config = yaml.load(open('chat.yaml'), Loader=yaml.FullLoader)
+clap_config = config['clap_config']
+model_config = config['model_config']
+text_tokenizer = prepare_tokenizer(model_config)
+DataProcessor = AudioTextDataProcessor(
+    data_root='./',
+    clap_config=clap_config,
+    tokenizer=text_tokenizer,
+    max_tokens=512,
+)
+laionclap_model = load_laionclap()
+model = prepare_model(
+    model_config=model_config,
+    clap_config=clap_config,
+    checkpoint_path='chat.pt'
+)
+def inference_item(name, prompt):
+    item = {
+        'name': str(name),
+        'prefix': 'The task is dialog.',
+        'prompt': str(prompt)
+    }
+    processed_item = DataProcessor.process(item)
+    outputs = inference(
+        model, text_tokenizer, item, processed_item,
+        inference_kwargs,
+    )
+    laionclap_scores = compute_laionclap_text_audio_sim(
+        item["name"],
+        laionclap_model,
+        outputs
+    )
+    outputs_joint = [(output, score) for (output, score) in zip(outputs, laionclap_scores)]
+    outputs_joint.sort(key=lambda x: -x[1])
+    return outputs_joint[0][0]
+with gr.Blocks(title="Audio Flamingo - Demo") as ui:
+    gr.HTML(
+        """
+        <div style="text-align: center; max-width: 900px; margin: 0 auto;">
+            <div
+            style="
+                display: inline-flex;
+                align-items: center;
+                gap: 0.8rem;
+                font-size: 1.5rem;
+            "
+            >
+            <h1 style="font-weight: 700; margin-bottom: 7px; line-height: normal;">
+                Audio Flamingo: A Novel Audio Language Model with Few-Shot Learning and Dialogue Abilities
+            </h1>
+            </div>
+            <p style="margin-bottom: 10px; font-size: 125%">
+            <a href="https://arxiv.org/abs/2402.01831">[Paper]</a>  <a href="https://github.com/NVIDIA/audio-flamingo">[Code]</a>  <a href="https://audioflamingo.github.io/">[Demo]</a>
+            </p>
+        </div>
+        """
+    )
+    gr.HTML(
+        """
+        <div>
+        <h3>Model Overview</h3>
+        Audio Flamingo is an audio language model that can understand sounds beyond speech.
+        It can also answer questions about the sound in natural language.
+        Examples of questions include:
+        "Can you briefly describe what you hear in this audio?",
+        "What is the emotion conveyed in this music?",
+        "Where is this audio usually heard?",
+        or "What place is this music usually played at?".
+        </div>
+        """
+    )
+    name = gr.Textbox(
+        label="Audio file path (choose one from: audio/wav{1--6}.wav)",
+        value="audio/wav5.wav"
+    )
+    prompt = gr.Textbox(
+        label="Instruction",
+        value='Can you briefly describe what you hear in this audio?'
+    )
+    with gr.Row():
+        play_audio_button = gr.Button("Play Audio")
+    audio_output = gr.Audio(label="Playback")
+    play_audio_button.click(fn=lambda x: x, inputs=name, outputs=audio_output)
+    inference_button = gr.Button("Inference")
+    output_text = gr.Textbox(label="Audio Flamingo output")
+    inference_button.click(
+        fn=inference_item,
+        inputs=[name, prompt],
+        outputs=output_text
+    )
+ui.queue()
+ui.launch()

audio/wav1.wav ADDED Viewed

Binary file (960 kB). View file

audio/wav2.wav ADDED Viewed

Binary file (960 kB). View file

audio/wav3.wav ADDED Viewed

Binary file (960 kB). View file

audio/wav4.wav ADDED Viewed

Binary file (441 kB). View file

audio/wav5.wav ADDED Viewed

Binary file (441 kB). View file

audio/wav6.wav ADDED Viewed

Binary file (441 kB). View file

chat.yaml ADDED Viewed

	@@ -0,0 +1,23 @@

+clap_config:
+  method: microsoft-clap
+  audio_embed_dim: 1024
+  config_root: ./ms_clap/src/configs
+  model_name: 'clapcap'
+  checkpoint: ./clapcap_weights_2023.pth
+  window_length: 7.0
+  window_overlap: 5.25
+  max_num_window: 16
+  max_num_fewshot: 4
+model_config:
+  cache_dir: None
+  lang_encoder_path: facebook/opt-iml-max-1.3b
+  tokenizer_path: facebook/opt-iml-max-1.3b
+  cross_attn_every_n_layers: 1
+  audio_transformer_kwargs: {
+    n_head: 8,
+    n_layers: 3,
+    d_inner: 2048,
+    max_num_media: 128,
+    max_window_per_audio: 16,
+  }

data.py ADDED Viewed

	@@ -0,0 +1,243 @@

+# Copyright (c) 2024 NVIDIA CORPORATION.
+#   Licensed under the MIT license.
+import functools
+import io
+import json
+import math
+import os
+os.environ["TOKENIZERS_PARALLELISM"] = "false"  # disable the tokenizer parallelism warning
+import random
+import re
+import string
+import subprocess
+import sys
+import yaml
+import numpy as np
+from collections import defaultdict
+from copy import deepcopy
+from dataclasses import dataclass
+from functools import partial
+from pydub import AudioSegment
+from tqdm import tqdm
+import torch
+import torchvision
+from torch.utils.data import DataLoader, Dataset, get_worker_info
+from torch.utils.data.distributed import DistributedSampler
+from transformers import AutoTokenizer
+import librosa
+import soundfile as sf
+def int16_to_float32(x):
+    return (x / 32767.0).astype(np.float32)
+def float32_to_int16(x):
+    x = np.clip(x, a_min=-1., a_max=1.)
+    return (x * 32767.).astype(np.int16)
+class AudioTextDataProcessor:
+    def __init__(
+        self,
+        data_root: str,
+        clap_config: dict,
+        tokenizer,
+        max_tokens: int,
+        **kwargs
+    ):
+        self.data_root = data_root
+        self.clap_config = clap_config
+        self.tokenizer = tokenizer
+        self.tokenizer.padding_side = "right"
+        self.max_tokens = max_tokens
+    def get_num_windows(self, T, sr):
+        clap_config = self.clap_config
+        window_length  = int(float(clap_config["window_length"]) * sr)
+        window_overlap = int(float(clap_config["window_overlap"]) * sr)
+        max_num_window = int(clap_config["max_num_window"])
+        num_windows = 1
+        if T <= window_length:
+            num_windows = 1
+            full_length = window_length
+        elif T >= (max_num_window * window_length - (max_num_window - 1) * window_overlap):
+            num_windows = max_num_window
+            full_length = (max_num_window * window_length - (max_num_window - 1) * window_overlap)
+        else:
+            num_windows = 1 + int(np.ceil((T - window_length) / float(window_length - window_overlap)))
+            full_length = num_windows * window_length - (num_windows - 1) * window_overlap
+        return num_windows, full_length
+    def load_audio(self, file_path, target_sr=44100, duration=30.0, start=0.0):
+        if file_path.endswith('.mp3'):
+            audio = AudioSegment.from_file(file_path)
+            if len(audio) > (start + duration) * 1000:
+                audio = audio[start * 1000:(start + duration) * 1000]
+            if audio.frame_rate != target_sr:
+                audio = audio.set_frame_rate(target_sr)
+            if audio.channels > 1:
+                audio = audio.set_channels(1)
+            data = np.array(audio.get_array_of_samples())
+            if audio.sample_width == 2:
+                data = data.astype(np.float32) / np.iinfo(np.int16).max
+            elif audio.sample_width == 4:
+                data = data.astype(np.float32) / np.iinfo(np.int32).max
+            else:
+                raise ValueError("Unsupported bit depth: {}".format(audio.sample_width))
+        else:
+            with sf.SoundFile(file_path) as audio:
+                original_sr = audio.samplerate
+                channels = audio.channels
+                max_frames = int((start + duration) * original_sr)
+                audio.seek(int(start * original_sr))
+                frames_to_read = min(max_frames, len(audio))
+                data = audio.read(frames_to_read)
+                if data.max() > 1 or data.min() < -1:
+                    data = data / max(abs(data.max()), abs(data.min()))
+            if original_sr != target_sr:
+                if channels == 1:
+                    data = librosa.resample(data.flatten(), orig_sr=original_sr, target_sr=target_sr)
+                else:
+                    data = librosa.resample(data.T, orig_sr=original_sr, target_sr=target_sr)[0]
+            else:
+                if channels != 1:
+                    data = data.T[0]
+        if data.min() >= 0:
+            data = 2 * data / abs(data.max()) - 1.0
+        else:
+            data = data / max(abs(data.max()), abs(data.min()))
+        assert len(data.shape) == 1, data.shape
+        return data
+    def compute_sliding_window(self, audio_file, audio_start=0.0):
+        if type(audio_start) == str:
+            audio_start = float(audio_start)
+        clap_config = self.clap_config
+        if clap_config["method"] == 'laion-clap':
+            sr = 48000
+        elif clap_config["method"] == 'microsoft-clap':
+            sr = 44100
+        else:
+            raise NotImplementedError
+        window_length  = int(float(clap_config["window_length"]) * sr)
+        window_overlap = int(float(clap_config["window_overlap"]) * sr)
+        max_num_window = int(clap_config["max_num_window"])
+        duration = max_num_window * (clap_config["window_length"] - clap_config["window_overlap"]) + clap_config["window_overlap"]
+        audio_data = self.load_audio(audio_file, sr, duration, audio_start)
+        T = len(audio_data)
+        num_windows, full_length = self.get_num_windows(T, sr)
+        if full_length > T:
+            audio_data = np.append(audio_data, np.zeros(full_length - T))
+        audio_data = audio_data.reshape(1, -1)
+        audio_data_tensor = torch.from_numpy(int16_to_float32(float32_to_int16(audio_data))).float()
+        audio_clips = []
+        audio_embed_mask = torch.zeros(max_num_window)
+        for i in range(num_windows):
+            start = i * (window_length - window_overlap)
+            audio_clips.append(audio_data_tensor[:, start:start+window_length])
+            audio_embed_mask[i] = 1
+        assert sum(audio_embed_mask) == num_windows
+        if num_windows < max_num_window:
+            for _ in range(max_num_window - num_windows):
+                audio_clips.append(torch.zeros_like(audio_clips[-1]))
+        audio_clips = torch.cat(audio_clips)  # (max_num_window, window_length * sr) cuda tensor
+        return audio_clips, audio_embed_mask
+    def preprocess_string_for_eval(self, x):
+        x = x.rstrip().lstrip()
+        x = x.lower()
+        return x
+    def process(self, item):
+        if type(item['name']) is str:
+            audio_files = [os.path.join(self.data_root, item['name'])]
+            audio_starts = [0 if 'audio_start' not in item else float(item['audio_start'])]
+        else:
+            audio_files = [os.path.join(self.data_root, name) for name in item['name']]
+            audio_starts = [0] * len(audio_files) if 'audio_start' not in item else item['audio_start']
+        audio_clips, audio_embed_mask = [], []
+        for audio_file, audio_start in zip(audio_files, audio_starts):
+            this_audio_clips, this_audio_embed_mask = self.compute_sliding_window(audio_file, audio_start)
+            audio_clips.append(this_audio_clips)
+            audio_embed_mask.append(this_audio_embed_mask)
+        audio_clips = torch.cat(audio_clips)
+        audio_embed_mask = torch.cat(audio_embed_mask)
+        correct_num_windows = int(self.clap_config["max_num_window"]) * int(self.clap_config["max_num_fewshot"])
+        if len(audio_clips) < correct_num_windows:
+            audio_clips = torch.cat([
+                audio_clips,
+                torch.zeros(correct_num_windows - len(audio_clips), audio_clips.shape[1])
+            ])
+            audio_embed_mask = torch.cat([
+                audio_embed_mask,
+                torch.zeros(correct_num_windows - len(audio_embed_mask))
+            ])
+        audio_clips.requires_grad = False
+        audio_embed_mask.requires_grad = False
+        assert type(item['name']) is str
+        # simple data - 1 audio, 1 text
+        if 'prompt' in item:
+            text_prompt = item['prompt'].lower()
+            prefix = item['prefix'].lower()  # the task is xxx.
+            sample = "{}{} <audio>{}\nanswer:{}".format(
+                self.tokenizer.bos_token,
+                self.preprocess_string_for_eval(prefix),
+                self.preprocess_string_for_eval(text_prompt),
+                self.tokenizer.sep_token
+            )
+        # dialog data - 1 audio, multiple text
+        elif 'dialogue' in item:
+            dialogue = item['dialogue']
+            prefix = item['prefix'].lower()  # the task is dialog.
+            sample = f"{self.tokenizer.bos_token}{prefix}<audio>"
+            for each_round in dialogue:
+                sample = sample + f"user: {each_round['user']} \nassistant: {self.tokenizer.sep_token}"
+                if 'assistant' in each_round:
+                    sample = sample + f"{each_round['assistant']}<|endofchunk|>{self.tokenizer.eos_token}\n"
+        text = self.tokenizer(
+            sample,
+            max_length=self.max_tokens*5,
+            padding="longest",
+            truncation="only_first",
+            return_tensors="pt"
+        )
+        return (item['name'], audio_clips, audio_embed_mask, text["input_ids"], text["attention_mask"])

inference_utils.py ADDED Viewed

	@@ -0,0 +1,81 @@

+# Copyright (c) 2024 NVIDIA CORPORATION.
+#   Licensed under the MIT license.
+import os
+import string
+import yaml
+from copy import deepcopy
+import torch
+from transformers import AutoTokenizer, set_seed
+set_seed(0)
+from data import AudioTextDataProcessor
+from src.factory import create_model_and_transforms
+def prepare_tokenizer(model_config):
+    tokenizer_path = model_config['tokenizer_path']
+    cache_dir = model_config['cache_dir']
+    text_tokenizer = AutoTokenizer.from_pretrained(
+        tokenizer_path,
+        local_files_only=False,
+        trust_remote_code=True,
+        cache_dir=cache_dir,
+    )
+    text_tokenizer.add_special_tokens(
+        {"additional_special_tokens": ["<audio>", "<|endofchunk|>"]}
+    )
+    if text_tokenizer.pad_token is None:
+        text_tokenizer.add_special_tokens({"pad_token": "<PAD>"})
+    if text_tokenizer.sep_token is None:
+        text_tokenizer.add_special_tokens({"sep_token": "<SEP>"})
+    return text_tokenizer
+def prepare_model(model_config, clap_config, checkpoint_path, device_id=0):
+    os.environ["TOKENIZERS_PARALLELISM"] = "false"  # disable the tokenizer parallelism warning
+    model, tokenizer = create_model_and_transforms(
+        **model_config,
+        clap_config=clap_config,
+        use_local_files=False,
+        gradient_checkpointing=False,
+        freeze_lm_embeddings=False,
+    )
+    model.eval()
+    model = model.to(device_id)
+    checkpoint = torch.load(checkpoint_path, map_location="cpu")
+    model_state_dict = checkpoint["model_state_dict"]
+    model_state_dict = {k.replace("module.", ""): v for k, v in model_state_dict.items()}
+    model.load_state_dict(model_state_dict, False)
+    return model
+def inference(model, tokenizer, item, processed_item, inference_kwargs, device_id=0):
+    filename, audio_clips, audio_embed_mask, input_ids, attention_mask = processed_item
+    audio_clips = audio_clips.to(device_id, dtype=None, non_blocking=True)
+    audio_embed_mask = audio_embed_mask.to(device_id, dtype=None, non_blocking=True)
+    input_ids = input_ids.to(device_id, dtype=None, non_blocking=True).squeeze()
+    media_token_id = tokenizer.encode("<audio>")[-1]
+    eoc_token_id = tokenizer.encode("<|endofchunk|>")[-1]
+    sep_token_id = tokenizer.sep_token_id
+    eos_token_id = tokenizer.eos_token_id
+    outputs = model.generate(
+        audio_x=audio_clips.unsqueeze(0),
+        audio_x_mask=audio_embed_mask.unsqueeze(0),
+        lang_x=input_ids.unsqueeze(0),
+        eos_token_id=eos_token_id,
+        max_new_tokens=128,
+        **inference_kwargs,
+    )
+    outputs_decoded = [
+        tokenizer.decode(output).split(tokenizer.sep_token)[-1].replace(tokenizer.eos_token, '').replace(tokenizer.pad_token, '').replace('<|endofchunk|>', '') for output in outputs
+    ]
+    return outputs_decoded

ms_clap/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

ms_clap/.gitignore ADDED Viewed

	@@ -0,0 +1,350 @@

+## Ignore Visual Studio temporary files, build results, and
+## files generated by popular Visual Studio add-ons.
+##
+## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
+# User-specific files
+*.rsuser
+*.suo
+*.user
+*.userosscache
+*.sln.docstates
+# User-specific files (MonoDevelop/Xamarin Studio)
+*.userprefs
+# Mono auto generated files
+mono_crash.*
+# Build results
+[Dd]ebug/
+[Dd]ebugPublic/
+[Rr]elease/
+[Rr]eleases/
+x64/
+x86/
+[Aa][Rr][Mm]/
+[Aa][Rr][Mm]64/
+bld/
+[Bb]in/
+[Oo]bj/
+[Ll]og/
+[Ll]ogs/
+# Visual Studio 2015/2017 cache/options directory
+.vs/
+# Uncomment if you have tasks that create the project's static files in wwwroot
+#wwwroot/
+# Visual Studio 2017 auto generated files
+Generated\ Files/
+# MSTest test Results
+[Tt]est[Rr]esult*/
+[Bb]uild[Ll]og.*
+# NUnit
+*.VisualState.xml
+TestResult.xml
+nunit-*.xml
+# Build Results of an ATL Project
+[Dd]ebugPS/
+[Rr]eleasePS/
+dlldata.c
+# Benchmark Results
+BenchmarkDotNet.Artifacts/
+# .NET Core
+project.lock.json
+project.fragment.lock.json
+artifacts/
+# StyleCop
+StyleCopReport.xml
+# Files built by Visual Studio
+*_i.c
+*_p.c
+*_h.h
+*.ilk
+*.meta
+*.obj
+*.iobj
+*.pch
+*.pdb
+*.ipdb
+*.pgc
+*.pgd
+*.rsp
+*.sbr
+*.tlb
+*.tli
+*.tlh
+*.tmp
+*.tmp_proj
+*_wpftmp.csproj
+*.log
+*.vspscc
+*.vssscc
+.builds
+*.pidb
+*.svclog
+*.scc
+# Chutzpah Test files
+_Chutzpah*
+# Visual C++ cache files
+ipch/
+*.aps
+*.ncb
+*.opendb
+*.opensdf
+*.sdf
+*.cachefile
+*.VC.db
+*.VC.VC.opendb
+# Visual Studio profiler
+*.psess
+*.vsp
+*.vspx
+*.sap
+# Visual Studio Trace Files
+*.e2e
+# TFS 2012 Local Workspace
+$tf/
+# Guidance Automation Toolkit
+*.gpState
+# ReSharper is a .NET coding add-in
+_ReSharper*/
+*.[Rr]e[Ss]harper
+*.DotSettings.user
+# TeamCity is a build add-in
+_TeamCity*
+# DotCover is a Code Coverage Tool
+*.dotCover
+# AxoCover is a Code Coverage Tool
+.axoCover/*
+!.axoCover/settings.json
+# Visual Studio code coverage results
+*.coverage
+*.coveragexml
+# NCrunch
+_NCrunch_*
+.*crunch*.local.xml
+nCrunchTemp_*
+# MightyMoose
+*.mm.*
+AutoTest.Net/
+# Web workbench (sass)
+.sass-cache/
+# Installshield output folder
+[Ee]xpress/
+# DocProject is a documentation generator add-in
+DocProject/buildhelp/
+DocProject/Help/*.HxT
+DocProject/Help/*.HxC
+DocProject/Help/*.hhc
+DocProject/Help/*.hhk
+DocProject/Help/*.hhp
+DocProject/Help/Html2
+DocProject/Help/html
+# Click-Once directory
+publish/
+# Publish Web Output
+*.[Pp]ublish.xml
+*.azurePubxml
+# Note: Comment the next line if you want to checkin your web deploy settings,
+# but database connection strings (with potential passwords) will be unencrypted
+*.pubxml
+*.publishproj
+# Microsoft Azure Web App publish settings. Comment the next line if you want to
+# checkin your Azure Web App publish settings, but sensitive information contained
+# in these scripts will be unencrypted
+PublishScripts/
+# NuGet Packages
+*.nupkg
+# NuGet Symbol Packages
+*.snupkg
+# The packages folder can be ignored because of Package Restore
+**/[Pp]ackages/*
+# except build/, which is used as an MSBuild target.
+!**/[Pp]ackages/build/
+# Uncomment if necessary however generally it will be regenerated when needed
+#!**/[Pp]ackages/repositories.config
+# NuGet v3's project.json files produces more ignorable files
+*.nuget.props
+*.nuget.targets
+# Microsoft Azure Build Output
+csx/
+*.build.csdef
+# Microsoft Azure Emulator
+ecf/
+rcf/
+# Windows Store app package directories and files
+AppPackages/
+BundleArtifacts/
+Package.StoreAssociation.xml
+_pkginfo.txt
+*.appx
+*.appxbundle
+*.appxupload
+# Visual Studio cache files
+# files ending in .cache can be ignored
+*.[Cc]ache
+# but keep track of directories ending in .cache
+!?*.[Cc]ache/
+# Others
+ClientBin/
+~$*
+*~
+*.dbmdl
+*.dbproj.schemaview
+*.jfm
+*.pfx
+*.publishsettings
+orleans.codegen.cs
+# Including strong name files can present a security risk
+# (https://github.com/github/gitignore/pull/2483#issue-259490424)
+#*.snk
+# Since there are multiple workflows, uncomment next line to ignore bower_components
+# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
+#bower_components/
+# RIA/Silverlight projects
+Generated_Code/
+# Backup & report files from converting an old project file
+# to a newer Visual Studio version. Backup files are not needed,
+# because we have git ;-)
+_UpgradeReport_Files/
+Backup*/
+UpgradeLog*.XML
+UpgradeLog*.htm
+ServiceFabricBackup/
+*.rptproj.bak
+# SQL Server files
+*.mdf
+*.ldf
+*.ndf
+# Business Intelligence projects
+*.rdl.data
+*.bim.layout
+*.bim_*.settings
+*.rptproj.rsuser
+*- [Bb]ackup.rdl
+*- [Bb]ackup ([0-9]).rdl
+*- [Bb]ackup ([0-9][0-9]).rdl
+# Microsoft Fakes
+FakesAssemblies/
+# GhostDoc plugin setting file
+*.GhostDoc.xml
+# Node.js Tools for Visual Studio
+.ntvs_analysis.dat
+node_modules/
+# Visual Studio 6 build log
+*.plg
+# Visual Studio 6 workspace options file
+*.opt
+# Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
+*.vbw
+# Visual Studio LightSwitch build output
+**/*.HTMLClient/GeneratedArtifacts
+**/*.DesktopClient/GeneratedArtifacts
+**/*.DesktopClient/ModelManifest.xml
+**/*.Server/GeneratedArtifacts
+**/*.Server/ModelManifest.xml
+_Pvt_Extensions
+# Paket dependency manager
+.paket/paket.exe
+paket-files/
+# FAKE - F# Make
+.fake/
+# CodeRush personal settings
+.cr/personal
+# Python Tools for Visual Studio (PTVS)
+__pycache__/
+*.pyc
+# Cake - Uncomment if you are using it
+# tools/**
+# !tools/packages.config
+# Tabs Studio
+*.tss
+# Telerik's JustMock configuration file
+*.jmconfig
+# BizTalk build output
+*.btp.cs
+*.btm.cs
+*.odx.cs
+*.xsd.cs
+# OpenCover UI analysis results
+OpenCover/
+# Azure Stream Analytics local run output
+ASALocalRun/
+# MSBuild Binary and Structured Log
+*.binlog
+# NVidia Nsight GPU debugger configuration file
+*.nvuser
+# MFractors (Xamarin productivity tool) working folder
+.mfractor/
+# Local History for Visual Studio
+.localhistory/
+# BeatPulse healthcheck temp database
+healthchecksdb
+# Backup folder for Package Reference Convert tool in Visual Studio 2017
+MigrationBackup/
+# Ionide (cross platform F# VS Code tools) working folder
+.ionide/

ms_clap/CODE_OF_CONDUCT.md ADDED Viewed

	@@ -0,0 +1,9 @@

+# Microsoft Open Source Code of Conduct
+This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
+Resources:
+- [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
+- [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
+- Contact [[email protected]](mailto:[email protected]) with questions or concerns

ms_clap/LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+    MIT License
+    Copyright (c) Microsoft Corporation.
+    Permission is hereby granted, free of charge, to any person obtaining a copy
+    of this software and associated documentation files (the "Software"), to deal
+    in the Software without restriction, including without limitation the rights
+    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+    copies of the Software, and to permit persons to whom the Software is
+    furnished to do so, subject to the following conditions:
+    The above copyright notice and this permission notice shall be included in all
+    copies or substantial portions of the Software.
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+    SOFTWARE

ms_clap/README.md ADDED Viewed

	@@ -0,0 +1,120 @@

+###### [Overview](#CLAP) | [Setup](#Setup) | [CLAP weights](#CLAP-weights) | [Usage](#Usage) | [Examples](#Examples) | [Citation](#Citation)
+# CLAP
+CLAP (Contrastive Language-Audio Pretraining) is a model that learns acoustic concepts from natural language supervision and enables “Zero-Shot” inference. The model has been extensively evaluated in 26 audio downstream tasks achieving SoTA in several of them including classification, retrieval, and captioning.
+<img width="832" alt="clap_diagrams" src="https://github.com/bmartin1/CLAP/assets/26778834/c5340a09-cc0c-4e41-ad5a-61546eaa824c">
+## Setup
+Install the dependencies: `pip install -r requirements.txt` using Python 3 to get started.
+If you have [conda](https://www.anaconda.com) installed, you can run the following:
+```shell
+git clone https://github.com/microsoft/CLAP.git && \
+cd CLAP && \
+conda create -n clap python=3.10 && \
+conda activate clap && \
+pip install -r requirements.txt
+```
+## NEW CLAP weights
+Download CLAP weights: versions _2022_, _2023_, and _clapcap_: [Pretrained Model \[Zenodo\]](https://zenodo.org/record/8378278)
+_clapcap_ is the audio captioning model that uses the 2023 encoders.
+## Usage
+- Zero-Shot Classification and Retrieval
+```python
+# Load model (Choose between versions '2022' or '2023')
+from src import CLAP
+clap_model = CLAP("<PATH TO WEIGHTS>", version = '2023', use_cuda=False)
+# Extract text embeddings
+text_embeddings = clap_model.get_text_embeddings(class_labels: List[str])
+# Extract audio embeddings
+audio_embeddings = clap_model.get_audio_embeddings(file_paths: List[str])
+# Compute similarity between audio and text embeddings
+similarities = clap_model.compute_similarity(audio_embeddings, text_embeddings)
+```
+- Audio Captioning
+```python
+# Load model (Choose version 'clapcap')
+from src import CLAP
+clap_model = CLAP("<PATH TO WEIGHTS>", version = 'clapcap', use_cuda=False)
+# Generate audio captions
+captions = clap_model.generate_caption(file_paths: List[str])
+```
+## Examples
+Take a look at `CLAP\src\` for usage examples.
+To run Zero-Shot Classification on the ESC50 dataset try the following:
+```bash
+> cd src && python zero_shot_classification.py
+```
+Output (version 2023)
+```bash
+ESC50 Accuracy: 93.9%
+```
+## Citation
+Kindly cite our work if you find it useful.
+[CLAP: Learning Audio Concepts from Natural Language Supervision](https://ieeexplore.ieee.org/abstract/document/10095889)
+```
+@inproceedings{CLAP2022,
+  title={Clap learning audio concepts from natural language supervision},
+  author={Elizalde, Benjamin and Deshmukh, Soham and Al Ismail, Mahmoud and Wang, Huaming},
+  booktitle={ICASSP 2023-2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
+  pages={1--5},
+  year={2023},
+  organization={IEEE}
+}
+```
+[Natural Language Supervision for General-Purpose Audio Representations](https://arxiv.org/abs/2309.05767)
+```
+@misc{CLAP2023,
+      title={Natural Language Supervision for General-Purpose Audio Representations},
+      author={Benjamin Elizalde and Soham Deshmukh and Huaming Wang},
+      year={2023},
+      eprint={2309.05767},
+      archivePrefix={arXiv},
+      primaryClass={cs.SD},
+      url={https://arxiv.org/abs/2309.05767}
+}
+```
+## Contributing
+This project welcomes contributions and suggestions.  Most contributions require you to agree to a
+Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
+the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com.
+When you submit a pull request, a CLA bot will automatically determine whether you need to provide
+a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions
+provided by the bot. You will only need to do this once across all repos using our CLA.
+This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
+For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
+contact [[email protected]](mailto:[email protected]) with any additional questions or comments.
+## Trademarks
+This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft
+trademarks or logos is subject to and must follow
+[Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general).
+Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship.
+Any use of third-party trademarks or logos are subject to those third-party's policies.

ms_clap/SECURITY.md ADDED Viewed

	@@ -0,0 +1,41 @@

+<!-- BEGIN MICROSOFT SECURITY.MD V0.0.8 BLOCK -->
+## Security
+Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/).
+If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/opensource/security/definition), please report it to us as described below.
+## Reporting Security Issues
+**Please do not report security vulnerabilities through public GitHub issues.**
+Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/opensource/security/create-report).
+If you prefer to submit without logging in, send email to [[email protected]](mailto:[email protected]).  If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/opensource/security/pgpkey).
+You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://aka.ms/opensource/security/msrc).
+Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
+  * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
+  * Full paths of source file(s) related to the manifestation of the issue
+  * The location of the affected source code (tag/branch/commit or direct URL)
+  * Any special configuration required to reproduce the issue
+  * Step-by-step instructions to reproduce the issue
+  * Proof-of-concept or exploit code (if possible)
+  * Impact of the issue, including how an attacker might exploit the issue
+This information will help us triage your report more quickly.
+If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/opensource/security/bounty) page for more details about our active programs.
+## Preferred Languages
+We prefer all communications to be in English.
+## Policy
+Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/opensource/security/cvd).
+<!-- END MICROSOFT SECURITY.MD BLOCK -->

ms_clap/SUPPORT.md ADDED Viewed

	@@ -0,0 +1,25 @@

+# TODO: The maintainer of this repo has not yet edited this file
+**REPO OWNER**: Do you want Customer Service & Support (CSS) support for this product/project?
+- **No CSS support:** Fill out this template with information about how to file issues and get help.
+- **Yes CSS support:** Fill out an intake form at [aka.ms/onboardsupport](https://aka.ms/onboardsupport). CSS will work with/help you to determine next steps.
+- **Not sure?** Fill out an intake as though the answer were "Yes". CSS will help you decide.
+*Then remove this first heading from this SUPPORT.MD file before publishing your repo.*
+# Support
+## How to file issues and get help
+This project uses GitHub Issues to track bugs and feature requests. Please search the existing
+issues before filing new issues to avoid duplicates.  For new issues, file your bug or
+feature request as a new Issue.
+For help and questions about using this project, please **REPO MAINTAINER: INSERT INSTRUCTIONS HERE
+FOR HOW TO ENGAGE REPO OWNERS OR COMMUNITY FOR HELP. COULD BE A STACK OVERFLOW TAG OR OTHER
+CHANNEL. WHERE WILL YOU HELP PEOPLE?**.
+## Microsoft Support Policy
+Support for this **PROJECT or PRODUCT** is limited to the resources listed above.

ms_clap/requirements.txt ADDED Viewed

	@@ -0,0 +1,50 @@

+appdirs==1.4.4
+audioread==3.0.0
+certifi==2022.12.7
+cffi==1.15.1
+charset-normalizer==3.0.1
+colorama==0.4.6
+decorator==5.1.1
+filelock==3.9.0
+flit_core==3.6.0
+huggingface-hub==0.12.1
+idna==3.4
+importlib-metadata==6.0.0
+importlib-resources==5.12.0
+jaraco.classes==3.2.3
+joblib==1.2.0
+lazy_loader==0.1
+librosa==0.10.0
+llvmlite==0.39.1
+mkl-service==2.4.0
+more-itertools==9.0.0
+msgpack==1.0.4
+numba==0.56.4
+numpy==1.23.5
+packaging==23.0
+pandas==1.4.2
+pooch==1.6.0
+pycparser==2.21
+pywin32-ctypes==0.2.0
+PyYAML==6.0
+regex==2022.10.31
+requests==2.28.2
+scikit-learn==1.2.1
+scipy==1.10.1
+setuptools==65.6.3
+six==1.16.0
+soundfile==0.12.1
+soxr==0.3.3
+threadpoolctl==3.1.0
+tokenizers==0.13.2
+torch==1.13.1
+torchaudio==0.13.1
+torchlibrosa==0.1.0
+torchvision==0.14.1
+tqdm==4.64.1
+transformers==4.26.1
+typing_extensions==4.4.0
+urllib3==1.26.14
+wheel==0.38.4
+wincertstore==0.2
+zipp==3.14.0

ms_clap/src/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

ms_clap/src/CLAPWrapper.py ADDED Viewed

	@@ -0,0 +1,458 @@

+import warnings
+warnings.filterwarnings("ignore")
+import random
+import torchaudio
+# from torch._six import string_classes
+import collections
+import re
+import numpy as np
+from transformers import AutoTokenizer, logging
+try:
+    from models.clap import CLAP
+    from models.mapper import get_clapcap
+except:
+    from .models.clap import CLAP
+    from .models.mapper import get_clapcap
+import math
+import torchaudio.transforms as T
+import os
+import torch
+from importlib_resources import files
+import argparse
+import yaml
+import sys
+logging.set_verbosity_error()
+class CLAPWrapper():
+    """
+    A class for interfacing CLAP model.
+    """
+    def __init__(self, model_fp, config_root, version, use_cuda=False):
+        self.supported_versions = ['2022', '2023', 'clapcap']
+        self.np_str_obj_array_pattern = re.compile(r'[SaUO]')
+        self.file_path = os.path.realpath(__file__)
+        self.default_collate_err_msg_format = (
+            "default_collate: batch must contain tensors, numpy arrays, numbers, "
+            "dicts or lists; found {}")
+        self.config_root = config_root
+        self.config_as_str = self.get_config_path(version)
+        self.model_fp = model_fp
+        self.use_cuda = use_cuda
+        self.version = version
+        if 'clapcap' in self.version:
+            self.clapcap, self.tokenizer, self.args = self.load_clapcap()
+        else:
+            self.clap, self.tokenizer, self.args = self.load_clap()
+    def get_config_path(self, version):
+        if version in self.supported_versions:
+            # config_root = /home/zkong/audio_flamingo/audio_flamingo_v1/microsoft_clap/src/configs
+            return f"{self.config_root}/config_{version}.yml"
+        else:
+            raise ValueError(f"The specific version is not supported. The supported versions are {str(self.supported_versions)}")
+    def read_config_as_args(self,config_path,args=None,is_config_str=False):
+        return_dict = {}
+        if config_path is not None:
+            if is_config_str:
+                yml_config = yaml.load(config_path, Loader=yaml.FullLoader)
+            else:
+                with open(config_path, "r") as f:
+                    yml_config = yaml.load(f, Loader=yaml.FullLoader)
+            if args != None:
+                for k, v in yml_config.items():
+                    if k in args.__dict__:
+                        args.__dict__[k] = v
+                    else:
+                        sys.stderr.write("Ignored unknown parameter {} in yaml.\n".format(k))
+            else:
+                for k, v in yml_config.items():
+                    return_dict[k] = v
+        args = args if args != None else return_dict
+        return argparse.Namespace(**args)
+    def load_clap(self):
+        r"""Load CLAP model with args from config file"""
+        args = self.read_config_as_args(self.config_as_str, is_config_str=False)
+        if 'roberta' in args.text_model or 'clip' in args.text_model or 'gpt' in args.text_model:
+            self.token_keys = ['input_ids', 'attention_mask']
+        elif 'bert' in args.text_model:
+            self.token_keys = ['input_ids', 'token_type_ids', 'attention_mask']
+        clap = CLAP(
+            audioenc_name=args.audioenc_name,
+            sample_rate=args.sampling_rate,
+            window_size=args.window_size,
+            hop_size=args.hop_size,
+            mel_bins=args.mel_bins,
+            fmin=args.fmin,
+            fmax=args.fmax,
+            classes_num=args.num_classes,
+            out_emb=args.out_emb,
+            text_model=args.text_model,
+            transformer_embed_dim=args.transformer_embed_dim,
+            d_proj=args.d_proj
+        )
+        # Load pretrained weights for model
+        model_state_dict = torch.load(self.model_fp, map_location=torch.device('cpu'))['model']
+        # We unwrap the DDP model and save. If the model is not unwrapped and saved, then the model needs to unwrapped before `load_state_dict`:
+        # Reference link: https://discuss.pytorch.org/t/how-to-load-dataparallel-model-which-trained-using-multiple-gpus/146005
+        clap.load_state_dict(model_state_dict)
+        clap.eval()  # set clap in eval mode
+        tokenizer = AutoTokenizer.from_pretrained(args.text_model)
+        if 'gpt' in args.text_model:
+            tokenizer.add_special_tokens({'pad_token': '!'})
+        if self.use_cuda and torch.cuda.is_available():
+            clap = clap.cuda()
+        return clap, tokenizer, args
+    def load_clapcap(self):
+        r"""Load CLAP model with args from config file"""
+        args = self.read_config_as_args(self.config_as_str, is_config_str=False)
+        args.prefix_dim = args.d_proj
+        text_model = args.text_model
+        args.text_model = args.text_decoder
+        args.cross_attention = True if 'cross' in args.clapcap_model.lower() else False
+        if 'roberta' in args.text_model or 'clip' in args.text_model or 'gpt' in args.text_model:
+            self.token_keys = ['input_ids', 'attention_mask']
+        elif 'bert' in args.text_model:
+            self.token_keys = ['input_ids', 'token_type_ids', 'attention_mask']
+        clap = CLAP(
+            audioenc_name=args.audioenc_name,
+            sample_rate=args.sampling_rate,
+            window_size=args.window_size,
+            hop_size=args.hop_size,
+            mel_bins=args.mel_bins,
+            fmin=args.fmin,
+            fmax=args.fmax,
+            classes_num=args.num_classes,
+            out_emb=args.out_emb,
+            text_model=text_model,
+            transformer_embed_dim=args.transformer_embed_dim,
+            d_proj=args.d_proj
+        )
+        clapcap = get_clapcap(args.clapcap_model)(clap, args.text_decoder, args.prefix_length, args.prefix_length_clip, args.prefix_dim,
+                 args.num_layers, args.normalize_prefix, args.mapping_type, True, True)
+        model_state_dict = torch.load(self.model_fp, map_location=torch.device('cpu'))['model']
+        clapcap.load_state_dict(model_state_dict)
+        clapcap.eval()  # set clap in eval mode
+        tokenizer = AutoTokenizer.from_pretrained(args.text_model)
+        if 'gpt' in args.text_model:
+            tokenizer.add_special_tokens({'pad_token': '!'})
+        if self.use_cuda and torch.cuda.is_available():
+            clapcap = clapcap.cuda()
+        return clapcap, tokenizer, args
+    def default_collate(self, batch):
+        r"""Puts each data field into a tensor with outer dimension batch size"""
+        elem = batch[0]
+        elem_type = type(elem)
+        if isinstance(elem, torch.Tensor):
+            out = None
+            if torch.utils.data.get_worker_info() is not None:
+                # If we're in a background process, concatenate directly into a
+                # shared memory tensor to avoid an extra copy
+                numel = sum([x.numel() for x in batch])
+                storage = elem.storage()._new_shared(numel)
+                out = elem.new(storage)
+            return torch.stack(batch, 0, out=out)
+        elif elem_type.__module__ == 'numpy' and elem_type.__name__ != 'str_' \
+                and elem_type.__name__ != 'string_':
+            if elem_type.__name__ == 'ndarray' or elem_type.__name__ == 'memmap':
+                # array of string classes and object
+                if self.np_str_obj_array_pattern.search(elem.dtype.str) is not None:
+                    raise TypeError(
+                        self.default_collate_err_msg_format.format(elem.dtype))
+                return self.default_collate([torch.as_tensor(b) for b in batch])
+            elif elem.shape == ():  # scalars
+                return torch.as_tensor(batch)
+        elif isinstance(elem, float):
+            return torch.tensor(batch, dtype=torch.float64)
+        elif isinstance(elem, int):
+            return torch.tensor(batch)
+        # elif isinstance(elem, string_classes):
+        #     return batch
+        elif isinstance(elem, collections.abc.Mapping):
+            return {key: self.default_collate([d[key] for d in batch]) for key in elem}
+        elif isinstance(elem, tuple) and hasattr(elem, '_fields'):  # namedtuple
+            return elem_type(*(self.default_collate(samples) for samples in zip(*batch)))
+        elif isinstance(elem, collections.abc.Sequence):
+            # check to make sure that the elements in batch have consistent size
+            it = iter(batch)
+            elem_size = len(next(it))
+            if not all(len(elem) == elem_size for elem in it):
+                raise RuntimeError(
+                    'each element in list of batch should be of equal size')
+            transposed = zip(*batch)
+            return [self.default_collate(samples) for samples in transposed]
+        raise TypeError(self.default_collate_err_msg_format.format(elem_type))
+    def read_audio(self, audio_path, resample=False):
+        r"""Loads audio file or array and returns a torch tensor"""
+        # Randomly sample a segment of audio_duration from the clip or pad to match duration
+        audio_time_series, sample_rate = torchaudio.load(audio_path)
+        resample_rate = self.args.sampling_rate
+        if resample:
+            resampler = T.Resample(sample_rate, resample_rate)
+            audio_time_series = resampler(audio_time_series)
+        return audio_time_series, sample_rate
+    def load_audio_into_tensor(self, audio_path, audio_duration, resample=False):
+        r"""Loads audio file and returns raw audio."""
+        # Randomly sample a segment of audio_duration from the clip or pad to match duration
+        audio_time_series, sample_rate = self.read_audio(audio_path, resample=False)
+        audio_time_series = audio_time_series.reshape(-1)
+        # audio_time_series is shorter than predefined audio duration,
+        # so audio_time_series is extended
+        if audio_duration*sample_rate >= audio_time_series.shape[0]:
+            repeat_factor = int(np.ceil((audio_duration*sample_rate) /
+                                        audio_time_series.shape[0]))
+            # Repeat audio_time_series by repeat_factor to match audio_duration
+            audio_time_series = audio_time_series.repeat(repeat_factor)
+            # remove excess part of audio_time_series
+            audio_time_series = audio_time_series[0:audio_duration*sample_rate]
+        else:
+            # audio_time_series is longer than predefined audio duration,
+            # so audio_time_series is trimmed
+            start_index = random.randrange(
+                audio_time_series.shape[0] - audio_duration*sample_rate)
+            audio_time_series = audio_time_series[start_index:start_index +
+                                                  audio_duration*sample_rate]
+        return torch.FloatTensor(audio_time_series)
+    # modified by Kong
+    def load_audio_clip_into_tensor(self, audio_clip, audio_duration, resample=False):
+        r"""Loads audio clip and returns raw audio."""
+        # Randomly sample a segment of audio_duration from the clip or pad to match duration
+        sample_rate = 44100
+        audio_time_series = audio_clip.reshape(-1)
+        # audio_time_series is shorter than predefined audio duration,
+        # so audio_time_series is extended
+        assert audio_duration * sample_rate >= audio_time_series.shape[0], \
+            'dur * sr = {} should be larger than len = {}'.format(audio_duration * sample_rate, audio_time_series.shape[0])
+        repeat_factor = int(np.ceil((audio_duration*sample_rate) /
+                                    audio_time_series.shape[0]))
+        # Repeat audio_time_series by repeat_factor to match audio_duration
+        audio_time_series = audio_time_series.repeat(repeat_factor)
+        # remove excess part of audio_time_series
+        audio_time_series = audio_time_series[0:audio_duration*sample_rate]
+        # return torch.FloatTensor(audio_time_series)
+        return audio_time_series  # already on cuda device
+    def preprocess_audio(self, audio_files, resample):
+        r"""Load list of audio files and return raw audio"""
+        audio_tensors = []
+        for audio_file in audio_files:
+            audio_tensor = self.load_audio_into_tensor(
+                audio_file, self.args.duration, resample)
+            audio_tensor = audio_tensor.reshape(
+                1, -1).cuda() if self.use_cuda and torch.cuda.is_available() else audio_tensor.reshape(1, -1)
+            audio_tensors.append(audio_tensor)
+        return self.default_collate(audio_tensors)
+    # modified by Kong
+    def preprocess_audio_clips(self, audio_clips, resample=False):
+        r"""Load list of audio clips and return raw audio"""
+        audio_tensors = []
+        for audio_clip in audio_clips:
+            audio_tensor = self.load_audio_clip_into_tensor(
+                audio_clip, self.args.duration, resample=False)
+            audio_tensor = audio_tensor.reshape(
+                1, -1).cuda() if self.use_cuda and torch.cuda.is_available() else audio_tensor.reshape(1, -1)
+            audio_tensors.append(audio_tensor)
+        return self.default_collate(audio_tensors)
+    def preprocess_text(self, text_queries):
+        r"""Load list of class labels and return tokenized text"""
+        tokenized_texts = []
+        for ttext in text_queries:
+            if 'gpt' in self.args.text_model:
+                ttext = ttext + ' <|endoftext|>'
+            tok = self.tokenizer.encode_plus(
+                text=ttext, add_special_tokens=True, max_length=self.args.text_len, padding='max_length', return_tensors="pt")
+            for key in self.token_keys:
+                tok[key] = tok[key].reshape(-1).cuda() if self.use_cuda and torch.cuda.is_available() else tok[key].reshape(-1)
+            tokenized_texts.append(tok)
+        return self.default_collate(tokenized_texts)
+    def get_text_embeddings(self, class_labels):
+        r"""Load list of class labels and return text embeddings"""
+        preprocessed_text = self.preprocess_text(class_labels)
+        return self._get_text_embeddings(preprocessed_text)
+    def get_audio_embeddings(self, audio_files, resample):
+        r"""Load list of audio files and return a audio embeddings"""
+        preprocessed_audio = self.preprocess_audio(audio_files, resample)
+        return self._get_audio_embeddings(preprocessed_audio)
+    # modified by Kong
+    def get_audio_embeddings_from_clips(self, audio_clips, resample=False):
+        r"""Load list of audio files and return a audio embeddings"""
+        preprocessed_audio = self.preprocess_audio_clips(audio_clips, resample)
+        return self._get_audio_embeddings(preprocessed_audio)
+    def _get_text_embeddings(self, preprocessed_text):
+        r"""Load preprocessed text and return text embeddings"""
+        with torch.no_grad():
+            return self.clap.caption_encoder(preprocessed_text)
+    # modified by Kong
+    def _get_audio_embeddings(self, preprocessed_audio):
+        r"""Load preprocessed audio and return a audio embeddings"""
+        with torch.no_grad():
+            preprocessed_audio = preprocessed_audio.reshape(
+                preprocessed_audio.shape[0], preprocessed_audio.shape[2])
+            #Append [0] the audio emebdding, [1] has output class probabilities
+            if 'clapcap' in self.version:
+                return self.clapcap.clap(preprocessed_audio)[0]
+            else:
+                return self.clap.audio_encoder(preprocessed_audio)[0]
+    def _generic_batch_inference(self, func, *args):
+        r"""Process audio and/or text per batch"""
+        input_tmp = args[0]
+        batch_size = args[-1]
+        # args[0] has audio_files, args[1] has class_labels
+        inputs = [args[0], args[1]] if len(args) == 3 else [args[0]]
+        args0_len = len(args[0])
+        # compute text_embeddings once for all the audio_files batches
+        if len(inputs) == 2:
+            text_embeddings = self.get_text_embeddings(args[1])
+            inputs = [args[0], args[1], text_embeddings]
+        dataset_idx = 0
+        for _ in range(math.ceil(args0_len/batch_size)):
+            next_batch_idx = dataset_idx + batch_size
+            # batch size is bigger than available audio/text items
+            if next_batch_idx >= args0_len:
+                inputs[0] = input_tmp[dataset_idx:]
+                return func(*tuple(inputs))
+            else:
+                inputs[0] = input_tmp[dataset_idx:next_batch_idx]
+                yield func(*tuple(inputs))
+            dataset_idx = next_batch_idx
+    def get_audio_embeddings_per_batch(self, audio_files, batch_size):
+        r"""Load preprocessed audio and return a audio embeddings per batch"""
+        return self._generic_batch_inference(self.get_audio_embeddings, audio_files, batch_size)
+    def get_text_embeddings_per_batch(self, class_labels, batch_size):
+        r"""Load preprocessed text and return text embeddings per batch"""
+        return self._generic_batch_inference(self.get_text_embeddings, class_labels, batch_size)
+    def compute_similarity(self, audio_embeddings, text_embeddings):
+        r"""Compute similarity between text and audio embeddings"""
+        audio_embeddings = audio_embeddings/torch.norm(audio_embeddings, dim=-1, keepdim=True)
+        text_embeddings = text_embeddings/torch.norm(text_embeddings, dim=-1, keepdim=True)
+        logit_scale = self.clap.logit_scale.exp()
+        similarity = logit_scale*text_embeddings @ audio_embeddings.T
+        return similarity.T
+    def classify_audio_files_per_batch(self, audio_files, class_labels, batch_size):
+        r"""Compute classification probabilities for each audio recording in a batch and each class label"""
+        return self._generic_batch_inference(self.classify_audio_files, audio_files, class_labels, batch_size)
+    def generate_caption(self, audio_files, resample=True, beam_size: int = 5, entry_length=67, temperature=1.):
+        r"""Generate audio captions for each audio recording in a batch"""
+        captions = []
+        audio_tensors = self.preprocess_audio(audio_files, resample)
+        with torch.no_grad():
+            prefix = self.clapcap.clap(audio_tensors.squeeze(1))[0]
+            if self.args.normalize_prefix:
+                prefix = prefix / prefix.norm(2, -1).reshape(-1,1)
+            prefix_embed = self.clapcap.clap_project(prefix).view(-1, self.args.prefix_length, self.clapcap.gpt.transformer.wte.weight.shape[1])
+            for i in range(len(audio_tensors)):
+                gen_caption = self._generate_beam(embed=prefix_embed[i].unsqueeze(0),\
+                                                            beam_size=beam_size,\
+                                                            entry_length=entry_length,\
+                                                            temperature=temperature)[0]
+                captions.append(gen_caption.capitalize())
+        return captions
+    def _generate_beam(self, beam_size: int = 5, prompt=None, embed=None,
+                  entry_length=67, temperature=1., stop_token: str = ' <|endoftext|>'):
+        r"""Generate captions by beam search decoding"""
+        self.clapcap.eval()
+        stop_token_index = self.tokenizer.encode(stop_token)[0]
+        tokens = None
+        scores = None
+        device = next(self.clapcap.parameters()).device
+        seq_lengths = torch.ones(beam_size, device=device)
+        is_stopped = torch.zeros(beam_size, device=device, dtype=torch.bool)
+        with torch.no_grad():
+            if embed is not None:
+                generated = embed
+            else:
+                if tokens is None:
+                    tokens = torch.tensor(self.tokenizer.encode(prompt))
+                    tokens = tokens.unsqueeze(0).to(device)
+                    generated = self.clapcap.gpt.transformer.wte(tokens)
+            for i in range(entry_length):
+                outputs = self.clapcap.gpt(inputs_embeds=generated)
+                logits = outputs.logits
+                logits = logits[:, -1, :] / (temperature if temperature > 0 else 1.0)
+                logits = logits.softmax(-1).log()
+                if scores is None:
+                    scores, next_tokens = logits.topk(beam_size, -1)
+                    generated = generated.expand(beam_size, *generated.shape[1:])
+                    next_tokens, scores = next_tokens.permute(1, 0), scores.squeeze(0)
+                    if tokens is None:
+                        tokens = next_tokens
+                    else:
+                        tokens = tokens.expand(beam_size, *tokens.shape[1:])
+                        tokens = torch.cat((tokens, next_tokens), dim=1)
+                else:
+                    logits[is_stopped] = -float(np.inf)
+                    logits[is_stopped, 0] = 0
+                    scores_sum = scores[:, None] + logits
+                    seq_lengths[~is_stopped] += 1
+                    scores_sum_average = scores_sum / seq_lengths[:, None]
+                    scores_sum_average, next_tokens = scores_sum_average.view(-1).topk(beam_size, -1)
+                    next_tokens_source = next_tokens // scores_sum.shape[1]
+                    seq_lengths = seq_lengths[next_tokens_source]
+                    next_tokens = next_tokens % scores_sum.shape[1]
+                    next_tokens = next_tokens.unsqueeze(1)
+                    tokens = tokens[next_tokens_source]
+                    tokens = torch.cat((tokens, next_tokens), dim=1)
+                    generated = generated[next_tokens_source]
+                    scores = scores_sum_average * seq_lengths
+                    is_stopped = is_stopped[next_tokens_source]
+                next_token_embed = self.clapcap.gpt.transformer.wte(next_tokens.squeeze()).view(generated.shape[0], 1, -1)
+                generated = torch.cat((generated, next_token_embed), dim=1)
+                is_stopped = is_stopped + next_tokens.eq(stop_token_index).squeeze()
+                if is_stopped.all():
+                    break
+        scores = scores / seq_lengths
+        output_list = tokens.cpu().numpy()
+        output_texts = [self.tokenizer.decode(output[:int(length)]) for output, length in zip(output_list, seq_lengths)]
+        order = scores.argsort(descending=True)
+        output_texts = [output_texts[i] for i in order]
+        return output_texts

ms_clap/src/__init__.py ADDED Viewed

File without changes

ms_clap/src/audio_captioning.py ADDED Viewed

	@@ -0,0 +1,25 @@

+"""
+This is an example using CLAPCAP for audio captioning.
+"""
+from CLAPWrapper import CLAPWrapper
+# Load and initialize CLAP
+weights_path = "weights_path"
+clap_model = CLAPWrapper(weights_path, version = 'clapcap', use_cuda=False)
+#Load audio files
+audio_files = ['audio_file']
+# Generate captions for the recording
+captions = clap_model.generate_caption(audio_files, resample=True, beam_size=5, entry_length=67, temperature=0.01)
+# Print the result
+for i in range(len(audio_files)):
+    print(f"Audio file: {audio_files[i]} \n")
+    print(f"Generated caption: {captions[i]} \n")
+"""
+The output (the exact caption may vary):
+The birds are singing in the trees.
+"""

ms_clap/src/configs/config_2022.yml ADDED Viewed

	@@ -0,0 +1,26 @@

+# TEXT ENCODER CONFIG
+text_model: 'bert-base-uncased'
+text_len: 100
+transformer_embed_dim: 768
+freeze_text_encoder_weights: True
+# AUDIO ENCODER CONFIG
+audioenc_name: 'Cnn14'
+out_emb: 2048
+sampling_rate: 44100
+duration: 5
+fmin: 50
+fmax: 14000
+n_fft: 1028
+hop_size: 320
+mel_bins: 64
+window_size: 1024
+# PROJECTION SPACE CONFIG
+d_proj: 1024
+temperature: 0.003
+# TRAINING AND EVALUATION CONFIG
+num_classes: 527
+batch_size: 1024
+demo: False

ms_clap/src/configs/config_2023.yml ADDED Viewed

	@@ -0,0 +1,26 @@

+# TEXT ENCODER CONFIG
+text_model: 'gpt2'
+text_len: 77
+transformer_embed_dim: 768
+freeze_text_encoder_weights: True
+# AUDIO ENCODER CONFIG
+audioenc_name: 'HTSAT'
+out_emb: 768
+sampling_rate: 44100
+duration: 7
+fmin: 50
+fmax: 8000 #14000
+n_fft: 1024 # 1028
+hop_size: 320
+mel_bins: 64
+window_size: 1024
+# PROJECTION SPACE CONFIG
+d_proj: 1024
+temperature: 0.003
+# TRAINING AND EVALUATION CONFIG
+num_classes: 527
+batch_size: 1024
+demo: False

ms_clap/src/configs/config_clapcap.yml ADDED Viewed

	@@ -0,0 +1,34 @@

+# TEXT ENCODER CONFIG
+text_model: 'gpt2'
+transformer_embed_dim: 768
+freeze_text_encoder_weights: True
+# AUDIO ENCODER CONFIG
+audioenc_name: 'HTSAT'
+out_emb: 768
+sampling_rate: 44100
+duration: 7
+fmin: 50
+fmax: 8000
+n_fft: 1024
+hop_size: 320
+mel_bins: 64
+window_size: 1024
+# PROJECTION SPACE CONFIG
+d_proj: 1024
+temperature: 0.003
+# TRAINING AND EVALUATION CONFIG
+batch_size: 128
+num_classes: 527
+# CLAPCAP CONFIG
+clapcap_model: 'ClapCaption'
+text_decoder: 'gpt2'
+prefix_length: 40
+prefix_length_clip: 40
+mapping_type: 'transformer'
+num_layers: 8
+normalize_prefix: True
+freeze_gpt_weights: True

ms_clap/src/esc50_dataset.py ADDED Viewed

	@@ -0,0 +1,82 @@

+from torch.utils.data import Dataset
+from torchvision.datasets.utils import download_url
+from tqdm import tqdm
+import pandas as pd
+import os
+import torch.nn as nn
+import torch
+class AudioDataset(Dataset):
+    def __init__(self, root: str, download: bool = True):
+        self.root = os.path.expanduser(root)
+        if download:
+            self.download()
+    def __getitem__(self, index):
+        raise NotImplementedError
+    def download(self):
+        raise NotImplementedError
+    def __len__(self):
+        raise NotImplementedError
+class ESC50(AudioDataset):
+    base_folder = 'ESC-50-master'
+    url = "https://github.com/karoldvl/ESC-50/archive/master.zip"
+    filename = "ESC-50-master.zip"
+    num_files_in_dir = 2000
+    audio_dir = 'audio'
+    label_col = 'category'
+    file_col = 'filename'
+    meta = {
+        'filename': os.path.join('meta','esc50.csv'),
+    }
+    def __init__(self, root, reading_transformations: nn.Module = None, download: bool = True):
+        super().__init__(root)
+        self._load_meta()
+        self.targets, self.audio_paths = [], []
+        self.pre_transformations = reading_transformations
+        print("Loading audio files")
+        # self.df['filename'] = os.path.join(self.root, self.base_folder, self.audio_dir) + os.sep + self.df['filename']
+        self.df['category'] = self.df['category'].str.replace('_',' ')
+        for _, row in tqdm(self.df.iterrows()):
+            file_path = os.path.join(self.root, self.base_folder, self.audio_dir, row[self.file_col])
+            self.targets.append(row[self.label_col])
+            self.audio_paths.append(file_path)
+    def _load_meta(self):
+        path = os.path.join(self.root, self.base_folder, self.meta['filename'])
+        self.df = pd.read_csv(path)
+        self.class_to_idx = {}
+        self.classes = [x.replace('_',' ') for x in sorted(self.df[self.label_col].unique())]
+        for i, category in enumerate(self.classes):
+            self.class_to_idx[category] = i
+    def __getitem__(self, index):
+        """
+        Args:
+            index (int): Index
+        Returns:
+            tuple: (image, target) where target is index of the target class.
+        """
+        file_path, target = self.audio_paths[index], self.targets[index]
+        idx = torch.tensor(self.class_to_idx[target])
+        one_hot_target = torch.zeros(len(self.classes)).scatter_(0, idx, 1).reshape(1,-1)
+        return file_path, target, one_hot_target
+    def __len__(self):
+        return len(self.audio_paths)
+    def download(self):
+        download_url(self.url, self.root, self.filename)
+        # extract file
+        from zipfile import ZipFile
+        with ZipFile(os.path.join(self.root, self.filename), 'r') as zip:
+            zip.extractall(path=self.root)

ms_clap/src/models/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from . import clap
+from . import audio
+from . import htsat
+from . import config
+from . import pytorch_utils
+from . import htsat

ms_clap/src/models/audio.py ADDED Viewed

	@@ -0,0 +1,186 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torchlibrosa.stft import Spectrogram, LogmelFilterBank
+try:
+    from models.htsat import HTSATWrapper
+except:
+    from .htsat import HTSATWrapper
+def get_audio_encoder(name: str):
+    if name == "Cnn14":
+        return Cnn14
+    elif name == "HTSAT":
+        return HTSATWrapper
+    else:
+        raise Exception('The audio encoder name {} is incorrect or not supported'.format(name))
+class ConvBlock(nn.Module):
+    def __init__(self, in_channels, out_channels):
+        super(ConvBlock, self).__init__()
+        self.conv1 = nn.Conv2d(in_channels=in_channels,
+                              out_channels=out_channels,
+                              kernel_size=(3, 3), stride=(1, 1),
+                              padding=(1, 1), bias=False)
+        self.conv2 = nn.Conv2d(in_channels=out_channels,
+                              out_channels=out_channels,
+                              kernel_size=(3, 3), stride=(1, 1),
+                              padding=(1, 1), bias=False)
+        self.bn1 = nn.BatchNorm2d(out_channels)
+        self.bn2 = nn.BatchNorm2d(out_channels)
+    def forward(self, input, pool_size=(2, 2), pool_type='avg'):
+        x = input
+        x = F.relu_(self.bn1(self.conv1(x)))
+        x = F.relu_(self.bn2(self.conv2(x)))
+        if pool_type == 'max':
+            x = F.max_pool2d(x, kernel_size=pool_size)
+        elif pool_type == 'avg':
+            x = F.avg_pool2d(x, kernel_size=pool_size)
+        elif pool_type == 'avg+max':
+            x1 = F.avg_pool2d(x, kernel_size=pool_size)
+            x2 = F.max_pool2d(x, kernel_size=pool_size)
+            x = x1 + x2
+        else:
+            raise Exception('Incorrect argument!')
+        return x
+class ConvBlock5x5(nn.Module):
+    def __init__(self, in_channels, out_channels):
+        super(ConvBlock5x5, self).__init__()
+        self.conv1 = nn.Conv2d(in_channels=in_channels,
+                              out_channels=out_channels,
+                              kernel_size=(5, 5), stride=(1, 1),
+                              padding=(2, 2), bias=False)
+        self.bn1 = nn.BatchNorm2d(out_channels)
+    def forward(self, input, pool_size=(2, 2), pool_type='avg'):
+        x = input
+        x = F.relu_(self.bn1(self.conv1(x)))
+        if pool_type == 'max':
+            x = F.max_pool2d(x, kernel_size=pool_size)
+        elif pool_type == 'avg':
+            x = F.avg_pool2d(x, kernel_size=pool_size)
+        elif pool_type == 'avg+max':
+            x1 = F.avg_pool2d(x, kernel_size=pool_size)
+            x2 = F.max_pool2d(x, kernel_size=pool_size)
+            x = x1 + x2
+        else:
+            raise Exception('Incorrect argument!')
+        return x
+class AttBlock(nn.Module):
+    def __init__(self, n_in, n_out, activation='linear', temperature=1.):
+        super(AttBlock, self).__init__()
+        self.activation = activation
+        self.temperature = temperature
+        self.att = nn.Conv1d(in_channels=n_in, out_channels=n_out, kernel_size=1, stride=1, padding=0, bias=True)
+        self.cla = nn.Conv1d(in_channels=n_in, out_channels=n_out, kernel_size=1, stride=1, padding=0, bias=True)
+        self.bn_att = nn.BatchNorm1d(n_out)
+    def forward(self, x):
+        # x: (n_samples, n_in, n_time)
+        norm_att = torch.softmax(torch.clamp(self.att(x), -10, 10), dim=-1)
+        cla = self.nonlinear_transform(self.cla(x))
+        x = torch.sum(norm_att * cla, dim=2)
+        return x, norm_att, cla
+    def nonlinear_transform(self, x):
+        if self.activation == 'linear':
+            return x
+        elif self.activation == 'sigmoid':
+            return torch.sigmoid(x)
+class Cnn14(nn.Module):
+    def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin,
+        fmax, classes_num, out_emb):
+        super(Cnn14, self).__init__()
+        window = 'hann'
+        center = True
+        pad_mode = 'reflect'
+        ref = 1.0
+        amin = 1e-10
+        top_db = None
+        # Spectrogram extractor
+        self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size,
+            win_length=window_size, window=window, center=center, pad_mode=pad_mode,
+            freeze_parameters=True)
+        # Logmel feature extractor
+        self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size,
+            n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db,
+            freeze_parameters=True)
+        self.bn0 = nn.BatchNorm2d(64)
+        self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
+        self.conv_block2 = ConvBlock(in_channels=64, out_channels=128)
+        self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
+        self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)
+        self.conv_block5 = ConvBlock(in_channels=512, out_channels=1024)
+        self.conv_block6 = ConvBlock(in_channels=1024, out_channels=2048)
+        # out_emb is 2048 for best Cnn14
+        self.fc1 = nn.Linear(2048, out_emb, bias=True)
+        self.fc_audioset = nn.Linear(out_emb, classes_num, bias=True)
+    def forward(self, input, mixup_lambda=None):
+        """
+        Input: (batch_size, data_length)
+        """
+        x = self.spectrogram_extractor(input)   # (batch_size, 1, time_steps, freq_bins)
+        x = self.logmel_extractor(x)    # (batch_size, 1, time_steps, mel_bins)
+        x = x.transpose(1, 3)
+        x = self.bn0(x)
+        x = x.transpose(1, 3)
+        x = self.conv_block1(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block2(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block4(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block5(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block6(x, pool_size=(1, 1), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = torch.mean(x, dim=3)
+        (x1, _) = torch.max(x, dim=2)
+        x2 = torch.mean(x, dim=2)
+        x = x1 + x2
+        x = F.dropout(x, p=0.5, training=self.training)
+        x = F.relu_(self.fc1(x))
+        embedding = F.dropout(x, p=0.5, training=self.training)
+        clipwise_output = torch.sigmoid(self.fc_audioset(x))
+        output_dict = {'clipwise_output': clipwise_output, 'embedding': embedding}
+        return output_dict

ms_clap/src/models/clap.py ADDED Viewed

	@@ -0,0 +1,109 @@

+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+from transformers import AutoModel
+from .audio import get_audio_encoder
+class Projection(nn.Module):
+    def __init__(self, d_in: int, d_out: int, p: float=0.5) -> None:
+        super().__init__()
+        self.linear1 = nn.Linear(d_in, d_out, bias=False)
+        self.linear2 = nn.Linear(d_out, d_out, bias=False)
+        self.layer_norm = nn.LayerNorm(d_out)
+        self.drop = nn.Dropout(p)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        embed1 = self.linear1(x)
+        embed2 = self.drop(self.linear2(F.gelu(embed1)))
+        embeds = self.layer_norm(embed1 + embed2)
+        return embeds
+class AudioEncoder(nn.Module):
+    def __init__(self, audioenc_name:str, d_in: int, d_out: int, sample_rate: int, window_size: int,
+            hop_size: int, mel_bins: int, fmin: int, fmax: int, classes_num: int) -> None:
+        super().__init__()
+        audio_encoder = get_audio_encoder(audioenc_name)
+        self.base = audio_encoder(
+            sample_rate, window_size,
+            hop_size, mel_bins, fmin, fmax,
+            classes_num, d_in)
+        self.projection = Projection(d_in, d_out)
+    def forward(self, x):
+        out_dict = self.base(x)
+        audio_features, audio_classification_output = out_dict['embedding'], out_dict['clipwise_output']
+        projected_vec = self.projection(audio_features)
+        return projected_vec, audio_classification_output
+class TextEncoder(nn.Module):
+    def __init__(self, d_out: int, text_model: str, transformer_embed_dim: int) -> None:
+        super().__init__()
+        self.text_model = text_model
+        self.base = AutoModel.from_pretrained(text_model)
+        if 'clip' in text_model:
+            self.clip_text_projection = self.base.text_projection
+            self.base = self.base.text_model
+            if 'base' in text_model:
+                transformer_embed_dim = 512
+        self.projection = Projection(transformer_embed_dim, d_out)
+    def forward(self, x):
+        if 'clip' in self.text_model:
+            pooled_output = self.base(**x)[1] # get pooled output
+            out = self.clip_text_projection(pooled_output)  # get CLS token output
+        elif 'gpt' in self.text_model:
+            batch_size = x['input_ids'].shape[0]
+            hidden_states = self.base(**x)[0] # (batch_size=4, seq_len, 768)
+            sequence_lengths = torch.ne(x['input_ids'], 0).sum(-1) - 1 # tensor([13, 14, 18, 17])
+            out = hidden_states[torch.arange(batch_size, device=hidden_states.device), sequence_lengths] # [batch_size, 768] = [4, 768]
+        else:
+            out = self.base(**x)[0]
+            out = out[:, 0, :]  # get CLS token output
+        projected_vec = self.projection(out)
+        return projected_vec
+class CLAP(nn.Module):
+    def __init__(self,
+                # audio
+                audioenc_name: str,
+                sample_rate: int,
+                window_size: int,
+                hop_size: int,
+                mel_bins: int,
+                fmin: int,
+                fmax: int,
+                classes_num: int,
+                out_emb: int,
+                # text
+                text_model: str,
+                transformer_embed_dim: int,
+                # common
+                d_proj: int,
+                ):
+        super().__init__()
+        self.audio_encoder = AudioEncoder(
+            audioenc_name, out_emb, d_proj,
+            sample_rate, window_size, hop_size, mel_bins, fmin, fmax, classes_num)
+        self.caption_encoder = TextEncoder(
+            d_proj, text_model, transformer_embed_dim
+        )
+        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+    def forward(self, audio, text):
+        audio_embed, _ = self.audio_encoder(audio)
+        caption_embed = self.caption_encoder(text)
+        return caption_embed, audio_embed, self.logit_scale.exp()

ms_clap/src/models/config.py ADDED Viewed

	@@ -0,0 +1,128 @@

+# Ke Chen
+# [email protected]
+# HTS-AT: A HIERARCHICAL TOKEN-SEMANTIC AUDIO TRANSFORMER FOR SOUND CLASSIFICATION AND DETECTION
+# The configuration for training the model
+exp_name = "exp_htsat_pretrain" # the saved ckpt prefix name of the model
+workspace = "/home/kechen/Research/HTSAT" # the folder of your code
+dataset_path = "/home/Research/audioset" # the dataset path
+desed_folder = "/home/Research/DESED" # the desed file
+dataset_type = "audioset" # "audioset" "esc-50" "scv2"
+index_type = "full_train" # only works for audioset
+balanced_data = True # only works for audioset
+loss_type = "clip_bce" #
+# AudioSet & SCV2: "clip_bce" |  ESC-50: "clip_ce"
+# trained from a checkpoint, or evaluate a single model
+resume_checkpoint = None
+# "/home/Research/model_backup/AudioSet/HTSAT_AudioSet_Saved_1.ckpt"
+esc_fold = 0 # just for esc dataset, select the fold you need for evaluation and (+1) validation
+debug = False
+random_seed = 970131 # 19970318 970131 12412 127777 1009 34047
+batch_size = 32 * 4 # batch size per GPU x GPU number , default is 32 x 4 = 128
+learning_rate = 1e-3 # 1e-4 also workable
+max_epoch = 100
+num_workers = 3
+lr_scheduler_epoch = [10,20,30]
+lr_rate = [0.02, 0.05, 0.1]
+# these data preparation optimizations do not bring many improvements, so deprecated
+enable_token_label = False # token label
+class_map_path = "class_hier_map.npy"
+class_filter = None
+retrieval_index = [15382, 9202, 130, 17618, 17157, 17516, 16356, 6165, 13992, 9238, 5550, 5733, 1914, 1600, 3450, 13735, 11108, 3762,
+    9840, 11318, 8131, 4429, 16748, 4992, 16783, 12691, 4945, 8779, 2805, 9418, 2797, 14357, 5603, 212, 3852, 12666, 1338, 10269, 2388, 8260, 4293, 14454, 7677, 11253, 5060, 14938, 8840, 4542, 2627, 16336, 8992, 15496, 11140, 446, 6126, 10691, 8624, 10127, 9068, 16710, 10155, 14358, 7567, 5695, 2354, 8057, 17635, 133, 16183, 14535, 7248, 4560, 14429, 2463, 10773, 113, 2462, 9223, 4929, 14274, 4716, 17307, 4617, 2132, 11083, 1039, 1403, 9621, 13936, 2229, 2875, 17840, 9359, 13311, 9790, 13288, 4750, 17052, 8260, 14900]
+token_label_range = [0.2,0.6]
+enable_time_shift = False # shift time
+enable_label_enhance = False # enhance hierarchical label
+enable_repeat_mode = False # repeat the spectrogram / reshape the spectrogram
+# for model's design
+enable_tscam = True # enbale the token-semantic layer
+# for signal processing
+sample_rate = 32000 # 16000 for scv2, 32000 for audioset and esc-50
+clip_samples = sample_rate * 10 # audio_set 10-sec clip
+window_size = 1024
+hop_size = 320 # 160 for scv2, 320 for audioset and esc-50
+mel_bins = 64
+fmin = 50
+fmax = 14000
+shift_max = int(clip_samples * 0.5)
+# for data collection
+classes_num = 527 # esc: 50 | audioset: 527 | scv2: 35
+patch_size = (25, 4) # deprecated
+crop_size = None # int(clip_samples * 0.5) deprecated
+# for htsat hyperparamater
+htsat_window_size = 8
+htsat_spec_size =  256
+htsat_patch_size = 4
+htsat_stride = (4, 4)
+htsat_num_head = [4,8,16,32]
+htsat_dim = 96
+htsat_depth = [2,2,6,2]
+swin_pretrain_path = None
+# "/home/Research/model_backup/pretrain/swin_tiny_c24_patch4_window8_256.pth"
+# Some Deprecated Optimization in the model design, check the model code for details
+htsat_attn_heatmap = False
+htsat_hier_output = False
+htsat_use_max = False
+# for ensemble test
+ensemble_checkpoints = []
+ensemble_strides = []
+# weight average folder
+wa_folder = "/home/version_0/checkpoints/"
+# weight average output filename
+wa_model_path = "HTSAT_AudioSet_Saved_x.ckpt"
+esm_model_pathes = [
+    "/home/Research/model_backup/AudioSet/HTSAT_AudioSet_Saved_1.ckpt",
+    "/home/Research/model_backup/AudioSet/HTSAT_AudioSet_Saved_2.ckpt",
+    "/home/Research/model_backup/AudioSet/HTSAT_AudioSet_Saved_3.ckpt",
+    "/home/Research/model_backup/AudioSet/HTSAT_AudioSet_Saved_4.ckpt",
+    "/home/Research/model_backup/AudioSet/HTSAT_AudioSet_Saved_5.ckpt",
+    "/home/Research/model_backup/AudioSet/HTSAT_AudioSet_Saved_6.ckpt"
+]
+# for framewise localization
+heatmap_dir = "/home/Research/heatmap_output"
+test_file = "htsat-test-ensemble"
+fl_local = False # indicate if we need to use this dataset for the framewise detection
+fl_dataset = "/home/Research/desed/desed_eval.npy"
+fl_class_num = [
+    "Speech", "Frying", "Dishes", "Running_water",
+    "Blender", "Electric_shaver_toothbrush", "Alarm_bell_ringing",
+    "Cat", "Dog", "Vacuum_cleaner"
+]
+# map 527 classes into 10 classes
+fl_audioset_mapping = [
+    [0,1,2,3,4,5,6,7],
+    [366, 367, 368],
+    [364],
+    [288, 289, 290, 291, 292, 293, 294, 295, 296, 297],
+    [369],
+    [382],
+    [310, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402],
+    [81, 82, 83, 84, 85],
+    [74, 75, 76, 77, 78, 79],
+    [377]
+]

ms_clap/src/models/htsat.py ADDED Viewed

	@@ -0,0 +1,956 @@

+# Ke Chen
+# [email protected]
+# HTS-AT: A HIERARCHICAL TOKEN-SEMANTIC AUDIO TRANSFORMER FOR SOUND CLASSIFICATION AND DETECTION
+# Model Core
+# below codes are based and referred from https://github.com/microsoft/Swin-Transformer
+# Swin Transformer for Computer Vision: https://arxiv.org/pdf/2103.14030.pdf
+import logging
+import pdb
+import math
+import random
+from numpy.core.fromnumeric import clip, reshape
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint as checkpoint
+# import os
+import sys
+sys.path.append('/home/zkong/audio_flamingo/audio_flamingo_v1/v0.2/open_flamingo/my_ms_clap/models')
+from torchlibrosa.stft import Spectrogram, LogmelFilterBank
+from torchlibrosa.augmentation import SpecAugmentation
+from itertools import repeat
+from typing import List
+try:
+    from models.pytorch_utils import do_mixup, interpolate
+    import models.config as config
+except:
+    from .pytorch_utils import do_mixup, interpolate
+    from . import config
+    # from CLAP_API.models.pytorch_utils import do_mixup, interpolate
+    # from CLAP_API.models import config
+import torch.nn.functional as F
+import collections.abc
+import warnings
+from torch.nn.init import _calculate_fan_in_and_fan_out
+def _ntuple(n):
+    def parse(x):
+        if isinstance(x, collections.abc.Iterable):
+            return x
+        return tuple(repeat(x, n))
+    return parse
+to_1tuple = _ntuple(1)
+to_2tuple = _ntuple(2)
+to_3tuple = _ntuple(3)
+to_4tuple = _ntuple(4)
+to_ntuple = _ntuple
+def drop_path(x, drop_prob: float = 0., training: bool = False):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
+    changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
+    'survival rate' as the argument.
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device)
+    random_tensor.floor_()  # binarize
+    output = x.div(keep_prob) * random_tensor
+    return output
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+class PatchEmbed(nn.Module):
+    """ 2D Image to Patch Embedding
+    """
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, norm_layer=None, flatten=True, patch_stride = 16):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        patch_stride = to_2tuple(patch_stride)
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.patch_stride = patch_stride
+        self.grid_size = (img_size[0] // patch_stride[0], img_size[1] // patch_stride[1])
+        self.num_patches = self.grid_size[0] * self.grid_size[1]
+        self.flatten = flatten
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+        padding = ((patch_size[0] - patch_stride[0]) // 2, (patch_size[1] - patch_stride[1]) // 2)
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_stride, padding=padding)
+        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+    def forward(self, x):
+        B, C, H, W = x.shape
+        assert H == self.img_size[0] and W == self.img_size[1], \
+            f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+        x = self.proj(x)
+        if self.flatten:
+            x = x.flatten(2).transpose(1, 2)  # BCHW -> BNC
+        x = self.norm(x)
+        return x
+class Mlp(nn.Module):
+    """ MLP as used in Vision Transformer, MLP-Mixer and related networks
+    """
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+def _no_grad_trunc_normal_(tensor, mean, std, a, b):
+    # Cut & paste from PyTorch official master until it's in a few official releases - RW
+    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
+    def norm_cdf(x):
+        # Computes standard normal cumulative distribution function
+        return (1. + math.erf(x / math.sqrt(2.))) / 2.
+    if (mean < a - 2 * std) or (mean > b + 2 * std):
+        warnings.warn("mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
+                      "The distribution of values may be incorrect.",
+                      stacklevel=2)
+    with torch.no_grad():
+        # Values are generated by using a truncated uniform distribution and
+        # then using the inverse CDF for the normal distribution.
+        # Get upper and lower cdf values
+        l = norm_cdf((a - mean) / std)
+        u = norm_cdf((b - mean) / std)
+        # Uniformly fill tensor with values from [l, u], then translate to
+        # [2l-1, 2u-1].
+        tensor.uniform_(2 * l - 1, 2 * u - 1)
+        # Use inverse cdf transform for normal distribution to get truncated
+        # standard normal
+        tensor.erfinv_()
+        # Transform to proper mean, std
+        tensor.mul_(std * math.sqrt(2.))
+        tensor.add_(mean)
+        # Clamp to ensure it's in the proper range
+        tensor.clamp_(min=a, max=b)
+        return tensor
+def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.):
+    # type: (Tensor, float, float, float, float) -> Tensor
+    r"""Fills the input Tensor with values drawn from a truncated
+    normal distribution. The values are effectively drawn from the
+    normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`
+    with values outside :math:`[a, b]` redrawn until they are within
+    the bounds. The method used for generating the random values works
+    best when :math:`a \leq \text{mean} \leq b`.
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        mean: the mean of the normal distribution
+        std: the standard deviation of the normal distribution
+        a: the minimum cutoff value
+        b: the maximum cutoff value
+    Examples:
+        >>> w = torch.empty(3, 5)
+        >>> nn.init.trunc_normal_(w)
+    """
+    return _no_grad_trunc_normal_(tensor, mean, std, a, b)
+def variance_scaling_(tensor, scale=1.0, mode='fan_in', distribution='normal'):
+    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
+    if mode == 'fan_in':
+        denom = fan_in
+    elif mode == 'fan_out':
+        denom = fan_out
+    elif mode == 'fan_avg':
+        denom = (fan_in + fan_out) / 2
+    variance = scale / denom
+    if distribution == "truncated_normal":
+        # constant is stddev of standard normal truncated to (-2, 2)
+        trunc_normal_(tensor, std=math.sqrt(variance) / .87962566103423978)
+    elif distribution == "normal":
+        tensor.normal_(std=math.sqrt(variance))
+    elif distribution == "uniform":
+        bound = math.sqrt(3 * variance)
+        tensor.uniform_(-bound, bound)
+    else:
+        raise ValueError(f"invalid distribution {distribution}")
+def lecun_normal_(tensor):
+    variance_scaling_(tensor, mode='fan_in', distribution='truncated_normal')
+# below codes are based and referred from https://github.com/microsoft/Swin-Transformer
+# Swin Transformer for Computer Vision: https://arxiv.org/pdf/2103.14030.pdf
+def window_partition(x, window_size):
+    """
+    Args:
+        x: (B, H, W, C)
+        window_size (int): window size
+    Returns:
+        windows: (num_windows*B, window_size, window_size, C)
+    """
+    B, H, W, C = x.shape
+    x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
+    windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
+    return windows
+def window_reverse(windows, window_size, H, W):
+    """
+    Args:
+        windows: (num_windows*B, window_size, window_size, C)
+        window_size (int): Window size
+        H (int): Height of image
+        W (int): Width of image
+    Returns:
+        x: (B, H, W, C)
+    """
+    B = int(windows.shape[0] / (H * W / window_size / window_size))
+    x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
+    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
+    return x
+class WindowAttention(nn.Module):
+    r""" Window based multi-head self attention (W-MSA) module with relative position bias.
+    It supports both of shifted and non-shifted window.
+    Args:
+        dim (int): Number of input channels.
+        window_size (tuple[int]): The height and width of the window.
+        num_heads (int): Number of attention heads.
+        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
+        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
+        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
+    """
+    def __init__(self, dim, window_size, num_heads, qkv_bias=True, qk_scale=None, attn_drop=0., proj_drop=0.):
+        super().__init__()
+        self.dim = dim
+        self.window_size = window_size  # Wh, Ww
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+        # define a parameter table of relative position bias
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads))  # 2*Wh-1 * 2*Ww-1, nH
+        # get pair-wise relative position index for each token inside the window
+        coords_h = torch.arange(self.window_size[0])
+        coords_w = torch.arange(self.window_size[1])
+        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
+        relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
+        relative_coords[:, :, 0] += self.window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += self.window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
+        relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+        self.register_buffer("relative_position_index", relative_position_index)
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        trunc_normal_(self.relative_position_bias_table, std=.02)
+        self.softmax = nn.Softmax(dim=-1)
+    def forward(self, x, mask=None):
+        """
+        Args:
+            x: input features with shape of (num_windows*B, N, C)
+            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
+        """
+        B_, N, C = x.shape
+        qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]  # make torchscript happy (cannot use tensor as tuple)
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1))
+        relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
+            self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1)  # Wh*Ww,Wh*Ww,nH
+        relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+        attn = attn + relative_position_bias.unsqueeze(0)
+        if mask is not None:
+            nW = mask.shape[0]
+            attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0)
+            attn = attn.view(-1, self.num_heads, N, N)
+            attn = self.softmax(attn)
+        else:
+            attn = self.softmax(attn)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x, attn
+    def extra_repr(self):
+        return f'dim={self.dim}, window_size={self.window_size}, num_heads={self.num_heads}'
+# We use the model based on Swintransformer Block, therefore we can use the swin-transformer pretrained model
+class SwinTransformerBlock(nn.Module):
+    r""" Swin Transformer Block.
+    Args:
+        dim (int): Number of input channels.
+        input_resolution (tuple[int]): Input resulotion.
+        num_heads (int): Number of attention heads.
+        window_size (int): Window size.
+        shift_size (int): Shift size for SW-MSA.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float, optional): Stochastic depth rate. Default: 0.0
+        act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+    def __init__(self, dim, input_resolution, num_heads, window_size=7, shift_size=0,
+                 mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0., drop_path=0.,
+                 act_layer=nn.GELU, norm_layer=nn.LayerNorm, norm_before_mlp='ln'):
+        super().__init__()
+        self.dim = dim
+        self.input_resolution = input_resolution
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.shift_size = shift_size
+        self.mlp_ratio = mlp_ratio
+        self.norm_before_mlp = norm_before_mlp
+        if min(self.input_resolution) <= self.window_size:
+            # if window size is larger than input resolution, we don't partition windows
+            self.shift_size = 0
+            self.window_size = min(self.input_resolution)
+        assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
+        self.norm1 = norm_layer(dim)
+        self.attn = WindowAttention(
+            dim, window_size=to_2tuple(self.window_size), num_heads=num_heads,
+            qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        if self.norm_before_mlp == 'ln':
+            self.norm2 = nn.LayerNorm(dim)
+        elif self.norm_before_mlp == 'bn':
+            self.norm2 = lambda x: nn.BatchNorm1d(dim)(x.transpose(1, 2)).transpose(1, 2)
+        else:
+            raise NotImplementedError
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+        if self.shift_size > 0:
+            # calculate attention mask for SW-MSA
+            H, W = self.input_resolution
+            img_mask = torch.zeros((1, H, W, 1))  # 1 H W 1
+            h_slices = (slice(0, -self.window_size),
+                        slice(-self.window_size, -self.shift_size),
+                        slice(-self.shift_size, None))
+            w_slices = (slice(0, -self.window_size),
+                        slice(-self.window_size, -self.shift_size),
+                        slice(-self.shift_size, None))
+            cnt = 0
+            for h in h_slices:
+                for w in w_slices:
+                    img_mask[:, h, w, :] = cnt
+                    cnt += 1
+            mask_windows = window_partition(img_mask, self.window_size)  # nW, window_size, window_size, 1
+            mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
+            attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+            attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
+        else:
+            attn_mask = None
+        self.register_buffer("attn_mask", attn_mask)
+    def forward(self, x):
+        # pdb.set_trace()
+        H, W = self.input_resolution
+        # print("H: ", H)
+        # print("W: ", W)
+        # pdb.set_trace()
+        B, L, C = x.shape
+        # assert L == H * W, "input feature has wrong size"
+        shortcut = x
+        x = self.norm1(x)
+        x = x.view(B, H, W, C)
+        # cyclic shift
+        if self.shift_size > 0:
+            shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
+        else:
+            shifted_x = x
+        # partition windows
+        x_windows = window_partition(shifted_x, self.window_size)  # nW*B, window_size, window_size, C
+        x_windows = x_windows.view(-1, self.window_size * self.window_size, C)  # nW*B, window_size*window_size, C
+        # W-MSA/SW-MSA
+        attn_windows, attn = self.attn(x_windows, mask=self.attn_mask)  # nW*B, window_size*window_size, C
+        # merge windows
+        attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C)
+        shifted_x = window_reverse(attn_windows, self.window_size, H, W)  # B H' W' C
+        # reverse cyclic shift
+        if self.shift_size > 0:
+            x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
+        else:
+            x = shifted_x
+        x = x.view(B, H * W, C)
+        # FFN
+        x = shortcut + self.drop_path(x)
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x, attn
+    def extra_repr(self):
+        return f"dim={self.dim}, input_resolution={self.input_resolution}, num_heads={self.num_heads}, " \
+               f"window_size={self.window_size}, shift_size={self.shift_size}, mlp_ratio={self.mlp_ratio}"
+class PatchMerging(nn.Module):
+    r""" Patch Merging Layer.
+    Args:
+        input_resolution (tuple[int]): Resolution of input feature.
+        dim (int): Number of input channels.
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+    def __init__(self, input_resolution, dim, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.input_resolution = input_resolution
+        self.dim = dim
+        self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
+        self.norm = norm_layer(4 * dim)
+    def forward(self, x):
+        """
+        x: B, H*W, C
+        """
+        H, W = self.input_resolution
+        B, L, C = x.shape
+        assert L == H * W, "input feature has wrong size"
+        assert H % 2 == 0 and W % 2 == 0, f"x size ({H}*{W}) are not even."
+        x = x.view(B, H, W, C)
+        x0 = x[:, 0::2, 0::2, :]  # B H/2 W/2 C
+        x1 = x[:, 1::2, 0::2, :]  # B H/2 W/2 C
+        x2 = x[:, 0::2, 1::2, :]  # B H/2 W/2 C
+        x3 = x[:, 1::2, 1::2, :]  # B H/2 W/2 C
+        x = torch.cat([x0, x1, x2, x3], -1)  # B H/2 W/2 4*C
+        x = x.view(B, -1, 4 * C)  # B H/2*W/2 4*C
+        x = self.norm(x)
+        x = self.reduction(x)
+        return x
+    def extra_repr(self):
+        return f"input_resolution={self.input_resolution}, dim={self.dim}"
+class BasicLayer(nn.Module):
+    """ A basic Swin Transformer layer for one stage.
+    Args:
+        dim (int): Number of input channels.
+        input_resolution (tuple[int]): Input resolution.
+        depth (int): Number of blocks.
+        num_heads (int): Number of attention heads.
+        window_size (int): Local window size.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
+        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
+        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+    """
+    def __init__(self, dim, input_resolution, depth, num_heads, window_size,
+                 mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0.,
+                 drop_path=0., norm_layer=nn.LayerNorm, downsample=None, use_checkpoint=False,
+                 norm_before_mlp='ln'):
+        super().__init__()
+        self.dim = dim
+        self.input_resolution = input_resolution
+        self.depth = depth
+        self.use_checkpoint = use_checkpoint
+        # build blocks
+        self.blocks = nn.ModuleList([
+            SwinTransformerBlock(dim=dim, input_resolution=input_resolution,
+                                 num_heads=num_heads, window_size=window_size,
+                                 shift_size=0 if (i % 2 == 0) else window_size // 2,
+                                 mlp_ratio=mlp_ratio,
+                                 qkv_bias=qkv_bias, qk_scale=qk_scale,
+                                 drop=drop, attn_drop=attn_drop,
+                                 drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
+                                 norm_layer=norm_layer, norm_before_mlp=norm_before_mlp)
+            for i in range(depth)])
+        # patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(input_resolution, dim=dim, norm_layer=norm_layer)
+        else:
+            self.downsample = None
+    def forward(self, x):
+        attns = []
+        for blk in self.blocks:
+            if self.use_checkpoint:
+                x = checkpoint.checkpoint(blk, x)
+            else:
+                x, attn = blk(x)
+                if not self.training:
+                    attns.append(attn.unsqueeze(0))
+        if self.downsample is not None:
+            x = self.downsample(x)
+        if not self.training:
+            attn = torch.cat(attns, dim = 0)
+            attn = torch.mean(attn, dim = 0)
+        return x, attn
+    def extra_repr(self):
+        return f"dim={self.dim}, input_resolution={self.input_resolution}, depth={self.depth}"
+# The Core of HTSAT
+class HTSAT_Swin_Transformer(nn.Module):
+    r"""HTSAT based on the Swin Transformer
+    Args:
+        spec_size (int | tuple(int)): Input Spectrogram size. Default 256
+        patch_size (int | tuple(int)): Patch size. Default: 4
+        path_stride (iot | tuple(int)): Patch Stride for Frequency and Time Axis. Default: 4
+        in_chans (int): Number of input image channels. Default: 1 (mono)
+        num_classes (int): Number of classes for classification head. Default: 527
+        embed_dim (int): Patch embedding dimension. Default: 96
+        depths (tuple(int)): Depth of each HTSAT-Swin Transformer layer.
+        num_heads (tuple(int)): Number of attention heads in different layers.
+        window_size (int): Window size. Default: 8
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4
+        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. Default: None
+        drop_rate (float): Dropout rate. Default: 0
+        attn_drop_rate (float): Attention dropout rate. Default: 0
+        drop_path_rate (float): Stochastic depth rate. Default: 0.1
+        norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
+        ape (bool): If True, add absolute position embedding to the patch embedding. Default: False
+        patch_norm (bool): If True, add normalization after patch embedding. Default: True
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False
+        config (module): The configuration Module from config.py
+    """
+    def __init__(self, spec_size=256, patch_size=4, patch_stride=(4,4),
+                in_chans=1, num_classes=527,
+                 embed_dim=96, depths=[2, 2, 6, 2], num_heads=[4, 8, 16, 32],
+                 window_size=8, mlp_ratio=4., qkv_bias=True, qk_scale=None,
+                 drop_rate=0., attn_drop_rate=0., drop_path_rate=0.1,
+                 norm_layer=nn.LayerNorm,
+                 ape=False, patch_norm=True,
+                 use_checkpoint=False, norm_before_mlp='ln', config = None, **kwargs):
+        super(HTSAT_Swin_Transformer, self).__init__()
+        self.config = config
+        self.spec_size = spec_size
+        self.patch_stride = patch_stride
+        self.patch_size = patch_size
+        self.window_size = window_size
+        self.embed_dim = embed_dim
+        self.depths = depths
+        self.ape = ape
+        self.in_chans = in_chans
+        self.num_classes = num_classes
+        self.num_heads = num_heads
+        self.num_layers = len(self.depths)
+        self.num_features = int(self.embed_dim * 2 ** (self.num_layers - 1))
+        self.drop_rate = drop_rate
+        self.attn_drop_rate = attn_drop_rate
+        self.drop_path_rate = drop_path_rate
+        self.qkv_bias = qkv_bias
+        self.qk_scale = None
+        self.patch_norm = patch_norm
+        self.norm_layer = norm_layer if self.patch_norm else None
+        self.norm_before_mlp = norm_before_mlp
+        self.mlp_ratio = mlp_ratio
+        self.use_checkpoint = use_checkpoint
+        #  process mel-spec ; used only once
+        self.freq_ratio = self.spec_size // self.config.mel_bins
+        window = 'hann'
+        center = True
+        pad_mode = 'reflect'
+        ref = 1.0
+        amin = 1e-10
+        top_db = None
+        self.interpolate_ratio = 32     # Downsampled ratio
+        # Spectrogram extractor
+        self.spectrogram_extractor = Spectrogram(n_fft=config.window_size, hop_length=config.hop_size,
+            win_length=config.window_size, window=window, center=center, pad_mode=pad_mode,
+            freeze_parameters=True)
+        # Logmel feature extractor
+        self.logmel_extractor = LogmelFilterBank(sr=config.sample_rate, n_fft=config.window_size,
+            n_mels=config.mel_bins, fmin=config.fmin, fmax=config.fmax, ref=ref, amin=amin, top_db=top_db,
+            freeze_parameters=True)
+        # Spec augmenter
+        self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2,
+            freq_drop_width=8, freq_stripes_num=2) # 2 2
+        self.bn0 = nn.BatchNorm2d(self.config.mel_bins)
+        # split spctrogram into non-overlapping patches
+        self.patch_embed = PatchEmbed(
+            img_size=self.spec_size, patch_size=self.patch_size, in_chans=self.in_chans,
+            embed_dim=self.embed_dim, norm_layer=self.norm_layer, patch_stride = patch_stride)
+        num_patches = self.patch_embed.num_patches
+        patches_resolution = self.patch_embed.grid_size
+        self.patches_resolution = patches_resolution
+        # absolute position embedding
+        if self.ape:
+            self.absolute_pos_embed = nn.Parameter(torch.zeros(1, num_patches, self.embed_dim))
+            trunc_normal_(self.absolute_pos_embed, std=.02)
+        self.pos_drop = nn.Dropout(p=self.drop_rate)
+        # stochastic depth
+        dpr = [x.item() for x in torch.linspace(0, self.drop_path_rate, sum(self.depths))]  # stochastic depth decay rule
+        # build layers
+        self.layers = nn.ModuleList()
+        for i_layer in range(self.num_layers):
+            layer = BasicLayer(dim=int(self.embed_dim * 2 ** i_layer),
+                input_resolution=(patches_resolution[0] // (2 ** i_layer),
+                                    patches_resolution[1] // (2 ** i_layer)),
+                depth=self.depths[i_layer],
+                num_heads=self.num_heads[i_layer],
+                window_size=self.window_size,
+                mlp_ratio=self.mlp_ratio,
+                qkv_bias=self.qkv_bias, qk_scale=self.qk_scale,
+                drop=self.drop_rate, attn_drop=self.attn_drop_rate,
+                drop_path=dpr[sum(self.depths[:i_layer]):sum(self.depths[:i_layer + 1])],
+                norm_layer=self.norm_layer,
+                downsample=PatchMerging if (i_layer < self.num_layers - 1) else None,
+                use_checkpoint=use_checkpoint,
+                norm_before_mlp=self.norm_before_mlp)
+            self.layers.append(layer)
+        self.norm = self.norm_layer(self.num_features)
+        self.avgpool = nn.AdaptiveAvgPool1d(1)
+        self.maxpool = nn.AdaptiveMaxPool1d(1)
+        if self.config.enable_tscam:
+            SF = self.spec_size // (2 ** (len(self.depths) - 1)) // self.patch_stride[0] // self.freq_ratio
+            self.tscam_conv = nn.Conv2d(
+                in_channels = self.num_features,
+                out_channels = self.num_classes,
+                kernel_size = (SF,3),
+                padding = (0,1)
+            )
+            self.head = nn.Linear(num_classes, num_classes)
+        else:
+            self.head = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity()
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {'absolute_pos_embed'}
+    @torch.jit.ignore
+    def no_weight_decay_keywords(self):
+        return {'relative_position_bias_table'}
+    def forward_features(self, x):
+        frames_num = x.shape[2]
+        x = self.patch_embed(x)
+        if self.ape:
+            x = x + self.absolute_pos_embed
+        x = self.pos_drop(x)
+        for i, layer in enumerate(self.layers):
+            x, attn = layer(x)
+        if self.config.enable_tscam:
+            # for x
+            x = self.norm(x)
+            B, N, C = x.shape
+            SF = frames_num // (2 ** (len(self.depths) - 1)) // self.patch_stride[0]
+            ST = frames_num // (2 ** (len(self.depths) - 1)) // self.patch_stride[1]
+            x = x.permute(0,2,1).contiguous().reshape(B, C, SF, ST)
+            B, C, F, T = x.shape
+            # group 2D CNN
+            c_freq_bin = F // self.freq_ratio
+            x = x.reshape(B, C, F // c_freq_bin, c_freq_bin, T)
+            x = x.permute(0,1,3,2,4).contiguous().reshape(B, C, c_freq_bin, -1)
+            # get latent_output
+            latent_output = self.avgpool(torch.flatten(x,2))
+            latent_output = torch.flatten(latent_output, 1)
+            # display the attention map, if needed
+            if self.config.htsat_attn_heatmap:
+                # for attn
+                attn = torch.mean(attn, dim = 1)
+                attn = torch.mean(attn, dim = 1)
+                attn = attn.reshape(B, SF, ST)
+                c_freq_bin = SF // self.freq_ratio
+                attn = attn.reshape(B, SF // c_freq_bin, c_freq_bin, ST)
+                attn = attn.permute(0,2,1,3).contiguous().reshape(B, c_freq_bin, -1)
+                attn = attn.mean(dim = 1)
+                attn_max = torch.max(attn, dim = 1, keepdim = True)[0]
+                attn_min = torch.min(attn, dim = 1, keepdim = True)[0]
+                attn = ((attn * 0.15) + (attn_max * 0.85 - attn_min)) / (attn_max - attn_min)
+                attn = attn.unsqueeze(dim = 2)
+            x = self.tscam_conv(x)
+            x = torch.flatten(x, 2) # B, C, T
+            if self.config.htsat_attn_heatmap:
+                fpx = interpolate(torch.sigmoid(x).permute(0,2,1).contiguous() * attn, 8 * self.patch_stride[1])
+            else:
+                fpx = interpolate(torch.sigmoid(x).permute(0,2,1).contiguous(), 8 * self.patch_stride[1])
+            x = self.avgpool(x)
+            x = torch.flatten(x, 1)
+            if self.config.loss_type == "clip_ce":
+                output_dict = {
+                    'framewise_output': fpx, # already sigmoided
+                    'clipwise_output': x,
+                    'latent_output': latent_output
+                }
+            else:
+                output_dict = {
+                    'framewise_output': fpx, # already sigmoided
+                    'clipwise_output': torch.sigmoid(x),
+                    'latent_output': latent_output
+                }
+        else:
+            x = self.norm(x)  # B N C
+            B, N, C = x.shape
+            fpx = x.permute(0,2,1).contiguous().reshape(B, C, frames_num // (2 ** (len(self.depths) + 1)), frames_num // (2 ** (len(self.depths) + 1)) )
+            B, C, F, T = fpx.shape
+            c_freq_bin = F // self.freq_ratio
+            fpx = fpx.reshape(B, C, F // c_freq_bin, c_freq_bin, T)
+            fpx = fpx.permute(0,1,3,2,4).contiguous().reshape(B, C, c_freq_bin, -1)
+            fpx = torch.sum(fpx, dim = 2)
+            fpx = interpolate(fpx.permute(0,2,1).contiguous(), 8 * self.patch_stride[1])
+            x = self.avgpool(x.transpose(1, 2))  # B C 1
+            x = torch.flatten(x, 1)
+            if self.num_classes > 0:
+                x = self.head(x)
+                fpx = self.head(fpx)
+            output_dict = {'framewise_output': torch.sigmoid(fpx),
+                'clipwise_output': torch.sigmoid(x)}
+        return output_dict
+    def crop_wav(self, x, crop_size, spe_pos = None):
+        time_steps = x.shape[2]
+        tx = torch.zeros(x.shape[0], x.shape[1], crop_size, x.shape[3]).to(x.device)
+        for i in range(len(x)):
+            if spe_pos is None:
+                crop_pos = random.randint(0, time_steps - crop_size - 1)
+            else:
+                crop_pos = spe_pos
+            tx[i][0] = x[i, 0, crop_pos:crop_pos + crop_size,:]
+        return tx
+    # Reshape the wavform to a img size, if you want to use the pretrained swin transformer model
+    def reshape_wav2img(self, x):
+        B, C, T, F = x.shape
+        target_T = int(self.spec_size * self.freq_ratio)
+        target_F = self.spec_size // self.freq_ratio
+        assert T <= target_T and F <= target_F, "the wav size should less than or equal to the swin input size"
+        # to avoid bicubic zero error
+        if T < target_T:
+            x = nn.functional.interpolate(x, (target_T, x.shape[3]), mode="bicubic", align_corners=True)
+        if F < target_F:
+            x = nn.functional.interpolate(x, (x.shape[2], target_F), mode="bicubic", align_corners=True)
+        x = x.permute(0,1,3,2).contiguous()
+        x = x.reshape(x.shape[0], x.shape[1], x.shape[2], self.freq_ratio, x.shape[3] // self.freq_ratio)
+        # print(x.shape)
+        x = x.permute(0,1,3,2,4).contiguous()
+        x = x.reshape(x.shape[0], x.shape[1], x.shape[2] * x.shape[3], x.shape[4])
+        return x
+    # Repeat the wavform to a img size, if you want to use the pretrained swin transformer model
+    def repeat_wat2img(self, x, cur_pos):
+        B, C, T, F = x.shape
+        target_T = int(self.spec_size * self.freq_ratio)
+        target_F = self.spec_size // self.freq_ratio
+        assert T <= target_T and F <= target_F, "the wav size should less than or equal to the swin input size"
+        # to avoid bicubic zero error
+        if T < target_T:
+            x = nn.functional.interpolate(x, (target_T, x.shape[3]), mode="bicubic", align_corners=True)
+        if F < target_F:
+            x = nn.functional.interpolate(x, (x.shape[2], target_F), mode="bicubic", align_corners=True)
+        x = x.permute(0,1,3,2).contiguous() # B C F T
+        x = x[:,:,:,cur_pos:cur_pos + self.spec_size]
+        x = x.repeat(repeats = (1,1,4,1))
+        return x
+    def forward(self, x: torch.Tensor, mixup_lambda = None, infer_mode = False):# out_feat_keys: List[str] = None):
+        x = self.spectrogram_extractor(x)   # (batch_size, 1, time_steps, freq_bins)
+        x = self.logmel_extractor(x)    # (batch_size, 1, time_steps, mel_bins)
+        x = x.transpose(1, 3)
+        x = self.bn0(x)
+        x = x.transpose(1, 3)
+        if self.training:
+            x = self.spec_augmenter(x)
+        if self.training and mixup_lambda is not None:
+            x = do_mixup(x, mixup_lambda)
+        if infer_mode:
+            # in infer mode. we need to handle different length audio input
+            frame_num = x.shape[2]
+            target_T = int(self.spec_size * self.freq_ratio)
+            repeat_ratio = math.floor(target_T / frame_num)
+            x = x.repeat(repeats=(1,1,repeat_ratio,1))
+            x = self.reshape_wav2img(x)
+            output_dict = self.forward_features(x)
+        elif self.config.enable_repeat_mode:
+            if self.training:
+                cur_pos = random.randint(0, (self.freq_ratio - 1) * self.spec_size - 1)
+                x = self.repeat_wat2img(x, cur_pos)
+                output_dict = self.forward_features(x)
+            else:
+                output_dicts = []
+                for cur_pos in range(0, (self.freq_ratio - 1) * self.spec_size + 1, self.spec_size):
+                    tx = x.clone()
+                    tx = self.repeat_wat2img(tx, cur_pos)
+                    output_dicts.append(self.forward_features(tx))
+                clipwise_output = torch.zeros_like(output_dicts[0]["clipwise_output"]).float().to(x.device)
+                framewise_output = torch.zeros_like(output_dicts[0]["framewise_output"]).float().to(x.device)
+                for d in output_dicts:
+                    clipwise_output += d["clipwise_output"]
+                    framewise_output += d["framewise_output"]
+                clipwise_output  = clipwise_output / len(output_dicts)
+                framewise_output = framewise_output / len(output_dicts)
+                output_dict = {
+                    'framewise_output': framewise_output,
+                    'clipwise_output': clipwise_output
+                }
+        else:
+            if x.shape[2] > self.freq_ratio * self.spec_size:
+                if self.training:
+                    x = self.crop_wav(x, crop_size=self.freq_ratio * self.spec_size)
+                    x = self.reshape_wav2img(x)
+                    output_dict = self.forward_features(x)
+                else:
+                    # Change: Hard code here
+                    overlap_size = 344 #(x.shape[2] - 1) // 4
+                    output_dicts = []
+                    crop_size = 689 #(x.shape[2] - 1) // 2
+                    for cur_pos in range(0, x.shape[2] - crop_size - 1, overlap_size):
+                        tx = self.crop_wav(x, crop_size = crop_size, spe_pos = cur_pos)
+                        tx = self.reshape_wav2img(tx)
+                        output_dicts.append(self.forward_features(tx))
+                    clipwise_output = torch.zeros_like(output_dicts[0]["clipwise_output"]).float().to(x.device)
+                    framewise_output = torch.zeros_like(output_dicts[0]["framewise_output"]).float().to(x.device)
+                    latent_output = torch.zeros_like(output_dicts[0]["latent_output"]).float().to(x.device)
+                    for d in output_dicts:
+                        clipwise_output += d["clipwise_output"]
+                        framewise_output += d["framewise_output"]
+                        latent_output += d["latent_output"]
+                    clipwise_output  = clipwise_output / len(output_dicts)
+                    framewise_output = framewise_output / len(output_dicts)
+                    latent_output = latent_output / len(output_dicts)
+                    output_dict = {
+                        'framewise_output': framewise_output,
+                        'clipwise_output': clipwise_output,
+                        'latent_output': latent_output,
+                    }
+            else: # this part is typically used, and most easy one
+                x = self.reshape_wav2img(x)
+                output_dict = self.forward_features(x)
+        # x = self.head(x)
+        return output_dict
+class HTSATWrapper(nn.Module):
+    def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin,
+        fmax, classes_num, out_emb):
+        super().__init__()
+        # print("parameters are being overidden when using HTSAT")
+        # print("HTSAT only support loading a pretrained model on AudioSet")
+        # @TODO later look at what parameters are same and can be merged
+        self.htsat = HTSAT_Swin_Transformer(config=config)
+    def forward(self, x):
+        out_dict = self.htsat(x)
+        out_dict['embedding'] = out_dict['latent_output']
+        return out_dict

ms_clap/src/models/mapper.py ADDED Viewed

	@@ -0,0 +1,200 @@

+import torch
+import torch.nn as nn
+from torch.nn import functional as nnf
+from torch.utils.data import Dataset, DataLoader
+from enum import Enum
+from transformers import GPT2LMHeadModel
+from typing import Tuple, Optional, Union
+def get_clapcap(name: str):
+    if name == "ClapCaption":
+        return ClapCaptionModel
+    else:
+        raise Exception('The ClapCap model {} is incorrect or not supported'.format(name))
+class MappingType(Enum):
+    MLP = 'mlp'
+    Transformer = 'transformer'
+class MLP(nn.Module):
+    def __init__(self, sizes: Tuple[int, ...], bias=True, act=nn.Tanh):
+        super(MLP, self).__init__()
+        layers = []
+        for i in range(len(sizes) - 1):
+            layers.append(nn.Linear(sizes[i], sizes[i + 1], bias=bias))
+            if i < len(sizes) - 2:
+                layers.append(act())
+        self.model = nn.Sequential(*layers)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.model(x)
+class MlpTransformer(nn.Module):
+    def __init__(self, in_dim, h_dim, out_d: Optional[int] = None, act=nnf.relu, dropout=0.):
+        super().__init__()
+        out_d = out_d if out_d is not None else in_dim
+        self.fc1 = nn.Linear(in_dim, h_dim)
+        self.act = act
+        self.fc2 = nn.Linear(h_dim, out_d)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.dropout(x)
+        x = self.fc2(x)
+        x = self.dropout(x)
+        return x
+class MultiHeadAttention(nn.Module):
+    def __init__(self, dim_self, dim_ref, num_heads, bias=True, dropout=0.):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim_self // num_heads
+        self.scale = head_dim ** -0.5
+        self.to_queries = nn.Linear(dim_self, dim_self, bias=bias)
+        self.to_keys_values = nn.Linear(dim_ref, dim_self * 2, bias=bias)
+        self.project = nn.Linear(dim_self, dim_self)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x, y=None, mask=None):
+        y = y if y is not None else x
+        b, n, c = x.shape
+        _, m, d = y.shape
+        # b n h dh
+        queries = self.to_queries(x).reshape(b, n, self.num_heads, c // self.num_heads)
+        # b m 2 h dh
+        keys_values = self.to_keys_values(y).reshape(b, m, 2, self.num_heads, c // self.num_heads)
+        keys, values = keys_values[:, :, 0], keys_values[:, :, 1]
+        attention = torch.einsum('bnhd,bmhd->bnmh', queries, keys) * self.scale
+        if mask is not None:
+            if mask.dim() == 2:
+                mask = mask.unsqueeze(1)
+            attention = attention.masked_fill(mask.unsqueeze(3), float("-inf"))
+        attention = attention.softmax(dim=2)
+        out = torch.einsum('bnmh,bmhd->bnhd', attention, values).reshape(b, n, c)
+        out = self.project(out)
+        return out, attention
+class TransformerLayer(nn.Module):
+    def forward_with_attention(self, x, y=None, mask=None):
+        x_, attention = self.attn(self.norm1(x), y, mask)
+        x = x + x_
+        x = x + self.mlp(self.norm2(x))
+        return x, attention
+    def forward(self, x, y=None, mask=None):
+        x = x + self.attn(self.norm1(x), y, mask)[0]
+        x = x + self.mlp(self.norm2(x))
+        return x
+    def __init__(self, dim_self, dim_ref, num_heads, mlp_ratio=4., bias=False, dropout=0., act=nnf.relu,
+                 norm_layer: nn.Module = nn.LayerNorm):
+        super().__init__()
+        self.norm1 = norm_layer(dim_self)
+        self.attn = MultiHeadAttention(dim_self, dim_ref, num_heads, bias=bias, dropout=dropout)
+        self.norm2 = norm_layer(dim_self)
+        self.mlp = MlpTransformer(dim_self, int(dim_self * mlp_ratio), act=act, dropout=dropout)
+class Transformer(nn.Module):
+    def __init__(self, dim_self: int, num_heads: int, num_layers: int, dim_ref: Optional[int] = None,
+                 mlp_ratio: float = 2., act=nnf.relu, norm_layer: nn.Module = nn.LayerNorm, enc_dec: bool = False):
+        super(Transformer, self).__init__()
+        dim_ref = dim_ref if dim_ref is not None else dim_self
+        self.enc_dec = enc_dec
+        if enc_dec:
+            num_layers = num_layers * 2
+        layers = []
+        for i in range(num_layers):
+            if i % 2 == 0 and enc_dec:  # cross
+                layers.append(TransformerLayer(dim_self, dim_ref, num_heads, mlp_ratio, act=act, norm_layer=norm_layer))
+            elif enc_dec:  # self
+                layers.append(TransformerLayer(dim_self, dim_self, num_heads, mlp_ratio, act=act, norm_layer=norm_layer))
+            else:  # self or cross
+                layers.append(TransformerLayer(dim_self, dim_ref, num_heads, mlp_ratio, act=act, norm_layer=norm_layer))
+        self.layers = nn.ModuleList(layers)
+    def forward_with_attention(self, x, y=None, mask=None):
+        attentions = []
+        for layer in self.layers:
+            x, att = layer.forward_with_attention(x, y, mask)
+            attentions.append(att)
+        return x, attentions
+    def forward(self, x, y=None, mask=None):
+        for i, layer in enumerate(self.layers):
+            if i % 2 == 0 and self.enc_dec: # cross
+                x = layer(x, y)
+            elif self.enc_dec:  # self
+                x = layer(x, x, mask)
+            else:  # self or cross
+                x = layer(x, y, mask)
+        return x
+class TransformerMapper(nn.Module):
+    def __init__(self, dim_clip: int, dim_embedding: int, prefix_length: int, clip_length: int, num_layers: int = 8):
+        super(TransformerMapper, self).__init__()
+        self.clip_length = clip_length
+        self.transformer = Transformer(dim_embedding, 8, num_layers)
+        self.linear = nn.Linear(dim_clip, clip_length * dim_embedding)
+        self.prefix_const = nn.Parameter(torch.randn(prefix_length, dim_embedding), requires_grad=True)
+    def forward(self, x):
+        x = self.linear(x).view(x.shape[0], self.clip_length, -1)
+        prefix = self.prefix_const.unsqueeze(0).expand(x.shape[0], *self.prefix_const.shape)
+        prefix = torch.cat((x, prefix), dim=1)
+        out = self.transformer(prefix)[:, self.clip_length:]
+        return out
+class ClapCaptionModel(nn.Module):
+    def __init__(self, clap, text_decoder: str, prefix_length: int, clip_length: Optional[int] = None, prefix_size: int = 512,
+                 num_layers: int = 8, normalize_prefix: bool = True, mapping_type: str = None,\
+                 freeze_audio_encoder_weights: bool = True, freeze_gpt_weights: bool = True):
+        super(ClapCaptionModel, self).__init__()
+        self.clap = clap.audio_encoder
+        self.prefix_length = prefix_length
+        self.normalize_prefix = normalize_prefix
+        self.gpt = GPT2LMHeadModel.from_pretrained(text_decoder)
+        self.gpt_embedding_size = self.gpt.transformer.wte.weight.shape[1]
+        if mapping_type == 'mlp':
+            self.clap_project = MLP((prefix_size, (self.gpt_embedding_size * prefix_length) // 2,
+                                     self.gpt_embedding_size * prefix_length))
+        else:
+            self.clap_project = TransformerMapper(prefix_size, self.gpt_embedding_size, prefix_length,
+                                                                     clip_length, num_layers)
+        # Freeze all CLAP parameters
+        if freeze_audio_encoder_weights:
+            for p in self.clap.parameters():
+                p.requires_grad = False
+        if freeze_gpt_weights:
+            for p in self.gpt.parameters():
+                p.requires_grad = False
+    def get_dummy_token(self, batch_size: int, device: torch.device) -> torch.Tensor:
+        return torch.zeros(batch_size, self.prefix_length, dtype=torch.int64, device=device)
+    def forward(self, audios: torch.Tensor, tokens: torch.Tensor, mask: Optional[torch.Tensor] = None,
+                labels: Optional[torch.Tensor] = None):
+        # get audio embeddings
+        prefix, _ = self.clap(audios)
+        # normalize prefix (audio embedding)
+        if self.normalize_prefix:
+            prefix = prefix / prefix.norm(2, -1).reshape(-1,1)
+        embedding_text = self.gpt.transformer.wte(tokens['input_ids'])
+        prefix_projections = self.clap_project(prefix).view(-1, self.prefix_length, self.gpt_embedding_size)
+        embedding_cat = torch.cat((prefix_projections, embedding_text), dim=1)
+        if labels is not None:
+            dummy_token = self.get_dummy_token(tokens['input_ids'].shape[0], tokens['input_ids'].device)
+            labels = torch.cat((dummy_token, tokens), dim=1)
+        out = self.gpt(inputs_embeds=embedding_cat, labels=labels, attention_mask=mask)
+        return out

ms_clap/src/models/pytorch_utils.py ADDED Viewed

	@@ -0,0 +1,184 @@

+import numpy as np
+import time
+import torch
+import torch.nn as nn
+def move_data_to_device(x, device):
+    if 'float' in str(x.dtype):
+        x = torch.Tensor(x)
+    elif 'int' in str(x.dtype):
+        x = torch.LongTensor(x)
+    else:
+        return x
+    return x.to(device)
+def do_mixup(x, mixup_lambda):
+    """Mixup x of even indexes (0, 2, 4, ...) with x of odd indexes
+    (1, 3, 5, ...).
+    Args:
+      x: (batch_size * 2, ...)
+      mixup_lambda: (batch_size * 2,)
+    Returns:
+      out: (batch_size, ...)
+    """
+    out = (x[0 :: 2].transpose(0, -1) * mixup_lambda[0 :: 2] + \
+        x[1 :: 2].transpose(0, -1) * mixup_lambda[1 :: 2]).transpose(0, -1)
+    return out
+def append_to_dict(dict, key, value):
+    if key in dict.keys():
+        dict[key].append(value)
+    else:
+        dict[key] = [value]
+def interpolate(x, ratio):
+    """Interpolate data in time domain. This is used to compensate the
+    resolution reduction in downsampling of a CNN.
+    Args:
+      x: (batch_size, time_steps, classes_num)
+      ratio: int, ratio to interpolate
+    Returns:
+      upsampled: (batch_size, time_steps * ratio, classes_num)
+    """
+    (batch_size, time_steps, classes_num) = x.shape
+    upsampled = x[:, :, None, :].repeat(1, 1, ratio, 1)
+    upsampled = upsampled.reshape(batch_size, time_steps * ratio, classes_num)
+    return upsampled
+def pad_framewise_output(framewise_output, frames_num):
+    """Pad framewise_output to the same length as input frames. The pad value
+    is the same as the value of the last frame.
+    Args:
+      framewise_output: (batch_size, frames_num, classes_num)
+      frames_num: int, number of frames to pad
+    Outputs:
+      output: (batch_size, frames_num, classes_num)
+    """
+    pad = framewise_output[:, -1 :, :].repeat(1, frames_num - framewise_output.shape[1], 1)
+    """tensor for padding"""
+    output = torch.cat((framewise_output, pad), dim=1)
+    """(batch_size, frames_num, classes_num)"""
+    return output
+def count_parameters(model):
+    return sum(p.numel() for p in model.parameters() if p.requires_grad)
+def count_flops(model, audio_length):
+    """Count flops. Code modified from others' implementation.
+    """
+    multiply_adds = True
+    list_conv2d=[]
+    def conv2d_hook(self, input, output):
+        batch_size, input_channels, input_height, input_width = input[0].size()
+        output_channels, output_height, output_width = output[0].size()
+        kernel_ops = self.kernel_size[0] * self.kernel_size[1] * (self.in_channels / self.groups) * (2 if multiply_adds else 1)
+        bias_ops = 1 if self.bias is not None else 0
+        params = output_channels * (kernel_ops + bias_ops)
+        flops = batch_size * params * output_height * output_width
+        list_conv2d.append(flops)
+    list_conv1d=[]
+    def conv1d_hook(self, input, output):
+        batch_size, input_channels, input_length = input[0].size()
+        output_channels, output_length = output[0].size()
+        kernel_ops = self.kernel_size[0] * (self.in_channels / self.groups) * (2 if multiply_adds else 1)
+        bias_ops = 1 if self.bias is not None else 0
+        params = output_channels * (kernel_ops + bias_ops)
+        flops = batch_size * params * output_length
+        list_conv1d.append(flops)
+    list_linear=[]
+    def linear_hook(self, input, output):
+        batch_size = input[0].size(0) if input[0].dim() == 2 else 1
+        weight_ops = self.weight.nelement() * (2 if multiply_adds else 1)
+        bias_ops = self.bias.nelement()
+        flops = batch_size * (weight_ops + bias_ops)
+        list_linear.append(flops)
+    list_bn=[]
+    def bn_hook(self, input, output):
+        list_bn.append(input[0].nelement() * 2)
+    list_relu=[]
+    def relu_hook(self, input, output):
+        list_relu.append(input[0].nelement() * 2)
+    list_pooling2d=[]
+    def pooling2d_hook(self, input, output):
+        batch_size, input_channels, input_height, input_width = input[0].size()
+        output_channels, output_height, output_width = output[0].size()
+        kernel_ops = self.kernel_size * self.kernel_size
+        bias_ops = 0
+        params = output_channels * (kernel_ops + bias_ops)
+        flops = batch_size * params * output_height * output_width
+        list_pooling2d.append(flops)
+    list_pooling1d=[]
+    def pooling1d_hook(self, input, output):
+        batch_size, input_channels, input_length = input[0].size()
+        output_channels, output_length = output[0].size()
+        kernel_ops = self.kernel_size[0]
+        bias_ops = 0
+        params = output_channels * (kernel_ops + bias_ops)
+        flops = batch_size * params * output_length
+        list_pooling2d.append(flops)
+    def foo(net):
+        childrens = list(net.children())
+        if not childrens:
+            if isinstance(net, nn.Conv2d):
+                net.register_forward_hook(conv2d_hook)
+            elif isinstance(net, nn.Conv1d):
+                net.register_forward_hook(conv1d_hook)
+            elif isinstance(net, nn.Linear):
+                net.register_forward_hook(linear_hook)
+            elif isinstance(net, nn.BatchNorm2d) or isinstance(net, nn.BatchNorm1d):
+                net.register_forward_hook(bn_hook)
+            elif isinstance(net, nn.ReLU):
+                net.register_forward_hook(relu_hook)
+            elif isinstance(net, nn.AvgPool2d) or isinstance(net, nn.MaxPool2d):
+                net.register_forward_hook(pooling2d_hook)
+            elif isinstance(net, nn.AvgPool1d) or isinstance(net, nn.MaxPool1d):
+                net.register_forward_hook(pooling1d_hook)
+            else:
+                print('Warning: flop of module {} is not counted!'.format(net))
+            return
+        for c in childrens:
+            foo(c)
+    # Register hook
+    foo(model)
+    device = device = next(model.parameters()).device
+    input = torch.rand(1, audio_length).to(device)
+    out = model(input)
+    total_flops = sum(list_conv2d) + sum(list_conv1d) + sum(list_linear) + \
+        sum(list_bn) + sum(list_relu) + sum(list_pooling2d) + sum(list_pooling1d)
+    return total_flops

ms_clap/src/models/utils.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import argparse
+import yaml
+import sys
+def read_config_as_args(config_path,args=None,is_config_str=False):
+    return_dict = {}
+    if config_path is not None:
+        if is_config_str:
+            yml_config = yaml.load(config_path, Loader=yaml.FullLoader)
+        else:
+            with open(config_path, "r") as f:
+                yml_config = yaml.load(f, Loader=yaml.FullLoader)
+        if args != None:
+            for k, v in yml_config.items():
+                if k in args.__dict__:
+                    args.__dict__[k] = v
+                else:
+                    sys.stderr.write("Ignored unknown parameter {} in yaml.\n".format(k))
+        else:
+            for k, v in yml_config.items():
+                return_dict[k] = v
+    args = args if args != None else return_dict
+    return argparse.Namespace(**args)

ms_clap/src/zero_shot_classification.py ADDED Viewed

	@@ -0,0 +1,46 @@

+"""
+This is an example using CLAP to perform zeroshot
+    classification on ESC50 (https://github.com/karolpiczak/ESC-50).
+"""
+from CLAPWrapper import CLAPWrapper
+from esc50_dataset import ESC50
+import torch.nn.functional as F
+import numpy as np
+from tqdm import tqdm
+from sklearn.metrics import accuracy_score
+# Load dataset
+root_path = "root_path" # Folder with ESC-50-master/
+dataset = ESC50(root=root_path, download=True) #If download=False code assumes base_folder='ESC-50-master' in esc50_dataset.py
+prompt = 'this is the sound of '
+y = [prompt + x for x in dataset.classes]
+# Load and initialize CLAP
+weights_path = "weights_path"
+clap_model = CLAPWrapper(weights_path, version = '2023', use_cuda=False)
+# Computing text embeddings
+text_embeddings = clap_model.get_text_embeddings(y)
+# Computing audio embeddings
+y_preds, y_labels = [], []
+for i in tqdm(range(len(dataset))):
+    x, _, one_hot_target = dataset.__getitem__(i)
+    audio_embeddings = clap_model.get_audio_embeddings([x], resample=True)
+    similarity = clap_model.compute_similarity(audio_embeddings, text_embeddings)
+    y_pred = F.softmax(similarity.detach().cpu(), dim=1).numpy()
+    y_preds.append(y_pred)
+    y_labels.append(one_hot_target.detach().cpu().numpy())
+y_labels, y_preds = np.concatenate(y_labels, axis=0), np.concatenate(y_preds, axis=0)
+acc = accuracy_score(np.argmax(y_labels, axis=1), np.argmax(y_preds, axis=1))
+print('ESC50 Accuracy {}'.format(acc))
+"""
+The output:
+ESC50 Accuracy: 93.9%
+"""

ms_clap/src/zero_shot_predictions.py ADDED Viewed

	@@ -0,0 +1,51 @@

+"""
+This is an example using CLAP for zero-shot inference.
+"""
+from CLAPWrapper import CLAPWrapper
+import torch.nn.functional as F
+# Define classes for zero-shot
+# Should be in lower case and can be more than one word
+classes = ['coughing','sneezing','drinking sipping', 'breathing', 'brushing teeth']
+ground_truth = ['coughing']
+# Add prompt
+prompt = 'this is a sound of '
+class_prompts = [prompt + x for x in classes]
+#Load audio files
+audio_files = ['audio_file']
+# Load and initialize CLAP
+weights_path = "weights_path"
+# Setting use_cuda = True will load the model on a GPU using CUDA
+clap_model = CLAPWrapper(weights_path, version = '2023', use_cuda=False)
+# compute text embeddings from natural text
+text_embeddings = clap_model.get_text_embeddings(class_prompts)
+# compute the audio embeddings from an audio file
+audio_embeddings = clap_model.get_audio_embeddings(audio_files, resample=True)
+# compute the similarity between audio_embeddings and text_embeddings
+similarity = clap_model.compute_similarity(audio_embeddings, text_embeddings)
+similarity = F.softmax(similarity, dim=1)
+values, indices = similarity[0].topk(5)
+# Print the results
+print("Ground Truth: {}".format(ground_truth))
+print("Top predictions:\n")
+for value, index in zip(values, indices):
+    print(f"{classes[index]:>16s}: {100 * value.item():.2f}%")
+"""
+The output (the exact numbers may vary):
+Ground Truth: coughing
+Top predictions:
+        coughing: 98.55%
+        sneezing: 1.24%
+drinking sipping: 0.15%
+       breathing: 0.02%
+  brushing teeth: 0.01%
+"""

requirements.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+numpy
+scipy
+scikit-learn
+librosa
+soundfile
+pydub
+torch==2.0.1
+torchaudio==2.0.2
+torchlibrosa==0.1.0
+torchvision==0.15.2
+transformers==4.27.4
+einops
+huggingface-hub
+laion-clap==1.1.3

src/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

src/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # Copyright (c) 2024 NVIDIA CORPORATION.
2	+ # Licensed under the MIT license.

src/factory.py ADDED Viewed

	@@ -0,0 +1,198 @@

+# Copyright (c) 2024 NVIDIA CORPORATION.
+#   Licensed under the MIT license.
+# Adapted from https://github.com/mlfoundations/open_flamingo under the MIT license.
+#   LICENSE is in incl_licenses directory.
+import sys
+sys.path.append('../')
+from typing import Optional
+from copy import deepcopy
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from ms_clap.src.CLAPWrapper import CLAPWrapper
+import torch
+from torch import nn
+try:
+    from .flamingo import Flamingo
+    from .flamingo_lm import FlamingoLMMixin
+    from .utils import extend_instance
+except:
+    from flamingo import Flamingo
+    from flamingo_lm import FlamingoLMMixin
+    from utils import extend_instance
+class CLAP(nn.Module):
+    def __init__(self, clap_config):
+        super(CLAP, self).__init__()
+        self.method = clap_config["method"]
+        device_id = f'cuda:{torch.cuda.current_device()}'
+        if self.method == 'laion-clap':
+            # https://github.com/LAION-AI/CLAP
+            if clap_config["model_name"] in ['630k-audioset-best', '630k-best', '630k-audioset-fusion-best', '630k-fusion-best']:
+                amodel = 'HTSAT-tiny'
+            elif clap_config["model_name"] in ['music_speech_audioset_epoch_15_esc_89.98']:
+                amodel = 'HTSAT-base'
+            else:
+                raise NotImplementedError
+            enable_fusion = 'fusion' in clap_config["model_name"].lower()
+            self.laion_clap = CLAP_Module(enable_fusion=enable_fusion, amodel=amodel, device=device_id)
+            self.laion_clap.load_ckpt(ckpt=clap_config["checkpoint"])
+            for param in self.laion_clap.parameters():
+                param.requires_grad = False
+            self.laion_clap.eval()
+            print('loaded laion-clap model: {}'.format(clap_config["checkpoint"]))
+        elif self.method == 'microsoft-clap':
+            # https://github.com/microsoft/CLAP
+            self.ms_clap = CLAPWrapper(
+                clap_config["checkpoint"],
+                config_root=clap_config["config_root"],
+                version=clap_config['model_name'],
+                use_cuda=True
+            )
+            if clap_config['model_name'] in ['2022', '2023']:
+                for param in self.ms_clap.clap.parameters():
+                    param.requires_grad = False
+                self.ms_clap.clap.eval()
+            else:
+                for param in self.ms_clap.clapcap.parameters():
+                    param.requires_grad = False
+                self.ms_clap.clapcap.eval()
+            print('loaded microsoft-clap model: {}'.format(clap_config["checkpoint"]))
+        else:
+            raise NotImplementedError
+    def forward(self, audio_clips):
+        if len(audio_clips.shape) == 2:
+            audio_clips = audio_clips.unsqueeze(0)
+        assert len(audio_clips.shape) == 3
+        audio_embeds = []
+        for x in audio_clips:
+            if self.method == 'laion-clap':
+                audio_embed = self.laion_clap.get_audio_embedding_from_data(x=x, use_tensor=True)
+            elif self.method == 'microsoft-clap':
+                audio_embed = self.ms_clap.get_audio_embeddings_from_clips(x)
+            audio_embeds.append(audio_embed)
+        audio_embeds = torch.stack(audio_embeds, dim=0)
+        audio_embeds.requires_grad = False
+        return audio_embeds
+def create_model_and_transforms(
+    clap_config: dict,
+    lang_encoder_path: str,
+    tokenizer_path: str,
+    audio_transformer_kwargs: dict,
+    cross_attn_every_n_layers: int = 1,
+    use_local_files: bool = False,
+    decoder_layers_attr_name: str = None,
+    freeze_lm_embeddings: bool = False,
+    unfreeze_full_lm: bool = False,
+    cache_dir: Optional[str] = None,
+    **flamingo_kwargs,
+):
+    clap = CLAP(clap_config)
+    text_tokenizer = AutoTokenizer.from_pretrained(
+        tokenizer_path,
+        local_files_only=use_local_files,
+        trust_remote_code=True,
+        cache_dir=cache_dir,
+    )
+    text_tokenizer.add_special_tokens(
+        {"additional_special_tokens": ["<audio>", "<|endofchunk|>"]}
+    )
+    if text_tokenizer.pad_token is None:
+        text_tokenizer.add_special_tokens({"pad_token": "<PAD>"})
+    if text_tokenizer.sep_token is None:
+        text_tokenizer.add_special_tokens({"sep_token": "<SEP>"})
+    lang_encoder = AutoModelForCausalLM.from_pretrained(
+        lang_encoder_path,
+        local_files_only=use_local_files,
+        trust_remote_code=True,
+        cache_dir=cache_dir,
+    )
+    extend_instance(lang_encoder, FlamingoLMMixin)
+    if decoder_layers_attr_name is None:
+        decoder_layers_attr_name = _infer_decoder_layers_attr_name(lang_encoder)
+    lang_encoder.set_decoder_layers_attr_name(decoder_layers_attr_name)
+    lang_encoder.resize_token_embeddings(len(text_tokenizer))
+    unfreeze_clap = False
+    model = Flamingo(
+        clap,
+        unfreeze_clap,
+        lang_encoder,
+        text_tokenizer.encode("<|endofchunk|>")[-1],
+        text_tokenizer.encode("<audio>")[-1],
+        text_tokenizer.sep_token_id,
+        audio_embed_dim=clap_config["audio_embed_dim"],
+        audio_transformer_kwargs=audio_transformer_kwargs,
+        cross_attn_every_n_layers=cross_attn_every_n_layers,
+        **flamingo_kwargs,
+    )
+    model.requires_grad_(False)
+    assert sum(p.numel() for p in model.parameters() if p.requires_grad) == 0
+    model.audio_transformer.requires_grad_(True)
+    model.lang_encoder.gated_cross_attn_layers.requires_grad_(True)
+    if not freeze_lm_embeddings:
+        model.lang_encoder.get_input_embeddings().requires_grad_(True)
+    if unfreeze_full_lm:
+        model.lang_encoder.requires_grad_(True)
+    if unfreeze_clap:
+        model.clap.requires_grad_(True)
+    print("Flamingo model initialized with {:,} trainable parameters (audio transformer has {:,}, LM has {:,})".format(
+        sum(p.numel() for p in model.parameters() if p.requires_grad),
+        sum(p.numel() for p in model.audio_transformer.parameters() if p.requires_grad),
+        sum(p.numel() for p in model.lang_encoder.parameters() if p.requires_grad)
+    ))
+    return model, text_tokenizer
+def _infer_decoder_layers_attr_name(model):
+    for k in __KNOWN_DECODER_LAYERS_ATTR_NAMES:
+        if k.lower() in model.__class__.__name__.lower():
+            return __KNOWN_DECODER_LAYERS_ATTR_NAMES[k]
+    raise ValueError(
+        f"We require the attribute name for the nn.ModuleList in the decoder storing the transformer block layers. Please supply this string manually."
+    )
+__KNOWN_DECODER_LAYERS_ATTR_NAMES = {
+    "opt": "model.decoder.layers",
+    "gptj": "transformer.h",
+    "gpt-j": "transformer.h",
+    "pythia": "gpt_neox.layers",
+    "llama": "model.layers",
+    "gptneoxforcausallm": "gpt_neox.layers",
+    "mpt": "transformer.blocks",
+    "mosaicgpt": "transformer.blocks",
+}

src/flamingo.py ADDED Viewed

	@@ -0,0 +1,260 @@

+# Copyright (c) 2024 NVIDIA CORPORATION.
+#   Licensed under the MIT license.
+# Adapted from https://github.com/mlfoundations/open_flamingo under the MIT license.
+#   LICENSE is in incl_licenses directory.
+import torch
+from einops import rearrange
+from torch import nn
+from torch.distributed.fsdp.wrap import (
+    enable_wrap,
+    wrap,
+)
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from torch.distributed.fsdp import (
+    FullyShardedDataParallel as FSDP,
+)
+try:
+    from .helpers import TransformerEncoder
+    from .utils import apply_with_stopping_condition
+except:
+    from helpers import TransformerEncoder
+    from utils import apply_with_stopping_condition
+class Flamingo(nn.Module):
+    def __init__(
+        self,
+        clap: nn.Module,
+        unfreeze_clap: bool,
+        lang_encoder: nn.Module,
+        eoc_token_id: int,
+        media_token_id: int,
+        sep_token_id: int,
+        audio_embed_dim: int,
+        audio_transformer_kwargs: dict,
+        cross_attn_every_n_layers: int = 1,
+        gradient_checkpointing: bool = False,
+    ):
+        super().__init__()
+        self.eoc_token_id = eoc_token_id
+        self.media_token_id = media_token_id
+        self.sep_token_id = sep_token_id
+        self.audio_embed_dim = audio_embed_dim
+        self.clap = clap # .to(torch.cuda.current_device())
+        self.unfreeze_clap = unfreeze_clap
+        self.clap.requires_grad_(unfreeze_clap)
+        if hasattr(lang_encoder.config, "d_model"):
+            self.lang_dim = lang_encoder.config.d_model  # mpt uses d_model
+        else:
+            self.lang_dim = lang_encoder.config.hidden_size
+        n_head = audio_transformer_kwargs["n_head"]
+        n_layers = audio_transformer_kwargs["n_layers"]
+        d_inner = audio_transformer_kwargs["d_inner"]
+        max_num_media = audio_transformer_kwargs["max_num_media"]
+        max_window_per_audio = audio_transformer_kwargs["max_window_per_audio"]
+        assert audio_embed_dim % n_head == 0
+        self.audio_transformer = TransformerEncoder(
+            d_word_vec=audio_embed_dim,
+            n_layers=n_layers,
+            n_head=n_head,
+            d_k=audio_embed_dim // n_head,
+            d_v=audio_embed_dim // n_head,
+            d_model=audio_embed_dim,
+            d_inner=d_inner,
+            dropout=0.0,
+            n_position=max_num_media,
+            scale_emb=True
+        )
+        self.lang_encoder = lang_encoder
+        self.lang_encoder.init_flamingo(
+            media_token_id=media_token_id,
+            lang_hidden_size=self.lang_dim,
+            audio_hidden_size=self.audio_embed_dim,
+            max_window_per_audio=max_window_per_audio,
+            cross_attn_every_n_layers=cross_attn_every_n_layers,
+            gradient_checkpointing=gradient_checkpointing,
+        )
+        self._use_gradient_checkpointing = gradient_checkpointing
+        self.audio_transformer._use_gradient_checkpointing = gradient_checkpointing
+        self.clap._use_gradient_checkpointing = gradient_checkpointing
+    def forward(
+        self,
+        audio_x: torch.Tensor,
+        audio_x_mask: torch.Tensor,
+        lang_x: torch.Tensor,
+        attention_mask: torch.Tensor = None,
+        labels: torch.Tensor = None,
+        clear_conditioned_layers: bool = True,
+        past_key_values=None,
+        use_cache: bool = False,
+    ):
+        assert (
+            self.lang_encoder.initialized_flamingo
+        ), "Flamingo layers are not initialized. Please call `init_flamingo` first."
+        assert (
+            self.lang_encoder._use_cached_audio_x or audio_x is not None
+        ), "Must provide either audio_x or have precached media using cache_media()."
+        if self.lang_encoder._use_cached_audio_x:
+            assert (
+                audio_x is None
+            ), "Expect audio_x to be None when media has been cached using cache_media(). Try uncache_media() first."
+            assert self.lang_encoder.is_conditioned()
+        else:
+            self._encode_audio_x(audio_x=audio_x, audio_x_mask=audio_x_mask)
+            self._condition_media_locations(input_ids=lang_x)
+        output = self.lang_encoder(
+            input_ids=lang_x,
+            attention_mask=attention_mask,
+            labels=labels,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+        )
+        if clear_conditioned_layers:
+            self.lang_encoder.clear_conditioned_layers()
+        return output
+    def generate(
+        self,
+        audio_x: torch.Tensor,
+        audio_x_mask: torch.Tensor,
+        lang_x: torch.Tensor,
+        attention_mask: torch.Tensor = None,
+        **kwargs,
+    ):
+        num_beams = kwargs.pop("num_beams", 1)
+        if num_beams > 1:
+            audio_x = audio_x.repeat_interleave(num_beams, dim=0)
+        self.lang_encoder._use_cached_audio_x = True
+        self._encode_audio_x(audio_x=audio_x, audio_x_mask=audio_x_mask)
+        eos_token_id = kwargs.pop("eos_token_id", self.eoc_token_id)
+        output = self.lang_encoder.generate(
+            input_ids=lang_x,
+            attention_mask=attention_mask,
+            eos_token_id=eos_token_id,
+            num_beams=num_beams,
+            **kwargs,
+        )
+        self.lang_encoder.clear_conditioned_layers()
+        self.lang_encoder._use_cached_audio_x = False
+        return output
+    def _encode_audio_x(self, audio_x: torch.Tensor, audio_x_mask: torch.Tensor):
+        """
+        rearrange code based on https://github.com/dhansmair/flamingo-mini
+        """
+        assert audio_x.ndim == 3, "audio_x should be of shape (B, num_window, window_length)"
+        with torch.no_grad():
+            audio_embeds = self.clap(audio_x)
+        B, L, D = audio_embeds.shape  # L is number of windows, D is feature dim
+        assert D == self.audio_embed_dim
+        assert audio_x_mask.ndim == 2, "audio_x_mask should be of shape (B, L)"
+        if B > 1 and audio_x_mask.shape[0] == 1:
+            audio_x_mask = audio_x_mask.repeat(B, 1)
+        assert audio_x_mask.shape[0] == B and audio_x_mask.shape[1] == L, "{} != ({},{})".format(audio_x_mask.shape, B, L)
+        audio_x_out = self.audio_transformer(audio_embeds)  # B, L, D
+        audio_x_out = audio_x_out.unsqueeze(2)  # B, L, n=1, D
+        audio_x_mask = audio_x_mask.unsqueeze(2)  # B, L, n=1
+        for layer in self.lang_encoder._get_decoder_layers():
+            layer.condition_audio_x(audio_x_out, audio_x_mask)
+    def wrap_fsdp(self, wrapper_kwargs, device_id):
+        # unfreeze the decoder layers
+        for block in self.lang_encoder.old_decoder_blocks:
+            block.requires_grad_(True)
+        # wrap in FSDP
+        with enable_wrap(wrapper_cls=FSDP, **wrapper_kwargs):
+            self.audio_transformer = wrap(wrap(self.audio_transformer))
+            self.lang_encoder.old_decoder_blocks = nn.ModuleList(
+                wrap(wrap(block)) for block in self.lang_encoder.old_decoder_blocks
+            )
+            self.lang_encoder.gated_cross_attn_layers = nn.ModuleList(
+                wrap(wrap(layer)) if layer is not None else None
+                for layer in self.lang_encoder.gated_cross_attn_layers
+            )
+            self.lang_encoder.init_flamingo_layers(self._use_gradient_checkpointing)
+            self.lang_encoder.set_input_embeddings(
+                wrap(wrap(self.lang_encoder.get_input_embeddings()))
+            )
+            if hasattr(self.lang_encoder, 'set_output_embeddings'):
+                self.lang_encoder.set_output_embeddings(
+                    wrap(wrap(self.lang_encoder.get_output_embeddings()))
+                )
+            else:
+                print('skip wrapping output embeddings')
+        # manually move non-FSDP managed parameters to device_id
+        # these are all in lang_encoder
+        apply_with_stopping_condition(
+            module=self.lang_encoder,
+            apply_fn=lambda m: m.to(device_id),
+            apply_condition=lambda m: len(list(m.children())) == 0,
+            stopping_condition=lambda m: isinstance(m, FSDP),
+        )
+        # clap shouldn't be wrapped; should be on each gpu
+        if self.unfreeze_clap:
+            apply_with_stopping_condition(
+                module=self.clap,
+                apply_fn=lambda m: m.to(device_id),
+                apply_condition=lambda m: len(list(m.children())) == 0,
+                stopping_condition=lambda m: isinstance(m, FSDP),
+            )
+        # exclude the original decoder layers from the optimizer
+        for block in self.lang_encoder.old_decoder_blocks:
+            for p in block.parameters():
+                p.exclude_from_optimizer = True
+        # set up clip_grad_norm_ function
+        def clip_grad_norm_(max_norm):
+            self.audio_transformer.clip_grad_norm_(max_norm)
+            for layer in self.lang_encoder.gated_cross_attn_layers:
+                if layer is not None:
+                    layer.clip_grad_norm_(max_norm)
+            self.lang_encoder.get_input_embeddings().clip_grad_norm_(max_norm)
+        self.clip_grad_norm_ = clip_grad_norm_
+    def _condition_media_locations(self, input_ids: torch.Tensor):
+        media_locations = (input_ids == self.media_token_id)
+        for layer in self.lang_encoder._get_decoder_layers():
+            layer.condition_media_locations(media_locations)
+    def cache_media(self, input_ids: torch.Tensor, audio_x: torch.Tensor, audio_x_mask: torch.Tensor):
+        self._encode_audio_x(audio_x=audio_x, audio_x_mask=audio_x_mask)
+        self._condition_media_locations(input_ids=input_ids)
+        self.lang_encoder._use_cached_audio_x = True
+    def uncache_media(self):
+        self.lang_encoder.clear_conditioned_layers()
+        self.lang_encoder._use_cached_audio_x = False

src/flamingo_lm.py ADDED Viewed

	@@ -0,0 +1,177 @@

+# Copyright (c) 2024 NVIDIA CORPORATION.
+#   Licensed under the MIT license.
+# Adapted from https://github.com/mlfoundations/open_flamingo under the MIT license.
+#   LICENSE is in incl_licenses directory.
+import torch.nn as nn
+try:
+    from .helpers import GatedCrossAttentionBlock
+    from .utils import getattr_recursive, setattr_recursive
+except:
+    from helpers import GatedCrossAttentionBlock
+    from utils import getattr_recursive, setattr_recursive
+class FlamingoLayer(nn.Module):
+    """
+    FlamingoLayer is a wrapper around the GatedCrossAttentionBlock and DecoderLayer.
+    """
+    def __init__(
+        self, gated_cross_attn_layer, decoder_layer, gradient_checkpointing=False
+    ):
+        super().__init__()
+        self.gated_cross_attn_layer = gated_cross_attn_layer
+        self.decoder_layer = decoder_layer
+        self.audio_x = None
+        self.audio_x_mask = None
+        self.few_shot_mask = None
+        self.media_locations = None
+        if self.gated_cross_attn_layer is not None:
+            self.gated_cross_attn_layer._use_gradient_checkpointing = (
+                gradient_checkpointing
+            )
+        self.decoder_layer._use_gradient_checkpointing = gradient_checkpointing
+    def is_conditioned(self) -> bool:
+        """Check whether the layer is conditioned."""
+        return (self.audio_x is not None) and (self.audio_x_mask is not None) and (self.media_locations is not None)
+    def condition_audio_x(self, audio_x, audio_x_mask):
+        self.audio_x = audio_x
+        self.audio_x_mask = audio_x_mask
+    def condition_media_locations(self, media_locations):
+        self.media_locations = media_locations
+    def condition_use_cached_media(self, use_cached_media):
+        self.use_cached_media = use_cached_media
+    def forward(
+        self,
+        lang_x,
+        attention_mask=None,
+        **decoder_layer_kwargs,
+    ):
+        if self.gated_cross_attn_layer is not None:
+            if self.audio_x is None:
+                raise ValueError("audio_x must be conditioned before forward pass")
+            if self.media_locations is None:
+                raise ValueError(
+                    "media_locations must be conditioned before forward pass"
+                )
+            lang_x = self.gated_cross_attn_layer(
+                lang_x,
+                self.audio_x,
+                self.audio_x_mask,
+                media_locations=self.media_locations,
+                use_cached_media=self.use_cached_media,
+            )
+        # Normal decoder layer
+        lang_x = self.decoder_layer(
+            lang_x, attention_mask=attention_mask, **decoder_layer_kwargs
+        )
+        return lang_x
+class FlamingoLMMixin(nn.Module):
+    """
+    Mixin to add cross-attention layers to a language model.
+    """
+    def set_decoder_layers_attr_name(self, decoder_layers_attr_name):
+        self.decoder_layers_attr_name = decoder_layers_attr_name
+    def _get_decoder_layers(self):
+        return getattr_recursive(self, self.decoder_layers_attr_name)
+    def _set_decoder_layers(self, value):
+        setattr_recursive(self, self.decoder_layers_attr_name, value)
+    def init_flamingo(
+        self,
+        media_token_id,
+        lang_hidden_size,
+        audio_hidden_size,
+        max_window_per_audio,
+        cross_attn_every_n_layers,
+        gradient_checkpointing,
+    ):
+        """
+        Initialize Flamingo by adding a new gated cross attn to the decoder. Store the media token id for computing the media locations.
+        """
+        self.old_decoder_blocks = self._get_decoder_layers()
+        self.gated_cross_attn_layers = nn.ModuleList(
+            [
+                GatedCrossAttentionBlock(
+                    dim=lang_hidden_size,
+                    dim_audio=audio_hidden_size,
+                    max_window_per_audio=max_window_per_audio,
+                    only_attend_immediate_media=False,
+                )
+                if (layer_idx + 1) % cross_attn_every_n_layers == 0
+                else None
+                for layer_idx, _ in enumerate(self._get_decoder_layers())
+            ]
+        )
+        self.init_flamingo_layers(gradient_checkpointing)
+        self.media_token_id = media_token_id
+        self.initialized_flamingo = True
+        self._use_cached_audio_x = False
+    def init_flamingo_layers(self, gradient_checkpointing):
+        """
+        Re initializes the FlamingoLayers.
+        Propagates any changes made to self.gated_corss_attn_layers or self.old_decoder_blocks
+        """
+        self._set_decoder_layers(
+            nn.ModuleList(
+                [
+                    FlamingoLayer(
+                        gated_cross_attn_layer, decoder_layer, gradient_checkpointing
+                    )
+                    for gated_cross_attn_layer, decoder_layer in zip(
+                        self.gated_cross_attn_layers, self.old_decoder_blocks
+                    )
+                ]
+            )
+        )
+    def forward(self, input_ids, attention_mask, **kwargs):
+        """Condition the Flamingo layers on the media locations before forward()"""
+        if not self.initialized_flamingo:
+            raise ValueError(
+                "Flamingo layers are not initialized. Please call `init_flamingo` first."
+            )
+        media_locations = input_ids == self.media_token_id
+        use_cached_media_locations = (
+            self._use_cached_audio_x
+            and self.is_conditioned()
+            and not media_locations.any()
+        )
+        for layer in self._get_decoder_layers():
+            if not use_cached_media_locations:
+                layer.condition_media_locations(media_locations)
+            layer.condition_use_cached_media(use_cached_media_locations)
+        kwargs["input_ids"] = input_ids
+        kwargs["attention_mask"] = attention_mask
+        return super().forward(**kwargs)
+    def is_conditioned(self) -> bool:
+        """Check whether all decoder layers are already conditioned."""
+        return all(l.is_conditioned() for l in self._get_decoder_layers())
+    def clear_conditioned_layers(self):
+        for layer in self._get_decoder_layers():
+            layer.condition_audio_x(None, None)
+            layer.condition_media_locations(None)
+            layer.condition_use_cached_media(None)

src/helpers.py ADDED Viewed

	@@ -0,0 +1,379 @@

+# Copyright (c) 2024 NVIDIA CORPORATION.
+#   Licensed under the MIT license.
+# Adapted from https://github.com/mlfoundations/open_flamingo under the MIT license.
+#   LICENSE is in incl_licenses directory.
+# Adapted from https://github.com/lucidrains/flamingo-pytorch under the MIT license.
+#   LICENSE is in incl_licenses directory.
+# Adapted from https://github.com/jadore801120/attention-is-all-you-need-pytorch under the MIT license.
+#   LICENSE is in incl_licenses directory.
+from einops import rearrange, repeat
+from einops_exts import rearrange_many
+import numpy as np
+import torch
+from torch import einsum, nn
+import torch.nn.functional as F
+def exists(val):
+    return val is not None
+def FeedForward(dim, mult=4):
+    inner_dim = int(dim * mult)
+    return nn.Sequential(
+        nn.LayerNorm(dim),
+        nn.Linear(dim, inner_dim, bias=False),
+        nn.GELU(),
+        nn.Linear(inner_dim, dim, bias=False),
+    )
+class ScaledDotProductAttention(nn.Module):
+    ''' Scaled Dot-Product Attention '''
+    def __init__(self, temperature, attn_dropout=0.1):
+        super().__init__()
+        self.temperature = temperature
+        self.dropout = nn.Dropout(attn_dropout)
+    def forward(self, q, k, v, mask=None):
+        attn = torch.matmul(q / self.temperature, k.transpose(2, 3))
+        if mask is not None:
+            attn = attn.masked_fill(mask == 0, -1e9)
+        attn = self.dropout(F.softmax(attn, dim=-1))
+        output = torch.matmul(attn, v)
+        return output, attn
+class MultiHeadAttention(nn.Module):
+    ''' Multi-Head Attention module '''
+    def __init__(self, n_head, d_model, d_k, d_v, dropout=0.1):
+        super().__init__()
+        self.n_head = n_head
+        self.d_k = d_k
+        self.d_v = d_v
+        self.w_qs = nn.Linear(d_model, n_head * d_k, bias=False)
+        self.w_ks = nn.Linear(d_model, n_head * d_k, bias=False)
+        self.w_vs = nn.Linear(d_model, n_head * d_v, bias=False)
+        self.fc = nn.Linear(n_head * d_v, d_model, bias=False)
+        self.attention = ScaledDotProductAttention(temperature=d_k ** 0.5)
+        self.dropout = nn.Dropout(dropout)
+        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
+    def forward(self, q, k, v, mask=None):
+        d_k, d_v, n_head = self.d_k, self.d_v, self.n_head
+        sz_b, len_q, len_k, len_v = q.size(0), q.size(1), k.size(1), v.size(1)
+        residual = q
+        # Pass through the pre-attention projection: b x lq x (n*dv)
+        # Separate different heads: b x lq x n x dv
+        q = self.w_qs(q).view(sz_b, len_q, n_head, d_k)
+        k = self.w_ks(k).view(sz_b, len_k, n_head, d_k)
+        v = self.w_vs(v).view(sz_b, len_v, n_head, d_v)
+        # Transpose for attention dot product: b x n x lq x dv
+        q, k, v = q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2)
+        if mask is not None:
+            mask = mask.unsqueeze(1)   # For head axis broadcasting.
+        q, attn = self.attention(q, k, v, mask=mask)
+        # Transpose to move the head dimension back: b x lq x n x dv
+        # Combine the last two dimensions to concatenate all the heads together: b x lq x (n*dv)
+        q = q.transpose(1, 2).contiguous().view(sz_b, len_q, -1)
+        q = self.dropout(self.fc(q))
+        q += residual
+        q = self.layer_norm(q)
+        return q, attn
+class PositionwiseFeedForward(nn.Module):
+    ''' A two-feed-forward-layer module '''
+    def __init__(self, d_in, d_hid, dropout=0.1):
+        super().__init__()
+        self.w_1 = nn.Linear(d_in, d_hid) # position-wise
+        self.w_2 = nn.Linear(d_hid, d_in) # position-wise
+        self.layer_norm = nn.LayerNorm(d_in, eps=1e-6)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x):
+        residual = x
+        x = self.w_2(F.relu(self.w_1(x)))
+        x = self.dropout(x)
+        x += residual
+        x = self.layer_norm(x)
+        return x
+class PositionalEncoding(nn.Module):
+    def __init__(self, d_hid, n_position=200):
+        super(PositionalEncoding, self).__init__()
+        self.register_buffer('pos_table', self._get_sinusoid_encoding_table(n_position, d_hid))
+    def _get_sinusoid_encoding_table(self, n_position, d_hid):
+        def get_position_angle_vec(position):
+            return [position / np.power(10000, 2 * (hid_j // 2) / d_hid) for hid_j in range(d_hid)]
+        sinusoid_table = np.array([get_position_angle_vec(pos_i) for pos_i in range(n_position)])
+        sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])  # dim 2i
+        sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])  # dim 2i+1
+        return torch.FloatTensor(sinusoid_table).unsqueeze(0)
+    def forward(self, x):
+        return x + self.pos_table[:, :x.size(1)].clone().detach()
+class EncoderLayer(nn.Module):
+    ''' Compose with two layers '''
+    def __init__(self, d_model, d_inner, n_head, d_k, d_v, dropout=0.0):
+        super(EncoderLayer, self).__init__()
+        self.slf_attn = MultiHeadAttention(n_head, d_model, d_k, d_v, dropout=dropout)
+        self.pos_ffn = PositionwiseFeedForward(d_model, d_inner, dropout=dropout)
+    def forward(self, enc_input, slf_attn_mask=None):
+        enc_output, enc_slf_attn = self.slf_attn(
+            enc_input, enc_input, enc_input, mask=slf_attn_mask)
+        enc_output = self.pos_ffn(enc_output)
+        return enc_output, enc_slf_attn
+class TransformerEncoder(nn.Module):
+    ''' A encoder model with self attention mechanism. '''
+    def __init__(
+            self, d_word_vec=512, n_layers=6, n_head=8, d_k=64, d_v=64,
+            d_model=512, d_inner=2048, dropout=0.0, n_position=16, scale_emb=True):
+        super().__init__()
+        if n_position > 0:
+            self.position_enc = PositionalEncoding(d_word_vec, n_position=n_position)
+        else:
+            self.position_enc = lambda x: x
+        self.dropout = nn.Dropout(p=dropout)
+        self.layer_stack = nn.ModuleList([
+            EncoderLayer(d_model, d_inner, n_head, d_k, d_v, dropout=dropout)
+            for _ in range(n_layers)])
+        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
+        self.scale_emb = scale_emb
+        self.d_model = d_model
+    def forward(self, src_seq, return_attns=False):
+        if len(src_seq.shape) == 2:
+            src_seq = src_seq.unsqueeze(1)
+        B, L, D = src_seq.shape
+        enc_slf_attn_list = []
+        causal_mask = None
+        enc_output = src_seq
+        if self.scale_emb:
+            enc_output = enc_output * self.d_model ** 0.5
+        enc_output = self.dropout(self.position_enc(enc_output))
+        enc_output = self.layer_norm(enc_output)
+        for enc_layer in self.layer_stack:
+            enc_output, enc_slf_attn = enc_layer(enc_output, slf_attn_mask=causal_mask)
+            enc_slf_attn_list += [enc_slf_attn] if return_attns else []
+        if return_attns:
+            return enc_output, enc_slf_attn_list
+        return enc_output
+# gated cross attention
+class MaskedCrossAttention(nn.Module):
+    def __init__(
+        self,
+        *,
+        dim,
+        dim_audio,
+        max_window_per_audio,
+        dim_head=64,
+        heads=8,
+        only_attend_immediate_media=True,
+    ):
+        super().__init__()
+        self.max_window_per_audio = max_window_per_audio
+        self.scale = dim_head**-0.5
+        self.heads = heads
+        inner_dim = dim_head * heads
+        self.norm = nn.LayerNorm(dim)
+        self.to_q = nn.Linear(dim, inner_dim, bias=False)
+        self.to_kv = nn.Linear(dim_audio, inner_dim * 2, bias=False)
+        self.to_out = nn.Linear(inner_dim, dim, bias=False)
+        self.only_attend_immediate_media = only_attend_immediate_media
+    def forward(
+        self,
+        x,
+        media, media_mask,
+        media_locations=None,
+        use_cached_media=False
+    ):
+        if not use_cached_media:
+            assert (
+                media_locations.shape[1] == x.shape[1]
+            ), f"media_location.shape is {media_locations.shape} but x.shape is {x.shape}"
+        T_txt = x.shape[1]
+        B, L = media.shape[:2]
+        assert media.shape[2] == 1  # extra dim
+        assert L % self.max_window_per_audio == 0  # should be 4 or 8 times
+        h = self.heads
+        x = self.norm(x)
+        q = self.to_q(x)
+        media = rearrange(media, "b t n d -> b (t n) d")
+        k, v = self.to_kv(media).chunk(2, dim=-1)
+        q, k, v = rearrange_many((q, k, v), "b n (h d) -> b h n d", h=h)
+        q = q * self.scale
+        sim = einsum("... i d, ... j d -> ... i j", q, k)
+        # mask padded audio embeddings
+        media_mask = rearrange(media_mask, "b i n -> b 1 1 (i n)").bool()  # n = 1 is extra dim
+        sim = sim.masked_fill(~media_mask, -torch.finfo(sim.dtype).max)
+        assert self.only_attend_immediate_media is False
+        # mask media locations
+        if exists(media_locations):
+            few_shot_mask = torch.zeros(B, T_txt, L).bool().to(sim.device)
+            for batch_idx in range(B):
+                media_locations_b = media_locations[batch_idx].nonzero()  # locations of <audio>
+                if len(media_locations_b.shape) > 1:
+                    media_locations_b = media_locations_b.squeeze(-1)
+                for i in range(-1, len(media_locations_b)):
+                    if i == -1:
+                        if len(media_locations_b) == 1:
+                            text_start, text_end = 0, T_txt
+                        else:
+                            text_start, text_end = 0, media_locations_b[i+1]
+                    elif i == len(media_locations_b) - 1:
+                        text_start, text_end = media_locations_b[i], T_txt
+                    else:
+                        text_start, text_end = media_locations_b[i], media_locations_b[i+1]
+                    if self.only_attend_immediate_media:
+                        look_at_window_start = max(i,0) * self.max_window_per_audio
+                    else:
+                        look_at_window_start = 0
+                    look_at_window_end = (max(i,0) + 1) * self.max_window_per_audio
+                    few_shot_mask[batch_idx, text_start:text_end, look_at_window_start:look_at_window_end] = True
+            sim = sim.masked_fill(~few_shot_mask.unsqueeze(1), -torch.finfo(sim.dtype).max)
+        sim = sim - sim.amax(dim=-1, keepdim=True).detach()
+        attn = sim.softmax(dim=-1)
+        if exists(media_locations) and self.only_attend_immediate_media:
+            text_without_media_mask = text_time == 0
+            text_without_media_mask = rearrange(
+                text_without_media_mask, "b i -> b 1 i 1"
+            )
+            attn = attn.masked_fill(text_without_media_mask, 0.0)
+        out = einsum("... i j, ... j d -> ... i d", attn, v)
+        out = rearrange(out, "b h n d -> b n (h d)")
+        return self.to_out(out)
+class GatedCrossAttentionBlock(nn.Module):
+    def __init__(
+        self,
+        *,
+        dim,
+        dim_audio,
+        max_window_per_audio,
+        dim_head=64,
+        heads=8,
+        ff_mult=4,
+        only_attend_immediate_media=True,
+    ):
+        super().__init__()
+        self.attn = MaskedCrossAttention(
+            dim=dim,
+            dim_audio=dim_audio,
+            max_window_per_audio=max_window_per_audio,
+            dim_head=dim_head,
+            heads=heads,
+            only_attend_immediate_media=only_attend_immediate_media,
+        )
+        self.attn_gate = nn.Parameter(torch.tensor([0.0]))
+        self.ff = FeedForward(dim, mult=ff_mult)
+        self.ff_gate = nn.Parameter(torch.tensor([0.0]))
+    def forward(
+        self,
+        x,
+        media,
+        media_mask,
+        media_locations=None,
+        use_cached_media=False,
+    ):
+        x = (
+            self.attn(
+                x,
+                media,
+                media_mask,
+                media_locations=media_locations,
+                use_cached_media=use_cached_media,
+            )
+            * self.attn_gate.tanh()
+            + x
+        )
+        x = self.ff(x) * self.ff_gate.tanh() + x
+        return x
+if __name__ == '__main__':
+    enc = TransformerEncoder().cuda()
+    x = torch.randn(4, 512).cuda()
+    output = enc(x)
+    enc._use_gradient_checkpointing = True
+    print(output.shape)

src/utils.py ADDED Viewed

	@@ -0,0 +1,54 @@

+# Copyright (c) 2024 NVIDIA CORPORATION.
+#   Licensed under the MIT license.
+# Adapted from https://github.com/mlfoundations/open_flamingo under the MIT license.
+#   LICENSE is in incl_licenses directory.
+def extend_instance(obj, mixin):
+    """Apply mixins to a class instance after creation"""
+    base_cls = obj.__class__
+    base_cls_name = obj.__class__.__name__
+    obj.__class__ = type(
+        base_cls_name, (mixin, base_cls), {}
+    )  # mixin needs to go first for our forward() logic to work
+def getattr_recursive(obj, att):
+    """
+    Return nested attribute of obj
+    Example: getattr_recursive(obj, 'a.b.c') is equivalent to obj.a.b.c
+    """
+    if att == "":
+        return obj
+    i = att.find(".")
+    if i < 0:
+        return getattr(obj, att)
+    else:
+        return getattr_recursive(getattr(obj, att[:i]), att[i + 1 :])
+def setattr_recursive(obj, att, val):
+    """
+    Set nested attribute of obj
+    Example: setattr_recursive(obj, 'a.b.c', val) is equivalent to obj.a.b.c = val
+    """
+    if "." in att:
+        obj = getattr_recursive(obj, ".".join(att.split(".")[:-1]))
+    setattr(obj, att.split(".")[-1], val)
+def apply_with_stopping_condition(
+    module, apply_fn, apply_condition=None, stopping_condition=None, **other_args
+):
+    if stopping_condition(module):
+        return
+    if apply_condition(module):
+        apply_fn(module, **other_args)
+    for child in module.children():
+        apply_with_stopping_condition(
+            child,
+            apply_fn,
+            apply_condition=apply_condition,
+            stopping_condition=stopping_condition,
+            **other_args
+        )