Spaces:

Surn
/

UnlimitedMusicGen

Running on T4

App Files Files Community

Surn commited on 5 days ago

Commit

907a484

1 Parent(s): 6804dbd

Add STYLE model with upgrades

Browse files

Files changed (20) hide show

app.py +44 -21
audiocraft/__init__.py +1 -1
audiocraft/data/audio.py +124 -4
audiocraft/models/__init__.py +5 -1
audiocraft/models/builders.py +202 -109
audiocraft/models/flow_matching.py +516 -0
audiocraft/models/genmodel.py +267 -0
audiocraft/models/lm.py +78 -24
audiocraft/models/lm_magnet.py +500 -0
audiocraft/models/loaders.py +49 -1
audiocraft/models/magnet.py +88 -0
audiocraft/models/musicgen.py +33 -4
audiocraft/modules/codebooks_patterns.py +10 -6
audiocraft/modules/conditioners.py +362 -15
audiocraft/modules/jasco_conditioners.py +300 -0
audiocraft/modules/transformer.py +11 -1
audiocraft/modules/unet_transformer.py +67 -0
audiocraft/utils/extend.py +23 -4
audiocraft/utils/utils.py +28 -0
requirements.txt +3 -1

app.py CHANGED Viewed

@@ -183,7 +183,7 @@ def load_melody_filepath(melody_filepath, title, assigned_model,topp, temperatur
     return  gr.update(value=melody_name), gr.update(maximum=MAX_PROMPT_INDEX, value=-1), gr.update(value=assigned_model, interactive=True), gr.update(value=topp), gr.update(value=temperature), gr.update(value=cfg_coef), gr.update(maximum=MAX_OVERLAP)
-def predict(model, text, melody_filepath, duration, dimension, topk, topp, temperature, cfg_coef, background, title, settings_font, settings_font_color, seed, overlap=1, prompt_index = 0, include_title = True, include_settings = True, harmony_only = False, profile = gr.OAuthProfile, segment_length = 30, settings_font_size=28, settings_animate_waveform=False, video_orientation="Landscape", progress=gr.Progress(track_tqdm=True)):
     global MODEL, INTERRUPTED, INTERRUPTING, MOVE_TO_CPU
     output_segments = None
     melody_name = "Not Used"
@@ -251,24 +251,47 @@ def predict(model, text, melody_filepath, duration, dimension, topk, topp, tempe
         print(f'Segment duration: {segment_duration}, duration: {duration}, overlap: {overlap}')
-        MODEL.set_generation_params(
-            use_sampling=True,
-            top_k=topk,
-            top_p=topp,
-            temperature=temperature,
-            cfg_coef=cfg_coef,
-            duration=segment_duration,
-            two_step_cfg=False,
-            extend_stride=10,
-            rep_penalty=0.5
-        )
         MODEL.set_custom_progress_callback(gr.Progress(track_tqdm=True))
         try:
-            if melody and ("melody" in model):
                 # return excess duration, load next model and continue in loop structure building up output_segments
                 if duration > MODEL.duration:
-                    output_segments, duration = generate_music_segments(text, melody, seed, MODEL, duration, overlap, MODEL.duration, prompt_index, harmony_only, progress=gr.Progress(track_tqdm=True))
                 else:
                     # pure original code
                     sr, melody = melody[0], torch.from_numpy(melody[1]).to(MODEL.device).float().t().unsqueeze(0)
@@ -487,11 +510,11 @@ def ui(**kwargs):
                     with gr.Column():
                         with gr.Row():
                             with gr.Column():
-                                text = gr.Text(label="Describe your music", interactive=True, value="4/4 100bpm 320kbps 48khz, Industrial/Electronic Soundtrack, Dark, Intense, Sci-Fi, soft fade-in, soft fade-out", key="prompt", lines=4)
                                 autoplay_cb = gr.Checkbox(value=False, label="Autoplay?", key="autoplay_cb")
                             with gr.Column():
                                 duration = gr.Slider(minimum=1, maximum=720, value=10, label="Duration (s)", interactive=True, key="total_duration", step=1)
-                                model = gr.Radio(["melody", "medium", "small", "large", "melody-large", "stereo-small", "stereo-medium", "stereo-large", "stereo-melody", "stereo-melody-large"], label="AI Model", value="medium", interactive=True, key="chosen_model")
                         with gr.Row():
                             submit = gr.Button("Generate", elem_id="btn-generate")
                             # Adapted from https://github.com/rkfg/audiocraft/blob/long/app.py, MIT license.
@@ -545,7 +568,7 @@ def ui(**kwargs):
             gr.Examples(
                 examples=[
                     [
-                        "4/4 120bpm 320kbps 48khz, An 80s driving pop song with heavy drums and synth pads in the background",
                         "./assets/bach.mp3",
                         "melody",
                         "80s Pop Synth",
@@ -554,7 +577,7 @@ def ui(**kwargs):
                         3.5
                     ],
                     [
-                        "4/4 120bpm 320kbps 48khz, A cheerful country song with acoustic guitars",
                         "./assets/bolero_ravel.mp3",
                         "stereo-melody-large",
                         "Country Guitar",
@@ -563,7 +586,7 @@ def ui(**kwargs):
                         4.0
                     ],
                     [
-                        "4/4 120bpm 320kbps 48khz, 90s rock song with electric guitar and heavy drums",
                         None,
                         "stereo-medium",
                         "90s Rock Guitar",
@@ -572,7 +595,7 @@ def ui(**kwargs):
                         3.75
                     ],
                     [
-                        "4/4 120bpm 320kbps 48khz, a light and cheery EDM track, with syncopated drums, aery pads, and strong emotions",
                         "./assets/bach.mp3",
                         "melody-large",
                         "EDM my Bach",
@@ -581,7 +604,7 @@ def ui(**kwargs):
                         3.75
                     ],
                     [
-                        "4/4 320kbps 48khz, lofi slow bpm electro chill with organic samples",
                         None,
                         "medium",
                         "LoFi Chill",

     return  gr.update(value=melody_name), gr.update(maximum=MAX_PROMPT_INDEX, value=-1), gr.update(value=assigned_model, interactive=True), gr.update(value=topp), gr.update(value=temperature), gr.update(value=cfg_coef), gr.update(maximum=MAX_OVERLAP)
+def predict(model, text, melody_filepath, duration, dimension, topk, topp, temperature, cfg_coef, background, title, settings_font, settings_font_color, seed, overlap=1, prompt_index = 0, include_title = True, include_settings = True, harmony_only = False, profile = gr.OAuthProfile, segment_length = 30, settings_font_size=28, settings_animate_waveform=False, video_orientation="Landscape", excerpt_duration=3.5, progress=gr.Progress(track_tqdm=True)):
     global MODEL, INTERRUPTED, INTERRUPTING, MOVE_TO_CPU
     output_segments = None
     melody_name = "Not Used"
         print(f'Segment duration: {segment_duration}, duration: {duration}, overlap: {overlap}')
+        if ("style" in model) and melody:
+            # style and text-to-music
+            MODEL.set_generation_params(
+                use_sampling=True,
+                top_k=topk,
+                top_p=topp,
+                temperature=temperature,
+                cfg_coef=cfg_coef,
+                duration=segment_duration,
+                two_step_cfg=False,
+                cfg_coef_beta=5, # double CFG is only useful for text-and-style conditioning
+            )
+            MODEL.set_style_conditioner_params(
+                eval_q=3, # integer between 1 and 6
+                            # eval_q is the level of quantization that passes
+                            # through the conditioner. When low, the models adheres less to the
+                            # audio conditioning
+                excerpt_length=excerpt_duration, # the length in seconds that is taken by the model in the provided excerpt, can be
+                                    # between 1.5 and 4.5 seconds but it has to be shortest to the length of the provided conditioning
+            )
+        else:
+            MODEL.set_generation_params(
+                use_sampling=True,
+                top_k=topk,
+                top_p=topp,
+                temperature=temperature,
+                cfg_coef=cfg_coef,
+                duration=segment_duration,
+                two_step_cfg=False,
+                extend_stride=10,
+                rep_penalty=0.5,
+                cfg_coef_beta=None, # double CFG is only useful for text-and-style conditioning
+            )
         MODEL.set_custom_progress_callback(gr.Progress(track_tqdm=True))
         try:
+            if melody and ("melody" or "style" in model):
                 # return excess duration, load next model and continue in loop structure building up output_segments
                 if duration > MODEL.duration:
+                    output_segments, duration = generate_music_segments(text, melody, seed, MODEL, duration, overlap, MODEL.duration, prompt_index, harmony_only, excerpt_duration, progress=gr.Progress(track_tqdm=True))
                 else:
                     # pure original code
                     sr, melody = melody[0], torch.from_numpy(melody[1]).to(MODEL.device).float().t().unsqueeze(0)
                     with gr.Column():
                         with gr.Row():
                             with gr.Column():
+                                text = gr.Text(label="Describe your music", interactive=True, value="4/4 100bpm 320kbps 32khz, Industrial/Electronic Soundtrack, Dark, Intense, Sci-Fi, soft fade-in, soft fade-out", key="prompt", lines=4)
                                 autoplay_cb = gr.Checkbox(value=False, label="Autoplay?", key="autoplay_cb")
                             with gr.Column():
                                 duration = gr.Slider(minimum=1, maximum=720, value=10, label="Duration (s)", interactive=True, key="total_duration", step=1)
+                                model = gr.Radio(["melody", "medium", "small", "large", "melody-large", "stereo-small", "stereo-medium", "stereo-large", "stereo-melody", "stereo-melody-large", "style"], label="AI Model", value="medium", interactive=True, key="chosen_model")
                         with gr.Row():
                             submit = gr.Button("Generate", elem_id="btn-generate")
                             # Adapted from https://github.com/rkfg/audiocraft/blob/long/app.py, MIT license.
             gr.Examples(
                 examples=[
                     [
+                        "4/4 120bpm 320kbps 32khz, An 80s driving pop song with heavy drums and synth pads in the background",
                         "./assets/bach.mp3",
                         "melody",
                         "80s Pop Synth",
                         3.5
                     ],
                     [
+                        "4/4 120bpm 320kbps 32khz, A cheerful country song with acoustic guitars",
                         "./assets/bolero_ravel.mp3",
                         "stereo-melody-large",
                         "Country Guitar",
                         4.0
                     ],
                     [
+                        "4/4 120bpm 320kbps 32khz, 90s rock song with electric guitar and heavy drums",
                         None,
                         "stereo-medium",
                         "90s Rock Guitar",
                         3.75
                     ],
                     [
+                        "4/4 120bpm 320kbps 32khz, a light and cheery EDM track, with syncopated drums, aery pads, and strong emotions",
                         "./assets/bach.mp3",
                         "melody-large",
                         "EDM my Bach",
                         3.75
                     ],
                     [
+                        "4/4 320kbps 32khz, lofi slow bpm electro chill with organic samples",
                         None,
                         "medium",
                         "LoFi Chill",

audiocraft/__init__.py CHANGED Viewed

@@ -7,4 +7,4 @@
 # flake8: noqa
 from . import data, modules, models
-__version__ = '1.2.Surn'

 # flake8: noqa
 from . import data, modules, models
+__version__ = '1.3.Surn'

audiocraft/data/audio.py CHANGED Viewed

@@ -79,7 +79,7 @@ def _av_read(filepath: tp.Union[str, Path], seek_time: float = 0, duration: floa
         seek_time (float): Time at which to start reading in the file.
         duration (float): Duration to read from the file. If set to -1, the whole file is read.
     Returns:
-        Tuple[torch.Tensor, int]: Tuple containing audio data and sample rate
     """
     _init_av()
     with av.open(str(filepath)) as af:
@@ -115,7 +115,7 @@ def _av_read(filepath: tp.Union[str, Path], seek_time: float = 0, duration: floa
 def audio_read(filepath: tp.Union[str, Path], seek_time: float = 0.,
-               duration: float = -1., pad: bool = False) -> tp.Tuple[torch.Tensor, int]:
     """Read audio by picking the most appropriate backend tool based on the audio format.
     Args:
@@ -124,7 +124,7 @@ def audio_read(filepath: tp.Union[str, Path], seek_time: float = 0.,
         duration (float): Duration to read from the file. If set to -1, the whole file is read.
         pad (bool): Pad output audio if not reaching expected duration.
     Returns:
-        Tuple[torch.Tensor, int]: Tuple containing audio data and sample rate.
     """
     fp = Path(filepath)
     if fp.suffix in ['.flac', '.ogg']:  # TODO: check if we can safely use av_read for .ogg
@@ -299,4 +299,124 @@ def audio_write2(stem_name: tp.Union[str, Path],
             # we do not want to leave half written files around.
             path.unlink()
         raise
-    return path

         seek_time (float): Time at which to start reading in the file.
         duration (float): Duration to read from the file. If set to -1, the whole file is read.
     Returns:
+        tuple of torch.Tensor, int: Tuple containing audio data and sample rate
     """
     _init_av()
     with av.open(str(filepath)) as af:
 def audio_read(filepath: tp.Union[str, Path], seek_time: float = 0.,
+               duration: float = -1.0, pad: bool = False) -> tp.Tuple[torch.Tensor, int]:
     """Read audio by picking the most appropriate backend tool based on the audio format.
     Args:
         duration (float): Duration to read from the file. If set to -1, the whole file is read.
         pad (bool): Pad output audio if not reaching expected duration.
     Returns:
+        tuple of torch.Tensor, int: Tuple containing audio data and sample rate.
     """
     fp = Path(filepath)
     if fp.suffix in ['.flac', '.ogg']:  # TODO: check if we can safely use av_read for .ogg
             # we do not want to leave half written files around.
             path.unlink()
         raise
+    return path
+def get_spec(y, sr=16000, n_fft=4096, hop_length=128, dur=8) -> np.ndarray:
+    """Get the mel-spectrogram from the raw audio.
+    Args:
+        y (numpy array): raw input
+        sr (int): Sampling rate
+        n_fft (int): Number of samples per FFT. Default is 2048.
+        hop_length (int): Number of samples between successive frames. Default is 512.
+        dur (float): Maxium duration to get the spectrograms
+    Returns:
+        spectro histogram as a numpy array
+    """
+    import librosa
+    import librosa.display
+    spectrogram = librosa.feature.melspectrogram(
+        y=y, sr=sr, n_fft=n_fft, hop_length=hop_length
+    )
+    spectrogram_db = librosa.power_to_db(spectrogram, ref=np.max)
+    return spectrogram_db
+def save_spectrograms(
+    ys: tp.List[np.ndarray],
+    sr: int,
+    path: str,
+    names: tp.List[str],
+    n_fft: int = 4096,
+    hop_length: int = 128,
+    dur: float = 8.0,
+):
+    """Plot a spectrogram for an audio file.
+    Args:
+        ys: List of audio spectrograms
+        sr (int): Sampling rate of the audio file. Default is 22050 Hz.
+        path (str): Path to the plot file.
+        names: name of each spectrogram plot
+        n_fft (int): Number of samples per FFT. Default is 2048.
+        hop_length (int): Number of samples between successive frames. Default is 512.
+        dur (float): Maxium duration to plot the spectrograms
+    Returns:
+        None (plots the spectrogram using matplotlib)
+    """
+    import matplotlib as mpl  # type: ignore
+    import matplotlib.pyplot as plt  # type: ignore
+    import librosa.display
+    if not names:
+        names = ["Ground Truth", "Audio Watermarked", "Watermark"]
+    ys = [wav[: int(dur * sr)] for wav in ys]  # crop
+    assert len(names) == len(
+        ys
+    ), f"There are {len(ys)} wavs but {len(names)} names ({names})"
+    # Set matplotlib stuff
+    BIGGER_SIZE = 10
+    SMALLER_SIZE = 8
+    linewidth = 234.8775  # linewidth in pt
+    plt.rc("font", size=BIGGER_SIZE, family="serif")  # controls default text sizes
+    plt.rcParams["font.family"] = "DeJavu Serif"
+    plt.rcParams["font.serif"] = ["Times New Roman"]
+    plt.rc("axes", titlesize=BIGGER_SIZE)  # fontsize of the axes title
+    plt.rc("axes", labelsize=BIGGER_SIZE)  # fontsize of the x and y labels
+    plt.rc("xtick", labelsize=BIGGER_SIZE)  # fontsize of the tick labels
+    plt.rc("ytick", labelsize=SMALLER_SIZE)  # fontsize of the tick labels
+    plt.rc("legend", fontsize=BIGGER_SIZE)  # legend fontsize
+    plt.rc("figure", titlesize=BIGGER_SIZE)
+    height = 1.6 * linewidth / 72.0
+    fig, ax = plt.subplots(
+        nrows=len(ys),
+        ncols=1,
+        sharex=True,
+        figsize=(linewidth / 72.0, height),
+    )
+    fig.tight_layout()
+    # Plot the spectrogram
+    for i, ysi in enumerate(ys):
+        spectrogram_db = get_spec(ysi, sr=sr, n_fft=n_fft, hop_length=hop_length)
+        if i == 0:
+            cax = fig.add_axes(
+                [
+                    ax[0].get_position().x1 + 0.01,  # type: ignore
+                    ax[-1].get_position().y0,
+                    0.02,
+                    ax[0].get_position().y1 - ax[-1].get_position().y0,
+                ]
+            )
+            fig.colorbar(
+                mpl.cm.ScalarMappable(
+                    norm=mpl.colors.Normalize(
+                        np.min(spectrogram_db), np.max(spectrogram_db)
+                    ),
+                    cmap="magma",
+                ),
+                ax=ax,
+                orientation="vertical",
+                format="%+2.0f dB",
+                cax=cax,
+            )
+        librosa.display.specshow(
+            spectrogram_db,
+            sr=sr,
+            hop_length=hop_length,
+            x_axis="time",
+            y_axis="mel",
+            ax=ax[i],
+        )
+        ax[i].set(title=names[i])
+        ax[i].yaxis.set_label_text(None)
+        ax[i].label_outer()
+    fig.savefig(path, bbox_inches="tight")
+    plt.close()

audiocraft/models/__init__.py CHANGED Viewed

@@ -12,6 +12,10 @@ from . import builders, loaders
 from .encodec import (
     CompressionModel, EncodecModel, DAC,
     HFEncodecModel, HFEncodecCompressionModel)
-from .musicgen import MusicGen
 from .lm import LMModel
 from .encodec import CompressionModel, EncodecModel

 from .encodec import (
     CompressionModel, EncodecModel, DAC,
     HFEncodecModel, HFEncodecCompressionModel)
 from .lm import LMModel
+from .lm_magnet import MagnetLMModel
+from .flow_matching import FlowMatchingModel
 from .encodec import CompressionModel, EncodecModel
+from .musicgen import MusicGen
+from .magnet import MAGNeT
+from .unet import DiffusionUnet

audiocraft/models/builders.py CHANGED Viewed

@@ -11,51 +11,53 @@ from the Hydra config.
 import typing as tp
-import audiocraft
 import omegaconf
 import torch
-from .encodec import CompressionModel, EncodecModel, InterleaveStereoCompressionModel
-from .lm import LMModel
-from ..modules.codebooks_patterns import (
-    CodebooksPatternProvider,
-    DelayedPatternProvider,
-    MusicLMPattern,
-    ParallelPatternProvider,
-    UnrolledPatternProvider,
-    CoarseFirstPattern,
-)
-from ..modules.conditioners import (
-    BaseConditioner,
-    ChromaStemConditioner,
-    CLAPEmbeddingConditioner,
-    ConditionFuser,
-    ConditioningProvider,
-    LUTConditioner,
-    T5Conditioner,
-)
-from .unet import DiffusionUnet
 from .. import quantization as qt
-from ..utils.utils import dict_from_config
 from ..modules.diffusion_schedule import MultiBandProcessor, SampleProcessor
-def get_quantizer(quantizer: str, cfg: omegaconf.DictConfig, dimension: int) -> qt.BaseQuantizer:
-    klass = {
-        'no_quant': qt.DummyQuantizer,
-        'rvq': qt.ResidualVectorQuantizer
-    }[quantizer]
     kwargs = dict_from_config(getattr(cfg, quantizer))
-    if quantizer != 'no_quant':
-        kwargs['dimension'] = dimension
     return klass(**kwargs)
 def get_encodec_autoencoder(encoder_name: str, cfg: omegaconf.DictConfig):
-    if encoder_name == 'seanet':
-        kwargs = dict_from_config(getattr(cfg, 'seanet'))
-        encoder_override_kwargs = kwargs.pop('encoder')
-        decoder_override_kwargs = kwargs.pop('decoder')
         encoder_kwargs = {**kwargs, **encoder_override_kwargs}
         decoder_kwargs = {**kwargs, **decoder_override_kwargs}
         encoder = audiocraft.modules.SEANetEncoder(**encoder_kwargs)
@@ -67,44 +69,98 @@ def get_encodec_autoencoder(encoder_name: str, cfg: omegaconf.DictConfig):
 def get_compression_model(cfg: omegaconf.DictConfig) -> CompressionModel:
     """Instantiate a compression model."""
-    if cfg.compression_model == 'encodec':
-        kwargs = dict_from_config(getattr(cfg, 'encodec'))
-        encoder_name = kwargs.pop('autoencoder')
-        quantizer_name = kwargs.pop('quantizer')
         encoder, decoder = get_encodec_autoencoder(encoder_name, cfg)
         quantizer = get_quantizer(quantizer_name, cfg, encoder.dimension)
-        frame_rate = kwargs['sample_rate'] // encoder.hop_length
-        renormalize = kwargs.pop('renormalize', False)
         # deprecated params
-        kwargs.pop('renorm', None)
-        return EncodecModel(encoder, decoder, quantizer,
-                            frame_rate=frame_rate, renormalize=renormalize, **kwargs).to(cfg.device)
     else:
         raise KeyError(f"Unexpected compression model {cfg.compression_model}")
 def get_lm_model(cfg: omegaconf.DictConfig) -> LMModel:
     """Instantiate a transformer LM."""
-    if cfg.lm_model == 'transformer_lm':
-        kwargs = dict_from_config(getattr(cfg, 'transformer_lm'))
-        n_q = kwargs['n_q']
-        q_modeling = kwargs.pop('q_modeling', None)
-        codebooks_pattern_cfg = getattr(cfg, 'codebooks_pattern')
-        attribute_dropout = dict_from_config(getattr(cfg, 'attribute_dropout'))
-        cls_free_guidance = dict_from_config(getattr(cfg, 'classifier_free_guidance'))
-        cfg_prob, cfg_coef = cls_free_guidance['training_dropout'], cls_free_guidance['inference_coef']
         fuser = get_condition_fuser(cfg)
         condition_provider = get_conditioner_provider(kwargs["dim"], cfg).to(cfg.device)
-        if len(fuser.fuse2cond['cross']) > 0:  # enforce cross-att programmatically
-            kwargs['cross_attention'] = True
         if codebooks_pattern_cfg.modeling is None:
-            assert q_modeling is not None, \
-                "LM model should either have a codebook pattern defined or transformer_lm.q_modeling"
             codebooks_pattern_cfg = omegaconf.OmegaConf.create(
-                {'modeling': q_modeling, 'delay': {'delays': list(range(n_q))}}
             )
         pattern_provider = get_codebooks_pattern_provider(n_q, codebooks_pattern_cfg)
-        return LMModel(
             pattern_provider=pattern_provider,
             condition_provider=condition_provider,
             fuser=fuser,
@@ -113,67 +169,84 @@ def get_lm_model(cfg: omegaconf.DictConfig) -> LMModel:
             attribute_dropout=attribute_dropout,
             dtype=getattr(torch, cfg.dtype),
             device=cfg.device,
-            **kwargs
         ).to(cfg.device)
     else:
         raise KeyError(f"Unexpected LM model {cfg.lm_model}")
-def get_conditioner_provider(output_dim: int, cfg: omegaconf.DictConfig) -> ConditioningProvider:
     """Instantiate a conditioning model."""
     device = cfg.device
     duration = cfg.dataset.segment_duration
-    cfg = getattr(cfg, 'conditioners')
     dict_cfg = {} if cfg is None else dict_from_config(cfg)
     conditioners: tp.Dict[str, BaseConditioner] = {}
-    condition_provider_args = dict_cfg.pop('args', {})
-    condition_provider_args.pop('merge_text_conditions_p', None)
-    condition_provider_args.pop('drop_desc_p', None)
     for cond, cond_cfg in dict_cfg.items():
-        model_type = cond_cfg['model']
         model_args = cond_cfg[model_type]
-        if model_type == 't5':
-            conditioners[str(cond)] = T5Conditioner(output_dim=output_dim, device=device, **model_args)
-        elif model_type == 'lut':
-            conditioners[str(cond)] = LUTConditioner(output_dim=output_dim, **model_args)
-        elif model_type == 'chroma_stem':
             conditioners[str(cond)] = ChromaStemConditioner(
-                output_dim=output_dim,
-                duration=duration,
-                device=device,
-                **model_args
             )
-        elif model_type == 'clap':
             conditioners[str(cond)] = CLAPEmbeddingConditioner(
                 output_dim=output_dim,
                 device=device,
                 **model_args
             )
         else:
             raise ValueError(f"Unrecognized conditioning model: {model_type}")
-    conditioner = ConditioningProvider(conditioners, device=device, **condition_provider_args)
     return conditioner
 def get_condition_fuser(cfg: omegaconf.DictConfig) -> ConditionFuser:
     """Instantiate a condition fuser object."""
-    fuser_cfg = getattr(cfg, 'fuser')
-    fuser_methods = ['sum', 'cross', 'prepend', 'input_interpolate']
-    fuse2cond = {k: fuser_cfg[k] for k in fuser_methods}
     kwargs = {k: v for k, v in fuser_cfg.items() if k not in fuser_methods}
     fuser = ConditionFuser(fuse2cond=fuse2cond, **kwargs)
     return fuser
-def get_codebooks_pattern_provider(n_q: int, cfg: omegaconf.DictConfig) -> CodebooksPatternProvider:
     """Instantiate a codebooks pattern provider object."""
     pattern_providers = {
-        'parallel': ParallelPatternProvider,
-        'delay': DelayedPatternProvider,
-        'unroll': UnrolledPatternProvider,
-        'coarse_first': CoarseFirstPattern,
-        'musiclm': MusicLMPattern,
     }
     name = cfg.modeling
     kwargs = dict_from_config(cfg.get(name)) if hasattr(cfg, name) else {}
@@ -181,20 +254,23 @@ def get_codebooks_pattern_provider(n_q: int, cfg: omegaconf.DictConfig) -> Codeb
     return klass(n_q, **kwargs)
-def get_debug_compression_model(device='cpu', sample_rate: int = 32000):
     """Instantiate a debug compression model to be used for unit tests."""
-    assert sample_rate in [16000, 32000], "unsupported sample rate for debug compression model"
     model_ratios = {
         16000: [10, 8, 8],  # 25 Hz at 16kHz
-        32000: [10, 8, 16]  # 25 Hz at 32kHz
     }
     ratios: tp.List[int] = model_ratios[sample_rate]
     frame_rate = 25
     seanet_kwargs: dict = {
-        'n_filters': 4,
-        'n_residual_layers': 1,
-        'dimension': 32,
-        'ratios': ratios,
     }
     encoder = audiocraft.modules.SEANetEncoder(**seanet_kwargs)
     decoder = audiocraft.modules.SEANetDecoder(**seanet_kwargs)
@@ -202,8 +278,13 @@ def get_debug_compression_model(device='cpu', sample_rate: int = 32000):
     init_x = torch.randn(8, 32, 128)
     quantizer(init_x, 1)  # initialize kmeans etc.
     compression_model = EncodecModel(
-        encoder, decoder, quantizer,
-        frame_rate=frame_rate, sample_rate=sample_rate, channels=1).to(device)
     return compression_model.eval()
@@ -211,48 +292,60 @@ def get_diffusion_model(cfg: omegaconf.DictConfig):
     # TODO Find a way to infer the channels from dset
     channels = cfg.channels
     num_steps = cfg.schedule.num_steps
-    return DiffusionUnet(
-            chin=channels, num_steps=num_steps, **cfg.diffusion_unet)
 def get_processor(cfg, sample_rate: int = 24000):
     sample_processor = SampleProcessor()
     if cfg.use:
         kw = dict(cfg)
-        kw.pop('use')
-        kw.pop('name')
         if cfg.name == "multi_band_processor":
             sample_processor = MultiBandProcessor(sample_rate=sample_rate, **kw)
     return sample_processor
-def get_debug_lm_model(device='cpu'):
     """Instantiate a debug LM to be used for unit tests."""
     pattern = DelayedPatternProvider(n_q=4)
     dim = 16
     providers = {
-        'description': LUTConditioner(n_bins=128, dim=dim, output_dim=dim, tokenizer="whitespace"),
     }
     condition_provider = ConditioningProvider(providers)
     fuser = ConditionFuser(
-        {'cross': ['description'], 'prepend': [],
-         'sum': [], 'input_interpolate': []})
     lm = LMModel(
-        pattern, condition_provider, fuser,
-        n_q=4, card=400, dim=dim, num_heads=4, custom=True, num_layers=2,
-        cross_attention=True, causal=True)
     return lm.to(device).eval()
 def get_wrapped_compression_model(
-        compression_model: CompressionModel,
-        cfg: omegaconf.DictConfig) -> CompressionModel:
-    if hasattr(cfg, 'interleave_stereo_codebooks'):
         if cfg.interleave_stereo_codebooks.use:
             kwargs = dict_from_config(cfg.interleave_stereo_codebooks)
-            kwargs.pop('use')
-            compression_model = InterleaveStereoCompressionModel(compression_model, **kwargs)
-    if hasattr(cfg, 'compression_model_n_q'):
         if cfg.compression_model_n_q is not None:
             compression_model.set_num_codebooks(cfg.compression_model_n_q)
     return compression_model

 import typing as tp
 import omegaconf
 import torch
+import audiocraft
 from .. import quantization as qt
+from ..modules.codebooks_patterns import (CoarseFirstPattern,
+                                          CodebooksPatternProvider,
+                                          DelayedPatternProvider,
+                                          MusicLMPattern,
+                                          ParallelPatternProvider,
+                                          UnrolledPatternProvider)
+from ..modules.conditioners import (BaseConditioner, ChromaStemConditioner,
+                                    CLAPEmbeddingConditioner,
+                                    ConditionFuser, JascoCondConst,
+                                    ConditioningProvider, LUTConditioner,
+                                    T5Conditioner, StyleConditioner)
+from ..modules.jasco_conditioners import (JascoConditioningProvider, ChordsEmbConditioner,
+                                          DrumsConditioner, MelodyConditioner)
 from ..modules.diffusion_schedule import MultiBandProcessor, SampleProcessor
+from ..utils.utils import dict_from_config
+from .encodec import (CompressionModel, EncodecModel,
+                      InterleaveStereoCompressionModel)
+from .lm import LMModel
+from .lm_magnet import MagnetLMModel
+from .flow_matching import FlowMatchingModel
+from .unet import DiffusionUnet
+def get_quantizer(
+    quantizer: str, cfg: omegaconf.DictConfig, dimension: int
+) -> qt.BaseQuantizer:
+    klass = {"no_quant": qt.DummyQuantizer, "rvq": qt.ResidualVectorQuantizer}[
+        quantizer
+    ]
     kwargs = dict_from_config(getattr(cfg, quantizer))
+    if quantizer != "no_quant":
+        kwargs["dimension"] = dimension
     return klass(**kwargs)
 def get_encodec_autoencoder(encoder_name: str, cfg: omegaconf.DictConfig):
+    if encoder_name == "seanet":
+        kwargs = dict_from_config(getattr(cfg, "seanet"))
+        encoder_override_kwargs = kwargs.pop("encoder")
+        decoder_override_kwargs = kwargs.pop("decoder")
         encoder_kwargs = {**kwargs, **encoder_override_kwargs}
         decoder_kwargs = {**kwargs, **decoder_override_kwargs}
         encoder = audiocraft.modules.SEANetEncoder(**encoder_kwargs)
 def get_compression_model(cfg: omegaconf.DictConfig) -> CompressionModel:
     """Instantiate a compression model."""
+    if cfg.compression_model == "encodec":
+        kwargs = dict_from_config(getattr(cfg, "encodec"))
+        encoder_name = kwargs.pop("autoencoder")
+        quantizer_name = kwargs.pop("quantizer")
         encoder, decoder = get_encodec_autoencoder(encoder_name, cfg)
         quantizer = get_quantizer(quantizer_name, cfg, encoder.dimension)
+        frame_rate = kwargs["sample_rate"] // encoder.hop_length
+        renormalize = kwargs.pop("renormalize", False)
         # deprecated params
+        kwargs.pop("renorm", None)
+        return EncodecModel(
+            encoder,
+            decoder,
+            quantizer,
+            frame_rate=frame_rate,
+            renormalize=renormalize,
+            **kwargs,
+        ).to(cfg.device)
     else:
         raise KeyError(f"Unexpected compression model {cfg.compression_model}")
+def get_jasco_model(cfg: omegaconf.DictConfig,
+                    compression_model: tp.Optional[CompressionModel] = None) -> FlowMatchingModel:
+    kwargs = dict_from_config(getattr(cfg, "transformer_lm"))
+    attribute_dropout = dict_from_config(getattr(cfg, "attribute_dropout"))
+    cls_free_guidance = dict_from_config(getattr(cfg, "classifier_free_guidance"))
+    cfg_prob = cls_free_guidance["training_dropout"]
+    cfg_coef = cls_free_guidance["inference_coef"]
+    fuser = get_condition_fuser(cfg)
+    condition_provider = get_conditioner_provider(kwargs["dim"], cfg).to(cfg.device)
+    if JascoCondConst.DRM.value in condition_provider.conditioners:  # use self_wav for drums
+        assert compression_model is not None
+        # use compression model for drums conditioning
+        condition_provider.conditioners.self_wav.compression_model = compression_model
+        condition_provider.conditioners.self_wav.compression_model.requires_grad_(False)
+    # downcast to jasco conditioning provider
+    seq_len = cfg.compression_model_framerate * cfg.dataset.segment_duration
+    chords_card = cfg.conditioners.chords.chords_emb.card if JascoCondConst.CRD.value in cfg.conditioners else -1
+    condition_provider = JascoConditioningProvider(device=condition_provider.device,
+                                                   conditioners=condition_provider.conditioners,
+                                                   chords_card=chords_card,
+                                                   sequence_length=seq_len)
+    if len(fuser.fuse2cond["cross"]) > 0:  # enforce cross-att programmatically
+        kwargs["cross_attention"] = True
+    kwargs.pop("n_q", None)
+    kwargs.pop("card", None)
+    return FlowMatchingModel(
+        condition_provider=condition_provider,
+        fuser=fuser,
+        cfg_dropout=cfg_prob,
+        cfg_coef=cfg_coef,
+        attribute_dropout=attribute_dropout,
+        dtype=getattr(torch, cfg.dtype),
+        device=cfg.device,
+        **kwargs,
+    ).to(cfg.device)
 def get_lm_model(cfg: omegaconf.DictConfig) -> LMModel:
     """Instantiate a transformer LM."""
+    if cfg.lm_model in ["transformer_lm", "transformer_lm_magnet"]:
+        kwargs = dict_from_config(getattr(cfg, "transformer_lm"))
+        n_q = kwargs["n_q"]
+        q_modeling = kwargs.pop("q_modeling", None)
+        codebooks_pattern_cfg = getattr(cfg, "codebooks_pattern")
+        attribute_dropout = dict_from_config(getattr(cfg, "attribute_dropout"))
+        cls_free_guidance = dict_from_config(getattr(cfg, "classifier_free_guidance"))
+        cfg_prob, cfg_coef = (
+            cls_free_guidance["training_dropout"],
+            cls_free_guidance["inference_coef"],
+        )
         fuser = get_condition_fuser(cfg)
         condition_provider = get_conditioner_provider(kwargs["dim"], cfg).to(cfg.device)
+        if len(fuser.fuse2cond["cross"]) > 0:  # enforce cross-att programmatically
+            kwargs["cross_attention"] = True
         if codebooks_pattern_cfg.modeling is None:
+            assert (
+                q_modeling is not None
+            ), "LM model should either have a codebook pattern defined or transformer_lm.q_modeling"
             codebooks_pattern_cfg = omegaconf.OmegaConf.create(
+                {"modeling": q_modeling, "delay": {"delays": list(range(n_q))}}
             )
         pattern_provider = get_codebooks_pattern_provider(n_q, codebooks_pattern_cfg)
+        lm_class = MagnetLMModel if cfg.lm_model == "transformer_lm_magnet" else LMModel
+        return lm_class(
             pattern_provider=pattern_provider,
             condition_provider=condition_provider,
             fuser=fuser,
             attribute_dropout=attribute_dropout,
             dtype=getattr(torch, cfg.dtype),
             device=cfg.device,
+            **kwargs,
         ).to(cfg.device)
     else:
         raise KeyError(f"Unexpected LM model {cfg.lm_model}")
+def get_conditioner_provider(
+    output_dim: int, cfg: omegaconf.DictConfig
+) -> ConditioningProvider:
     """Instantiate a conditioning model."""
     device = cfg.device
     duration = cfg.dataset.segment_duration
+    cfg = getattr(cfg, "conditioners")
     dict_cfg = {} if cfg is None else dict_from_config(cfg)
     conditioners: tp.Dict[str, BaseConditioner] = {}
+    condition_provider_args = dict_cfg.pop("args", {})
+    condition_provider_args.pop("merge_text_conditions_p", None)
+    condition_provider_args.pop("drop_desc_p", None)
     for cond, cond_cfg in dict_cfg.items():
+        model_type = cond_cfg["model"]
         model_args = cond_cfg[model_type]
+        if model_type == "t5":
+            conditioners[str(cond)] = T5Conditioner(
+                output_dim=output_dim, device=device, **model_args
+            )
+        elif model_type == "lut":
+            conditioners[str(cond)] = LUTConditioner(
+                output_dim=output_dim, **model_args
+            )
+        elif model_type == "chroma_stem":
             conditioners[str(cond)] = ChromaStemConditioner(
+                output_dim=output_dim, duration=duration, device=device, **model_args
             )
+        elif model_type in {"chords_emb", "drum_latents", "melody"}:
+            conditioners_classes = {"chords_emb": ChordsEmbConditioner,
+                                    "drum_latents": DrumsConditioner,
+                                    "melody": MelodyConditioner}
+            conditioner_class = conditioners_classes[model_type]
+            conditioners[str(cond)] = conditioner_class(device=device, **model_args)
+        elif model_type == "clap":
             conditioners[str(cond)] = CLAPEmbeddingConditioner(
+                output_dim=output_dim, device=device, **model_args
+            )
+        elif model_type == 'style':
+            conditioners[str(cond)] = StyleConditioner(
                 output_dim=output_dim,
                 device=device,
                 **model_args
             )
         else:
             raise ValueError(f"Unrecognized conditioning model: {model_type}")
+    conditioner = ConditioningProvider(
+        conditioners, device=device, **condition_provider_args
+    )
     return conditioner
 def get_condition_fuser(cfg: omegaconf.DictConfig) -> ConditionFuser:
     """Instantiate a condition fuser object."""
+    fuser_cfg = getattr(cfg, "fuser")
+    fuser_methods = ["sum", "cross", "prepend", "ignore", "input_interpolate"]
+    fuse2cond = {k: fuser_cfg[k] for k in fuser_methods if k in fuser_cfg}
     kwargs = {k: v for k, v in fuser_cfg.items() if k not in fuser_methods}
     fuser = ConditionFuser(fuse2cond=fuse2cond, **kwargs)
     return fuser
+def get_codebooks_pattern_provider(
+    n_q: int, cfg: omegaconf.DictConfig
+) -> CodebooksPatternProvider:
     """Instantiate a codebooks pattern provider object."""
     pattern_providers = {
+        "parallel": ParallelPatternProvider,
+        "delay": DelayedPatternProvider,
+        "unroll": UnrolledPatternProvider,
+        "coarse_first": CoarseFirstPattern,
+        "musiclm": MusicLMPattern,
     }
     name = cfg.modeling
     kwargs = dict_from_config(cfg.get(name)) if hasattr(cfg, name) else {}
     return klass(n_q, **kwargs)
+def get_debug_compression_model(device="cpu", sample_rate: int = 32000):
     """Instantiate a debug compression model to be used for unit tests."""
+    assert sample_rate in [
+        16000,
+        32000,
+    ], "unsupported sample rate for debug compression model"
     model_ratios = {
         16000: [10, 8, 8],  # 25 Hz at 16kHz
+        32000: [10, 8, 16],  # 25 Hz at 32kHz
     }
     ratios: tp.List[int] = model_ratios[sample_rate]
     frame_rate = 25
     seanet_kwargs: dict = {
+        "n_filters": 4,
+        "n_residual_layers": 1,
+        "dimension": 32,
+        "ratios": ratios,
     }
     encoder = audiocraft.modules.SEANetEncoder(**seanet_kwargs)
     decoder = audiocraft.modules.SEANetDecoder(**seanet_kwargs)
     init_x = torch.randn(8, 32, 128)
     quantizer(init_x, 1)  # initialize kmeans etc.
     compression_model = EncodecModel(
+        encoder,
+        decoder,
+        quantizer,
+        frame_rate=frame_rate,
+        sample_rate=sample_rate,
+        channels=1,
+    ).to(device)
     return compression_model.eval()
     # TODO Find a way to infer the channels from dset
     channels = cfg.channels
     num_steps = cfg.schedule.num_steps
+    return DiffusionUnet(chin=channels, num_steps=num_steps, **cfg.diffusion_unet)
 def get_processor(cfg, sample_rate: int = 24000):
     sample_processor = SampleProcessor()
     if cfg.use:
         kw = dict(cfg)
+        kw.pop("use")
+        kw.pop("name")
         if cfg.name == "multi_band_processor":
             sample_processor = MultiBandProcessor(sample_rate=sample_rate, **kw)
     return sample_processor
+def get_debug_lm_model(device="cpu"):
     """Instantiate a debug LM to be used for unit tests."""
     pattern = DelayedPatternProvider(n_q=4)
     dim = 16
     providers = {
+        "description": LUTConditioner(
+            n_bins=128, dim=dim, output_dim=dim, tokenizer="whitespace"
+        ),
     }
     condition_provider = ConditioningProvider(providers)
     fuser = ConditionFuser(
+        {"cross": ["description"], "prepend": [], "sum": [], "input_interpolate": []}
+    )
     lm = LMModel(
+        pattern,
+        condition_provider,
+        fuser,
+        n_q=4,
+        card=400,
+        dim=dim,
+        num_heads=4,
+        custom=True,
+        num_layers=2,
+        cross_attention=True,
+        causal=True,
+    )
     return lm.to(device).eval()
 def get_wrapped_compression_model(
+    compression_model: CompressionModel, cfg: omegaconf.DictConfig
+) -> CompressionModel:
+    if hasattr(cfg, "interleave_stereo_codebooks"):
         if cfg.interleave_stereo_codebooks.use:
             kwargs = dict_from_config(cfg.interleave_stereo_codebooks)
+            kwargs.pop("use")
+            compression_model = InterleaveStereoCompressionModel(
+                compression_model, **kwargs
+            )
+    if hasattr(cfg, "compression_model_n_q"):
         if cfg.compression_model_n_q is not None:
             compression_model.set_num_codebooks(cfg.compression_model_n_q)
     return compression_model

audiocraft/models/flow_matching.py ADDED Viewed

	@@ -0,0 +1,516 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from dataclasses import dataclass
+from functools import partial
+import logging
+import math
+import typing as tp
+import torch
+from torch import nn
+from torchdiffeq import odeint  # type: ignore
+from ..modules.streaming import StreamingModule
+from ..modules.transformer import create_norm_fn, StreamingTransformerLayer
+from ..modules.unet_transformer import UnetTransformer
+from ..modules.conditioners import (
+    ConditionFuser,
+    ClassifierFreeGuidanceDropout,
+    AttributeDropout,
+    ConditioningAttributes,
+    JascoCondConst
+)
+from ..modules.jasco_conditioners import JascoConditioningProvider
+from ..modules.activations import get_activation_fn
+from .lm import ConditionTensors, init_layer
+logger = logging.getLogger(__name__)
+@dataclass
+class FMOutput:
+    latents: torch.Tensor  # [B, T, D]
+    mask: torch.Tensor  # [B, T]
+class CFGTerm:
+    """
+    Base class for Multi Source Classifier-Free Guidance (CFG) terms. This class represents a term in the CFG process,
+    which is used to guide the generation process by adjusting the influence of different conditions.
+    Attributes:
+        conditions (dict): A dictionary of conditions that influence the generation process.
+        weight (float): The weight of the CFG term, determining its influence on the generation.
+    """
+    def __init__(self, conditions, weight):
+        self.conditions = conditions
+        self.weight = weight
+    def drop_irrelevant_conds(self, conditions):
+        """
+        Drops irrelevant conditions from the CFG term. This method should be implemented by subclasses.
+        Args:
+            conditions (dict): The conditions to be filtered.
+        Raises:
+            NotImplementedError: If the method is not implemented in a subclass.
+        """
+        raise NotImplementedError("No base implementation for setting generation params.")
+class AllCFGTerm(CFGTerm):
+    """
+    A CFG term that retains all conditions. This class does not drop any condition.
+    """
+    def __init__(self, conditions, weight):
+        super().__init__(conditions, weight)
+        self.drop_irrelevant_conds()
+    def drop_irrelevant_conds(self):
+        pass
+class NullCFGTerm(CFGTerm):
+    """
+    A CFG term that drops all conditions, effectively nullifying their influence.
+    """
+    def __init__(self, conditions, weight):
+        super().__init__(conditions, weight)
+        self.drop_irrelevant_conds()
+    def drop_irrelevant_conds(self):
+        """
+        Drops all conditions by applying a dropout with probability 1.0, effectively nullifying their influence.
+        """
+        self.conditions = ClassifierFreeGuidanceDropout(p=1.0)(
+                                                        samples=self.conditions,
+                                                        cond_types=["wav", "text", "symbolic"])
+class TextCFGTerm(CFGTerm):
+    """
+    A CFG term that selectively drops conditions based on specified dropout probabilities for different types
+    of conditions, such as 'symbolic' and 'wav'.
+    """
+    def __init__(self, conditions, weight, model_att_dropout):
+        """
+        Initializes a TextCFGTerm with specified conditions, weight, and model attention dropout configuration.
+        Args:
+            conditions (dict): The conditions to be used in the CFG process.
+            weight (float): The weight of the CFG term.
+            model_att_dropout (object): The attribute dropouts used by the model.
+        """
+        super().__init__(conditions, weight)
+        if 'symbolic' in model_att_dropout.p:
+            self.drop_symbolics = {k: 1.0 for k in model_att_dropout.p['symbolic'].keys()}
+        else:
+            self.drop_symbolics = {}
+        if 'wav' in model_att_dropout.p:
+            self.drop_wav = {k: 1.0 for k in model_att_dropout.p['wav'].keys()}
+        else:
+            self.drop_wav = {}
+        self.drop_irrelevant_conds()
+    def drop_irrelevant_conds(self):
+        self.conditions = AttributeDropout({'symbolic': self.drop_symbolics,
+                                            'wav': self.drop_wav})(self.conditions)  # drop temporal conds
+class FlowMatchingModel(StreamingModule):
+    """
+    A flow matching model inherits from StreamingModule.
+    This model uses a transformer architecture to process and fuse conditions, applying learned embeddings and
+    transformations and predicts multi-source guided vector fields.
+    Attributes:
+        condition_provider (JascoConditioningProvider): Provider for conditioning attributes.
+        fuser (ConditionFuser): Fuser for combining multiple conditions.
+        dim (int): Dimensionality of the model's main features.
+        num_heads (int): Number of attention heads in the transformer.
+        flow_dim (int): Dimensionality of the flow features.
+        chords_dim (int): Dimensionality for chord embeddings, if used.
+        drums_dim (int): Dimensionality for drums embeddings, if used.
+        melody_dim (int): Dimensionality for melody embeddings, if used.
+        hidden_scale (int): Scaling factor for the dimensionality of the feedforward network in the transformer.
+        norm (str): Type of normalization to use ('layer_norm' or other supported types).
+        norm_first (bool): Whether to apply normalization before other operations in the transformer layers.
+        bias_proj (bool): Whether to include bias in the projection layers.
+        weight_init (Optional[str]): Method for initializing weights.
+        depthwise_init (Optional[str]): Method for initializing depthwise convolutional layers.
+        zero_bias_init (bool): Whether to initialize biases to zero.
+        cfg_dropout (float): Dropout rate for configuration settings.
+        cfg_coef (float): Coefficient for configuration influence.
+        attribute_dropout (Dict[str, Dict[str, float]]): Dropout rates for specific attributes.
+        time_embedding_dim (int): Dimensionality of time embeddings.
+        **kwargs: Additional keyword arguments for the transformer.
+    Methods:
+        __init__: Initializes the model with the specified attributes and configuration.
+    """
+    def __init__(self, condition_provider: JascoConditioningProvider,
+                 fuser: ConditionFuser,
+                 dim: int = 128,
+                 num_heads: int = 8,
+                 flow_dim: int = 128,
+                 chords_dim: int = 0,
+                 drums_dim: int = 0,
+                 melody_dim: int = 0,
+                 hidden_scale: int = 4,
+                 norm: str = 'layer_norm',
+                 norm_first: bool = False,
+                 bias_proj: bool = True,
+                 weight_init: tp.Optional[str] = None,
+                 depthwise_init: tp.Optional[str] = None,
+                 zero_bias_init: bool = False,
+                 cfg_dropout: float = 0,
+                 cfg_coef: float = 1.0,
+                 attribute_dropout: tp.Dict[str, tp.Dict[str, float]] = {},
+                 time_embedding_dim: int = 128,
+                 **kwargs):
+        super().__init__()
+        self.cfg_coef = cfg_coef
+        self.cfg_dropout = ClassifierFreeGuidanceDropout(p=cfg_dropout)
+        self.att_dropout = AttributeDropout(p=attribute_dropout)
+        self.condition_provider = condition_provider
+        self.fuser = fuser
+        self.dim = dim  # transformer dim
+        self.flow_dim = flow_dim
+        self.chords_dim = chords_dim
+        self.emb = nn.Linear(flow_dim + chords_dim + drums_dim + melody_dim, dim, bias=False)
+        if 'activation' in kwargs:
+            kwargs['activation'] = get_activation_fn(kwargs['activation'])
+        self.transformer = UnetTransformer(
+            d_model=dim, num_heads=num_heads, dim_feedforward=int(hidden_scale * dim),
+            norm=norm, norm_first=norm_first,
+            layer_class=StreamingTransformerLayer,
+            **kwargs)
+        self.out_norm: tp.Optional[nn.Module] = None
+        if norm_first:
+            self.out_norm = create_norm_fn(norm, dim)
+        self.linear = nn.Linear(dim, flow_dim, bias=bias_proj)
+        self._init_weights(weight_init, depthwise_init, zero_bias_init)
+        self._fsdp: tp.Optional[nn.Module]
+        self.__dict__['_fsdp'] = None
+        # init time parameter embedding
+        self.d_temb1 = time_embedding_dim
+        self.d_temb2 = 4 * time_embedding_dim
+        self.temb = nn.Module()
+        self.temb.dense = nn.ModuleList([
+            torch.nn.Linear(self.d_temb1,
+                            self.d_temb2),
+            torch.nn.Linear(self.d_temb2,
+                            self.d_temb2),
+        ])
+        self.temb_proj = nn.Linear(self.d_temb2, dim)
+    def _get_timestep_embedding(self, timesteps, embedding_dim):
+        """
+        #######################################################################################################
+        TAKEN FROM: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/model.py
+        #######################################################################################################
+        This matches the implementation in Denoising Diffusion Probabilistic Models:
+        From Fairseq.
+        Build sinusoidal embeddings.
+        This matches the implementation in tensor2tensor, but differs slightly
+        from the description in Section 3.5 of "Attention Is All You Need".
+        """
+        assert len(timesteps.shape) == 1
+        half_dim = embedding_dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=torch.float32) * -emb)
+        emb = emb.to(device=timesteps.device)
+        emb = timesteps.float()[:, None] * emb[None, :]
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+        if embedding_dim % 2 == 1:  # zero pad
+            emb = torch.nn.functional.pad(emb, (0, 1, 0, 0))
+        return emb
+    def _embed_time_parameter(self, t: torch.Tensor):
+        """
+        #######################################################################################################
+        TAKEN FROM: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/model.py
+        #######################################################################################################
+        """
+        temb = self._get_timestep_embedding(t.flatten(), self.d_temb1)
+        temb = self.temb.dense[0](temb)
+        temb = temb * torch.sigmoid(temb)  # swish activation
+        temb = self.temb.dense[1](temb)
+        return temb
+    def _init_weights(self, weight_init: tp.Optional[str], depthwise_init: tp.Optional[str], zero_bias_init: bool):
+        """Initialization of the transformer module weights.
+        Args:
+            weight_init (str, optional): Weight initialization strategy. See ``get_init_fn`` for valid options.
+            depthwise_init (str, optional): Depthwise initialization strategy. The following options are valid:
+                'current' where the depth corresponds to the current layer index or 'global' where the total number
+                of layer is used as depth. If not set, no depthwise initialization strategy is used.
+            zero_bias_init (bool): Whether to initialize bias to zero or not.
+        """
+        assert depthwise_init is None or depthwise_init in ['current', 'global']
+        assert depthwise_init is None or weight_init is not None, \
+            "If 'depthwise_init' is defined, a 'weight_init' method should be provided."
+        assert not zero_bias_init or weight_init is not None, \
+            "If 'zero_bias_init', a 'weight_init' method should be provided"
+        if weight_init is None:
+            return
+        init_layer(self.emb, method=weight_init, init_depth=None, zero_bias_init=zero_bias_init)
+        for layer_idx, tr_layer in enumerate(self.transformer.layers):
+            depth = None
+            if depthwise_init == 'current':
+                depth = layer_idx + 1
+            elif depthwise_init == 'global':
+                depth = len(self.transformer.layers)
+            init_fn = partial(init_layer, method=weight_init, init_depth=depth, zero_bias_init=zero_bias_init)
+            tr_layer.apply(init_fn)
+        init_layer(self.linear, method=weight_init, init_depth=None, zero_bias_init=zero_bias_init)
+    def _align_seq_length(self,
+                          cond: torch.Tensor,
+                          seq_len: int = 500):
+        # trim if needed
+        cond = cond[:, :seq_len, :]
+        # pad if needed
+        B, T, C = cond.shape
+        if T < seq_len:
+            cond = torch.cat((cond, torch.zeros((B, seq_len - T, C), dtype=cond.dtype, device=cond.device)), dim=1)
+        return cond
+    def forward(self,
+                latents: torch.Tensor,
+                t: torch.Tensor,
+                conditions: tp.List[ConditioningAttributes],
+                condition_tensors: tp.Optional[ConditionTensors] = None) -> torch.Tensor:
+        """Apply flow matching forward pass on latents and conditions.
+        Given a tensor of noisy latents of shape [B, T, D] with D the flow dim and T the sequence steps,
+        and a time parameter tensor t, return the vector field with shape [B, T, D].
+        Args:
+            latents (torch.Tensor): noisy latents.
+            conditions (list of ConditioningAttributes): Conditions to use when modeling
+                the given codes. Note that when evaluating multiple time with the same conditioning
+                you should pre-compute those and pass them as `condition_tensors`.
+            condition_tensors (dict[str, ConditionType], optional): Pre-computed conditioning
+                tensors, see `conditions`.
+        Returns:
+            torch.Tensor: estimated vector field v_theta.
+        """
+        assert condition_tensors is not None, "FlowMatchingModel require pre-calculation of condition tensors"
+        assert not conditions, "Shouldn't pass unprocessed conditions to FlowMatchingModel."
+        B, T, D = latents.shape
+        x = latents
+        # concat temporal conditions on the feature dimension
+        temporal_conds = JascoCondConst.ALL.value
+        for cond in temporal_conds:
+            if cond not in condition_tensors:
+                continue
+            c = self._align_seq_length(condition_tensors[cond][0], seq_len=T)
+            x = torch.concat((x, c), dim=-1)
+        # project to transformer dimension
+        input_ = self.emb(x)
+        input_, cross_attention_input = self.fuser(input_, condition_tensors)
+        # embed time parameter
+        t_embs = self._embed_time_parameter(t)
+        # add it to cross_attention_input
+        cross_attention_input = cross_attention_input + self.temb_proj(t_embs[:, None, :])
+        out = self.transformer(input_, cross_attention_src=cross_attention_input)
+        if self.out_norm:
+            out = self.out_norm(out)
+        v_theta = self.linear(out)  # [B, T, D]
+        # remove the prefix from the model outputs
+        if len(self.fuser.fuse2cond['prepend']) > 0:
+            v_theta = v_theta[:, :, -T:]
+        return v_theta  # [B, T, D]
+    def _multi_source_cfg_preprocess(self,
+                                     conditions: tp.List[ConditioningAttributes],
+                                     cfg_coef_all: float,
+                                     cfg_coef_txt: float,
+                                     min_weight: float = 1e-6):
+        """
+        Preprocesses the CFG terms for multi-source conditional generation.
+        Args:
+            conditions (list): A list of conditions to be applied.
+            cfg_coef_all (float): The coefficient for all conditions.
+            cfg_coef_txt (float): The coefficient for text conditions.
+            min_weight (float): The minimal absolute weight for calculating a CFG term.
+        Returns:
+            tuple: A tuple containing condition_tensors and cfg_terms.
+                condition_tensors is a dictionary or ConditionTensors object with tokenized conditions.
+                cfg_terms is a list of CFGTerm objects with weights adjusted based on the coefficients.
+        """
+        condition_tensors: tp.Optional[ConditionTensors]
+        cfg_terms = []
+        if conditions:
+            # conditional terms
+            cfg_terms = [AllCFGTerm(conditions=conditions, weight=cfg_coef_all),
+                         TextCFGTerm(conditions=conditions, weight=cfg_coef_txt,
+                                     model_att_dropout=self.att_dropout)]
+            # add null term
+            cfg_terms.append(NullCFGTerm(conditions=conditions, weight=1 - sum([ct.weight for ct in cfg_terms])))
+            # remove terms with negligible weight
+            for ct in cfg_terms:
+                if abs(ct.weight) < min_weight:
+                    cfg_terms.remove(ct)
+            conds: tp.List[ConditioningAttributes] = sum([ct.conditions for ct in cfg_terms], [])
+            tokenized = self.condition_provider.tokenize(conds)
+            condition_tensors = self.condition_provider(tokenized)
+        else:
+            condition_tensors = {}
+        return condition_tensors, cfg_terms
+    def estimated_vector_field(self, z, t, condition_tensors=None, cfg_terms=[]):
+        """
+        Estimates the vector field for the given latent variables and time parameter,
+        conditioned on the provided conditions.
+        Args:
+            z (Tensor): The latent variables.
+            t (float): The time variable.
+            condition_tensors (ConditionTensors, optional): The condition tensors. Defaults to None.
+            cfg_terms (list, optional): The list of CFG terms. Defaults to an empty list.
+        Returns:
+            Tensor: The estimated vector field.
+        """
+        if len(cfg_terms) > 1:
+            z = z.repeat(len(cfg_terms), 1, 1)  # duplicate noisy latents for multi-source CFG
+        v_thetas = self(latents=z, t=t, conditions=[], condition_tensors=condition_tensors)
+        return self._multi_source_cfg_postprocess(v_thetas, cfg_terms)
+    def _multi_source_cfg_postprocess(self, v_thetas, cfg_terms):
+        """
+        Postprocesses the vector fields generated for each CFG term to combine them into a single vector field.
+        Multi source guidance occurs here.
+        Args:
+            v_thetas (Tensor): The vector fields for each CFG term.
+            cfg_terms (list): The CFG terms used.
+        Returns:
+            Tensor: The combined vector field.
+        """
+        if len(cfg_terms) <= 1:
+            return v_thetas
+        v_theta_per_term = v_thetas.chunk(len(cfg_terms))
+        return sum([ct.weight * term_vf for ct, term_vf in zip(cfg_terms, v_theta_per_term)])
+    @torch.no_grad()
+    def generate(self,
+                 prompt: tp.Optional[torch.Tensor] = None,
+                 conditions: tp.List[ConditioningAttributes] = [],
+                 num_samples: tp.Optional[int] = None,
+                 max_gen_len: int = 256,
+                 callback: tp.Optional[tp.Callable[[int, int], None]] = None,
+                 cfg_coef_all: float = 3.0,
+                 cfg_coef_txt: float = 1.0,
+                 euler: bool = False,
+                 euler_steps: int = 100,
+                 ode_rtol: float = 1e-5,
+                 ode_atol: float = 1e-5,
+                 ) -> torch.Tensor:
+        """
+        Generate audio latents given a prompt or unconditionally. This method supports both Euler integration
+        and adaptive ODE solving to generate sequences based on the specified conditions and configuration coefficients.
+        Args:
+            prompt (torch.Tensor, optional): Initial prompt to condition the generation. defaults to None
+            conditions (List[ConditioningAttributes]): List of conditioning attributes - text, symbolic or audio.
+            num_samples (int, optional): Number of samples to generate.
+                                         If None, it is inferred from the number of conditions.
+            max_gen_len (int): Maximum length of the generated sequence.
+            callback (Callable[[int, int], None], optional): Callback function to monitor the generation process.
+            cfg_coef_all (float): Coefficient for the fully conditional CFG term.
+            cfg_coef_txt (float): Coefficient for text CFG term.
+            euler (bool): If True, use Euler integration, otherwise use adaptive ODE solver.
+            euler_steps (int): Number of Euler steps to perform if Euler integration is used.
+            ode_rtol (float): ODE solver rtol threshold.
+            ode_atol (float): ODE solver atol threshold.
+        Returns:
+            torch.Tensor: Generated latents, shaped as (num_samples, max_gen_len, feature_dim).
+        """
+        assert not self.training, "generation shouldn't be used in training mode."
+        first_param = next(iter(self.parameters()))
+        device = first_param.device
+        # Checking all input shapes are consistent.
+        possible_num_samples = []
+        if num_samples is not None:
+            possible_num_samples.append(num_samples)
+        elif prompt is not None:
+            possible_num_samples.append(prompt.shape[0])
+        elif conditions:
+            possible_num_samples.append(len(conditions))
+        else:
+            possible_num_samples.append(1)
+        assert [x == possible_num_samples[0] for x in possible_num_samples], "Inconsistent inputs shapes"
+        num_samples = possible_num_samples[0]
+        condition_tensors, cfg_terms = self._multi_source_cfg_preprocess(conditions, cfg_coef_all, cfg_coef_txt)
+        # flow matching inference
+        B, T, D = num_samples, max_gen_len, self.flow_dim
+        z_0 = torch.randn((B, T, D), device=device)
+        if euler:
+            # vanilla Euler intergration
+            dt = (1 / euler_steps)
+            z = z_0
+            t = torch.zeros((1, ), device=device)
+            for _ in range(euler_steps):
+                v_theta = self.estimated_vector_field(z, t,
+                                                      condition_tensors=condition_tensors,
+                                                      cfg_terms=cfg_terms)
+                z = z + dt * v_theta
+                t = t + dt
+            z_1 = z
+        else:
+            # solve with dynamic ode integrator (dopri5)
+            t = torch.tensor([0, 1.0 - 1e-5], device=device)
+            num_evals = 0
+            # define ode vector field function
+            def inner_ode_func(t, z):
+                nonlocal num_evals
+                num_evals += 1
+                if callback is not None:
+                    ESTIMATED_ODE_SOLVER_STEPS = 300
+                    callback(num_evals, ESTIMATED_ODE_SOLVER_STEPS)
+                return self.estimated_vector_field(z, t,
+                                                   condition_tensors=condition_tensors,
+                                                   cfg_terms=cfg_terms)
+            ode_opts: dict = {"options": {}}
+            z = odeint(
+                inner_ode_func,
+                z_0,
+                t,
+                **{"atol": ode_atol, "rtol": ode_rtol, **ode_opts},
+            )
+            logger.info("Generated in %d steps", num_evals)
+            z_1 = z[-1]
+        return z_1

audiocraft/models/genmodel.py ADDED Viewed

	@@ -0,0 +1,267 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Base implementation for audio generative models. This base implementation
+combines all the required components to run inference with pretrained audio
+generative models. It can be easily inherited by downstream model classes to
+provide easy access to the generation API.
+"""
+from abc import ABC, abstractmethod
+import typing as tp
+import omegaconf
+import torch
+from .encodec import CompressionModel
+from .lm import LMModel
+from .builders import get_wrapped_compression_model
+from ..data.audio_utils import convert_audio
+from ..modules.conditioners import ConditioningAttributes
+from ..utils.autocast import TorchAutocast
+class BaseGenModel(ABC):
+    """Base generative model with convenient generation API.
+    Args:
+        name (str): name of the model.
+        compression_model (CompressionModel): Compression model
+            used to map audio to invertible discrete representations.
+        lm (LMModel): Language model over discrete representations.
+        max_duration (float, optional): maximum duration the model can produce,
+            otherwise, inferred from the training params.
+    """
+    def __init__(self, name: str, compression_model: CompressionModel, lm: LMModel,
+                 max_duration: tp.Optional[float] = None):
+        self.name = name
+        self.compression_model = compression_model
+        self.lm = lm
+        self.cfg: tp.Optional[omegaconf.DictConfig] = None
+        # Just to be safe, let's put everything in eval mode.
+        self.compression_model.eval()
+        self.lm.eval()
+        if hasattr(lm, 'cfg'):
+            cfg = lm.cfg
+            assert isinstance(cfg, omegaconf.DictConfig)
+            self.cfg = cfg
+        if self.cfg is not None:
+            self.compression_model = get_wrapped_compression_model(self.compression_model, self.cfg)
+        if max_duration is None:
+            if self.cfg is not None:
+                max_duration = lm.cfg.dataset.segment_duration  # type: ignore
+            else:
+                raise ValueError("You must provide max_duration when building directly your GenModel")
+        assert max_duration is not None
+        self.max_duration: float = max_duration
+        self.duration = self.max_duration
+        # self.extend_stride is the length of audio extension when generating samples longer
+        # than self.max_duration. NOTE: the derived class must set self.extend_stride to a
+        # positive float value when generating with self.duration > self.max_duration.
+        self.extend_stride: tp.Optional[float] = None
+        self.device = next(iter(lm.parameters())).device
+        self.generation_params: dict = {}
+        self._progress_callback: tp.Optional[tp.Callable[[int, int], None]] = None
+        if self.device.type == 'cpu':
+            self.autocast = TorchAutocast(enabled=False)
+        else:
+            self.autocast = TorchAutocast(
+                enabled=True, device_type=self.device.type, dtype=torch.float16)
+    @property
+    def frame_rate(self) -> float:
+        """Roughly the number of AR steps per seconds."""
+        return self.compression_model.frame_rate
+    @property
+    def sample_rate(self) -> int:
+        """Sample rate of the generated audio."""
+        return self.compression_model.sample_rate
+    @property
+    def audio_channels(self) -> int:
+        """Audio channels of the generated audio."""
+        return self.compression_model.channels
+    def set_custom_progress_callback(self, progress_callback: tp.Optional[tp.Callable[[int, int], None]] = None):
+        """Override the default progress callback."""
+        self._progress_callback = progress_callback
+    @abstractmethod
+    def set_generation_params(self, *args, **kwargs):
+        """Set the generation parameters."""
+        raise NotImplementedError("No base implementation for setting generation params.")
+    @staticmethod
+    @abstractmethod
+    def get_pretrained(name: str, device=None):
+        raise NotImplementedError("No base implementation for getting pretrained model")
+    @torch.no_grad()
+    def _prepare_tokens_and_attributes(
+            self,
+            descriptions: tp.Sequence[tp.Optional[str]],
+            prompt: tp.Optional[torch.Tensor],
+    ) -> tp.Tuple[tp.List[ConditioningAttributes], tp.Optional[torch.Tensor]]:
+        """Prepare model inputs.
+        Args:
+            descriptions (list of str): A list of strings used as text conditioning.
+            prompt (torch.Tensor): A batch of waveforms used for continuation.
+        """
+        attributes = [
+            ConditioningAttributes(text={'description': description})
+            for description in descriptions]
+        if prompt is not None:
+            if descriptions is not None:
+                assert len(descriptions) == len(prompt), "Prompt and nb. descriptions doesn't match"
+            prompt = prompt.to(self.device)
+            prompt_tokens, scale = self.compression_model.encode(prompt)
+            assert scale is None
+        else:
+            prompt_tokens = None
+        return attributes, prompt_tokens
+    def generate_unconditional(self, num_samples: int, progress: bool = False,
+                               return_tokens: bool = False) -> tp.Union[torch.Tensor,
+                                                                        tp.Tuple[torch.Tensor, torch.Tensor]]:
+        """Generate samples in an unconditional manner.
+        Args:
+            num_samples (int): Number of samples to be generated.
+            progress (bool, optional): Flag to display progress of the generation process. Defaults to False.
+        """
+        descriptions: tp.List[tp.Optional[str]] = [None] * num_samples
+        attributes, prompt_tokens = self._prepare_tokens_and_attributes(descriptions, None)
+        tokens = self._generate_tokens(attributes, prompt_tokens, progress)
+        if return_tokens:
+            return self.generate_audio(tokens), tokens
+        return self.generate_audio(tokens)
+    def generate(self, descriptions: tp.List[str], progress: bool = False, return_tokens: bool = False) \
+            -> tp.Union[torch.Tensor, tp.Tuple[torch.Tensor, torch.Tensor]]:
+        """Generate samples conditioned on text.
+        Args:
+            descriptions (list of str): A list of strings used as text conditioning.
+            progress (bool, optional): Flag to display progress of the generation process. Defaults to False.
+        """
+        attributes, prompt_tokens = self._prepare_tokens_and_attributes(descriptions, None)
+        assert prompt_tokens is None
+        tokens = self._generate_tokens(attributes, prompt_tokens, progress)
+        if return_tokens:
+            return self.generate_audio(tokens), tokens
+        return self.generate_audio(tokens)
+    def generate_continuation(self, prompt: torch.Tensor, prompt_sample_rate: int,
+                              descriptions: tp.Optional[tp.List[tp.Optional[str]]] = None,
+                              progress: bool = False, return_tokens: bool = False) \
+            -> tp.Union[torch.Tensor, tp.Tuple[torch.Tensor, torch.Tensor]]:
+        """Generate samples conditioned on audio prompts and an optional text description.
+        Args:
+            prompt (torch.Tensor): A batch of waveforms used for continuation.
+                Prompt should be [B, C, T], or [C, T] if only one sample is generated.
+            prompt_sample_rate (int): Sampling rate of the given audio waveforms.
+            descriptions (list of str, optional): A list of strings used as text conditioning. Defaults to None.
+            progress (bool, optional): Flag to display progress of the generation process. Defaults to False.
+        """
+        if prompt.dim() == 2:
+            prompt = prompt[None]
+        if prompt.dim() != 3:
+            raise ValueError("prompt should have 3 dimensions: [B, C, T] (C = 1).")
+        prompt = convert_audio(prompt, prompt_sample_rate, self.sample_rate, self.audio_channels)
+        if descriptions is None:
+            descriptions = [None] * len(prompt)
+        attributes, prompt_tokens = self._prepare_tokens_and_attributes(descriptions, prompt)
+        assert prompt_tokens is not None
+        tokens = self._generate_tokens(attributes, prompt_tokens, progress)
+        if return_tokens:
+            return self.generate_audio(tokens), tokens
+        return self.generate_audio(tokens)
+    def _generate_tokens(self, attributes: tp.List[ConditioningAttributes],
+                         prompt_tokens: tp.Optional[torch.Tensor], progress: bool = False) -> torch.Tensor:
+        """Generate discrete audio tokens given audio prompt and/or conditions.
+        Args:
+            attributes (list of ConditioningAttributes): Conditions used for generation (here text).
+            prompt_tokens (torch.Tensor, optional): Audio prompt used for continuation.
+            progress (bool, optional): Flag to display progress of the generation process. Defaults to False.
+        Returns:
+            torch.Tensor: Generated audio, of shape [B, C, T], T is defined by the generation params.
+        """
+        total_gen_len = int(self.duration * self.frame_rate)
+        max_prompt_len = int(min(self.duration, self.max_duration) * self.frame_rate)
+        current_gen_offset: int = 0
+        def _progress_callback(generated_tokens: int, tokens_to_generate: int):
+            generated_tokens += current_gen_offset
+            if self._progress_callback is not None:
+                # Note that total_gen_len might be quite wrong depending on the
+                # codebook pattern used, but with delay it is almost accurate.
+                self._progress_callback(generated_tokens, tokens_to_generate)
+            else:
+                print(f'{generated_tokens: 6d} / {tokens_to_generate: 6d}', end='\r')
+        if prompt_tokens is not None:
+            assert max_prompt_len >= prompt_tokens.shape[-1], \
+                "Prompt is longer than audio to generate"
+        callback = None
+        if progress:
+            callback = _progress_callback
+        if self.duration <= self.max_duration:
+            # generate by sampling from LM, simple case.
+            with self.autocast:
+                gen_tokens = self.lm.generate(
+                    prompt_tokens, attributes,
+                    callback=callback, max_gen_len=total_gen_len, **self.generation_params)
+        else:
+            assert self.extend_stride is not None, "Stride should be defined to generate beyond max_duration"
+            assert self.extend_stride < self.max_duration, "Cannot stride by more than max generation duration."
+            all_tokens = []
+            if prompt_tokens is None:
+                prompt_length = 0
+            else:
+                all_tokens.append(prompt_tokens)
+                prompt_length = prompt_tokens.shape[-1]
+            stride_tokens = int(self.frame_rate * self.extend_stride)
+            while current_gen_offset + prompt_length < total_gen_len:
+                time_offset = current_gen_offset / self.frame_rate
+                chunk_duration = min(self.duration - time_offset, self.max_duration)
+                max_gen_len = int(chunk_duration * self.frame_rate)
+                with self.autocast:
+                    gen_tokens = self.lm.generate(
+                        prompt_tokens, attributes,
+                        callback=callback, max_gen_len=max_gen_len, **self.generation_params)
+                if prompt_tokens is None:
+                    all_tokens.append(gen_tokens)
+                else:
+                    all_tokens.append(gen_tokens[:, :, prompt_tokens.shape[-1]:])
+                prompt_tokens = gen_tokens[:, :, stride_tokens:]
+                prompt_length = prompt_tokens.shape[-1]
+                current_gen_offset += stride_tokens
+            gen_tokens = torch.cat(all_tokens, dim=-1)
+        return gen_tokens
+    def generate_audio(self, gen_tokens: torch.Tensor) -> torch.Tensor:
+        """Generate Audio from tokens."""
+        assert gen_tokens.dim() == 3
+        with torch.no_grad():
+            gen_audio = self.compression_model.decode(gen_tokens, None)
+        return gen_audio

audiocraft/models/lm.py CHANGED Viewed

@@ -23,6 +23,7 @@ from ..modules.conditioners import (
     ConditioningProvider,
     ConditioningAttributes,
     ConditionType,
 )
 from ..modules.codebooks_patterns import CodebooksPatternProvider
 from ..modules.activations import get_activation_fn
@@ -219,7 +220,8 @@ class LMModel(StreamingModule):
     def forward(self, sequence: torch.Tensor,
                 conditions: tp.List[ConditioningAttributes],
-                condition_tensors: tp.Optional[ConditionTensors] = None) -> torch.Tensor:
         """Apply language model on sequence and conditions.
         Given a tensor of sequence of shape [B, K, S] with K the number of codebooks and
         S the sequence steps, return the logits with shape [B, card, K, S].
@@ -231,6 +233,9 @@ class LMModel(StreamingModule):
                 you should pre-compute those and pass them as `condition_tensors`.
             condition_tensors (dict[str, ConditionType], optional): Pre-computed conditioning
                 tensors, see `conditions`.
         Returns:
             torch.Tensor: Logits.
         """
@@ -250,7 +255,8 @@ class LMModel(StreamingModule):
         input_, cross_attention_input = self.fuser(input_, condition_tensors)
-        out = self.transformer(input_, cross_attention_src=cross_attention_input)
         if self.out_norm:
             out = self.out_norm(out)
         logits = torch.stack([self.linears[k](out) for k in range(K)], dim=1)  # [B, K, S, card]
@@ -264,7 +270,9 @@ class LMModel(StreamingModule):
     def compute_predictions(
             self, codes: torch.Tensor,
             conditions: tp.List[ConditioningAttributes],
-            condition_tensors: tp.Optional[ConditionTensors] = None) -> LMOutput:
         """Given an input tensor of codes [B, K, T] and list of conditions, runs the model
         forward using the specified codes interleaving pattern.
@@ -276,6 +284,11 @@ class LMModel(StreamingModule):
                 you should pre-compute those and pass them as `condition_tensors`.
             condition_tensors (dict[str, ConditionType], optional): pre-computed conditioning
                 tensors, see `conditions`.
         Returns:
             LMOutput: Language model outputs
                 logits (torch.Tensor) of shape [B, K, T, card] corresponding to the provided codes,
@@ -290,17 +303,18 @@ class LMModel(StreamingModule):
         # map codes [B, K, T] into pattern sequence [B, K, S] using special_token_id for masked tokens
         pattern = self.pattern_provider.get_pattern(T)
         sequence_codes, sequence_indexes, sequence_mask = pattern.build_pattern_sequence(
-            codes, self.special_token_id, keep_only_valid_steps=True
         )
         # apply model on pattern sequence
         model = self if self._fsdp is None else self._fsdp
-        logits = model(sequence_codes, conditions, condition_tensors)  # [B, K, S, card]
         # map back the logits on pattern sequence to logits on original codes: [B, K, S, card] -> [B, K, T, card]
         # and provide the corresponding mask over invalid positions of tokens
         logits = logits.permute(0, 3, 1, 2)  # [B, card, K, S]
         # note: we use nans as special token to make it obvious if we feed unexpected logits
         logits, logits_indexes, logits_mask = pattern.revert_pattern_logits(
-            logits, float('nan'), keep_only_valid_steps=True
         )
         logits = logits.permute(0, 2, 3, 1)  # [B, K, T, card]
         logits_mask = logits_mask[None, :, :].expand(B, -1, -1)  # [K, T] -> [B, K, T]
@@ -315,6 +329,7 @@ class LMModel(StreamingModule):
                            top_k: int = 0,
                            top_p: float = 0.0,
                            cfg_coef: tp.Optional[float] = None,
                            two_step_cfg: tp.Optional[bool] = None) -> torch.Tensor:
         """Sample next token from the model given a sequence and a set of conditions. The model supports
         multiple sampling strategies (greedy sampling, softmax, top-k, top-p...).
@@ -330,6 +345,13 @@ class LMModel(StreamingModule):
             top_k (int): K for "top-k" sampling.
             top_p (float): P for "top-p" sampling.
             cfg_coef (float, optional): classifier free guidance coefficient
         Returns:
             next_token (torch.Tensor): Next token tensor of shape [B, K, 1].
         """
@@ -337,7 +359,23 @@ class LMModel(StreamingModule):
         cfg_coef = self.cfg_coef if cfg_coef is None else cfg_coef
         model = self if self._fsdp is None else self._fsdp
         two_step_cfg = self.two_step_cfg if two_step_cfg is None else two_step_cfg
-        if two_step_cfg and cfg_conditions != {}:
             assert isinstance(cfg_conditions, tuple), type(cfg_conditions)
             condition_tensors, null_condition_tensors = cfg_conditions
             cond_logits = model(sequence, conditions=[], condition_tensors=condition_tensors)
@@ -390,23 +428,30 @@ class LMModel(StreamingModule):
                  top_k: int = 250,
                  top_p: float = 0.0,
                  cfg_coef: tp.Optional[float] = None,
                  two_step_cfg: tp.Optional[bool] = None,
                  remove_prompts: bool = False,
                  check: bool = False,
-                 callback: tp.Optional[tp.Callable[[int, int], None]] = None) -> torch.Tensor:
         """Generate tokens sampling from the model given a prompt or unconditionally. Generation can
-        be perform in a greedy fashion or using sampling with top K and top P strategies.
         Args:
             prompt (torch.Tensor, optional): Prompt tokens of shape [B, K, T].
-            conditions_tensors (list of ConditioningAttributes, optional): List of conditions.
             num_samples (int, optional): Number of samples to generate when no prompt and no conditions are given.
             max_gen_len (int): Maximum generation length.
             use_sampling (bool): Whether to use a sampling strategy or not.
             temp (float): Sampling temperature.
             top_k (int): K for "top-k" sampling.
             top_p (float): P for "top-p" sampling.
-            cfg_coeff (float, optional): Classifier-free guidance coefficient.
             two_step_cfg (bool, optional): Whether to perform classifier-free guidance with two steps generation.
             remove_prompts (bool): Whether to remove prompts from generation or not.
             check (bool): Whether to apply further checks on generated sequence.
@@ -441,18 +486,27 @@ class LMModel(StreamingModule):
         # the padding structure is exactly the same between train and test.
         # With a batch size of 1, this can be slower though.
         cfg_conditions: CFGConditions
-        two_step_cfg = self.two_step_cfg if two_step_cfg is None else two_step_cfg
-        if conditions:
-            null_conditions = ClassifierFreeGuidanceDropout(p=1.0)(conditions)
-            if two_step_cfg:
-                cfg_conditions = (
-                    self.condition_provider(self.condition_provider.tokenize(conditions)),
-                    self.condition_provider(self.condition_provider.tokenize(null_conditions)),
-                )
-            else:
-                conditions = conditions + null_conditions
                 tokenized = self.condition_provider.tokenize(conditions)
                 cfg_conditions = self.condition_provider(tokenized)
         else:
             cfg_conditions = {}
@@ -463,8 +517,8 @@ class LMModel(StreamingModule):
         B, K, T = prompt.shape
         start_offset = T
         print(f"start_offset: {start_offset} | max_gen_len: {max_gen_len}")
-        assert start_offset <= max_gen_len
         pattern = self.pattern_provider.get_pattern(max_gen_len)
         # this token is used as default value for codes that are not generated yet
         unknown_token = -1
@@ -496,7 +550,7 @@ class LMModel(StreamingModule):
                 # sample next token from the model, next token shape is [B, K, 1]
                 next_token = self._sample_next_token(
                     curr_sequence, cfg_conditions, unconditional_state, use_sampling, temp, top_k, top_p,
-                    cfg_coef=cfg_coef, two_step_cfg=two_step_cfg)
                 # ensure the tokens that should be masked are properly set to special_token_id
                 # as the model never output special_token_id
                 valid_mask = mask[..., offset:offset+1].expand(B, -1, -1)

     ConditioningProvider,
     ConditioningAttributes,
     ConditionType,
+    _drop_description_condition
 )
 from ..modules.codebooks_patterns import CodebooksPatternProvider
 from ..modules.activations import get_activation_fn
     def forward(self, sequence: torch.Tensor,
                 conditions: tp.List[ConditioningAttributes],
+                condition_tensors: tp.Optional[ConditionTensors] = None,
+                stage: int = -1) -> torch.Tensor:
         """Apply language model on sequence and conditions.
         Given a tensor of sequence of shape [B, K, S] with K the number of codebooks and
         S the sequence steps, return the logits with shape [B, card, K, S].
                 you should pre-compute those and pass them as `condition_tensors`.
             condition_tensors (dict[str, ConditionType], optional): Pre-computed conditioning
                 tensors, see `conditions`.
+            stage (int): The codebook level that is being predicted. Relevant for MAGNeT
+                in which prediction is done in a codebook-by-codebook manner.
+                Takes values in range(n_q), and ignored by default.
         Returns:
             torch.Tensor: Logits.
         """
         input_, cross_attention_input = self.fuser(input_, condition_tensors)
+        out = self.transformer(input_, cross_attention_src=cross_attention_input,
+                               src_mask=(self.attn_mask_per_stage[stage] if stage >= 0 else None))  # type: ignore
         if self.out_norm:
             out = self.out_norm(out)
         logits = torch.stack([self.linears[k](out) for k in range(K)], dim=1)  # [B, K, S, card]
     def compute_predictions(
             self, codes: torch.Tensor,
             conditions: tp.List[ConditioningAttributes],
+            condition_tensors: tp.Optional[ConditionTensors] = None,
+            stage: int = -1,
+            keep_only_valid_steps: bool = True) -> LMOutput:
         """Given an input tensor of codes [B, K, T] and list of conditions, runs the model
         forward using the specified codes interleaving pattern.
                 you should pre-compute those and pass them as `condition_tensors`.
             condition_tensors (dict[str, ConditionType], optional): pre-computed conditioning
                 tensors, see `conditions`.
+            stage (int): The codebook level that is being predicted. Relevant for MAGNeT
+                in which prediction is done in a codebook-by-codebook manner.
+                Takes values in range(n_q), and ignored by default.
+            keep_only_valid_steps (bool): Build a sequence from the pattern up to valid (= fully defined) steps.
+                Steps that are beyond valid steps will be replaced by the special_token in that case.
         Returns:
             LMOutput: Language model outputs
                 logits (torch.Tensor) of shape [B, K, T, card] corresponding to the provided codes,
         # map codes [B, K, T] into pattern sequence [B, K, S] using special_token_id for masked tokens
         pattern = self.pattern_provider.get_pattern(T)
         sequence_codes, sequence_indexes, sequence_mask = pattern.build_pattern_sequence(
+            codes, self.special_token_id, keep_only_valid_steps=keep_only_valid_steps,
         )
         # apply model on pattern sequence
         model = self if self._fsdp is None else self._fsdp
+        logits = model(sequence_codes, conditions, condition_tensors, stage=stage)  # [B, K, S, card]
         # map back the logits on pattern sequence to logits on original codes: [B, K, S, card] -> [B, K, T, card]
         # and provide the corresponding mask over invalid positions of tokens
         logits = logits.permute(0, 3, 1, 2)  # [B, card, K, S]
         # note: we use nans as special token to make it obvious if we feed unexpected logits
         logits, logits_indexes, logits_mask = pattern.revert_pattern_logits(
+            logits, float('nan'), keep_only_valid_steps=keep_only_valid_steps
         )
         logits = logits.permute(0, 2, 3, 1)  # [B, K, T, card]
         logits_mask = logits_mask[None, :, :].expand(B, -1, -1)  # [K, T] -> [B, K, T]
                            top_k: int = 0,
                            top_p: float = 0.0,
                            cfg_coef: tp.Optional[float] = None,
+                           cfg_coef_beta: tp.Optional[float] = None,
                            two_step_cfg: tp.Optional[bool] = None) -> torch.Tensor:
         """Sample next token from the model given a sequence and a set of conditions. The model supports
         multiple sampling strategies (greedy sampling, softmax, top-k, top-p...).
             top_k (int): K for "top-k" sampling.
             top_p (float): P for "top-p" sampling.
             cfg_coef (float, optional): classifier free guidance coefficient
+            cfg_coef_beta (float, optional): If None, simple classifier free guidance is used with cfg_coef.
+                If not None, we apply double classifier free guidance as introduced in MusicGen-Style
+                in paragraph 4.3 (https://arxiv.org/pdf/2407.12563). This beta coefficient is meant to
+                push the text condition more than the style condition in the case where both text and style
+                conditions are being used.
+            two_step_cfg (bool): Whether to run classifier free-guidance with 2 distinct steps.
         Returns:
             next_token (torch.Tensor): Next token tensor of shape [B, K, 1].
         """
         cfg_coef = self.cfg_coef if cfg_coef is None else cfg_coef
         model = self if self._fsdp is None else self._fsdp
         two_step_cfg = self.two_step_cfg if two_step_cfg is None else two_step_cfg
+        if cfg_coef_beta is not None:
+            assert isinstance(cfg_conditions, dict)
+            condition_tensors = cfg_conditions
+            if condition_tensors:
+                # Preparing for CFG, predicting conditional text and style, conditional style
+                # and unconditional
+                sequence = torch.cat([sequence, sequence, sequence], dim=0)
+            all_logits = model(
+                sequence,
+                conditions=[], condition_tensors=condition_tensors)
+            if condition_tensors:
+                cond_logits, wav_logits, uncond_logits = all_logits.split(B, dim=0)  # [B, K, T, card]
+                logits = uncond_logits + cfg_coef * (
+                    wav_logits + cfg_coef_beta * (cond_logits - wav_logits) - uncond_logits
+                    )
+        elif two_step_cfg and cfg_conditions != {}:
             assert isinstance(cfg_conditions, tuple), type(cfg_conditions)
             condition_tensors, null_condition_tensors = cfg_conditions
             cond_logits = model(sequence, conditions=[], condition_tensors=condition_tensors)
                  top_k: int = 250,
                  top_p: float = 0.0,
                  cfg_coef: tp.Optional[float] = None,
+                 cfg_coef_beta: tp.Optional[float] = None,
                  two_step_cfg: tp.Optional[bool] = None,
                  remove_prompts: bool = False,
                  check: bool = False,
+                 callback: tp.Optional[tp.Callable[[int, int], None]] = None,
+                 ) -> torch.Tensor:
         """Generate tokens sampling from the model given a prompt or unconditionally. Generation can
+        be performed in a greedy fashion or using sampling with top K and top P strategies.
         Args:
             prompt (torch.Tensor, optional): Prompt tokens of shape [B, K, T].
+            conditions (list of ConditioningAttributes, optional): List of conditions.
             num_samples (int, optional): Number of samples to generate when no prompt and no conditions are given.
             max_gen_len (int): Maximum generation length.
             use_sampling (bool): Whether to use a sampling strategy or not.
             temp (float): Sampling temperature.
             top_k (int): K for "top-k" sampling.
             top_p (float): P for "top-p" sampling.
+            cfg_coef (float, optional): Classifier-free guidance coefficient.
+            cfg_coef_beta (float, optional): If None, simple classifier free guidance is used with cfg_coef.
+                If not None, we apply double classifier free guidance as introduced in MusicGen-Style
+                in paragraph 4.3 (https://arxiv.org/pdf/2407.12563). This beta coefficient is meant to
+                push the text condition more than the style condition in the case where both text and style
+                conditions are being used.
             two_step_cfg (bool, optional): Whether to perform classifier-free guidance with two steps generation.
             remove_prompts (bool): Whether to remove prompts from generation or not.
             check (bool): Whether to apply further checks on generated sequence.
         # the padding structure is exactly the same between train and test.
         # With a batch size of 1, this can be slower though.
         cfg_conditions: CFGConditions
+        cfg_conditions = {}
+        if cfg_coef_beta is not None:
+            if conditions:
+                wav_conditions = _drop_description_condition(conditions)
+                null_conditions = ClassifierFreeGuidanceDropout(p=1.0)(conditions)
+                conditions = conditions + wav_conditions + null_conditions
                 tokenized = self.condition_provider.tokenize(conditions)
                 cfg_conditions = self.condition_provider(tokenized)
+        elif conditions:
+            two_step_cfg = self.two_step_cfg if two_step_cfg is None else two_step_cfg
+            if conditions:
+                null_conditions = ClassifierFreeGuidanceDropout(p=1.0)(conditions)
+                if two_step_cfg:
+                    cfg_conditions = (
+                        self.condition_provider(self.condition_provider.tokenize(conditions)),
+                        self.condition_provider(self.condition_provider.tokenize(null_conditions)),
+                    )
+                else:
+                    conditions = conditions + null_conditions
+                    tokenized = self.condition_provider.tokenize(conditions)
+                    cfg_conditions = self.condition_provider(tokenized)
         else:
             cfg_conditions = {}
         B, K, T = prompt.shape
         start_offset = T
         print(f"start_offset: {start_offset} | max_gen_len: {max_gen_len}")
+        assert start_offset < max_gen_len
         pattern = self.pattern_provider.get_pattern(max_gen_len)
         # this token is used as default value for codes that are not generated yet
         unknown_token = -1
                 # sample next token from the model, next token shape is [B, K, 1]
                 next_token = self._sample_next_token(
                     curr_sequence, cfg_conditions, unconditional_state, use_sampling, temp, top_k, top_p,
+                    cfg_coef=cfg_coef, cfg_coef_beta=cfg_coef_beta, two_step_cfg=two_step_cfg)
                 # ensure the tokens that should be masked are properly set to special_token_id
                 # as the model never output special_token_id
                 valid_mask = mask[..., offset:offset+1].expand(B, -1, -1)

audiocraft/models/lm_magnet.py ADDED Viewed

	@@ -0,0 +1,500 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import logging
+import math
+import typing as tp
+import torch
+import numpy as np
+from ..utils import utils
+from ..modules.conditioners import (
+    ClassifierFreeGuidanceDropout,
+    ConditioningAttributes,
+    ConditionType,
+)
+from .lm import LMModel
+logger = logging.getLogger(__name__)
+ConditionTensors = tp.Dict[str, ConditionType]
+CFGConditions = tp.Union[ConditionTensors, tp.Tuple[ConditionTensors, ConditionTensors]]
+class MagnetLMModel(LMModel):
+    """Transformer-based, non-autoregressive model, operates on multiple streams of audio tokens (MAGNeT).
+    Args:
+        subcodes_context (int): The number of timesteps attended in the self-attention blocks of codebooks > 0.
+                                When set to -1, attention is unrestricted and all timesteps are attended. Defaults to 5.
+        compression_model_framerate (int): frame rate of the audio tokenizer.
+        segment_duration (int): Sample length in seconds.
+        span_len (int): Determines the length of masking spans. This is the minimal length of consecutive masked tokens,
+                        for both training and inference. Defaults to 3.
+        **kwargs: Additional parameters for the LMModel.
+    """
+    def __init__(self, subcodes_context: int = 5, compression_model_framerate: int = 50,
+                 segment_duration: int = 10, span_len: int = 3, **kwargs):
+        super().__init__(**kwargs)
+        self.causal = kwargs['causal']
+        self.subcodes_context = subcodes_context
+        self.span_len = span_len
+        self._build_attn_masks(compression_model_framerate=compression_model_framerate,
+                               segment_duration=segment_duration,
+                               num_heads=kwargs['num_heads'],
+                               device=kwargs['device'], dtype=kwargs['dtype'])
+    def restricted_context_attn_mask(self, seq_len: int, device: torch.device, dtype: torch.dtype) -> torch.Tensor:
+        """Creates a restricted attention mask (local attention map) where the context
+           is determined by self.subcodes_context.
+        Args:
+            seq_len (int): token sequence length.
+            device (torch.device): device of the output tensor.
+            dtype (torch.dtype): data type of the output tensor.
+        Returns:
+            torch.Tensor: The restricted attention mask.
+        """
+        # Return a context restricted non-causal att mask
+        queries_pos = torch.arange(seq_len, device=device).view(-1, 1)
+        keys_pos = torch.arange(seq_len, device=device).view(1, -1)
+        delta = queries_pos - keys_pos
+        valid = torch.abs(delta) <= self.subcodes_context
+        return torch.where(
+            valid,
+            torch.zeros([], device=device, dtype=dtype),
+            torch.full([], float('-inf'), device=device, dtype=dtype))
+    def _stage_attn_mask(self, stage: int, seq_len: int, num_heads: int,
+                         device: torch.device, dtype: torch.dtype) -> tp.Optional[torch.Tensor]:
+        """Creates a restricted attention mask given the stage (codebook index).
+        Args:
+            stage (int): The codebook index. Takes values in [0, n_q].
+            seq_len (int): Token sequence length.
+            num_heads (int): Num transformer attention heads.
+            device (torch.device): device of the output tensor.
+            dtype (torch.dtype): data type of the output tensor.
+        Returns:
+            torch.Tensor: Either a restricted attention mask or None if stage attention is unrestricted.
+        """
+        sa_mask = None
+        if stage > 0 and self.subcodes_context > -1:
+            # parallel - non-causal - with restricted subcodes context
+            sa_mask = self.restricted_context_attn_mask(seq_len, device=device, dtype=dtype)
+        if sa_mask is not None:
+            # Repeat for each attention head
+            sa_mask = sa_mask.repeat((1, num_heads, 1, 1))
+            # align8 to enable memory efficient attention
+            MEMORY_EFFICIENT_ATTN_ALIGN_FACTOR = 8
+            seq_len_aligned = \
+                int(np.ceil(seq_len / MEMORY_EFFICIENT_ATTN_ALIGN_FACTOR)) * MEMORY_EFFICIENT_ATTN_ALIGN_FACTOR
+            sa_mask_aligned = torch.zeros((1, num_heads, seq_len_aligned, seq_len_aligned), device=device, dtype=dtype)
+            sa_mask_aligned[..., :seq_len, :seq_len] = sa_mask
+            sa_mask = sa_mask_aligned
+        return sa_mask
+    def _build_attn_masks(self, compression_model_framerate: int, segment_duration: int, num_heads: int,
+                          device: torch.device, dtype: torch.dtype):
+        """Construct attention mask per stage. For each of the RVQ codebook levels in the [0, n_q] range,
+           either a local attention map or None would be stored as an entry in the self.attn_mask_per_stage list.
+        Args:
+            compression_model_framerate (int): The frame rate of the tokenizer.
+            segment_duration (int): Sample length in seconds.
+            num_heads (int): Num transformer attention heads.
+            device (torch.device): device of the output tensor.
+            dtype (torch.dtype): data type of the output tensor.
+        """
+        seq_len = compression_model_framerate * segment_duration
+        self.attn_mask_per_stage = [self._stage_attn_mask(stage, seq_len, num_heads,
+                                                          device, dtype) for stage in range(self.n_q)]
+    @torch.no_grad()
+    def generate(self,
+                 prompt: tp.Optional[torch.Tensor] = None,
+                 conditions: tp.List[ConditioningAttributes] = [],
+                 num_samples: tp.Optional[int] = None,
+                 max_gen_len: int = 256,
+                 use_sampling: bool = True,
+                 temp: float = 1.0,
+                 top_k: int = 250,
+                 top_p: float = 0.0,
+                 cfg_coef: tp.Optional[float] = None,
+                 cfg_coef_beta: tp.Optional[float] = None,
+                 two_step_cfg: tp.Optional[bool] = None,
+                 remove_prompts: bool = False,
+                 check: bool = False,
+                 callback: tp.Optional[tp.Callable[[int, int], None]] = None,
+                 **kwargs) -> torch.Tensor:
+        assert cfg_coef is None, "Unsupported in MAGNeT. Use max_cfg_coef,min_cfg_coef instead."
+        assert two_step_cfg is None, "MAGNeT currently doesn't support two step classifier-free-guidance."
+        assert remove_prompts is False, "MAGNeT currently doesn't support the remove_prompts arg."
+        assert check is False, "MAGNeT currently doesn't support the check arg."
+        assert cfg_coef_beta is None, "MAGNeT currently doesn't support the cfg_coef_beta arg."
+        # Call the MAGNeT-specific generation method
+        return self._generate_magnet(prompt=prompt,
+                                     conditions=conditions,
+                                     num_samples=num_samples,
+                                     max_gen_len=max_gen_len,
+                                     use_sampling=use_sampling,
+                                     temp=temp,
+                                     top_k=top_k,
+                                     top_p=top_p,
+                                     callback=callback, **kwargs)
+    @torch.no_grad()
+    def _generate_magnet(self,
+                         prompt: tp.Optional[torch.Tensor] = None,
+                         conditions: tp.List[ConditioningAttributes] = [],
+                         num_samples: tp.Optional[int] = None,
+                         max_gen_len: int = 256,
+                         use_sampling: bool = True,
+                         temp: float = 3.0,
+                         top_k: int = 0,
+                         top_p: float = 0.9,
+                         callback: tp.Optional[tp.Callable[[int, int], None]] = None,
+                         max_cfg_coef: float = 10.0,
+                         min_cfg_coef: float = 1.0,
+                         decoding_steps: tp.List[int] = [20, 10, 10, 10],
+                         anneal_temp: bool = True,
+                         span_scoring='max',
+                         span_arrangement='nonoverlap') -> torch.Tensor:
+        """Generate audio tokens given textual conditions, and optionally given audio prompts,
+        by running MAGNeT's iterative decoding algorithm for each of the n_q RVQ levels.
+        Args:
+            prompt (torch.Tensor): Prompt tokens of shape [B, K, T].
+            conditions (list of ConditioningAttributes): List of conditions.
+            num_samples (int): Number of samples to generate when no prompt and no conditions are given.
+            max_gen_len (int): Maximum generation length.
+            use_sampling (bool): Whether to use a sampling strategy or not.
+            temp (float): Initial sampling temperature.
+            top_k (int): k for "top-k" sampling.
+            top_p (float): p for "top-p" sampling.
+            callback (Callback): Callback function to report generation progress.
+            max_clsfg_coef (float): Initial coefficient used for classifier free guidance.
+            min_clsfg_coef (float): Final coefficient used for classifier free guidance.
+            decoding_steps (list of n_q ints): The number of iterative decoding steps,
+                                            for each of the n_q RVQ codebooks.
+            anneal_temp (bool): When set to True, softmax temperature will be linearly decayed to zero, at each stage.
+            span_scoring (str): Use the maximum probability of each span ('max')
+                                or the product of probabilities ('prod').
+            span_arrangement (str): Use either non-overlapping spans ('nonoverlap') or overlapping spans ('stride1').
+                                                in the masking scheme.
+        Returns:
+            torch.Tensor: Generated tokens.
+        """
+        assert not self.training, "generation shouldn't be used in training mode."
+        first_param = next(iter(self.parameters()))
+        device = first_param.device
+        # Checking all input shapes are consistent.
+        possible_num_samples = []
+        if num_samples is not None:
+            possible_num_samples.append(num_samples)
+        elif prompt is not None:
+            possible_num_samples.append(prompt.shape[0])
+        elif conditions:
+            possible_num_samples.append(len(conditions))
+        else:
+            possible_num_samples.append(1)
+        assert [x == possible_num_samples[0] for x in possible_num_samples], "Inconsistent inputs shapes"
+        num_samples = possible_num_samples[0]
+        # below we create set of conditions: one conditional and one unconditional
+        # to do that we merge the regular condition together with the null condition
+        # we then do 1 forward pass instead of 2.
+        cfg_conditions: tp.Optional[ConditionTensors]
+        if conditions:
+            null_conditions = ClassifierFreeGuidanceDropout(p=1.0)(conditions)
+            conditions = conditions + null_conditions
+            tokenized = self.condition_provider.tokenize(conditions)
+            cfg_conditions = self.condition_provider(tokenized)
+        else:
+            cfg_conditions = {}
+        if prompt is None:
+            assert num_samples > 0
+            prompt = torch.zeros((num_samples, self.num_codebooks, 0), dtype=torch.long, device=device)
+        B, K, prompt_length = prompt.shape
+        start_offset = prompt_length
+        assert start_offset < max_gen_len
+        mask_id = self.special_token_id
+        # we generate codes with a fixed sequence length
+        shape = (B, K, max_gen_len)
+        gen_codes = torch.full(shape, mask_id, dtype=torch.long, device=device)
+        # filling the gen_codes with the prompt if needed
+        gen_codes[..., :start_offset] = prompt
+        # create the gen_sequence with proper interleaving from the pattern: [B, K, S]
+        gen_sequence = gen_codes
+        curr_step = 0
+        for stage, n_steps in zip(range(self.n_q), decoding_steps):
+            gen_sequence, curr_step = self._generate_stage(gen_sequence,
+                                                           cfg_conditions,
+                                                           stage=stage,
+                                                           device=device,
+                                                           prompt_length=prompt_length,
+                                                           prompt=prompt,
+                                                           temp=temp,
+                                                           max_cfg_coef=max_cfg_coef,
+                                                           min_cfg_coef=min_cfg_coef,
+                                                           top_k=top_k,
+                                                           top_p=top_p,
+                                                           timesteps=n_steps,
+                                                           anneal_temp=anneal_temp,
+                                                           span_scoring=span_scoring,
+                                                           use_sampling=use_sampling,
+                                                           span_arrangement=span_arrangement,
+                                                           curr_step=curr_step,
+                                                           total_steps=sum(decoding_steps),
+                                                           callback=callback)
+        return gen_sequence
+    @torch.no_grad()
+    def _generate_stage(self,
+                        gen_sequence: torch.Tensor,
+                        condition_tensors: tp.Optional[ConditionTensors],
+                        stage: int,
+                        device: torch.device,
+                        prompt_length: int = 0,
+                        prompt: tp.Optional[torch.Tensor] = None,
+                        use_sampling: bool = True,
+                        temp: float = 3.0,
+                        max_cfg_coef: float = 10.0,
+                        min_cfg_coef: float = 1.0,
+                        top_k: int = 0,
+                        top_p: float = 0.0,
+                        timesteps: int = 10,
+                        anneal_temp: bool = True,
+                        span_scoring: str = 'max',
+                        span_arrangement: str = 'nonoverlap',
+                        curr_step: int = 0,
+                        total_steps: int = 0,
+                        callback: tp.Optional[tp.Callable[[int, int], None]] = None) -> tp.Tuple[torch.Tensor, int]:
+        """Generate audio tokens of a single RVQ level (stage), given the previously generated stages,
+           and the textual conditions.
+        Args:
+            gen_sequence (torch.Tensor): Previously generated tokens.
+            condition_tensors (tp.Optional[ConditionTensors]): pre-computed conditioning tensors.
+            stage (int): RVQ level to generate.
+            device (torch.device): device of the output tensor.
+            prompt_length (int): Temporal length of the audio prompt.
+            prompt (torch.Tensor): Prompt tokens of shape [B, K, T].
+            use_sampling (bool): Whether to use a sampling strategy or not.
+            temp (float): Initial sampling temperature.
+            max_clsfg_coef (float): Initial coefficient used for classifier free guidance.
+            min_clsfg_coef (float): Final coefficient used for classifier free guidance.
+            top_k (int): k for "top-k" sampling.
+            top_p (float): p for "top-p" sampling.
+            timesteps (int): Number of iterative decoding steps.
+            anneal_temp (bool): When set to True, softmax temperature will be linearly decayed to zero, at each stage.
+            span_scoring (str): Use the maximum probability of each span ('max')
+                                or the product of probabilities ('prod').
+            span_arrangement (str): Use either non-overlapping spans ('nonoverlap') or overlapping spans ('stride1').
+                                                in the masking scheme.
+            curr_step (int): Global iterative decoding step counter.
+            total_steps (int): Total decoding steps.
+            callback (Callback): Callback function to report generation progress.
+        Returns:
+            tuple(torch.Tensor, int): Generated tokens and the current decoding step counter.
+        """
+        B, K, T = gen_sequence.shape
+        shape = (B, 1, T)  # generating a single codebook per stage
+        mask_id = self.special_token_id
+        stage_gen_seq = torch.full(shape, mask_id, dtype=torch.long, device=device)
+        assert span_arrangement == 'nonoverlap' or span_arrangement == 'stride1'
+        chunk_masking = self.span_len > 1 and span_arrangement == 'nonoverlap'
+        DONT_REMASK_ME_SCORE = -1e4
+        model = self if self._fsdp is None else self._fsdp
+        if chunk_masking:
+            # span-wise scores
+            n_chunks = T // self.span_len
+            if T % self.span_len != 0:
+                # trim sequence ending to achieve a multiple of span_len
+                T = self.span_len * n_chunks
+                gen_sequence = gen_sequence[..., :T]
+                stage_gen_seq = stage_gen_seq[..., :T]
+            chunked_shape = (B, 1, n_chunks)
+            n_prompt_chunks = prompt_length // self.span_len
+            scores = torch.zeros(chunked_shape, dtype=torch.float32, device=device)
+            scores[..., :n_prompt_chunks] = DONT_REMASK_ME_SCORE
+            num_chunks_to_gen = n_chunks - n_prompt_chunks
+        else:
+            # token-wise scores
+            scores = torch.zeros(shape, dtype=torch.float32, device=device)
+            scores[..., :prompt_length] = DONT_REMASK_ME_SCORE
+            gen_T = T - prompt_length
+        # run MAGNeT iterative decoding for "timesteps" iterations
+        for timestep, steps_left in zip(torch.linspace(0, 1, timesteps, device=device), reversed(range(timesteps))):
+            mask_p = torch.cos(timestep * math.pi * 0.5)
+            if chunk_masking:
+                num_masked = max(int((mask_p * num_chunks_to_gen).item()), 1)
+            else:
+                num_masked = max(int((mask_p * gen_T).item()), 1)
+            # masking
+            run_lps_masking = (span_arrangement == 'stride1') and self.span_len > 1
+            if run_lps_masking:
+                # masking of the k least probable overlapping (stride 1) spans
+                mask = torch.concat((
+                    [self._least_probable_span_masking(scores[[i], :, :], num_masked).to(device)
+                     for i in range(B)]), dim=0)
+                stage_gen_seq[mask] = mask_id
+            else:
+                # masking of the k least probable non-overlapping spans
+                masked = scores.topk(num_masked, dim=-1).indices
+                if chunk_masking:
+                    chunks_mask = torch.full(chunked_shape, False, dtype=torch.bool, device=device)
+                    chunks_mask = chunks_mask.scatter(2, masked, True)
+                    mask = torch.repeat_interleave(chunks_mask, self.span_len, dim=-1)
+                    stage_gen_seq[mask] = mask_id
+                else:
+                    stage_gen_seq = stage_gen_seq.scatter(2, masked, mask_id)
+            if prompt is not None:
+                stage_gen_seq[..., :prompt_length] = prompt[:, stage, :].unsqueeze(1)
+            gen_sequence[:, [stage], :] = stage_gen_seq
+            if condition_tensors:
+                # duplicate input for classifier free guidance
+                sequence = torch.cat([gen_sequence, gen_sequence], dim=0)
+            all_logits = model(sequence, [], condition_tensors, stage=stage)
+            if condition_tensors:
+                # classifier free guidance with annealing
+                cond_logits, uncond_logits = all_logits.split(B, dim=0)  # [B, K, T, card]
+                clsfg_coef = float(mask_p) * max_cfg_coef + (1 - float(mask_p)) * min_cfg_coef
+                logits = uncond_logits + (cond_logits - uncond_logits) * clsfg_coef
+            else:
+                logits = all_logits
+            # temperature annealing - linear
+            t = temp * (steps_left / timesteps) if anneal_temp else temp
+            # sampling
+            logits = logits[:, stage, :, :].unsqueeze(1)
+            probs = torch.softmax(logits / max(t, 1e-2), dim=-1)
+            if use_sampling:
+                if top_p > 0.0:
+                    sampled_tokens = utils.sample_top_p(probs, p=top_p)
+                elif top_k > 0:
+                    sampled_tokens = utils.sample_top_k(probs, k=top_k)
+                else:
+                    sampled_tokens = utils.multinomial(probs, num_samples=1)
+            else:
+                sampled_tokens = torch.argmax(logits, dim=-1, keepdim=True)
+            # place mask_id token in each of the masked positions
+            mask = stage_gen_seq == mask_id
+            stage_gen_seq = torch.where(mask, sampled_tokens[..., 0], stage_gen_seq)
+            gen_sequence[:, [stage], :] = stage_gen_seq
+            # get probs of sampled tokens
+            sampled_probs = torch.gather(probs, 3, sampled_tokens)[..., 0]
+            # span scoring
+            if chunk_masking:
+                if span_scoring == 'max':
+                    # max in linear space
+                    scores = 1 - torch.max(sampled_probs.reshape((B, 1, n_chunks, -1)), dim=-1)[0]
+                elif span_scoring == 'prod':
+                    # prod in log space
+                    scores = torch.sum(-torch.log(sampled_probs).reshape((B, 1, n_chunks, -1)), dim=-1)
+                else:
+                    raise NotImplementedError
+            else:
+                # prod in log space for lps masking (stride1)
+                scores = -torch.log(sampled_probs)
+            # Fix unmasked tokens by placing inf probs (-inf scores)
+            if chunk_masking:
+                scores = scores.masked_fill(~chunks_mask, DONT_REMASK_ME_SCORE)
+            else:
+                scores = scores.masked_fill(~mask, DONT_REMASK_ME_SCORE)
+            if callback is not None:
+                curr_step += 1
+                callback(curr_step, total_steps)
+        return gen_sequence, curr_step
+    def _construct_spans_mask(self, span_starts: torch.Tensor, T: int, device: torch.device) -> torch.Tensor:
+        """Build a [1x1xT] boolean mask consists of overlapping spans of True values, where
+           span_starts defines the initial index of each span, and the span length is
+           defined by self.span_len.
+        Args:
+            span_starts (torch.Tensor): Boolean mask determines the temporal location of each span start.
+            T (int): Sequence length.
+            device (torch.device): device of the output tensor.
+        Returns:
+            torch.Tensor: Spans mask of shape [1x1xT]
+        """
+        mask = torch.full((1, 1, T), False, device=device)
+        mask[:, :, span_starts] = True
+        shifted_mask = mask.clone()
+        for _ in range(self.span_len - 1):
+            shifted_mask = torch.concat((torch.full((1, 1, 1), False, device=device), shifted_mask[:, :, :-1]), dim=-1)
+            mask = torch.logical_or(mask, shifted_mask)
+        return mask
+    def _least_probable_span_masking(self, scores: torch.Tensor, num_masked_trg: int) -> torch.Tensor:
+        """Construct a [1x1xT] boolean mask, consists of the u least probable spans,
+           where the token probability is determined by -scores, and the total
+           number of masked tokens is as closest as possible to num_masked_trg.
+           Find u using binary search.
+        Args:
+            scores (torch.Tensor): Per token score [-log(prob)]
+            num_masked_trg: int: The desired amount of tokens to be masked.
+        Returns:
+            torch.Tensor: Spans mask of shape [1x1xT]
+        """
+        T = scores.shape[-1]
+        device = scores.device
+        scores_unfolded = scores.unfold(2, self.span_len, 1)
+        # Span score is the product of probs (sum in log space)
+        span_scores = scores_unfolded.sum(dim=-1)
+        spans_by_scores = torch.argsort(span_scores[0, 0], descending=True)
+        num_masked_trg = max(num_masked_trg, self.span_len)
+        # Binary search for u - the number least probable overlapping masked spans s.t.
+        # the total masking rate is the closest to num_masked_trg / T.
+        min_u = num_masked_trg // self.span_len
+        max_u = num_masked_trg - self.span_len + 1
+        mid = round(0.5 * (min_u + max_u))
+        if mid == min_u or mid == max_u:
+            return self._construct_spans_mask(spans_by_scores[:mid], T, device)
+        while mid > min_u and mid < max_u:
+            mask = self._construct_spans_mask(spans_by_scores[:mid], T, device)
+            n_masked = mask.sum()
+            if n_masked > num_masked_trg:
+                max_u = mid
+                mid = round(0.5 * (min_u + max_u))
+            else:
+                min_u = mid
+                mid = round(0.5 * (min_u + max_u))
+        return mask

audiocraft/models/loaders.py CHANGED Viewed

@@ -28,6 +28,7 @@ from omegaconf import OmegaConf, DictConfig
 import torch
 import audiocraft
 from . import builders
 from .encodec import CompressionModel
@@ -47,6 +48,7 @@ HF_MODEL_CHECKPOINTS_MAP = {
     "stereo-large": "facebook/musicgen-stereo-large",
     "stereo-melody": "facebook/musicgen-stereo-melody",
     "stereo-melody-large": "facebook/musicgen-stereo-melody-large",
 }
@@ -156,7 +158,7 @@ def load_compression_model(file_or_url_or_id: tp.Union[Path, str], device='cpu',
     # Handle newer model formats that might not have xp.cfg
     if 'xp.cfg' not in pkg:
         if file_or_url_or_id in ['melody-large', 'stereo-melody', 'stereo-medium',
-                                 'stereo-small', 'stereo-large', 'stereo-melody-large']:
             print(f"Using fallback configuration for {file_or_url_or_id}")
             # Create a default configuration based on the model type
             # This is where you'd need to add model-specific configurations
@@ -212,6 +214,52 @@ def load_lm_model(file_or_url_or_id: tp.Union[Path, str], device='cpu', cache_di
     return model
 def load_mbd_ckpt(file_or_url_or_id: tp.Union[Path, str],
                   filename: tp.Optional[str] = None,
                   cache_dir: tp.Optional[str] = None):

 import torch
 import audiocraft
 from . import builders
 from .encodec import CompressionModel
     "stereo-large": "facebook/musicgen-stereo-large",
     "stereo-melody": "facebook/musicgen-stereo-melody",
     "stereo-melody-large": "facebook/musicgen-stereo-melody-large",
+    "style": "facebook/musicgen-style",
 }
     # Handle newer model formats that might not have xp.cfg
     if 'xp.cfg' not in pkg:
         if file_or_url_or_id in ['melody-large', 'stereo-melody', 'stereo-medium',
+                                 'stereo-small', 'stereo-large', 'stereo-melody-large','style']:
             print(f"Using fallback configuration for {file_or_url_or_id}")
             # Create a default configuration based on the model type
             # This is where you'd need to add model-specific configurations
     return model
+def load_lm_model_magnet(file_or_url_or_id: tp.Union[Path, str], compression_model_frame_rate: int,
+                         device='cpu', cache_dir: tp.Optional[str] = None):
+    pkg = load_lm_model_ckpt(file_or_url_or_id, cache_dir=cache_dir)
+    cfg = OmegaConf.create(pkg['xp.cfg'])
+    cfg.device = str(device)
+    if cfg.device == 'cpu':
+        cfg.dtype = 'float32'
+    else:
+        cfg.dtype = 'float16'
+    _delete_param(cfg, 'conditioners.args.merge_text_conditions_p')
+    _delete_param(cfg, 'conditioners.args.drop_desc_p')
+    cfg.transformer_lm.compression_model_framerate = compression_model_frame_rate
+    cfg.transformer_lm.segment_duration = cfg.dataset.segment_duration
+    cfg.transformer_lm.span_len = cfg.masking.span_len
+    # MAGNeT models v1 support only xformers backend.
+    from audiocraft.modules.transformer import set_efficient_attention_backend
+    if cfg.transformer_lm.memory_efficient:
+        set_efficient_attention_backend("xformers")
+    model = builders.get_lm_model(cfg)
+    model.load_state_dict(pkg['best_state'])
+    model.eval()
+    model.cfg = cfg
+    return model
+def load_jasco_model(file_or_url_or_id: tp.Union[Path, str],
+                     compression_model: CompressionModel,
+                     device='cpu', cache_dir: tp.Optional[str] = None):
+    pkg = load_lm_model_ckpt(file_or_url_or_id, cache_dir=cache_dir)
+    cfg = OmegaConf.create(pkg['xp.cfg'])
+    cfg.device = str(device)
+    if cfg.device == 'cpu':
+        cfg.dtype = 'float32'
+    else:
+        cfg.dtype = 'float16'
+    model = builders.get_jasco_model(cfg, compression_model)
+    model.load_state_dict(pkg['best_state'])
+    model.eval()
+    model.cfg = cfg
+    return model
 def load_mbd_ckpt(file_or_url_or_id: tp.Union[Path, str],
                   filename: tp.Optional[str] = None,
                   cache_dir: tp.Optional[str] = None):

audiocraft/models/magnet.py ADDED Viewed

	@@ -0,0 +1,88 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Main model for using MAGNeT. This will combine all the required components
+and provide easy access to the generation API.
+"""
+import typing as tp
+import torch
+from .genmodel import BaseGenModel
+from .loaders import load_compression_model, load_lm_model_magnet
+class MAGNeT(BaseGenModel):
+    """MAGNeT main model with convenient generation API.
+    Args:
+       See MusicGen class.
+    """
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        # MAGNeT operates over a fixed sequence length defined in it's config.
+        self.duration = self.lm.cfg.dataset.segment_duration
+        self.set_generation_params()
+    @staticmethod
+    def get_pretrained(name: str = 'facebook/magnet-small-10secs', device=None):
+        """Return pretrained model, we provide six models:
+        - facebook/magnet-small-10secs (300M), text to music, 10-second audio samples.
+          # see: https://huggingface.co/facebook/magnet-small-10secs
+        - facebook/magnet-medium-10secs (1.5B), text to music, 10-second audio samples.
+          # see: https://huggingface.co/facebook/magnet-medium-10secs
+        - facebook/magnet-small-30secs (300M), text to music, 30-second audio samples.
+          # see: https://huggingface.co/facebook/magnet-small-30secs
+        - facebook/magnet-medium-30secs (1.5B), text to music, 30-second audio samples.
+          # see: https://huggingface.co/facebook/magnet-medium-30secs
+        - facebook/audio-magnet-small (300M), text to sound-effect (10-second samples).
+          # see: https://huggingface.co/facebook/audio-magnet-small
+        - facebook/audio-magnet-medium (1.5B), text to sound-effect (10-second samples).
+          # see: https://huggingface.co/facebook/audio-magnet-medium
+        """
+        if device is None:
+            if torch.cuda.device_count():
+                device = 'cuda'
+            else:
+                device = 'cpu'
+        compression_model = load_compression_model(name, device=device)
+        lm = load_lm_model_magnet(name, compression_model_frame_rate=int(compression_model.frame_rate), device=device)
+        if 'self_wav' in lm.condition_provider.conditioners:
+            lm.condition_provider.conditioners['self_wav'].match_len_on_eval = True
+        kwargs = {'name': name, 'compression_model': compression_model, 'lm': lm}
+        return MAGNeT(**kwargs)
+    def set_generation_params(self, use_sampling: bool = True, top_k: int = 0,
+                              top_p: float = 0.9, temperature: float = 3.0,
+                              max_cfg_coef: float = 10.0, min_cfg_coef: float = 1.0,
+                              decoding_steps: tp.List[int] = [20, 10, 10, 10],
+                              span_arrangement: str = 'nonoverlap'):
+        """Set the generation parameters for MAGNeT.
+        Args:
+            use_sampling (bool, optional): Use sampling if True, else do argmax decoding. Defaults to True.
+            top_k (int, optional): top_k used for sampling. Defaults to 0.
+            top_p (float, optional): top_p used for sampling, when set to 0 top_k is used. Defaults to 0.9.
+            temperature (float, optional): Initial softmax temperature parameter. Defaults to 3.0.
+            max_cfg_coef (float, optional): Coefficient used for classifier free guidance. Defaults to 10.0.
+            min_cfg_coef (float, optional): End coefficient of classifier free guidance annealing. Defaults to 1.0.
+            decoding_steps (list of n_q ints, optional): The number of iterative decoding steps,
+                                                         for each of the n_q RVQ codebooks.
+            span_arrangement (str, optional): Use either non-overlapping spans ('nonoverlap')
+                                              or overlapping spans ('stride1') in the masking scheme.
+        """
+        self.generation_params = {
+            'use_sampling': use_sampling,
+            'temp': temperature,
+            'top_k': top_k,
+            'top_p': top_p,
+            'max_cfg_coef': max_cfg_coef,
+            'min_cfg_coef': min_cfg_coef,
+            'decoding_steps': [int(s) for s in decoding_steps],
+            'span_arrangement': span_arrangement
+        }

audiocraft/models/musicgen.py CHANGED Viewed

@@ -18,11 +18,12 @@ import torch
 import gradio as gr
 from .encodec import CompressionModel
 from .lm import LMModel
 from .builders import get_debug_compression_model, get_debug_lm_model, get_wrapped_compression_model
 from .loaders import load_compression_model, load_lm_model, HF_MODEL_CHECKPOINTS_MAP
 from ..data.audio_utils import convert_audio
-from ..modules.conditioners import ConditioningAttributes, WavCondition
 from ..utils.autocast import TorchAutocast
 MelodyList = tp.List[tp.Optional[torch.Tensor]]
@@ -108,6 +109,7 @@ class MusicGen:
         - stereo-melody (1.5B) text to music and text+melody to music, # see: https://huggingface.co/facebook/musicgen-stereo-melody
         - stereo-large (3.3B), text to music, # see: https://huggingface.co/facebook/musicgen-stereo-large
         - stereo-melody-large (3.3B), text to music, and text+melody to music # see: https://huggingface.co/facebook/musicgen-stereo-melody-large
         """
         if device is None:
@@ -120,7 +122,7 @@ class MusicGen:
             # used only for unit tests
             compression_model = get_debug_compression_model(device)
             lm = get_debug_lm_model(device)
-            return MusicGen(name, compression_model, lm)
         if name not in HF_MODEL_CHECKPOINTS_MAP:
             if not os.path.isfile(name) and not os.path.isdir(name):
@@ -143,6 +145,7 @@ class MusicGen:
     def set_generation_params(self, use_sampling: bool = True, top_k: int = 250,
                               top_p: float = 0.0, temperature: float = 1.0,
                               duration: float = 30.0, cfg_coef: float = 3.0,
                               two_step_cfg: bool = False, extend_stride: float = 10, rep_penalty: float = None):
         """Set the generation parameters for MusicGen.
@@ -153,6 +156,10 @@ class MusicGen:
             temperature (float, optional): Softmax temperature parameter. Defaults to 1.0.
             duration (float, optional): Duration of the generated waveform. Defaults to 30.0.
             cfg_coef (float, optional): Coefficient used for classifier free guidance. Defaults to 3.0.
             two_step_cfg (bool, optional): If True, performs 2 forward for Classifier Free Guidance,
                 instead of batching together the two. This has some impact on how things
                 are padded but seems to have little impact in practice.
@@ -172,8 +179,30 @@ class MusicGen:
             'top_p': top_p,
             'cfg_coef': cfg_coef,
             'two_step_cfg': two_step_cfg,
         }
     def set_custom_progress_callback(self, progress_callback: tp.Union[tp.Callable[[int, int], None],gr.Progress] = None):
         """Override the default progress callback."""
         self._progress_callback = progress_callback
@@ -399,8 +428,8 @@ class MusicGen:
         """Generate discrete audio tokens given audio prompt and/or conditions.
         Args:
-            attributes (tp.List[ConditioningAttributes]): Conditions used for generation (text/melody).
-            prompt_tokens (tp.Optional[torch.Tensor]): Audio prompt used for continuation.
             progress (bool, optional): Flag to display progress of the generation process. Defaults to False.
         Returns:
             torch.Tensor: Generated audio, of shape [B, C, T], T is defined by the generation params.

 import gradio as gr
 from .encodec import CompressionModel
+from .genmodel import BaseGenModel
 from .lm import LMModel
 from .builders import get_debug_compression_model, get_debug_lm_model, get_wrapped_compression_model
 from .loaders import load_compression_model, load_lm_model, HF_MODEL_CHECKPOINTS_MAP
 from ..data.audio_utils import convert_audio
+from ..modules.conditioners import ConditioningAttributes, WavCondition, StyleConditioner
 from ..utils.autocast import TorchAutocast
 MelodyList = tp.List[tp.Optional[torch.Tensor]]
         - stereo-melody (1.5B) text to music and text+melody to music, # see: https://huggingface.co/facebook/musicgen-stereo-melody
         - stereo-large (3.3B), text to music, # see: https://huggingface.co/facebook/musicgen-stereo-large
         - stereo-melody-large (3.3B), text to music, and text+melody to music # see: https://huggingface.co/facebook/musicgen-stereo-melody-large
+        - musicgen-style (1.5B), text to music, # see: https://huggingface.co/facebook/musicgen-style
         """
         if device is None:
             # used only for unit tests
             compression_model = get_debug_compression_model(device)
             lm = get_debug_lm_model(device)
+            return MusicGen(name, compression_model, lm, max_duration=30)
         if name not in HF_MODEL_CHECKPOINTS_MAP:
             if not os.path.isfile(name) and not os.path.isdir(name):
     def set_generation_params(self, use_sampling: bool = True, top_k: int = 250,
                               top_p: float = 0.0, temperature: float = 1.0,
                               duration: float = 30.0, cfg_coef: float = 3.0,
+                              cfg_coef_beta: tp.Optional[float] = None,
                               two_step_cfg: bool = False, extend_stride: float = 10, rep_penalty: float = None):
         """Set the generation parameters for MusicGen.
             temperature (float, optional): Softmax temperature parameter. Defaults to 1.0.
             duration (float, optional): Duration of the generated waveform. Defaults to 30.0.
             cfg_coef (float, optional): Coefficient used for classifier free guidance. Defaults to 3.0.
+            cfg_coef_beta (float, optional): beta coefficient in double classifier free guidance.
+                Should be only used for MusicGen melody if we want to push the text condition more than
+                the audio conditioning. See paragraph 4.3 in https://arxiv.org/pdf/2407.12563 to understand
+                double CFG.
             two_step_cfg (bool, optional): If True, performs 2 forward for Classifier Free Guidance,
                 instead of batching together the two. This has some impact on how things
                 are padded but seems to have little impact in practice.
             'top_p': top_p,
             'cfg_coef': cfg_coef,
             'two_step_cfg': two_step_cfg,
+            'cfg_coef_beta': cfg_coef_beta,
         }
+    def set_style_conditioner_params(self, eval_q: int = 3, excerpt_length: float = 3.0,
+                                     ds_factor: tp.Optional[int] = None,
+                                     encodec_n_q: tp.Optional[int] = None) -> None:
+        """Set the parameters of the style conditioner
+        Args:
+            eval_q (int): the number of residual quantization streams used to quantize the style condition
+                the smaller it is, the narrower is the information bottleneck
+            excerpt_length (float): the excerpt length in seconds that is extracted from the audio
+                conditioning
+            ds_factor: (int): the downsampling factor used to downsample the style tokens before
+                using them as a prefix
+            encodec_n_q: (int, optional): if encodec is used as a feature extractor, sets the number
+                of streams that is used to extract features
+        """
+        assert isinstance(self.lm.condition_provider.conditioners.self_wav, StyleConditioner), \
+            "Only use this function if you model is MusicGen-Style"
+        self.lm.condition_provider.conditioners.self_wav.set_params(eval_q=eval_q,
+                                                                    excerpt_length=excerpt_length,
+                                                                    ds_factor=ds_factor,
+                                                                    encodec_n_q=encodec_n_q)
     def set_custom_progress_callback(self, progress_callback: tp.Union[tp.Callable[[int, int], None],gr.Progress] = None):
         """Override the default progress callback."""
         self._progress_callback = progress_callback
         """Generate discrete audio tokens given audio prompt and/or conditions.
         Args:
+            attributes (list of ConditioningAttributes): Conditions used for generation (text/melody).
+            prompt_tokens (torch.Tensor, optional): Audio prompt used for continuation.
             progress (bool, optional): Flag to display progress of the generation process. Defaults to False.
         Returns:
             torch.Tensor: Generated audio, of shape [B, C, T], T is defined by the generation params.

audiocraft/modules/codebooks_patterns.py CHANGED Viewed

@@ -30,7 +30,7 @@ class Pattern:
     The pattern provides convenient methods to build and revert interleaved sequences from it:
     ``build_pattern_sequence`` maps a given a dense input tensor of multi-codebook sequence from [B, K, T]
-        to the interleaved sequence of shape [B, K, S] applying the pattern, with S being the batch size,
         K being the number of codebooks, T the number of original timesteps and S the number of sequence steps
         for the output sequence. The unfilled positions are replaced with a special token and the built sequence
         is returned along with a mask indicating valid tokens.
@@ -49,7 +49,6 @@ class Pattern:
     def __post_init__(self):
         assert len(self.layout) > 0
-        assert self.layout[0] == []
         self._validate_layout()
         self._build_reverted_sequence_scatter_indexes = lru_cache(100)(self._build_reverted_sequence_scatter_indexes)
         self._build_pattern_sequence_scatter_indexes = lru_cache(100)(self._build_pattern_sequence_scatter_indexes)
@@ -93,6 +92,9 @@ class Pattern:
         valid_step = len(self.layout) - self.max_delay
         return self.layout[:valid_step]
     def get_sequence_coords_with_timestep(self, t: int, q: tp.Optional[int] = None):
         """Get codebook coordinates in the layout that corresponds to the specified timestep t
         and optionally to the codebook q. Coordinates are returned as a tuple with the sequence step
@@ -202,7 +204,7 @@ class Pattern:
             f"sequence to revert is longer than the defined pattern: {sequence_steps} > {len(ref_layout)}"
         # ensure we take the appropriate indexes to keep the model output from the first special token as well
-        if is_model_output:
             ref_layout = ref_layout[1:]
         # single item indexing being super slow with pytorch vs. numpy, so we use numpy here
@@ -335,7 +337,8 @@ class DelayedPatternProvider(CodebooksPatternProvider):
         assert sorted(self.delays) == self.delays
     def get_pattern(self, timesteps: int) -> Pattern:
-        out: PatternLayout = [[]]
         max_delay = max(self.delays)
         if self.empty_initial:
             out += [[] for _ in range(self.empty_initial)]
@@ -360,9 +363,10 @@ class ParallelPatternProvider(DelayedPatternProvider):
     Args:
         n_q (int): Number of codebooks.
     """
-    def __init__(self, n_q: int):
-        super().__init__(n_q, [0] * n_q)
 class UnrolledPatternProvider(CodebooksPatternProvider):

     The pattern provides convenient methods to build and revert interleaved sequences from it:
     ``build_pattern_sequence`` maps a given a dense input tensor of multi-codebook sequence from [B, K, T]
+        to the interleaved sequence of shape [B, K, S] applying the pattern, with B being the batch size,
         K being the number of codebooks, T the number of original timesteps and S the number of sequence steps
         for the output sequence. The unfilled positions are replaced with a special token and the built sequence
         is returned along with a mask indicating valid tokens.
     def __post_init__(self):
         assert len(self.layout) > 0
         self._validate_layout()
         self._build_reverted_sequence_scatter_indexes = lru_cache(100)(self._build_reverted_sequence_scatter_indexes)
         self._build_pattern_sequence_scatter_indexes = lru_cache(100)(self._build_pattern_sequence_scatter_indexes)
         valid_step = len(self.layout) - self.max_delay
         return self.layout[:valid_step]
+    def starts_with_special_token(self):
+        return self.layout[0] == []
     def get_sequence_coords_with_timestep(self, t: int, q: tp.Optional[int] = None):
         """Get codebook coordinates in the layout that corresponds to the specified timestep t
         and optionally to the codebook q. Coordinates are returned as a tuple with the sequence step
             f"sequence to revert is longer than the defined pattern: {sequence_steps} > {len(ref_layout)}"
         # ensure we take the appropriate indexes to keep the model output from the first special token as well
+        if is_model_output and self.starts_with_special_token():
             ref_layout = ref_layout[1:]
         # single item indexing being super slow with pytorch vs. numpy, so we use numpy here
         assert sorted(self.delays) == self.delays
     def get_pattern(self, timesteps: int) -> Pattern:
+        omit_special_token = self.empty_initial < 0
+        out: PatternLayout = [] if omit_special_token else [[]]
         max_delay = max(self.delays)
         if self.empty_initial:
             out += [[] for _ in range(self.empty_initial)]
     Args:
         n_q (int): Number of codebooks.
+        empty_initial (int): Prepend with N empty list of coordinates.
     """
+    def __init__(self, n_q: int, empty_initial: int = 0):
+        super().__init__(n_q, [0] * n_q, empty_initial=empty_initial)
 class UnrolledPatternProvider(CodebooksPatternProvider):

audiocraft/modules/conditioners.py CHANGED Viewed

@@ -15,8 +15,8 @@ import random
 import re
 import typing as tp
 import warnings
 import einops
 from num2words import num2words
 import spacy
 from transformers import RobertaTokenizer, T5EncoderModel, T5Tokenizer  # type: ignore
@@ -24,10 +24,10 @@ import torch
 from torch import nn
 import torch.nn.functional as F
 from torch.nn.utils.rnn import pad_sequence
 from .chroma import ChromaExtractor
 from .streaming import StreamingModule
-from .transformer import create_sin_embedding
 from ..data.audio import audio_read
 from ..data.audio_dataset import SegmentInfo
 from ..data.audio_utils import convert_audio
@@ -43,6 +43,15 @@ TextCondition = tp.Optional[str]  # a text condition can be a string or None (if
 ConditionType = tp.Tuple[torch.Tensor, torch.Tensor]  # condition, mask
 class WavCondition(tp.NamedTuple):
     wav: torch.Tensor
     length: torch.Tensor
@@ -60,11 +69,17 @@ class JointEmbedCondition(tp.NamedTuple):
     seek_time: tp.List[tp.Optional[float]] = []
 @dataclass
 class ConditioningAttributes:
     text: tp.Dict[str, tp.Optional[str]] = field(default_factory=dict)
     wav: tp.Dict[str, WavCondition] = field(default_factory=dict)
     joint_embed: tp.Dict[str, JointEmbedCondition] = field(default_factory=dict)
     def __getitem__(self, item):
         return getattr(self, item)
@@ -81,19 +96,25 @@ class ConditioningAttributes:
     def joint_embed_attributes(self):
         return self.joint_embed.keys()
     @property
     def attributes(self):
         return {
             "text": self.text_attributes,
             "wav": self.wav_attributes,
             "joint_embed": self.joint_embed_attributes,
         }
     def to_flat_dict(self):
         return {
             **{f"text.{k}": v for k, v in self.text.items()},
             **{f"wav.{k}": v for k, v in self.wav.items()},
-            **{f"joint_embed.{k}": v for k, v in self.joint_embed.items()}
         }
     @classmethod
@@ -177,6 +198,44 @@ def nullify_joint_embed(embed: JointEmbedCondition) -> JointEmbedCondition:
     )
 class Tokenizer:
     """Base tokenizer implementation
     (in case we want to introduce more advances tokenizers in the future).
@@ -297,7 +356,8 @@ class BaseConditioner(nn.Module):
         super().__init__()
         self.dim = dim
         self.output_dim = output_dim
-        self.output_proj = nn.Linear(dim, output_dim)
     def tokenize(self, *args, **kwargs) -> tp.Any:
         """Should be any part of the processing that will lead to a synchronization
@@ -495,8 +555,9 @@ class WaveformConditioner(BaseConditioner):
         wav, lengths, *_ = x
         with torch.no_grad():
             embeds = self._get_wav_embedding(x)
-        embeds = embeds.to(self.output_proj.weight)
-        embeds = self.output_proj(embeds)
         if lengths is not None and self._use_masking:
             lengths = lengths / self._downsampling_factor()
@@ -607,7 +668,7 @@ class ChromaStemConditioner(WaveformConditioner):
         with self.autocast:
             wav = convert_audio(
                 wav, sample_rate, self.demucs.samplerate, self.demucs.audio_channels)  # type: ignore
-            stems = apply_model(self.demucs, wav, device=self.device)
             stems = stems[:, self.stem_indices]  # extract relevant stems for melody conditioning
             mix_wav = stems.sum(1)  # merge extracted stems to single waveform
             mix_wav = convert_audio(mix_wav, self.demucs.samplerate, self.sample_rate, 1)  # type: ignore
@@ -698,6 +759,250 @@ class ChromaStemConditioner(WaveformConditioner):
         return x
 class JointEmbeddingConditioner(BaseConditioner):
     """Joint embedding conditioning supporting both audio or text conditioning.
@@ -996,13 +1301,48 @@ class CLAPEmbeddingConditioner(JointEmbeddingConditioner):
         return embed, empty_idx
-def dropout_condition(sample: ConditioningAttributes, condition_type: str, condition: str) -> ConditioningAttributes:
     """Utility function for nullifying an attribute inside an ConditioningAttributes object.
     If the condition is of type "wav", then nullify it using `nullify_condition` function.
     If the condition is of any other type, set its value to None.
     Works in-place.
     """
-    if condition_type not in ['text', 'wav', 'joint_embed']:
         raise ValueError(
             "dropout_condition got an unexpected condition type!"
             f" expected 'text', 'wav' or 'joint_embed' but got '{condition_type}'"
@@ -1021,6 +1361,8 @@ def dropout_condition(sample: ConditioningAttributes, condition_type: str, condi
     elif condition_type == 'joint_embed':
         embed = sample.joint_embed[condition]
         sample.joint_embed[condition] = nullify_joint_embed(embed)
     else:
         sample.text[condition] = None
@@ -1071,7 +1413,7 @@ class AttributeDropout(DropoutModule):
             return samples
         samples = deepcopy(samples)
-        for condition_type, ps in self.p.items():  # for condition types [text, wav]
             for condition, p in ps.items():  # for attributes of each type (e.g., [artist, genre])
                 if torch.rand(1, generator=self.rng).item() < p:
                     for sample in samples:
@@ -1094,7 +1436,9 @@ class ClassifierFreeGuidanceDropout(DropoutModule):
         super().__init__(seed=seed)
         self.p = p
-    def forward(self, samples: tp.List[ConditioningAttributes]) -> tp.List[ConditioningAttributes]:
         """
         Args:
             samples (list[ConditioningAttributes]): List of conditions.
@@ -1111,10 +1455,11 @@ class ClassifierFreeGuidanceDropout(DropoutModule):
         # nullify conditions of all attributes
         samples = deepcopy(samples)
-        for condition_type in ["wav", "text"]:
             for sample in samples:
                 for condition in sample.attributes[condition_type]:
-                    dropout_condition(sample, condition_type, condition)
         return samples
     def __repr__(self):
@@ -1339,7 +1684,7 @@ class ConditionFuser(StreamingModule):
         cross_attention_pos_emb (bool, optional): Use positional embeddings in cross attention.
         cross_attention_pos_emb_scale (int): Scale for positional embeddings in cross attention if used.
     """
-    FUSING_METHODS = ["sum", "prepend", "cross", "input_interpolate"]
     def __init__(self, fuse2cond: tp.Dict[str, tp.List[str]], cross_attention_pos_emb: bool = False,
                  cross_attention_pos_emb_scale: float = 1.0):
@@ -1399,6 +1744,8 @@ class ConditionFuser(StreamingModule):
                     cross_attention_output = torch.cat([cross_attention_output, cond], dim=1)
                 else:
                     cross_attention_output = cond
             else:
                 raise ValueError(f"unknown op ({op})")

 import re
 import typing as tp
 import warnings
 import einops
+import flashy
 from num2words import num2words
 import spacy
 from transformers import RobertaTokenizer, T5EncoderModel, T5Tokenizer  # type: ignore
 from torch import nn
 import torch.nn.functional as F
 from torch.nn.utils.rnn import pad_sequence
+from enum import Enum
 from .chroma import ChromaExtractor
 from .streaming import StreamingModule
+from .transformer import create_sin_embedding, StreamingTransformer
 from ..data.audio import audio_read
 from ..data.audio_dataset import SegmentInfo
 from ..data.audio_utils import convert_audio
 ConditionType = tp.Tuple[torch.Tensor, torch.Tensor]  # condition, mask
+class JascoCondConst(Enum):
+    DRM = 'self_wav'
+    CRD = 'chords'
+    MLD = 'melody'
+    SYM = {'chords', 'melody'}
+    LAT = {'self_wav'}
+    ALL = ['chords', 'self_wav', 'melody']  # order matters
 class WavCondition(tp.NamedTuple):
     wav: torch.Tensor
     length: torch.Tensor
     seek_time: tp.List[tp.Optional[float]] = []
+class SymbolicCondition(tp.NamedTuple):
+    frame_chords: tp.Optional[torch.Tensor] = None
+    melody: tp.Optional[torch.Tensor] = None
 @dataclass
 class ConditioningAttributes:
     text: tp.Dict[str, tp.Optional[str]] = field(default_factory=dict)
     wav: tp.Dict[str, WavCondition] = field(default_factory=dict)
     joint_embed: tp.Dict[str, JointEmbedCondition] = field(default_factory=dict)
+    symbolic: tp.Dict[str, SymbolicCondition] = field(default_factory=dict)
     def __getitem__(self, item):
         return getattr(self, item)
     def joint_embed_attributes(self):
         return self.joint_embed.keys()
+    @property
+    def symbolic_attributes(self):
+        return self.symbolic.keys()
     @property
     def attributes(self):
         return {
             "text": self.text_attributes,
             "wav": self.wav_attributes,
             "joint_embed": self.joint_embed_attributes,
+            "symbolic": self.symbolic_attributes,
         }
     def to_flat_dict(self):
         return {
             **{f"text.{k}": v for k, v in self.text.items()},
             **{f"wav.{k}": v for k, v in self.wav.items()},
+            **{f"joint_embed.{k}": v for k, v in self.joint_embed.items()},
+            **{f"symbolic.{k}": v for k, v in self.symbolic.items()}
         }
     @classmethod
     )
+def nullify_chords(sym_cond: SymbolicCondition, null_chord_idx: int = 194) -> SymbolicCondition:
+    """Nullify the symbolic condition by setting all frame chords to a specified null chord index.
+    Args:
+        sym_cond (SymbolicCondition): The symbolic condition containing frame chords to be nullified.
+        null_chord_idx (int, optional): The index to use for nullifying the chords. Defaults to 194 (Chordino).
+    Returns:
+        SymbolicCondition: A new symbolic condition with all frame chords set to the null chord index.
+    """
+    return SymbolicCondition(frame_chords=torch.ones_like(sym_cond.frame_chords) * null_chord_idx)  # type: ignore
+def nullify_melody(sym_cond: SymbolicCondition) -> SymbolicCondition:
+    """Nullify the symbolic condition by replacing the melody matrix with zeros matrix.
+    Args:
+        sym_cond (SymbolicCondition): The symbolic condition containing frame chords to be nullified.
+        null_chord_idx (int, optional): The index to use for nullifying the chords. Defaults to 194 (Chordino).
+    Returns:
+        SymbolicCondition: A new symbolic condition with all frame chords set to the null chord index.
+    """
+    return SymbolicCondition(melody=torch.zeros_like(sym_cond.melody))  # type: ignore
+def _drop_description_condition(conditions: tp.List[ConditioningAttributes]) -> tp.List[ConditioningAttributes]:
+    """Drop the text condition but keep the wav conditon on a list of ConditioningAttributes.
+    This is useful to calculate l_style in the double classifier free guidance formula.
+    See paragraph 4.3 in https://arxiv.org/pdf/2407.12563
+    Args:
+        conditions (tp.List[ConditioningAttributes]): List of conditions.
+    """
+    # We assert that description and self_wav are in the conditions
+    for condition in conditions:
+        assert 'description' in condition.text.keys()
+        assert 'self_wav' in condition.wav.keys()
+    return AttributeDropout(p={'text': {'description': 1.0},
+                               'wav': {'self_wav': 0.0}})(conditions)
 class Tokenizer:
     """Base tokenizer implementation
     (in case we want to introduce more advances tokenizers in the future).
         super().__init__()
         self.dim = dim
         self.output_dim = output_dim
+        if self.output_dim > -1:  # omit projection when output_dim <= 0
+            self.output_proj = nn.Linear(dim, output_dim)
     def tokenize(self, *args, **kwargs) -> tp.Any:
         """Should be any part of the processing that will lead to a synchronization
         wav, lengths, *_ = x
         with torch.no_grad():
             embeds = self._get_wav_embedding(x)
+        if hasattr(self, 'output_proj'):
+            embeds = embeds.to(self.output_proj.weight)
+            embeds = self.output_proj(embeds)
         if lengths is not None and self._use_masking:
             lengths = lengths / self._downsampling_factor()
         with self.autocast:
             wav = convert_audio(
                 wav, sample_rate, self.demucs.samplerate, self.demucs.audio_channels)  # type: ignore
+            stems = apply_model(self.demucs, wav, device=self.device)  # type: ignore
             stems = stems[:, self.stem_indices]  # extract relevant stems for melody conditioning
             mix_wav = stems.sum(1)  # merge extracted stems to single waveform
             mix_wav = convert_audio(mix_wav, self.demucs.samplerate, self.sample_rate, 1)  # type: ignore
         return x
+class FeatureExtractor(WaveformConditioner):
+    """
+    Feature Extractor used for the style conditioner of the paper AUDIO CONDITIONING
+        FOR MUSIC GENERATION VIA DISCRETE BOTTLENECK FEATURES.
+    Given a waveform, we extract an excerpt of defined length randomly subsampled.
+        Then, we feed this excerpt to a feature extractor.
+    Args:
+        model_name (str): 'encodec' or 'mert'.
+        sample_rate (str): sample rate of the input audio. (32000)
+        encodec_checkpoint (str): if encodec is used as a feature extractor, checkpoint
+            of the model. ('//pretrained/facebook/encodec_32khz' is the default)
+        encodec_n_q (int): if encodec is used as a feature extractor it sets the number of
+            quantization streams used in it.
+        length (float): length in seconds of the random subsampled excerpt that is used
+            for conditioning.
+        dim (int): The internal representation dimension.
+        output_dim (int): Output dimension for the conditioner.
+        device (tp.Union[torch.device, str], optional): Device for the conditioner.
+        compute_mask (bool): whether to mask the tokens corresponding to the subsampled
+            excerpt in the computation of the music language model cross-entropy loss.
+        use_middle_of_segment (bool): if True, always take the middle of the input
+            instead of a random subsampled excerpt.
+        ds_rate_compression (int): downsampling parameter of the compression model used
+            for the music language model. (640 for encodec_32khz)
+        num_codebooks_lm (int): the number of codebooks used by the music language model.
+    """
+    def __init__(
+        self, model_name: str,
+        sample_rate: int, encodec_checkpoint: str, encodec_n_q: int, length: float,
+        dim: int, output_dim: int, device: tp.Union[torch.device, str],
+        compute_mask: bool = True,
+        use_middle_of_segment: bool = False, ds_rate_compression: int = 640,
+        num_codebooks_lm: int = 4
+    ):
+        assert model_name in ['encodec', 'mert']
+        if model_name == 'encodec':
+            from ..solvers.compression import CompressionSolver
+            feat_extractor = CompressionSolver.model_from_checkpoint(encodec_checkpoint, device)
+        elif model_name == 'mert':
+            from transformers import AutoModel
+            feat_extractor = AutoModel.from_pretrained("m-a-p/MERT-v1-95M", trust_remote_code=True)
+        super().__init__(
+            dim=dim,
+            output_dim=output_dim,
+            device=device
+        )
+        self.sample_rate = sample_rate
+        self.compute_mask = compute_mask
+        self.feat_extractor: nn.Module
+        self.embed: tp.Union[nn.ModuleList, nn.Linear]
+        if model_name == 'encodec':
+            self.__dict__["feat_extractor"] = feat_extractor.to(device)
+            self.encodec_n_q = encodec_n_q
+            self.embed = nn.ModuleList([nn.Embedding(feat_extractor.cardinality, dim) for _ in range(encodec_n_q)])
+        if model_name == 'mert':
+            self.__dict__["feat_extractor"] = feat_extractor.eval().to(device)
+            self.embed = nn.Linear(768, dim)  # hardcoded
+        self.length_subwav = int(length * sample_rate)
+        self.ds_rate_compression = ds_rate_compression
+        self.model_name = model_name
+        self.use_middle_of_segment = use_middle_of_segment
+        self.num_codebooks_lm = num_codebooks_lm
+    def _get_wav_embedding(self, x: WavCondition) -> torch.Tensor:
+        if x.wav.shape[-1] == 1:
+            self.temp_mask = None
+            return torch.zeros(x.wav.shape[0], 1, self.dim, device=self.device)
+        else:
+            with torch.no_grad():
+                if self.use_middle_of_segment:
+                    start = int((x.wav.shape[-1] - self.length_subwav) / 2)
+                    wav = x.wav[:, :, start:start+self.length_subwav]
+                else:
+                    start = random.randint(0, x.wav.shape[-1] - self.length_subwav)
+                    wav = x.wav[:, :, start:start+self.length_subwav]
+                if self.compute_mask:
+                    self.temp_mask = self._get_mask_wav(x, start)
+                if self.model_name == 'encodec':
+                    tokens = self.feat_extractor.encode(wav)[0]  # type: ignore
+                elif self.model_name == 'mert':
+                    wav = convert_audio(wav, from_rate=x.sample_rate[0], to_rate=24000, to_channels=1)
+                    embeds = self.feat_extractor(wav.squeeze(-2)).last_hidden_state
+            if self.model_name == 'encodec':
+                tokens = tokens[:, :self.encodec_n_q]
+                embeds = sum([self.embed[k](tokens[:, k]) for k in range(self.encodec_n_q)])  # type: ignore
+            else:
+                embeds = self.embed(embeds)
+            return embeds  # [B, T, dim]
+    def _downsampling_factor(self):
+        if self.model_name == 'encodec':
+            return self.sample_rate / self.feat_extractor.frame_rate
+        elif self.model_name == 'mert':
+            return self.sample_rate / 75
+    def _get_mask_wav(self, x: WavCondition, start: int) -> tp.Union[torch.Tensor, None]:
+        if x.wav.shape[-1] == 1:
+            return None
+        total_length = int(x.wav.shape[-1] / self.ds_rate_compression)
+        mask_length = int(self.length_subwav / self.ds_rate_compression)
+        start = int(start / self.ds_rate_compression)
+        mask = torch.ones(x.wav.shape[0], self.num_codebooks_lm,
+                          total_length, device=self.device, dtype=torch.bool)
+        mask[:, :, start:start+mask_length] = 0
+        return mask
+class StyleConditioner(FeatureExtractor):
+    """Conditioner from the paper AUDIO CONDITIONING FOR MUSIC GENERATION VIA
+    DISCRETE BOTTLENECK FEATURES.
+    Given an audio input, it is passed through a Feature Extractor and a
+    transformer encoder. Then it is quantized through RVQ.
+    Args:
+        transformer_scale (str): size of the transformer. See in the __init__ to have more infos.
+        ds_factor (int): the downsampling factor applied to the representation after quantization.
+        encodec_n_q (int): if encodec is used as a feature extractor it sets the number of
+            quantization streams used in it.
+        n_q_out (int): the number of quantization streams used for the RVQ. If increased, there
+            is more information passing as a conditioning.
+        eval_q (int): the number of quantization streams used for the RVQ at evaluation time.
+        q_dropout (bool): if True, at training time, a random number of stream is sampled
+            at each step in the interval [1, n_q_out].
+        bins (int): the codebook size used for each quantization stream.
+        varying_lengths (List[float]): list of the min and max duration in seconds for the
+            randomly subsampled excerpt at training time. For each step a length is sampled
+            in this interval.
+        batch_norm (bool): use of batch normalization after the transformer. Stabilizes the
+            training.
+        rvq_threshold_ema_dead_code (float): threshold for dropping dead codes in the
+            RVQ.
+    """
+    def __init__(self, transformer_scale: str = 'default', ds_factor: int = 15, encodec_n_q: int = 4,
+                 n_q_out: int = 6, eval_q: int = 3, q_dropout: bool = True, bins: int = 1024,
+                 varying_lengths: tp.List[float] = [1.5, 4.5],
+                 batch_norm: bool = True, rvq_threshold_ema_dead_code: float = 0.1,
+                 **kwargs):
+        tr_args: tp.Dict[str, tp.Any]
+        if transformer_scale == 'xsmall':
+            tr_args = {'d_model': 256, 'num_heads': 8, 'num_layers': 4}
+        elif transformer_scale == 'large':
+            tr_args = {'d_model': 1024, 'num_heads': 16, 'num_layers': 24}
+        elif transformer_scale == 'default':
+            tr_args = {'d_model': 512, 'num_heads': 8, 'num_layers': 8}
+        elif transformer_scale == 'none':
+            tr_args = {'d_model': 512}
+        tr_args.update({
+            'memory_efficient': True, 'activation': 'gelu',
+            'norm_first': True, 'causal': False, 'layer_scale': None,
+            'bias_ff': False, 'bias_attn': False,
+        })
+        dim = tr_args['d_model']
+        super().__init__(dim=dim, encodec_n_q=encodec_n_q, **kwargs)
+        self.ds_factor = ds_factor
+        if transformer_scale == 'none':
+            self.transformer = None
+        else:
+            self.transformer = StreamingTransformer(dim_feedforward=int(4 * dim), **tr_args)
+        self.n_q_out = n_q_out
+        self.eval_q = eval_q
+        self.rvq = None
+        if n_q_out > 0:
+            self.rvq = ResidualVectorQuantizer(dim, n_q=n_q_out, q_dropout=q_dropout, bins=bins,
+                                               threshold_ema_dead_code=rvq_threshold_ema_dead_code)
+        self.autocast = TorchAutocast(enabled=self.device != 'cpu', device_type=self.device, dtype=torch.float32)
+        self.varying_lengths = varying_lengths
+        self.batch_norm = None
+        if batch_norm:
+            self.batch_norm = nn.BatchNorm1d(dim, affine=False)
+        self.mask = None
+    def _get_wav_embedding(self, wav: WavCondition) -> torch.Tensor:
+        with self.autocast:
+            # Sample the length of the excerpts
+            if self.varying_lengths and self.training:
+                assert len(self.varying_lengths) == 2
+                length = random.uniform(self.varying_lengths[0], self.varying_lengths[1])
+                self.length_subwav = int(length * self.sample_rate)
+            z1 = super()._get_wav_embedding(wav)
+            if self.compute_mask:
+                self.mask = self.temp_mask  # type: ignore
+            self.temp_mask = None
+            if self.transformer is not None:
+                out1 = self.transformer(z1)
+            else:
+                out1 = z1
+            if self.batch_norm:
+                out1 = self.batch_norm(out1.transpose(1, 2)).transpose(1, 2)
+            # Apply quantization
+            if self.rvq:
+                if self.training:
+                    self.rvq.set_num_codebooks(self.n_q_out)
+                else:
+                    self.rvq.set_num_codebooks(self.eval_q)
+                out1 = self.rvq(out1.transpose(1, 2), frame_rate=1.)
+                if self.training:
+                    flashy.distrib.average_tensors(self.rvq.buffers())
+                out1 = out1.x.transpose(1, 2)
+            # Apply fix downsample
+            out1 = out1[:, ::self.ds_factor]
+        return out1
+    def set_params(self, eval_q: int = 3,
+                   excerpt_length: float = 3.0,
+                   ds_factor: tp.Optional[int] = None, encodec_n_q: tp.Optional[int] = None):
+        """Modify the parameters of the SSL or introduce new parameters to add noise to
+        the conditioning or to downsample it
+        Args:
+            eval_q (int): number of codebooks used when evaluating the model
+            excerpt_length (float): the length of the excerpts used to condition the model
+        """
+        self.eval_q = eval_q
+        self.length_subwav = int(excerpt_length * self.sample_rate)
+        if ds_factor is not None:
+            self.ds_factor = ds_factor
+        if encodec_n_q is not None:
+            self.encodec_n_q = encodec_n_q
+    def _downsampling_factor(self):
+        df = super()._downsampling_factor()
+        return df * self.ds_factor
+    def forward(self, x: WavCondition) -> ConditionType:
+        wav, lengths, *_ = x
+        embeds = self._get_wav_embedding(x)
+        embeds = embeds.to(self.output_proj.weight)
+        embeds = self.output_proj(embeds)
+        lengths = lengths / self._downsampling_factor()
+        mask = length_to_mask(lengths, max_len=embeds.shape[1]).int()  # type: ignore
+        embeds = (embeds * mask.unsqueeze(2).to(self.device))
+        return embeds, mask
 class JointEmbeddingConditioner(BaseConditioner):
     """Joint embedding conditioning supporting both audio or text conditioning.
         return embed, empty_idx
+def dropout_symbolic_conditions(sample: ConditioningAttributes,
+                                condition: str, null_chord_idx: int = 194) -> ConditioningAttributes:
+    """
+    Applies dropout to symbolic conditions within the sample based on the specified condition by setting the condition
+    value to a null index.
+    Args:
+        sample (ConditioningAttributes): The sample containing symbolic attributes to potentially dropout.
+        condition (str): The specific condition within the symbolic attributes to apply dropout.
+        null_chord_idx (int, optional): The index used to represent a null chord. Defaults to 194.
+    Returns:
+        ConditioningAttributes: The modified sample with dropout applied to the specified condition.
+    Raises:
+        ValueError: If the specified condition is not present in the sample's symbolic attributes.
+    """
+    if sample.symbolic == {} or sample.symbolic[JascoCondConst.CRD.value].frame_chords.shape[-1] <= 1:  # type: ignore
+        # nothing to drop
+        return sample
+    if condition not in getattr(sample, 'symbolic'):
+        raise ValueError(
+            "dropout_symbolic_condition received an unexpected condition!"
+            f" expected {sample.symbolic.keys()}"
+            f" but got '{condition}'!"
+        )
+    if condition == JascoCondConst.CRD.value:
+        sample.symbolic[condition] = nullify_chords(sample.symbolic[condition], null_chord_idx=null_chord_idx)
+    elif condition == JascoCondConst.MLD.value:
+        sample.symbolic[condition] = nullify_melody(sample.symbolic[condition])
+    return sample
+def dropout_condition(sample: ConditioningAttributes,
+                      condition_type: str, condition: str,
+                      **kwargs) -> ConditioningAttributes:
     """Utility function for nullifying an attribute inside an ConditioningAttributes object.
     If the condition is of type "wav", then nullify it using `nullify_condition` function.
     If the condition is of any other type, set its value to None.
     Works in-place.
     """
+    if condition_type not in ['text', 'wav', 'joint_embed', 'symbolic']:
         raise ValueError(
             "dropout_condition got an unexpected condition type!"
             f" expected 'text', 'wav' or 'joint_embed' but got '{condition_type}'"
     elif condition_type == 'joint_embed':
         embed = sample.joint_embed[condition]
         sample.joint_embed[condition] = nullify_joint_embed(embed)
+    elif condition_type == 'symbolic':
+        sample = dropout_symbolic_conditions(sample=sample, condition=condition, **kwargs)
     else:
         sample.text[condition] = None
             return samples
         samples = deepcopy(samples)
+        for condition_type, ps in self.p.items():  # for condition types [text, wav, symbolic]
             for condition, p in ps.items():  # for attributes of each type (e.g., [artist, genre])
                 if torch.rand(1, generator=self.rng).item() < p:
                     for sample in samples:
         super().__init__(seed=seed)
         self.p = p
+    def forward(self, samples: tp.List[ConditioningAttributes],
+                cond_types: tp.List[str] = ["wav", "text"],
+                **kwargs) -> tp.List[ConditioningAttributes]:
         """
         Args:
             samples (list[ConditioningAttributes]): List of conditions.
         # nullify conditions of all attributes
         samples = deepcopy(samples)
+        for condition_type in cond_types:
             for sample in samples:
                 for condition in sample.attributes[condition_type]:
+                    dropout_condition(sample, condition_type, condition,
+                                      **kwargs)
         return samples
     def __repr__(self):
         cross_attention_pos_emb (bool, optional): Use positional embeddings in cross attention.
         cross_attention_pos_emb_scale (int): Scale for positional embeddings in cross attention if used.
     """
+    FUSING_METHODS = ["sum", "prepend", "cross", "ignore", "input_interpolate"]
     def __init__(self, fuse2cond: tp.Dict[str, tp.List[str]], cross_attention_pos_emb: bool = False,
                  cross_attention_pos_emb_scale: float = 1.0):
                     cross_attention_output = torch.cat([cross_attention_output, cond], dim=1)
                 else:
                     cross_attention_output = cond
+            elif op == 'ignore':
+                continue
             else:
                 raise ValueError(f"unknown op ({op})")

audiocraft/modules/jasco_conditioners.py ADDED Viewed

	@@ -0,0 +1,300 @@

+import torch
+import typing as tp
+from itertools import chain
+from pathlib import Path
+from torch import nn
+from .conditioners import (ConditioningAttributes, BaseConditioner, ConditionType,
+                           ConditioningProvider, JascoCondConst,
+                           WaveformConditioner, WavCondition, SymbolicCondition)
+from ..data.audio import audio_read
+from ..data.audio_utils import convert_audio
+from ..utils.autocast import TorchAutocast
+from ..utils.cache import EmbeddingCache
+class MelodyConditioner(BaseConditioner):
+    """
+    A conditioner that handles melody conditioning from pre-computed salience matrix.
+    Attributes:
+        card (int): The cardinality of the melody matrix.
+        out_dim (int): The dimensionality of the output projection.
+        device (Union[torch.device, str]): The device on which the embeddings are stored.
+    """
+    def __init__(self, card: int, out_dim: int, device: tp.Union[torch.device, str] = 'cpu', **kwargs):
+        super().__init__(dim=card, output_dim=out_dim)
+        self.device = device
+    def tokenize(self, x: SymbolicCondition) -> SymbolicCondition:
+        return SymbolicCondition(melody=x.melody.to(self.device))  # type: ignore
+    def forward(self, x: SymbolicCondition) -> ConditionType:
+        embeds = self.output_proj(x.melody.permute(0, 2, 1))  # type: ignore
+        mask = torch.ones_like(embeds[..., 0])
+        return embeds, mask
+class ChordsEmbConditioner(BaseConditioner):
+    """
+    A conditioner that embeds chord symbols into a continuous vector space.
+    Attributes:
+        card (int): The cardinality of the chord vocabulary.
+        out_dim (int): The dimensionality of the output embeddings.
+        device (Union[torch.device, str]): The device on which the embeddings are stored.
+    """
+    def __init__(self, card: int, out_dim: int, device: tp.Union[torch.device, str] = 'cpu', **kwargs):
+        vocab_size = card + 1  # card + 1 - for null chord used during dropout
+        super().__init__(dim=vocab_size, output_dim=-1)  # out_dim=-1 to avoid another projection
+        self.emb = nn.Embedding(vocab_size, out_dim, device=device)
+        self.device = device
+    def tokenize(self, x: SymbolicCondition) -> SymbolicCondition:
+        return SymbolicCondition(frame_chords=x.frame_chords.to(self.device))   # type: ignore
+    def forward(self, x: SymbolicCondition) -> ConditionType:
+        embeds = self.emb(x.frame_chords)
+        mask = torch.ones_like(embeds[..., 0])
+        return embeds, mask
+class DrumsConditioner(WaveformConditioner):
+    def __init__(self, out_dim: int, sample_rate: int, blurring_factor: int = 3,
+                 cache_path: tp.Optional[tp.Union[str, Path]] = None,
+                 compression_model_latent_dim: int = 128,
+                 compression_model_framerate: float = 50,
+                 segment_duration: float = 10.0,
+                 device: tp.Union[torch.device, str] = 'cpu',
+                 **kwargs):
+        """Drum condition conditioner
+        Args:
+            out_dim (int): _description_
+            sample_rate (int): _description_
+            blurring_factor (int, optional): _description_. Defaults to 3.
+            cache_path (tp.Optional[tp.Union[str, Path]], optional): path to precomputed cache. Defaults to None.
+            compression_model_latent_dim (int, optional): latent dimensino. Defaults to 128.
+            compression_model_framerate (float, optional): frame rate of the representation model. Defaults to 50.
+            segment_duration (float, optional): duration in sec for each audio segment. Defaults to 10.0.
+            device (tp.Union[torch.device, str], optional): device. Defaults to 'cpu'.
+        """
+        from demucs import pretrained
+        self.sample_rate = sample_rate
+        self.__dict__['demucs'] = pretrained.get_model('htdemucs').to(device)
+        stem_sources: list = self.demucs.sources  # type: ignore
+        self.stem_idx = stem_sources.index('drums')
+        self.compression_model = None
+        self.latent_dim = compression_model_latent_dim
+        super().__init__(dim=self.latent_dim, output_dim=out_dim, device=device)
+        self.autocast = TorchAutocast(enabled=device != 'cpu', device_type=self.device, dtype=torch.float32)
+        self._use_masking = False
+        self.blurring_factor = blurring_factor
+        self.seq_len = int(segment_duration * compression_model_framerate)
+        self.cache = None  # If you wish to train with EmbeddingCache, call self.create_embedding_cache(cache_path)
+    def create_embedding_cache(self, cache_path):
+        if cache_path is not None:
+            self.cache = EmbeddingCache(Path(cache_path) / 'wav', self.device,
+                                        compute_embed_fn=self._calc_coarse_drum_codes_for_cache,
+                                        extract_embed_fn=self._load_drum_codes_chunk)
+    @torch.no_grad()
+    def _get_drums_stem(self, wav: torch.Tensor, sample_rate: int) -> torch.Tensor:
+        """Get parts of the wav that holds the drums, extracting the main stems from the wav."""
+        from demucs.apply import apply_model
+        from demucs.audio import convert_audio
+        with self.autocast:
+            wav = convert_audio(
+                wav, sample_rate, self.demucs.samplerate, self.demucs.audio_channels)  # type: ignore
+            stems = apply_model(self.demucs, wav, device=self.device)
+            drum_stem = stems[:, self.stem_idx]  # extract relevant stems for drums conditioning
+            return convert_audio(drum_stem, self.demucs.samplerate, self.sample_rate, 1)  # type: ignore
+    def _temporal_blur(self, z: torch.Tensor):
+        # z: (B, T, C)
+        B, T, C = z.shape
+        if T % self.blurring_factor != 0:
+            # pad with reflect for T % self.temporal_blurring on the right in dim=1
+            pad_val = self.blurring_factor - T % self.blurring_factor
+            z = torch.nn.functional.pad(z, (0, 0, 0, pad_val), mode='reflect')
+        z = z.reshape(B, -1, self.blurring_factor, C).sum(dim=2) / self.blurring_factor
+        z = z.unsqueeze(2).repeat(1, 1, self.blurring_factor, 1).reshape(B, -1, C)
+        z = z[:, :T]
+        assert z.shape == (B, T, C)
+        return z
+    @torch.no_grad()
+    def _extract_coarse_drum_codes(self, wav: torch.Tensor, sample_rate: int) -> torch.Tensor:
+        assert self.compression_model is not None
+        # stem separation of drums
+        drums = self._get_drums_stem(wav, sample_rate)
+        # continuous encoding with compression model
+        latents = self.compression_model.model.encoder(drums)
+        # quantization to coarsest codebook
+        coarsest_quantizer = self.compression_model.model.quantizer.layers[0]
+        drums = coarsest_quantizer.encode(latents).to(torch.int16)
+        return drums
+    @torch.no_grad()
+    def _calc_coarse_drum_codes_for_cache(self, path: tp.Union[str, Path],
+                                          x: WavCondition, idx: int,
+                                          max_duration_to_process: float = 600) -> torch.Tensor:
+        """Extract blurred drum latents from the whole audio waveform at the given path."""
+        wav, sr = audio_read(path)
+        wav = wav[None].to(self.device)
+        wav = convert_audio(wav, sr, self.sample_rate, to_channels=1)
+        max_frames_to_process = int(max_duration_to_process * self.sample_rate)
+        if wav.shape[-1] > max_frames_to_process:
+            # process very long tracks in chunks
+            start = 0
+            codes = []
+            while start < wav.shape[-1] - 1:
+                wav_chunk = wav[..., start: start + max_frames_to_process]
+                codes.append(self._extract_coarse_drum_codes(wav_chunk, self.sample_rate)[0])
+                start += max_frames_to_process
+            return torch.cat(codes)
+        return self._extract_coarse_drum_codes(wav, self.sample_rate)[0]
+    def _load_drum_codes_chunk(self, full_coarse_drum_codes: torch.Tensor, x: WavCondition, idx: int) -> torch.Tensor:
+        """Extract a chunk of coarse drum codes from the full coarse drum codes derived from the full waveform."""
+        wav_length = x.wav.shape[-1]
+        seek_time = x.seek_time[idx]
+        assert seek_time is not None, (
+            "WavCondition seek_time is required "
+            "when extracting chunks from pre-computed drum codes.")
+        assert self.compression_model is not None
+        frame_rate = self.compression_model.frame_rate
+        target_length = int(frame_rate * wav_length / self.sample_rate)
+        target_length = max(target_length, self.seq_len)
+        index = int(frame_rate * seek_time)
+        out = full_coarse_drum_codes[index: index + target_length]
+        # pad
+        out = torch.cat((out, torch.zeros(target_length - out.shape[0], dtype=out.dtype, device=out.device)))
+        return out.to(self.device)
+    @torch.no_grad()
+    def _get_wav_embedding(self, x: WavCondition) -> torch.Tensor:
+        bs = x.wav.shape[0]
+        if x.wav.shape[-1] <= 1:
+            # null condition
+            return torch.zeros((bs, self.seq_len, self.latent_dim), device=x.wav.device, dtype=x.wav.dtype)
+        # extract coarse drum codes
+        no_undefined_paths = all(p is not None for p in x.path)
+        no_nullified_cond = x.wav.shape[-1] > 1
+        if self.cache is not None and no_undefined_paths and no_nullified_cond:
+            paths = [Path(p) for p in x.path if p is not None]
+            codes = self.cache.get_embed_from_cache(paths, x)
+        else:
+            assert all(sr == x.sample_rate[0] for sr in x.sample_rate), "All sample rates in batch should be equal."
+            codes = self._extract_coarse_drum_codes(x.wav, x.sample_rate[0])
+        assert self.compression_model is not None
+        # decode back to the continuous representation of compression model
+        codes = codes.unsqueeze(1).permute(1, 0, 2)  # (B, T) -> (1, B, T)
+        codes = codes.to(torch.int64)
+        latents = self.compression_model.model.quantizer.decode(codes)
+        latents = latents.permute(0, 2, 1)  # [B, C, T] -> [B, T, C]
+        # temporal blurring
+        return self._temporal_blur(latents)
+    def tokenize(self, x: WavCondition) -> WavCondition:
+        """Apply WavConditioner tokenization and populate cache if needed."""
+        x = super().tokenize(x)
+        no_undefined_paths = all(p is not None for p in x.path)
+        if self.cache is not None and no_undefined_paths:
+            paths = [Path(p) for p in x.path if p is not None]
+            self.cache.populate_embed_cache(paths, x)
+        return x
+class JascoConditioningProvider(ConditioningProvider):
+    """
+    A cond-provider that manages and tokenizes various types of conditioning attributes for Jasco models.
+    Attributes:
+        chords_card (int): The cardinality of the chord vocabulary.
+        sequence_length (int): The length of the sequence for padding purposes.
+        melody_dim (int): The dimensionality of the melody matrix.
+    """
+    def __init__(self, *args,
+                 chords_card: int = 194,
+                 sequence_length: int = 500,
+                 melody_dim: int = 53, **kwargs):
+        self.null_chord = chords_card
+        self.sequence_len = sequence_length
+        self.melody_dim = melody_dim
+        super().__init__(*args, **kwargs)
+    def tokenize(self, inputs: tp.List[ConditioningAttributes]) -> tp.Dict[str, tp.Any]:
+        """Match attributes/wavs with existing conditioners in self, and compute tokenize them accordingly.
+        This should be called before starting any real GPU work to avoid synchronization points.
+        This will return a dict matching conditioner names to their arbitrary tokenized representations.
+        Args:
+            inputs (list[ConditioningAttributes]): List of ConditioningAttributes objects containing
+                text and wav conditions.
+        """
+        assert all([isinstance(x, ConditioningAttributes) for x in inputs]), (
+            "Got unexpected types input for conditioner! should be tp.List[ConditioningAttributes]",
+            f" but types were {set([type(x) for x in inputs])}"
+        )
+        output = {}
+        text = self._collate_text(inputs)
+        wavs = self._collate_wavs(inputs)
+        symbolic = self._collate_symbolic(inputs, self.conditioners.keys())
+        assert set(text.keys() | wavs.keys() | symbolic.keys()).issubset(set(self.conditioners.keys())), (
+            f"Got an unexpected attribute! Expected {self.conditioners.keys()}, ",
+            f"got {text.keys(), wavs.keys(), symbolic.keys()}"
+        )
+        for attribute, batch in chain(text.items(), wavs.items(), symbolic.items()):
+            output[attribute] = self.conditioners[attribute].tokenize(batch)
+        return output
+    def _collate_symbolic(self, samples: tp.List[ConditioningAttributes],
+                          conditioner_keys: tp.Set) -> tp.Dict[str, SymbolicCondition]:
+        output = {}
+        # collate if symbolic cond exists
+        if any(x in conditioner_keys for x in JascoCondConst.SYM.value):
+            for s in samples:
+                # hydrate with null chord if chords not exist - for inference support
+                if (s.symbolic == {} or
+                        s.symbolic[JascoCondConst.CRD.value].frame_chords is None or
+                        s.symbolic[JascoCondConst.CRD.value].frame_chords.shape[-1] <= 1):  # type: ignore
+                    # no chords conditioning - fill with null chord token
+                    s.symbolic[JascoCondConst.CRD.value] = SymbolicCondition(
+                        frame_chords=torch.ones(self.sequence_len, dtype=torch.int32) * self.null_chord)
+                if (s.symbolic == {} or
+                        s.symbolic[JascoCondConst.MLD.value].melody is None or
+                        s.symbolic[JascoCondConst.MLD.value].melody.shape[-1] <= 1):  # type: ignore
+                    # no chords conditioning - fill with null chord token
+                    s.symbolic[JascoCondConst.MLD.value] = SymbolicCondition(
+                        melody=torch.zeros((self.melody_dim, self.sequence_len)))
+            if JascoCondConst.CRD.value in conditioner_keys:
+                # pad to max
+                max_seq_len = max(
+                    [s.symbolic[JascoCondConst.CRD.value].frame_chords.shape[-1] for s in samples])  # type: ignore
+                padded_chords = [
+                    torch.cat((x.symbolic[JascoCondConst.CRD.value].frame_chords,   # type: ignore
+                               torch.ones(max_seq_len -
+                                          x.symbolic[JascoCondConst.CRD.value].frame_chords.shape[-1],  # type: ignore
+                                          dtype=torch.int32) * self.null_chord))
+                    for x in samples
+                ]
+                output[JascoCondConst.CRD.value] = SymbolicCondition(frame_chords=torch.stack(padded_chords))
+            if JascoCondConst.MLD.value in conditioner_keys:
+                melodies = torch.stack([x.symbolic[JascoCondConst.MLD.value].melody for x in samples])  # type: ignore
+                output[JascoCondConst.MLD.value] = SymbolicCondition(melody=melodies)
+        return output

audiocraft/modules/transformer.py CHANGED Viewed

@@ -315,7 +315,6 @@ class StreamingMultiheadAttention(StreamingModule):
     def forward(self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor,
                 key_padding_mask=None, need_weights=False, attn_mask=None,
                 average_attn_weights=True, is_causal=False):
-        assert attn_mask is None
         assert not is_causal, ("New param added in torch 2.0.1 not supported, "
                                "use the causal args in the constructor.")
@@ -329,7 +328,10 @@ class StreamingMultiheadAttention(StreamingModule):
             assert self.causal or self.cross_attention, \
                 "Streaming only available for causal or cross attention"
         if self.causal:
             # At the moment we specialize only for the self-attention case.
             assert query.shape[1] == key.shape[1], "Causal only for same length query / key / value"
             assert value.shape[1] == key.shape[1], "Causal only for same length query / key / value"
@@ -398,6 +400,14 @@ class StreamingMultiheadAttention(StreamingModule):
             if self.attention_as_float32:
                 q, k, v = [x.float() for x in [q, k, v]]
             if self.memory_efficient:
                 p = self.dropout if self.training else 0
                 if _efficient_attention_backend == 'torch':
                     x = torch.nn.functional.scaled_dot_product_attention(

     def forward(self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor,
                 key_padding_mask=None, need_weights=False, attn_mask=None,
                 average_attn_weights=True, is_causal=False):
         assert not is_causal, ("New param added in torch 2.0.1 not supported, "
                                "use the causal args in the constructor.")
             assert self.causal or self.cross_attention, \
                 "Streaming only available for causal or cross attention"
+        custom_attn_mask = attn_mask is not None
         if self.causal:
+            assert attn_mask is None
             # At the moment we specialize only for the self-attention case.
             assert query.shape[1] == key.shape[1], "Causal only for same length query / key / value"
             assert value.shape[1] == key.shape[1], "Causal only for same length query / key / value"
             if self.attention_as_float32:
                 q, k, v = [x.float() for x in [q, k, v]]
             if self.memory_efficient:
+                if custom_attn_mask:
+                    # When using a custom attn mask:
+                    # Move to query's device, repeat for each sample, remove align8 padding
+                    seq_len = query.shape[1]
+                    attn_mask = attn_mask.to(q.dtype)
+                    attn_mask = attn_mask.repeat((q.shape[0], 1, 1, 1))
+                    attn_mask = attn_mask[..., :seq_len, :seq_len]
                 p = self.dropout if self.training else 0
                 if _efficient_attention_backend == 'torch':
                     x = torch.nn.functional.scaled_dot_product_attention(

audiocraft/modules/unet_transformer.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import torch
+import typing as tp
+from .transformer import StreamingTransformer, create_sin_embedding
+class UnetTransformer(StreamingTransformer):
+    """U-net Transformer for processing sequences with optional skip connections.
+    This transformer architecture incorporates U-net style skip connections
+    between layers, which can be optionally enabled. It inherits from a
+    StreamingTransformer.
+    Args:
+        d_model (int): Dimension of the model, typically the number of expected features in the input.
+        num_layers (int): Total number of layers in the transformer.
+        skip_connections (bool, optional): Flag to determine whether skip connections should be used.
+                                           Defaults to False.
+        layer_dropout_p (float, Optional): if given, defined bernoulli prob. to drop a skip connection (in training).
+        **kwargs: Additional keyword arguments inherited from `nn.StreamingTransformer`.
+    """
+    def __init__(self, d_model: int, num_layers: int, skip_connections: bool = False,
+                 layer_dropout_p: tp.Optional[float] = None, **kwargs):
+        super().__init__(d_model=d_model,
+                         num_layers=num_layers,
+                         **kwargs)
+        self.skip_connect = skip_connections
+        if self.skip_connect:
+            self.skip_projections = torch.nn.ModuleList([torch.nn.Linear(d_model * 2, d_model)
+                                                        for _ in range(num_layers // 2)])
+        self.num_layers = num_layers
+        self.layer_drop_p = max(min(layer_dropout_p, 1.), 0.) if layer_dropout_p is not None else 0.0
+    def forward(self, x: torch.Tensor, *args, **kwargs):
+        B, T, C = x.shape
+        if 'offsets' in self._streaming_state:
+            offsets = self._streaming_state['offsets']
+        else:
+            offsets = torch.zeros(B, dtype=torch.long, device=x.device)
+        if self.positional_embedding in ['sin', 'sin_rope']:
+            positions = torch.arange(T, device=x.device).view(1, -1, 1)
+            positions = positions + offsets.view(-1, 1, 1)
+            pos_emb = create_sin_embedding(positions, C, max_period=self.max_period, dtype=x.dtype)
+            x = x + self.positional_scale * pos_emb
+        skip_connections: tp.List[torch.Tensor] = []
+        for i, layer in enumerate(self.layers):
+            if self.skip_connect and i >= self.num_layers // 2:
+                # in the second half of the layers, add residual connection
+                # and linearly project the concatenated features back to d_model
+                x = torch.cat([x, skip_connections.pop()], dim=-1)
+                x = self.skip_projections[i % len(self.skip_projections)](x)
+            x = self._apply_layer(layer, x, *args, **kwargs)
+            if self.skip_connect and i < self.num_layers // 2:
+                if self.training and torch.rand(1,) < self.layer_drop_p:  # drop skip
+                    skip_connections.append(torch.zeros_like(x))
+                else:
+                    skip_connections.append(x)
+        if self._is_streaming:
+            self._streaming_state['offsets'] = offsets + T
+        return x

audiocraft/utils/extend.py CHANGED Viewed

@@ -51,7 +51,7 @@ def separate_audio_segments(audio, segment_duration=30, overlap=1):
     print(f"separate_audio_segments: {len(segments)} segments of length {segment_samples // sr} seconds")
     return segments
-def generate_music_segments(text, melody, seed, MODEL, duration:int=10, overlap:int=1, segment_duration:int=30, prompt_index:int=0, harmony_only:bool= False, progress= gr.Progress(track_tqdm=True)):
     # generate audio segments
     melody_segments = separate_audio_segments(melody, segment_duration, 0)
@@ -96,7 +96,7 @@ def generate_music_segments(text, melody, seed, MODEL, duration:int=10, overlap:
             pbar.update(1)
         print(f"melody_segments: {len(melody_segments)} fixed")
-    # Iterate over the segments to create list of Meldoy tensors
     for segment_idx in range(total_segments):
         if INTERRUPTING:
             return [], duration
@@ -119,6 +119,10 @@ def generate_music_segments(text, melody, seed, MODEL, duration:int=10, overlap:
            verse = verse[None]
         verse = verse[..., :int(sr * MODEL.lm.cfg.dataset.segment_duration)]
         # Append the segment to the melodys list
         melodys.append(verse)
         pbar.update(1)
@@ -139,10 +143,17 @@ def generate_music_segments(text, melody, seed, MODEL, duration:int=10, overlap:
         top_p=MODEL.generation_params["top_p"],
         temperature=MODEL.generation_params["temp"],
         cfg_coef=MODEL.generation_params["cfg_coef"],
         duration=segment_duration,
         two_step_cfg=False,
-        rep_penalty=0.5
     )
     # Generate a new prompt segment. This will be applied to all segments for consistency
     print(f"Generating New Prompt Segment: {text} from verse {prompt_index}\r")
     prompt_segment = MODEL.generate_with_all(
@@ -168,10 +179,18 @@ def generate_music_segments(text, melody, seed, MODEL, duration:int=10, overlap:
                 top_p=MODEL.generation_params["top_p"],
                 temperature=MODEL.generation_params["temp"],
                 cfg_coef=MODEL.generation_params["cfg_coef"],
                 duration=mod_duration,
                 two_step_cfg=False,
-                rep_penalty=0.5
             )
             try:
                 # get last chunk
                 verse = verse[:, :, -mod_duration*MODEL.sample_rate:]

     print(f"separate_audio_segments: {len(segments)} segments of length {segment_samples // sr} seconds")
     return segments
+def generate_music_segments(text, melody, seed, MODEL, duration:int=10, overlap:int=1, segment_duration:int=30, prompt_index:int=0, harmony_only:bool= False, excerpt_duration:float=3.5, progress= gr.Progress(track_tqdm=True)):
     # generate audio segments
     melody_segments = separate_audio_segments(melody, segment_duration, 0)
             pbar.update(1)
         print(f"melody_segments: {len(melody_segments)} fixed")
+    # Iterate over the segments to create list of Melody tensors
     for segment_idx in range(total_segments):
         if INTERRUPTING:
             return [], duration
            verse = verse[None]
         verse = verse[..., :int(sr * MODEL.lm.cfg.dataset.segment_duration)]
+        # Reduce the length of verse to sr * excerpt_duration
+        if ("style" in MODEL.name):
+            verse = verse[:, :, :int(sr * excerpt_duration)]
         # Append the segment to the melodys list
         melodys.append(verse)
         pbar.update(1)
         top_p=MODEL.generation_params["top_p"],
         temperature=MODEL.generation_params["temp"],
         cfg_coef=MODEL.generation_params["cfg_coef"],
+        cfg_coef_beta=MODEL.generation_params["cfg_coef_beta"],
         duration=segment_duration,
         two_step_cfg=False,
+        rep_penalty=0.5,
     )
+    if ("style" in MODEL.name):
+        MODEL.set_style_conditioner_params(
+            eval_q=MODEL.lm.condition_provider.conditioners.self_wav.eval_q, # integer between 1 and 6
+            excerpt_length=excerpt_duration, # the length in seconds that is taken by the model in the provided excerpt, can be between 1.5 and 4.5 seconds but it has to be shortest to the length of the provided conditioning
+        )
     # Generate a new prompt segment. This will be applied to all segments for consistency
     print(f"Generating New Prompt Segment: {text} from verse {prompt_index}\r")
     prompt_segment = MODEL.generate_with_all(
                 top_p=MODEL.generation_params["top_p"],
                 temperature=MODEL.generation_params["temp"],
                 cfg_coef=MODEL.generation_params["cfg_coef"],
+                cfg_coef_beta=MODEL.generation_params["cfg_coef_beta"],
                 duration=mod_duration,
                 two_step_cfg=False,
+                rep_penalty=0.5,
             )
+            if ("style" in MODEL.name):
+                MODEL.set_style_conditioner_params(
+                    eval_q=MODEL.lm.condition_provider.conditioners.self_wav.eval_q, # integer between 1 and 6
+                    excerpt_length=min(excerpt_duration, mod_duration), # the length in seconds that is taken by the model in the provided excerpt, can be between 1.5 and 4.5 seconds but it has to be shortest to the length of the provided conditioning
+                )
             try:
                 # get last chunk
                 verse = verse[:, :, -mod_duration*MODEL.sample_rate:]

audiocraft/utils/utils.py CHANGED Viewed

@@ -298,3 +298,31 @@ def load_clap_state_dict(clap_model, path: tp.Union[str, Path]):
     pkg = load_state_dict(path)
     pkg.pop('text_branch.embeddings.position_ids', None)
     clap_model.model.load_state_dict(pkg)

     pkg = load_state_dict(path)
     pkg.pop('text_branch.embeddings.position_ids', None)
     clap_model.model.load_state_dict(pkg)
+def construct_frame_chords(
+                    min_timestamp: int,
+                    chord_changes: tp.List[tp.Tuple[float, str]],
+                    mapping_dict: tp.Dict,
+                    prev_chord: str,
+                    frame_rate: float,
+                    segment_duration: float,
+                    ) -> tp.List[str]:
+    """ Translate symbolic chords [(start_time, tuples),...] into a frame-level int sequence"""
+    frames = [
+        frame / frame_rate
+        for frame in range(
+            min_timestamp, int(min_timestamp + segment_duration * frame_rate)
+        )
+    ]
+    frame_chords = []
+    current_chord = prev_chord
+    for frame in frames:
+        while chord_changes and frame >= chord_changes[0][0]:
+            current_chord = chord_changes.pop(0)[1]
+        current_chord = 'N' if current_chord in {None, ''} else current_chord
+        frame_chords.append(mapping_dict[current_chord])
+    return frame_chords

requirements.txt CHANGED Viewed

@@ -10,14 +10,16 @@ soundfile
 huggingface_hub
 hf_xet
 tqdm
-transformers>=4.48.0  # need Encodec there.
 xformers>=0.0.23 --index-url https://download.pytorch.org/whl/cu124
 demucs
 librosa==0.11.0
 soundfile
 gradio[oauth]
 pillow
 torchmetrics
 encodec
 protobuf>=3.20.1
 filetype

 huggingface_hub
 hf_xet
 tqdm
+transformers==4.43.4  # need Encodec there.
 xformers>=0.0.23 --index-url https://download.pytorch.org/whl/cu124
 demucs
 librosa==0.11.0
 soundfile
 gradio[oauth]
 pillow
+torchdiffeq
 torchmetrics
+nnAudio
 encodec
 protobuf>=3.20.1
 filetype