hertz-dev

Paused

App Files Files Community

calculating commited on Nov 7, 2024

Commit

824afbf

1 Parent(s): 35d94d0

committing...

Browse files

Files changed (14) hide show

app.py +245 -0
ioblocks.py +333 -0
model.py +443 -0
requirements.txt +14 -0
tokenizer.py +581 -0
transformer.py +382 -0
utils/__init__.py +3 -0
utils/__pycache__/__init__.cpython-310.pyc +0 -0
utils/__pycache__/blocks.cpython-310.pyc +0 -0
utils/__pycache__/dist.cpython-310.pyc +0 -0
utils/__pycache__/interp.cpython-310.pyc +0 -0
utils/blocks.py +92 -0
utils/dist.py +99 -0
utils/interp.py +84 -0

app.py ADDED Viewed

	@@ -0,0 +1,245 @@

+import gradio as gr
+import torch as T
+import torch.nn as nn
+import torch.nn.functional as F
+import torchaudio
+import matplotlib.pyplot as plt
+from utils import load_ckpt, print_colored
+from tokenizer import make_tokenizer
+from model import get_hertz_dev_config
+from typing import Tuple
+import numpy as np
+import os
+# Global variables for model and tokenizer
+global_generator = None
+global_tokenizer = None
+default_audio_path = "testingtesting.wav"  # Your default audio file
+def init_model(use_pure_audio_ablation: bool = False) -> Tuple[nn.Module, object]:
+    """Initialize the model and tokenizer"""
+    global global_generator, global_tokenizer
+    if global_generator is not None and global_tokenizer is not None:
+        return global_generator, global_tokenizer
+    device = 'cuda' if T.cuda.is_available() else 'cpu'
+    T.cuda.set_device(0) if device == 'cuda' else None
+    print_colored("Initializing model and tokenizer...", "blue")
+    global_tokenizer = make_tokenizer(device)
+    model_config = get_hertz_dev_config(is_split=False, use_pure_audio_ablation=use_pure_audio_ablation)
+    global_generator = model_config()
+    global_generator = global_generator.eval().to(T.bfloat16).to(device)
+    print_colored("Model initialization complete!", "green")
+    return global_generator, global_tokenizer
+def process_audio(audio_path: str, sr: int) -> T.Tensor:
+    """Load and preprocess audio file"""
+    audio_tensor, sr = torchaudio.load(audio_path)
+    if audio_tensor.shape[0] == 2:
+        audio_tensor = audio_tensor.mean(dim=0).unsqueeze(0)
+    if sr != 16000:
+        resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)
+        audio_tensor = resampler(audio_tensor)
+    max_samples = 16000 * 60 * 5  # 5 minutes
+    if audio_tensor.shape[1] > max_samples:
+        audio_tensor = audio_tensor[:, :max_samples]
+    return audio_tensor.unsqueeze(0)
+def generate_completion(
+    audio_file,
+    prompt_len_seconds: float = 3.0,
+    num_completions: int = 5,
+    generation_seconds: float = 20.0,
+    token_temp: float = 0.8,
+    categorical_temp: float = 0.5,
+    gaussian_temp: float = 0.1,
+    progress=gr.Progress(track_tqdm=True)
+) -> list:
+    """Generate audio completions from the input audio"""
+    device = 'cuda' if T.cuda.is_available() else 'cpu'
+    # Use existing model and tokenizer
+    generator, audio_tokenizer = global_generator, global_tokenizer
+    progress(0, desc="Processing input audio...")
+    # Process input audio
+    prompt_audio = process_audio(audio_file, sr=16000)
+    prompt_len = int(prompt_len_seconds * 8)
+    progress(0.2, desc="Encoding prompt...")
+    # Encode prompt
+    with T.autocast(device_type='cuda', dtype=T.bfloat16):
+        encoded_prompt_audio = audio_tokenizer.latent_from_data(prompt_audio.to(device))
+    completions = []
+    for i in range(num_completions):
+        progress((i + 1) / num_completions, desc=f"Generating completion {i+1}/{num_completions}")
+        # Generate completion
+        encoded_prompt = encoded_prompt_audio[:, :prompt_len]
+        with T.autocast(device_type='cuda', dtype=T.bfloat16):
+            completed_audio_batch = generator.completion(
+                encoded_prompt,
+                temps=(token_temp, (categorical_temp, gaussian_temp)),
+                use_cache=True,
+                gen_len=int(generation_seconds * 8)
+            )
+            decoded_completion = audio_tokenizer.data_from_latent(completed_audio_batch.bfloat16())
+        # Process audio for output
+        audio_tensor = decoded_completion.cpu().squeeze()
+        if audio_tensor.ndim == 1:
+            audio_tensor = audio_tensor.unsqueeze(0)
+        audio_tensor = audio_tensor.float()
+        if audio_tensor.abs().max() > 1:
+            audio_tensor = audio_tensor / audio_tensor.abs().max()
+        # Trim to include only the generated portion
+        output_audio = audio_tensor[:, max(prompt_len*2000 - 16000, 0):]
+        completions.append((16000, output_audio.numpy().T))
+    progress(1.0, desc="Generation complete!")
+    return completions
+def create_interface():
+    # Initialize model at startup
+    init_model()
+    with gr.Blocks(title="Audio Completion Generator") as app:
+        gr.Markdown("""
+        # Audio Completion Generator
+        Upload an audio file (or use the default) and generate AI completions based on the prompt.
+        """)
+        with gr.Row():
+            with gr.Column():
+                # Load the default audio if it exists
+                default_value = default_audio_path if os.path.exists(default_audio_path) else None
+                audio_input = gr.Audio(
+                    label="Input Audio",
+                    type="filepath",
+                    sources=["microphone", "upload"],
+                    value=default_value
+                )
+                with gr.Row():
+                    prompt_len = gr.Slider(
+                        minimum=1,
+                        maximum=10,
+                        value=3,
+                        step=0.5,
+                        label="Prompt Length (seconds)"
+                    )
+                    default_num_completions = 5
+                    num_completions = gr.Slider(
+                        minimum=1,
+                        maximum=10,
+                        value=default_num_completions,
+                        step=1,
+                        label="Number of Completions"
+                    )
+                    gen_length = gr.Slider(
+                        minimum=5,
+                        maximum=60,
+                        value=20,
+                        step=5,
+                        label="Generation Length (seconds)"
+                    )
+                with gr.Row():
+                    token_temp = gr.Slider(
+                        minimum=0.1,
+                        maximum=1.0,
+                        value=0.8,
+                        step=0.1,
+                        label="Token Temperature"
+                    )
+                    cat_temp = gr.Slider(
+                        minimum=0.1,
+                        maximum=1.0,
+                        value=0.5,
+                        step=0.1,
+                        label="Categorical Temperature"
+                    )
+                    gauss_temp = gr.Slider(
+                        minimum=0.1,
+                        maximum=1.0,
+                        value=0.1,
+                        step=0.1,
+                        label="Gaussian Temperature"
+                    )
+                generate_btn = gr.Button("Generate Completions")
+                status_text = gr.Markdown("Ready")
+            with gr.Column():
+                output_audios = []
+                for i in range(10):  # Create 10 audio components
+                    output_audios.append(gr.Audio(
+                        label=f"Generated Completion {i+1}",
+                        type="numpy",
+                        visible=False
+                    ))
+        def update_visibility(num):
+            return [gr.update(visible=(i < num)) for i in range(10)]
+        def generate_with_status(*args):
+            status_text.value = "Processing input audio..."
+            completions = generate_completion(*args)
+            status_text.value = "Generation complete!"
+            # Prepare outputs for all audio components
+            outputs = []
+            for i in range(10):
+                if i < len(completions):
+                    outputs.append(completions[i])
+                else:
+                    outputs.append(None)
+            return outputs
+        # Set initial visibility on load
+        app.load(
+            fn=update_visibility,
+            inputs=[num_completions],
+            outputs=output_audios
+        )
+        # Update visibility when slider changes
+        num_completions.change(
+            fn=update_visibility,
+            inputs=[num_completions],
+            outputs=output_audios
+        )
+        generate_btn.click(
+            fn=generate_with_status,
+            inputs=[
+                audio_input,
+                prompt_len,
+                num_completions,
+                gen_length,
+                token_temp,
+                cat_temp,
+                gauss_temp
+            ],
+            outputs=output_audios
+        )
+    return app
+if __name__ == "__main__":
+    app = create_interface()
+    app.launch(share=True)

ioblocks.py ADDED Viewed

	@@ -0,0 +1,333 @@

+from __future__ import annotations
+from functools import partial
+from contextlib import nullcontext
+from typing import List, Tuple
+from math import ceil
+import torch as T
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.distributed as dist
+from torch import Tensor, int32
+from torch.amp import autocast
+from einops import rearrange, pack, unpack
+from utils import si_module, exists, default, maybe
+@si_module
+class GaussianMixtureIOLayer(nn.Module):
+    class Config:
+        latent_dim: int
+        dim: int
+        num_components: int
+    def __init__(self, c: Config):
+        super().__init__()
+        self.latent_dim = c.latent_dim
+        self.num_components = c.num_components
+        self.input_projection = nn.Linear(c.latent_dim, c.dim)
+        self.fc_loc = nn.Linear(c.dim, c.num_components * c.latent_dim)
+        self.fc_scale = nn.Linear(c.dim, c.num_components * c.latent_dim)
+        self.fc_weight = nn.Linear(c.dim, c.num_components)
+    def _square_plus(self, x):
+        return (x + T.sqrt(T.square(x) + 4)) / 2
+    def input(self, sampled_latents: T.Tensor) -> T.Tensor:
+        """Pre-sampled latents T.Tensor (B, L, Z) -> float tensor (B, L, D)"""
+        hidden = self.input_projection(sampled_latents)
+        return hidden
+    def output(self, h: T.Tensor) -> Tuple[T.Tensor, T.Tensor, T.Tensor]:
+        """float tensor (B, L, D) -> Tuple of locs, scales, and weights"""
+        batch_size, seq_len, _ = h.shape
+        locs = self.fc_loc(h).view(batch_size, seq_len, self.num_components, self.latent_dim)
+        scales = T.clamp(self._square_plus(self.fc_scale(h)), min=1e-6).view(batch_size, seq_len, self.num_components, self.latent_dim)
+        weights = self.fc_weight(h).view(batch_size, seq_len, self.num_components)
+        return (locs, scales, weights)
+    def loss(self, data, dataHat):
+        locs, scales, weights = dataHat
+        log_probs = -0.5 * T.sum(
+            (data.unsqueeze(-2) - locs).pow(2) / scales.pow(2) +
+            2 * T.log(scales) +
+            T.log(T.tensor(2 * T.pi)),
+            dim=-1
+        )
+        log_weights = F.log_softmax(weights, dim=-1)
+        return -T.logsumexp(log_weights + log_probs, dim=-1)
+    def temp_sample(self, orig_pdist, temp):
+        locs, scales, weights = orig_pdist
+        if temp is None:
+            component_samples = locs + scales * T.randn_like(scales)
+            mixture_samples = F.gumbel_softmax(weights, hard=True)
+            sampled = (component_samples * mixture_samples.unsqueeze(-1)).sum(dim=-2)
+        elif isinstance(temp, tuple):
+            assert len(temp) == 2
+            categorical_temp, gaussian_temp = temp
+            component_samples = locs + scales * gaussian_temp * T.randn_like(scales)
+            mixture_samples = F.gumbel_softmax(weights / categorical_temp, hard=True)
+            sampled = (component_samples * mixture_samples.unsqueeze(-1)).sum(dim=-2)
+        else:
+            component_samples = locs + scales * temp * T.randn_like(scales)
+            mixture_samples = F.gumbel_softmax(weights / temp, hard=True)
+            sampled = (component_samples * mixture_samples.unsqueeze(-1)).sum(dim=-2)
+        return sampled
+class GPTOutput(nn.Module):
+    def __init__(self, dim, vocab_size):
+        super().__init__()
+        self.output = nn.Linear(dim, vocab_size, bias=False)
+    def forward(self, x):
+        return self.output(x)
+# helper functions
+def pack_one(t, pattern):
+    return pack([t], pattern)
+def unpack_one(t, ps, pattern):
+    return unpack(t, ps, pattern)[0]
+def first(l):
+    return l[0]
+def round_up_multiple(num, mult):
+    return ceil(num / mult) * mult
+def get_code_utilization(codes, codebook_size, get_global=False):
+    if get_global and dist.is_initialized():
+        world_size = dist.get_world_size()
+    else:
+        world_size = 1
+    if world_size > 1:
+        gathered_tokens = [T.zeros_like(codes) for _ in range(world_size)]
+        dist.all_gather(gathered_tokens, codes)
+        gathered_tokens = T.cat(gathered_tokens, dim=0)
+    else:
+        gathered_tokens = codes
+    unique_tokens = len(T.unique(gathered_tokens))
+    code_utilization = unique_tokens / min(gathered_tokens.numel(), codebook_size)
+    return code_utilization
+# tensor helpers
+def round_ste(z: Tensor) -> Tensor:
+    """Round with straight through gradients."""
+    zhat = z.round()
+    return z + (zhat - z).detach()
+# main class
+# lucidrains fsq
+@si_module
+class FSQ(nn.Module):
+    @property
+    def needs_float32_params(self):
+        return True
+    class Config:
+        levels: List[int]
+        dim: int | None = None
+        num_codebooks: int = 1
+        keep_num_codebooks_dim: bool | None = None
+        scale: float | None = None
+        allowed_dtypes: Tuple[str, ...] = ('float32', 'float64')
+        channel_first: bool = False
+        projection_has_bias: bool = True
+        return_indices: bool = True
+        force_quantization_f32: bool = True
+        use_rms: bool = False
+    def __init__(self, c: Config):
+        super().__init__()
+        _levels = T.tensor(c.levels, dtype=int32)
+        self.register_buffer("_levels", _levels, persistent = False)
+        _basis = T.cumprod(T.tensor([1] + c.levels[:-1]), dim=0, dtype=int32)
+        self.register_buffer("_basis", _basis, persistent = False)
+        self.scale = c.scale
+        codebook_dim = len(c.levels)
+        self.codebook_dim = codebook_dim
+        effective_codebook_dim = codebook_dim * c.num_codebooks
+        self.num_codebooks = c.num_codebooks
+        self.allowed_dtypes = []
+        for dtype_str in c.allowed_dtypes:
+            if hasattr(T, dtype_str):
+                self.allowed_dtypes.append(getattr(T, dtype_str))
+            else:
+                raise ValueError(f"Invalid dtype string: {dtype_str}")
+        self.effective_codebook_dim = effective_codebook_dim
+        keep_num_codebooks_dim = default(c.keep_num_codebooks_dim, c.num_codebooks > 1)
+        assert not (c.num_codebooks > 1 and not keep_num_codebooks_dim)
+        self.keep_num_codebooks_dim = keep_num_codebooks_dim
+        self.dim = default(c.dim, len(_levels) * c.num_codebooks)
+        self.channel_first = c.channel_first
+        has_projections = self.dim != effective_codebook_dim
+        self.project_in = nn.Linear(self.dim, effective_codebook_dim, bias = c.projection_has_bias) if has_projections else nn.Identity()
+        self.project_out = nn.Linear(effective_codebook_dim, self.dim, bias = c.projection_has_bias) if has_projections else nn.Identity()
+        self.has_projections = has_projections
+        self.return_indices = c.return_indices
+        if c.return_indices:
+            self.codebook_size = self._levels.prod().item()
+            implicit_codebook = self._indices_to_codes(T.arange(self.codebook_size))
+            self.register_buffer("implicit_codebook", implicit_codebook, persistent = False)
+        self.allowed_dtypes = c.allowed_dtypes
+        self.force_quantization_f32 = c.force_quantization_f32
+        self.latent_loss = None
+    def latent_metric(self, codes, get_global=False):
+        return {'code_util_estimate': get_code_utilization(codes, self.codebook_size, get_global)}
+    def repr_from_latent(self, latent):
+        return self.indices_to_codes(latent)
+    def bound(self, z, eps: float = 1e-3):
+        """ Bound `z`, an array of shape (..., d). """
+        half_l = (self._levels - 1) * (1 + eps) / 2
+        offset = T.where(self._levels % 2 == 0, 0.5, 0.0)
+        shift = (offset / half_l).atanh()
+        return (z + shift).tanh() * half_l - offset
+    def quantize(self, z):
+        """ Quantizes z, returns quantized zhat, same shape as z. """
+        quantized = round_ste(self.bound(z))
+        half_width = self._levels // 2 # Renormalize to [-1, 1].
+        return quantized / half_width
+    def _scale_and_shift(self, zhat_normalized):
+        half_width = self._levels // 2
+        return (zhat_normalized * half_width) + half_width
+    def _scale_and_shift_inverse(self, zhat):
+        half_width = self._levels // 2
+        return (zhat - half_width) / half_width
+    def _indices_to_codes(self, indices):
+        level_indices = self.indices_to_level_indices(indices)
+        codes = self._scale_and_shift_inverse(level_indices)
+        return codes
+    def codes_to_indices(self, zhat):
+        """ Converts a `code` to an index in the codebook. """
+        assert zhat.shape[-1] == self.codebook_dim
+        zhat = self._scale_and_shift(zhat)
+        return (zhat * self._basis).sum(dim=-1).to(int32)
+    def indices_to_level_indices(self, indices):
+        """ Converts indices to indices at each level, perhaps needed for a transformer with factorized embeddings """
+        indices = rearrange(indices, '... -> ... 1')
+        codes_non_centered = (indices // self._basis) % self._levels
+        return codes_non_centered
+    def indices_to_codes(self, indices):
+        """ Inverse of `codes_to_indices`. """
+        assert exists(indices)
+        is_img_or_video = indices.ndim >= (3 + int(self.keep_num_codebooks_dim))
+        codes = self._indices_to_codes(indices)
+        if self.keep_num_codebooks_dim:
+            codes = rearrange(codes, '... c d -> ... (c d)')
+        codes = self.project_out(codes)
+        if is_img_or_video or self.channel_first:
+            codes = rearrange(codes, 'b ... d -> b d ...')
+        return codes
+    # @autocast(device_type='cuda', enabled = False)
+    def forward(self, z, return_codes=False):
+        """
+        einstein notation
+        b - batch
+        n - sequence (or flattened spatial dimensions)
+        d - feature dimension
+        c - number of codebook dim
+        """
+        is_img_or_video = z.ndim >= 4
+        need_move_channel_last = is_img_or_video or self.channel_first
+        # standardize image or video into (batch, seq, dimension)
+        if need_move_channel_last:
+            z = rearrange(z, 'b d ... -> b ... d')
+            z, ps = pack_one(z, 'b * d')
+        assert z.shape[-1] == self.dim, f'expected dimension of {self.dim} but found dimension of {z.shape[-1]}'
+        z = self.project_in(z)
+        z = rearrange(z, 'b n (c d) -> b n c d', c = self.num_codebooks)
+        # whether to force quantization step to be full precision or not
+        force_f32 = self.force_quantization_f32
+        quantization_context = partial(autocast, device_type='cuda', enabled = False) if force_f32 else nullcontext
+        with quantization_context():
+            orig_dtype = z.dtype
+            if force_f32 and orig_dtype not in self.allowed_dtypes:
+                z = z.float()
+            codes = self.quantize(z)
+            # returning indices could be optional
+            indices = None
+            if self.return_indices:
+                indices = self.codes_to_indices(codes)
+            codes = rearrange(codes, 'b n c d -> b n (c d)')
+            codes = codes.type(orig_dtype)
+        # project out
+        if return_codes:
+            return codes, indices
+        out = self.project_out(codes)
+        # reconstitute image or video dimensions
+        if need_move_channel_last:
+            out = unpack_one(out, ps, 'b * d')
+            out = rearrange(out, 'b ... d -> b d ...')
+            indices = maybe(unpack_one)(indices, ps, 'b * c')
+        if not self.keep_num_codebooks_dim and self.return_indices:
+            indices = maybe(rearrange)(indices, '... 1 -> ...')
+        # return quantized output and indices
+        return out, indices

model.py ADDED Viewed

	@@ -0,0 +1,443 @@

+from typing import Optional, Tuple
+import torch as T
+import torch.nn as nn
+import torch.nn.functional as F
+from ioblocks import GaussianMixtureIOLayer, FSQ
+from transformer import Stack, ShapeRotator, Block as PerfBlock, GPTOutput, CACHE_FILL_VALUE, FFNN, Norm
+from tokenizer import make_tokenizer
+from utils import si_module, exists, isnt, tqdm0, print0, default, print0_colored
+from utils import load_ckpt
+@si_module
+class LatentQuantizer(nn.Module):
+    class Config:
+        compressor_config: Optional[FSQ.Config] = None
+        dim: Optional[int] = None
+        ff_dim: Optional[int] = None
+        input_dim: int = None
+        from_pretrained: Optional[Tuple[str, str]] = None
+    def __init__(self, c: Config):
+        super().__init__()
+        if exists(c.from_pretrained):
+            checkpoint = load_ckpt(*c.from_pretrained)
+        else:
+            assert exists(c.compressor_config), f'hmm {c}'
+        self.compressor = c.compressor_config()
+        self.ffnn = FFNN(c.dim, c.ff_dim)
+        self.input = nn.Linear(c.input_dim, c.dim) if exists(c.input_dim) else nn.Identity()
+        if exists(c.from_pretrained):
+            self.load_state_dict(checkpoint)
+    @T.no_grad()
+    def forward(self, x, return_latent=False, known_latent=None):
+        """
+        x: (B, S, D)
+        """
+        if exists(known_latent):
+            return self.compressor.indices_to_codes(known_latent)
+        x = self.input(x)
+        x = self.ffnn(x)
+        x, tokens = self.compressor(x)
+        if return_latent:
+            return x, tokens
+        return x
+@si_module
+class TransformerVAE(nn.Module):
+    class Config:
+        io_config: Optional[GaussianMixtureIOLayer.Config] = None
+        stack_config: Optional[Stack.Config] = None
+        quantizer_config: Optional[LatentQuantizer.Config] = None
+        plex_layer: int = None
+        plex_roll: int = 1
+        split: bool = True
+        from_pretrained: Optional[Tuple[str, str]] = None
+    def __init__(self, c: Config):
+        super().__init__()
+        if exists(c.from_pretrained):
+            checkpoint = load_ckpt(*c.from_pretrained)
+        else:
+            assert (exists(c.io_config) and exists(c.stack_config) and exists(c.quantizer_config)), f'hmm {c}'
+        self.io = c.io_config()
+        self.stack = c.stack_config()
+        self.plex_layer = c.stack_config.layers//2
+        self.plex_roll = c.plex_roll
+        self.plex_dim = c.quantizer_config.dim
+        assert self.plex_dim is not None and c.stack_config.dim is not None, f'One of the following are None: self.plex_dim: {self.plex_dim}, c.stack_config.dim: {c.stack_config.dim}'
+        self.plex_projection = nn.Linear(self.plex_dim, c.stack_config.dim)
+        self.out_norm = Norm(c.stack_config.dim)
+        if c.split:
+            self.io2 = c.io_config()
+            self.plex_projection2 = nn.Linear(self.plex_dim, c.stack_config.dim)
+            self.io2.fc_loc = None
+            self.io2.fc_scale = None
+            self.io2.fc_weight = None
+        kv_heads = c.stack_config.kv_heads or c.stack_config.n_head
+        head_dim = c.stack_config.dim // c.stack_config.n_head
+        self.cache_num_layers = c.stack_config.layers + ((c.stack_config.layers - self.plex_layer) if c.split else 0)
+        cache_shape = [self.cache_num_layers, c.stack_config.seq_len, 2, kv_heads, head_dim]
+        self.cache_shape = cache_shape
+        self.cache = [None] * self.cache_num_layers
+        if exists(c.from_pretrained):
+            result = self.load_state_dict(checkpoint, strict=False)
+            print0_colored(result, 'yellow')
+        self.quantizer = c.quantizer_config().eval()
+        self.quantizer.requires_grad = False
+    @T.no_grad()
+    def quantize(self, x):
+        if self.c.split:
+            x1, x2 = x.chunk(2, dim=-1)
+            with T.autocast(device_type='cuda', dtype=T.bfloat16):
+                quantized1 = self.quantizer(x1)
+                quantized2 = self.quantizer(x2)
+            return quantized1, quantized2
+        else:
+            with T.autocast(device_type='cuda', dtype=T.bfloat16):
+                return self.quantizer(x)
+    @T.no_grad()
+    def untokenize(self, token_data):
+        return self.quantizer(None, known_latent=token_data)
+    def init_cache(self, bsize, device, dtype, length:int=None):
+        cache_shape = self.cache_shape.copy()
+        cache_shape[1] = length or cache_shape[1]
+        self.cache = T.full((bsize, *cache_shape), CACHE_FILL_VALUE, device=device, dtype=dtype).transpose(0, 1)
+    def deinit_cache(self):
+        self.cache = [None] * self.cache_num_layers
+    @T.no_grad()
+    def forward(self, data, next_tokens: Optional[Tuple[T.Tensor, T.Tensor]] = None, temps: Optional[Tuple[float, Tuple[float, float]]] = None):
+        if self.c.split:
+            x1, x2 = data.chunk(2, dim=-1)
+            x = self.io.input(x1) + self.io2.input(x2)
+        else:
+            x = self.io.input(data)
+        cache_idx = 0
+        for l, layer in enumerate(self.stack.layers):
+            if l == self.plex_layer:
+                if self.c.split:
+                    plex1, plex2 = self.quantize(data)
+                    plex1 = T.roll(plex1, -self.c.plex_roll, dims=1)
+                    plex2 = T.roll(plex2, -self.c.plex_roll, dims=1)
+                    if exists(next_tokens):
+                        plex1[:, -1:] = self.untokenize(next_tokens[0])
+                        plex2[:, -1:] = self.untokenize(next_tokens[1])
+                    x1 = x + self.plex_projection(plex1)
+                    x2 = x + self.plex_projection2(plex2)
+                else:
+                    plex = self.quantize(data)
+                    plex = T.roll(plex, -self.c.plex_roll, dims=1)
+                    if exists(next_tokens):
+                        plex[:, -1:] = self.untokenize(next_tokens)
+                    x = x + self.plex_projection(plex)
+            if l < self.plex_layer:
+                x = layer(x, kv=self.cache[l])
+            else:
+                if self.c.split:
+                    x1 = layer(x1, kv=self.cache[self.plex_layer + cache_idx])
+                    cache_idx += 1
+                    x2 = layer(x2, kv=self.cache[self.plex_layer + cache_idx])
+                    cache_idx += 1
+                else:
+                    x = layer(x, kv=self.cache[l])
+        with T.autocast(device_type='cuda', dtype=T.bfloat16):
+            if self.c.split:
+                x1, x2 = self.out_norm(x1), self.out_norm(x2)
+                out1, out2 = self.io.output(x1), self.io.output(x2)
+            else:
+                x = self.out_norm(x)
+                out = self.io.output(x)
+        if isnt(temps):
+            if self.c.split:
+                return out1, out2
+            else:
+                return out
+        else:
+            if self.c.split:
+                next_data1 = self.io.temp_sample(out1, temps)[:, -1:, :]
+                next_data2 = self.io2.temp_sample(out2, temps)[:, -1:, :]
+                next_data = T.cat([next_data1, next_data2], dim=-1)
+                return next_data
+            else:
+                next_data = self.io.temp_sample(out, temps)[:, -1:, :]
+                return next_data
+@si_module
+class HertzDevModel(nn.Module):
+    class Config:
+        dim: int
+        vocab_size: int
+        stack_config: Optional[Stack.Config] = None
+        latent_size: int = 32
+        split: bool = True
+        quantizer_config: Optional[LatentQuantizer.Config] = None
+        resynthesizer_config: Optional[TransformerVAE.Config] = None
+        from_pretrained: Optional[Tuple[str, str]] = None
+    def __init__(self, c: Config):
+        super().__init__()
+        if exists(c.from_pretrained):
+            checkpoint = load_ckpt(*c.from_pretrained)
+        else:
+            assert (exists(c.stack_config)), f'hmm {c}'
+        self.input = nn.Linear(c.latent_size, c.dim)
+        if self.c.split:
+            self.input2 = nn.Linear(c.latent_size, c.dim)
+        self.shape_rotator = ShapeRotator(c.stack_config.dim//c.stack_config.n_head, c.stack_config.seq_len, theta=c.stack_config.theta)
+        self.layers = nn.ModuleList([
+            PerfBlock(
+                dim=c.stack_config.dim,
+                layer_id=l,
+                n_head=c.stack_config.n_head,
+                kv_heads=c.stack_config.kv_heads,
+                ff_dim=c.stack_config.ff_dim,
+                eps=c.stack_config.eps,
+                shape_rotator=self.shape_rotator,
+            ) for l in range(c.stack_config.layers)
+        ])
+        self.output = GPTOutput(c.dim, c.vocab_size)
+        if self.c.split:
+            self.output2 = GPTOutput(c.dim, c.vocab_size)
+        self.cache = [None] * c.stack_config.layers
+        self.kv_heads = c.stack_config.kv_heads or c.stack_config.n_head
+        self.head_dim = c.stack_config.dim // c.stack_config.n_head
+        if exists(c.from_pretrained):
+            result = self.load_state_dict(checkpoint, strict=False)
+            print0_colored(result, 'yellow')
+        self.resynthesizer = c.resynthesizer_config().eval()
+        self.resynthesizer.requires_grad = False
+        self.audio_tokenizer = make_tokenizer(device='cpu')
+        self.audio_cache = None
+        self.audio_latent_cache = None
+        self.use_audio_cache = False
+    @T.no_grad()
+    def tokenize(self, audio_data):
+        orig_audio_shape = audio_data.shape
+        if exists(self.audio_cache):
+            audio_data = T.cat([self.audio_cache, audio_data], dim=-1)
+            self.audio_cache = audio_data[..., -(6*16_000):]
+        elif self.use_audio_cache:
+            self.audio_cache = audio_data[..., -(6*16_000):]
+        if audio_data.shape[1] == 2:
+            enc_ch1 = self.audio_tokenizer.latent_from_data(audio_data[:, 0:1])
+            enc_ch2 = self.audio_tokenizer.latent_from_data(audio_data[:, 1:2])
+            return T.cat([enc_ch1, enc_ch2], dim=-1)[:, -(orig_audio_shape[-1]//2000):]
+        else:
+            return self.audio_tokenizer.latent_from_data(audio_data)[:, -(orig_audio_shape[-1]//2000):]
+    @T.no_grad()
+    def untokenize(self, token_data):
+        if exists(self.audio_latent_cache):
+            token_data = T.cat([self.audio_latent_cache, token_data], dim=1)
+            self.audio_latent_cache = token_data[:, -(6*8):]
+        elif self.use_audio_cache:
+            self.audio_latent_cache = token_data[:, -(6*8):]
+        if token_data.shape[-1] == 2*self.c.latent_size:
+            dec_ch1 = self.audio_tokenizer.data_from_latent(token_data[:, :self.c.latent_size])
+            dec_ch2 = self.audio_tokenizer.data_from_latent(token_data[:, self.c.latent_size:])
+            return T.cat([dec_ch1, dec_ch2], dim=1)[..., -(token_data.shape[1]*2000):]
+        else:
+            return self.audio_tokenizer.data_from_latent(token_data)[..., -(token_data.shape[1]*2000):]
+    def init_cache(self, bsize, device, dtype, length:int=None):
+        cache_shape = [self.c.stack_config.layers, length or self.c.stack_config.seq_len, 2, self.kv_heads, self.head_dim]
+        self.cache = T.full((bsize, *cache_shape), CACHE_FILL_VALUE, device=device, dtype=dtype).transpose(0, 1)
+        self.resynthesizer.init_cache(bsize, device, dtype, length)
+        self.use_audio_cache = True
+    def deinit_cache(self):
+        self.cache = [None] * len(self.layers)
+        self.resynthesizer.deinit_cache()
+        self.audio_cache = None
+        self.audio_latent_cache = None
+        self.use_audio_cache = False
+    @T.no_grad()
+    def forward(self, data):
+        if self.c.split:
+            x1, x2 = data.chunk(2, dim=-1)
+            x = self.input(x1) + self.input2(x2)
+        else:
+            x = self.input(data)
+        for l, layer in enumerate(self.layers):
+            x = layer(x, kv=self.cache[l])
+        if self.c.split:
+            return self.output(x), self.output2(x)
+        else:
+            return self.output(x)
+    @T.no_grad()
+    def next_audio_from_audio(self, audio_data: T.Tensor, temps=(0.8, (0.5, 0.1))):
+        latents_in = self.tokenize(audio_data)
+        next_latents = self.next_latent(latents_in, temps)
+        next_model_latent = next_latents[..., self.c.latent_size:]
+        audio_decoded = self.untokenize(next_model_latent)[..., -2000:]
+        return audio_decoded
+    @T.no_grad()
+    def next_latent(self, model_input: T.Tensor, temps=(0.8, (0.5, 0.1))):
+        if self.c.split:
+            logits1, logits2 = self.forward(model_input)
+            next_logits1 = logits1[:, -1]
+            next_logits2 = logits2[:, -1]
+            next_token1 = F.softmax(next_logits1 / temps[0], dim=-1).multinomial(1)
+            next_token2 = F.softmax(next_logits2 / temps[0], dim=-1).multinomial(1)
+            next_input = self.resynthesizer(model_input, next_tokens=(next_token1, next_token2), temps=temps[1])
+        else:
+            logits = self.forward(model_input)
+            next_logits = logits[:, -1]
+            next_token = F.softmax(next_logits / temps[0], dim=-1).multinomial(1)
+            next_input = self.resynthesizer(model_input, next_tokens=next_token, temps=temps[1])
+        return next_input
+    @T.no_grad()
+    def completion(self, data: T.Tensor, temps=(0.8, (0.5, 0.1)), gen_len=None, use_cache=True) -> T.Tensor:
+        """
+        only accepts latent-space data.
+        """
+        if use_cache:
+            self.init_cache(data.shape[0], data.device, T.bfloat16)
+        next_input = generated = data
+        target_len = min(data.shape[1] + default(gen_len, data.shape[1]), self.c.stack_config.seq_len)
+        for _ in tqdm0(range(data.shape[1], target_len)):
+            model_input = next_input if use_cache else generated
+            next_input = self.next_latent(model_input, temps)
+            generated = T.cat([generated, next_input], dim=1)
+        if use_cache:
+            self.deinit_cache()
+        return generated
+def get_hertz_dev_config(is_split=True, use_pure_audio_ablation=False):
+    if is_split:
+        checkpoints = [('inference_care_50000', 'e4ff4fe5c7e9f066410d2a5673b7a935'), ('inference_scion_54000', 'cb8bc484423922747b277ebc2933af5d')]
+    elif not use_pure_audio_ablation:
+        checkpoints = [('inference_whip_72000', '5e7cee7316900737d55fc5d44cc7a8f7'), ('inference_caraway_112000', 'fcb8368ef8ebf7712f3e31e6856da580')]
+    else:
+        checkpoints = [('inference_whip_72000', '5e7cee7316900737d55fc5d44cc7a8f7'), ('inference_syrup_110000', '353c48f553f1706824c11f3bb6a049e9')]
+    quantizer_config=LatentQuantizer.Config(
+        from_pretrained=('inference_volcano_3', 'd42bf674022c5f84b051d5d7794f6169'),
+        compressor_config=FSQ.Config(
+            levels=[8,8,8,8,8],
+            dim=2048,
+            num_codebooks=1,
+            keep_num_codebooks_dim=None,
+            scale=None,
+            allowed_dtypes=['float32', 'float64', 'bfloat16'],
+            channel_first=False,
+            projection_has_bias=True,
+            return_indices=True,
+            force_quantization_f32=True,
+            use_rms=False
+        ),
+        dim=2048,
+        ff_dim=8192,
+        input_dim=32
+    )
+    resynthesizer_config=TransformerVAE.Config(
+        io_config=GaussianMixtureIOLayer.Config(
+            latent_dim=32,
+            dim=4096,
+            num_components=8,
+        ),
+        stack_config=Stack.Config(
+            layers=8,
+            dim=4096,
+            seq_len=8192,
+            n_head=16,
+            ff_dim=11008,
+            kv_heads=16,
+            eps=1e-5,
+            theta=10_000
+        ),
+        quantizer_config=quantizer_config,
+        plex_layer=None,
+        plex_roll=1,
+        split=is_split,
+        from_pretrained=checkpoints[0],
+    )
+    return HertzDevModel.Config(
+        dim=4096,
+        vocab_size=32_768,
+        stack_config=Stack.Config(
+            layers=32,
+            dim=4096,
+            seq_len=2048,
+            n_head=32,
+            ff_dim=None,
+            kv_heads=None,
+            eps=1e-5,
+            theta=10_000,
+        ),
+        quantizer_config=quantizer_config,
+        resynthesizer_config=resynthesizer_config,
+        split=is_split,
+        from_pretrained=checkpoints[1],
+    )

requirements.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+torch==2.5.1
+torchaudio==2.5.1
+einops==0.8.0
+tqdm==4.66.6
+ipython==8.29.0
+numpy==1.26.3
+soundfile==0.12.1
+websockets==13.1
+requests==2.32.3
+sounddevice==0.5.1
+matplotlib==3.9.2
+fastapi==0.115.4
+uvicorn==0.32.0
+gradio==5.5.0

tokenizer.py ADDED Viewed

	@@ -0,0 +1,581 @@

+import math
+from dataclasses import dataclass
+from typing import Union, Tuple, Literal
+import torch as T
+import torch.nn as nn
+from torch.nn.utils.parametrizations import weight_norm
+from utils import load_ckpt
+from utils.interp import print_colored
+from utils import si_module, get_activation
+# Adapted from https://github.com/facebookresearch/AudioDec
+def Conv1d1x1(in_channels, out_channels, bias=True):
+    return nn.Conv1d(in_channels, out_channels, kernel_size=1, bias=bias)
+class NonCausalConv1d(nn.Module):
+    """1D noncausal convolution w/ 2-sides padding."""
+    def __init__(
+            self,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=1,
+            padding=-1,
+            dilation=1,
+            groups=1,
+            bias=True):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        if padding < 0:
+            padding = (kernel_size - 1) // 2 * dilation
+        self.dilation = dilation
+        self.conv = nn.Conv1d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias,
+        )
+    def forward(self, x):
+        """
+        Args:
+            x (Tensor): Float tensor variable with the shape  (B, C, T).
+        Returns:
+            Tensor: Float tensor variable with the shape (B, C, T).
+        """
+        x = self.conv(x)
+        return x
+class NonCausalConvTranspose1d(nn.Module):
+    """1D noncausal transpose convolution."""
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride,
+        padding=-1,
+        output_padding=-1,
+        groups=1,
+        bias=True,
+    ):
+        super().__init__()
+        if padding < 0:
+            padding = (stride+1) // 2
+        if output_padding < 0:
+            output_padding = 1 if stride % 2 else 0
+        self.deconv = nn.ConvTranspose1d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            output_padding=output_padding,
+            groups=groups,
+            bias=bias,
+        )
+    def forward(self, x):
+        """
+        Args:
+            x (Tensor): Float tensor variable with the shape  (B, C, T).
+        Returns:
+            Tensor: Float tensor variable with the shape (B, C', T').
+        """
+        x = self.deconv(x)
+        return x
+class CausalConv1d(NonCausalConv1d):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        dilation=1,
+        groups=1,
+        bias=True
+    ):
+        super(CausalConv1d, self).__init__(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=0,
+            dilation=dilation,
+            groups=groups,
+            bias=bias,
+        )
+        self.stride = stride
+        self.pad_length = (kernel_size - 1) * dilation
+    def forward(self, x):
+        pad = nn.ConstantPad1d((self.pad_length, 0), 0.0)
+        x = pad(x)
+        return self.conv(x)
+class CausalConvTranspose1d(NonCausalConvTranspose1d):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride,
+        bias=True,
+        pad_buffer=None,
+    ):
+        super(CausalConvTranspose1d, self).__init__(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=0,
+            output_padding=0,
+            bias=bias,
+        )
+        self.stride = stride
+        self.pad_length = (math.ceil(kernel_size/stride) - 1)
+        if pad_buffer is None:
+            pad_buffer = T.zeros(1, in_channels, self.pad_length)
+        self.register_buffer("pad_buffer", pad_buffer)
+    def forward(self, x):
+        pad = nn.ReplicationPad1d((self.pad_length, 0))
+        x = pad(x)
+        return self.deconv(x)[:, :, self.stride : -self.stride]
+    def inference(self, x):
+        x = T.cat((self.pad_buffer, x), -1)
+        self.pad_buffer = x[:, :, -self.pad_length:]
+        return self.deconv(x)[:, :, self.stride : -self.stride]
+    def reset_buffer(self):
+        self.pad_buffer.zero_()
+class NonCausalResUnit(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size=7,
+        dilation=1,
+        bias=False,
+    ):
+        super().__init__()
+        self.activation = nn.ELU()
+        self.conv1 = NonCausalConv1d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=1,
+            dilation=dilation,
+            bias=bias,
+        )
+        self.conv2 = Conv1d1x1(out_channels, out_channels, bias)
+    def forward(self, x):
+        y = self.conv1(self.activation(x))
+        y = self.conv2(self.activation(y))
+        return x + y
+class CausalResUnit(NonCausalResUnit):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size=7,
+        dilation=1,
+        bias=False,
+    ):
+        super(CausalResUnit, self).__init__(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            dilation=dilation,
+            bias=bias,
+        )
+        self.conv1 = CausalConv1d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=1,
+            dilation=dilation,
+            bias=bias,
+        )
+    def inference(self, x):
+        y = self.conv1.inference(self.activation(x))
+        y = self.conv2(self.activation(y))
+        return x + y
+class ResNetBlock(nn.Module):
+    def __init__(self,
+        in_channels,
+        out_channels,
+        stride,
+        kernel_size=7,
+        dilations=(1, 3, 9),
+        bias=True,
+        mode='encoder',
+    ):
+        super().__init__()
+        assert mode in ('encoder', 'decoder'), f"Mode ({mode}) is not supported!"
+        self.mode = mode
+        self.stride = stride
+        ConvUnit = CausalConv1d if mode == 'encoder' else CausalConvTranspose1d
+        res_channels = in_channels if mode == 'encoder' else out_channels
+        res_units = [CausalResUnit(
+            res_channels,
+            res_channels,
+            kernel_size=kernel_size,
+            dilation=dilation,
+        ) for dilation in dilations]
+        if in_channels == out_channels:
+            if mode == 'encoder':
+                self.pool = nn.AvgPool1d(kernel_size=stride, stride=stride)
+            if mode == 'decoder':
+                self.upsample = nn.Upsample(scale_factor=stride, mode='nearest')
+            conv_unit = nn.Conv1d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=1,
+                bias=bias,
+            ) if in_channels != out_channels else nn.Identity()
+        else:
+            conv_unit = ConvUnit(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=(2 * stride),
+                stride=stride,
+                bias=bias,
+            )
+        if mode == 'encoder':
+            if in_channels == out_channels:
+                self.res_block = nn.Sequential(*res_units, self.pool, conv_unit)
+            else:
+                self.res_block = nn.Sequential(*res_units, conv_unit)
+        elif mode == 'decoder':
+            if in_channels == out_channels:
+                self.res_block = nn.Sequential(self.upsample, conv_unit, *res_units)
+            else:
+                self.res_block = nn.Sequential(conv_unit, *res_units)
+    def forward(self, x):
+        out = x
+        for unit in self.res_block:
+            out = unit(out)
+        return out
+    def inference(self, x):
+        for unit in self.res_block:
+            x = unit.inference(x)
+        return x
+@si_module
+class ResNetStack(nn.Module):
+    """
+    ResNet encoder or decoder stack. Channel ratios
+    and strides take the default order of from
+    data/io-layer, to the middle of the model.
+    """
+    class Config:
+        input_channels: int = 1
+        output_channels: int = 1
+        encode_channels: int = 32
+        decode_channel_multiplier: int = 1
+        latent_dim: int = None
+        kernel_size: int = 7
+        bias: bool = True
+        channel_ratios: Tuple[int, ...] = (2, 4, 8, 16)
+        strides: Tuple[int, ...] = (3, 4, 5, 5)
+        mode: Literal['encoder', 'decoder'] = 'encoder'
+    def __init__(self, c: Config):
+        super().__init__()
+        assert c.mode in ('encoder', 'decoder'), f"Mode ({c.mode}) is not supported!"
+        self.mode = c.mode
+        assert len(c.channel_ratios) == len(c.strides)
+        channel_ratios = (1,) + c.channel_ratios
+        strides = c.strides
+        self.middle_channels = c.encode_channels * channel_ratios[-1]
+        if c.mode == 'decoder':
+            channel_ratios = tuple(reversed(channel_ratios))
+            strides = tuple(reversed(strides))
+        self.multiplier = c.decode_channel_multiplier if c.mode == 'decoder' else 1
+        res_blocks = [ResNetBlock(
+            c.encode_channels * channel_ratios[s_idx] * self.multiplier,
+            c.encode_channels * channel_ratios[s_idx+1] * self.multiplier,
+            stride,
+            kernel_size=c.kernel_size,
+            bias=c.bias,
+            mode=c.mode,
+        ) for s_idx, stride in enumerate(strides)]
+        data_conv = CausalConv1d(
+            in_channels=c.input_channels if c.mode == 'encoder' else c.encode_channels * self.multiplier,
+            out_channels=c.encode_channels if c.mode == 'encoder' else c.output_channels,
+            kernel_size=c.kernel_size,
+            stride=1,
+            bias=False,
+        )
+        if c.mode == 'encoder':
+            self.res_stack = nn.Sequential(data_conv, *res_blocks)
+        elif c.mode == 'decoder':
+            self.res_stack = nn.Sequential(*res_blocks, data_conv)
+        if c.latent_dim is not None:
+            self.latent_proj = Conv1d1x1(self.middle_channels, c.latent_dim, bias=c.bias) if c.mode == 'encoder' else Conv1d1x1(c.latent_dim, self.middle_channels, bias=c.bias)
+        if self.multiplier != 1:
+            self.multiplier_proj = Conv1d1x1(self.middle_channels, self.middle_channels * self.multiplier, bias=c.bias)
+    def forward(self, x, return_feats=False):
+        if self.c.latent_dim is not None and self.mode == 'decoder':
+            x = self.latent_proj(x)
+        if self.multiplier != 1:
+            x = self.multiplier_proj(x)
+        feats = []
+        for block in self.res_stack:
+            x = block(x)
+            if return_feats:
+                feats.append(x)
+        if self.c.latent_dim is not None and self.mode == 'encoder':
+            x = self.latent_proj(x)
+            if return_feats:
+                feats.append(x)
+        if return_feats:
+            return feats
+        return x
+    def inference(self, x):
+        for block in self.res_stack:
+            x = block.inference(x)
+        return x
+    def reset_buffer(self):
+        def _reset_buffer(m):
+            if isinstance(m, CausalConv1d) or isinstance(m, CausalConvTranspose1d):
+                m.reset_buffer()
+        self.apply(_reset_buffer)
+    def reset_parameters(self):
+        def _reset_parameters(m):
+            if isinstance(m, (nn.Conv1d, nn.ConvTranspose1d)):
+                m.weight.data.normal_(0.0, 0.01)
+        self.apply(_reset_parameters)
+    def apply_weight_norm(self):
+        def _apply_weight_norm(m):
+            if isinstance(m, nn.Conv1d) or isinstance(
+                m, nn.ConvTranspose1d
+            ):
+                nn.utils.parametrizations.weight_norm(m)
+        self.apply(_apply_weight_norm)
+    def remove_weight_norm(self):
+        def _remove_weight_norm(m):
+            try:
+                print(m)
+                nn.utils.remove_weight_norm(m)
+            except ValueError:  # this module didn't have weight norm
+                return
+        self.apply(_remove_weight_norm)
+@si_module
+class GaussianZ(nn.Module):
+    class Config:
+        dim: int
+        latent_dim: int
+        bias: bool = False
+        use_weight_norm: bool = False
+    def __init__(self, c: Config):
+        super().__init__()
+        self.proj_in = nn.Linear(c.dim, c.latent_dim * 2, bias=c.bias)
+        self.proj_out = nn.Linear(c.latent_dim, c.dim, bias=c.bias)
+        if c.use_weight_norm:
+            self.proj_in = weight_norm(self.proj_in)
+            self.proj_out = weight_norm(self.proj_out)
+    def reparam(self, mu, logvar):
+        std = T.exp(logvar / 2)
+        eps = T.randn_like(std)
+        return mu + eps * std
+    def kl_divergence(self, mu, logvar):
+        return T.mean(-0.5 * T.sum(
+                1 + logvar - mu.pow(2) - logvar.exp(),
+                dim=(1, 2))
+            )
+    def repr_from_latent(self, latent: Union[dict, T.Tensor]):
+        if isinstance(latent, T.Tensor):
+            z = latent
+        else:
+            z = self.reparam(latent['mu'], latent['logvar'])
+        l = self.proj_out(z)
+        return l
+    def forward(self, x: T.Tensor) -> Tuple[T.Tensor, dict]:
+        mu, logvar = self.proj_in(x).chunk(2, dim=-1)
+        kl_div = self.kl_divergence(mu, logvar)
+        z = self.reparam(mu, logvar)
+        xhat = self.proj_out(z)
+        latent = {'mu': mu, 'logvar': logvar, 'z': z, 'kl_divergence': kl_div}
+        return xhat, latent
+@si_module
+class WaveCodec(nn.Module):
+    class Config:
+        resnet_config: ResNetStack.Config = None
+        sample_rate: int = 16_000
+        use_weight_norm: bool = False
+        compressor_config: dataclass = None
+        norm_stddev: float = 1.0
+    def __init__(self, c: Config):
+        super().__init__()
+        self.norm_stddev = c.norm_stddev
+        self.encoder = c.resnet_config(mode='encoder')
+        self.sample_rate = c.sample_rate
+        self.total_stride = 1
+        for stride in c.resnet_config.strides:
+            self.total_stride *= stride
+        self.tokens_per_second = self.sample_rate / self.total_stride
+        self.compressor = c.compressor_config(dim=self.encoder.middle_channels)
+        self.decoder = c.resnet_config(mode='decoder')
+        if c.use_weight_norm:
+            self.encoder.apply_weight_norm()
+            self.decoder.apply_weight_norm()
+            self.encoder.reset_parameters()
+            self.decoder.reset_parameters()
+    def encode(self, data):
+        return self.encoder(data/self.norm_stddev)
+    def decode(self, latent):
+        return self.decoder(latent.transpose(1, 2))*self.norm_stddev
+    @T.no_grad()
+    def latent_from_data(self, data, get_parameters=False):
+        x = self.encode(data)
+        l_in = x.transpose(1, 2)
+        l, latent = self.compressor(l_in)
+        return latent['z'] if not get_parameters else {
+            'mu': latent['mu'],
+            'logvar': latent['logvar'],
+            'z': latent['z'],
+        }
+    @T.no_grad()
+    def data_from_latent(self, latent):
+        l = self.compressor.repr_from_latent(latent)
+        x = self.decode(l)
+        return x
+    def process(self, x):
+        return self.latent_from_data(x)
+    def unprocess(self, latent):
+        return self.data_from_latent(latent)
+    def forward(self, audio_input):
+        x = self.encode(audio_input)
+        l_in = x.transpose(1, 2)
+        l, latent = self.compressor(l_in)
+        xhat = self.decode(l)
+        return xhat, latent
+def make_tokenizer(device='cuda'):
+    generator_config = WaveCodec.Config(
+        resnet_config=ResNetStack.Config(
+            input_channels=1,
+            output_channels=1,
+            encode_channels=16,
+            decode_channel_multiplier=4,
+            kernel_size=7,
+            bias=True,
+            channel_ratios=(4, 8, 16, 16, 16, 16),
+            strides=(2, 2, 4, 5, 5, 5),
+            mode=None,
+        ),
+        use_weight_norm=True,
+        compressor_config=GaussianZ.Config(
+            dim=None,
+            latent_dim=32,
+            bias=True,
+            use_weight_norm=True
+        ),
+        norm_stddev=0.05,
+    )
+    checkpoint = load_ckpt("inference_apatosaurus_95000", expected_hash="ba876edb97b988e9196e449dd176ca97")
+    tokenizer = generator_config()
+    load_result = tokenizer.load_state_dict(checkpoint, strict=False)
+    print_colored(f"Loaded tokenizer state dict: {load_result}", "grey")
+    tokenizer = tokenizer.eval()
+    # Only convert to bfloat16 if using CUDA
+    if device == 'cuda':
+        tokenizer = tokenizer.bfloat16()
+    tokenizer = tokenizer.to(device)
+    tokenizer.requires_grad_ = False
+    return tokenizer

transformer.py ADDED Viewed

	@@ -0,0 +1,382 @@

+from typing import Optional, Tuple, MutableMapping
+from typing import Union
+import math
+from contextlib import nullcontext
+import torch
+import torch as T
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+from torch.nn.attention import SDPBackend
+from einops import rearrange
+from utils import si_module, default, exists, load_ckpt
+CACHE_FILL_VALUE = -1
+def get_cache_len(cache: Optional[Tensor]) -> int:
+    """
+    cache: (batch, seq_len, 2, kv_heads, head_dim)
+    """
+    if cache is None:
+        return 0
+    nonzeros = T.any(cache.flatten(2) != CACHE_FILL_VALUE, dim=-1)
+    length = nonzeros.sum(dim=-1).int()
+    assert T.all(length == length[0])
+    return length[0]
+def rotate_half(x):
+    x1, x2 = x.chunk(2, dim=-1)
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb(x, cos, sin, offset: int = 0):
+    assert (
+        cos.shape[1] >= offset + x.shape[1]
+    ), f"Offset and/or input sequence is too large,\
+        \n offset: {offset}, seq_len: {x.shape[1]}, max: {cos.shape[1]}"
+    cos_out = cos[:, offset : offset + x.shape[1], :, :]
+    sin_out = sin[:, offset : offset + x.shape[1], :, :]
+    return (x * cos_out) + (rotate_half(x) * sin_out)
+# Adapted from https://github.com/foundation-model-stack/foundation-model-stack
+class ShapeRotator:
+    def __init__(
+        self,
+        dim: int,
+        end: int,
+        theta: float = 10_000,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.ratio = theta
+        self.cached_freqs: MutableMapping[int, MutableMapping[int, torch.Tensor]] = {}
+        self.max_seq_len_cached: MutableMapping[int, int] = {}
+        self.ntk_scaling = False
+        self.max_seq_len = end
+    def compute_freqs_cis(self, device, max_seq_len=None):
+        alpha = 1
+        dev_idx = device.index
+        max_seq_len = default(max_seq_len, self.max_seq_len)
+        if dev_idx not in self.cached_freqs:
+            self.cached_freqs[dev_idx] = {}
+        if dev_idx not in self.max_seq_len_cached:
+            self.max_seq_len_cached[dev_idx] = 0
+        if self.max_seq_len_cached[dev_idx] > 0:
+            return 1
+        max_seq_len = max(max_seq_len, self.max_seq_len)
+        if (
+            1 in self.cached_freqs[dev_idx]
+            and max_seq_len <= self.max_seq_len_cached[dev_idx]
+        ):
+            return 1
+        ratio = self.ratio
+        dim = self.dim
+        freqs = 1.0 / (ratio ** (torch.arange(0, dim, 2, device=device).float() / dim))
+        t = torch.arange(max_seq_len, device=device, dtype=freqs.dtype)
+        freqs = torch.einsum("i,j->ij", t, freqs)
+        emb = torch.cat((freqs, freqs), dim=-1).to(device)
+        cos_to_cache = emb.cos()[None, :, None, :]
+        sin_to_cache = emb.sin()[None, :, None, :]
+        self.max_seq_len_cached[dev_idx] = max_seq_len
+        self.cached_freqs[dev_idx][alpha] = torch.stack(
+            [
+                cos_to_cache,
+                sin_to_cache,
+            ],
+            dim=-1,
+        )
+        return alpha
+    def rotate(
+        self,
+        q: Tensor,
+        k: Tensor,
+        offset: int = 0,
+    ) -> Tuple[Tensor, Tensor]:
+        """
+        Args
+        ----
+        q : torch.Tensor
+            Embedded query tensor, expected size is B x S x H x Eh
+        k : torch.Tensor
+            Embedded query tensor, expected size is B x S x H x Eh
+        """
+        assert len(q.size()) == 4
+        assert len(k.size()) == 4
+        seq_len = self.max_seq_len
+        alpha = self.compute_freqs_cis(q.device, seq_len)
+        freqs = self.cached_freqs[q.device.index][alpha]
+        freqs = freqs.float()  # 1 L D/2 2 2
+        q_out = apply_rotary_pos_emb(q, freqs[..., 0], freqs[..., 1], offset=offset).type_as(q)
+        k_out = apply_rotary_pos_emb(k, freqs[..., 0], freqs[..., 1], offset=offset).type_as(k)
+        return q_out.view_as(q), k_out.view_as(k)
+class Linear(nn.Linear):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs, bias=False)
+class Norm(nn.Module):
+    def __init__(self,
+            dim: int,
+            eps: float = 1e-5,) -> None:
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(T.ones((dim,)))
+    def forward(self, input: Tensor) -> Tensor:
+        return F.layer_norm(input, (self.weight.shape[0],), weight=self.weight, bias=None, eps=self.eps)
+class FFNN(nn.Module):
+    def __init__(self,
+            dim: int,
+            expand_dim: int = None,):
+        super().__init__()
+        expand_dim = default(expand_dim, 256 * ((int(2 * 4 * dim / 3) + 256 - 1) // 256))
+        self.dim = dim
+        self.expand_dim = expand_dim
+        self.gateup_proj = Linear(dim, 2*expand_dim)
+        self.down_proj = Linear(expand_dim, dim)
+    def forward(self, x):
+        gate, up = self.gateup_proj(x).chunk(2, dim=-1)
+        return self.down_proj(up * F.silu(gate))
+class GQA(nn.Module):
+    def __init__(self,
+            dim: int,
+            n_head: int,
+            shape_rotator: ShapeRotator,
+            kv_heads: Optional[int] = None,
+            eps: float = 1e-5,
+            causal: bool = True,):
+        super().__init__()
+        self.n_heads = n_head
+        self.kv_heads = default(kv_heads, n_head)
+        self.head_dim = dim // n_head
+        self.causal = causal
+        self.proj_qkv = Linear(dim, self.head_dim*(n_head+2*self.kv_heads))
+        self.norm_q = Norm(self.head_dim*n_head, eps=eps)
+        self.norm_k = Norm(self.head_dim*self.kv_heads, eps=eps)
+        self.attn_out = Linear(dim, dim)
+        self.shape_rotator = shape_rotator
+    def _sdpa(self, q: Tensor, k: Tensor, v: Tensor) -> Tensor:
+        k = k.repeat_interleave(self.n_heads // self.kv_heads, dim=2)
+        v = v.repeat_interleave(self.n_heads // self.kv_heads, dim=2)
+        with nn.attention.sdpa_kernel(SDPBackend.FLASH_ATTENTION) if k.device.type == 'cuda' else nullcontext():
+            x = F.scaled_dot_product_attention(
+                q.transpose(1, 2),
+                k.transpose(1, 2),
+                v.transpose(1, 2),
+                is_causal=False if (q.size(1) != k.size(1)) else self.causal,
+            )
+        x = x.transpose(1, 2).contiguous()
+        return x
+    def _attend(self, q: Tensor, k: Tensor, v: Tensor, kv_cache: Optional[Tensor] = None,):
+        cache_len = get_cache_len(kv_cache)
+        q, k = self.shape_rotator.rotate(q, k, offset=cache_len)
+        if exists(kv_cache):
+            k = T.cat([kv_cache[:, :cache_len, 0], k], dim=1)
+            v = T.cat([kv_cache[:, :cache_len, 1], v], dim=1)
+            kv_cache[:, :k.size(1), 0] = k
+            kv_cache[:, :v.size(1), 1] = v
+        x = self._sdpa(q, k, v)
+        return self.attn_out(rearrange(x, 'b s h d -> b s (h d)'))
+    def _project(self, x):
+        full_q, full_k, full_v = self.proj_qkv(x).chunk(3, dim=-1)
+        normed_full_q = self.norm_q(full_q).to(full_q.dtype)
+        normed_full_k = self.norm_k(full_k).to(full_k.dtype)
+        q = rearrange(normed_full_q, 'b s (h d) -> b s h d', h=self.n_heads)
+        k = rearrange(normed_full_k, 'b s (h d) -> b s h d', h=self.kv_heads)
+        v = rearrange(full_v, 'b s (h d) -> b s h d', h=self.kv_heads)
+        return q, k, v
+    def forward(self,
+            x: Tensor,
+            kv: Optional[Tensor] = None,):
+        """
+        x: (B, S, D)
+        kv: (B, S, H, D)
+        """
+        q, k, v = self._project(x)
+        return self._attend(q, k, v, kv_cache=kv)
+class PreNormAttn(nn.Module):
+    def __init__(self,
+            dim: int,
+            n_head: int,
+            shape_rotator: ShapeRotator,
+            kv_heads: Optional[int] = None,
+            eps: float = 1e-5,
+            causal: bool = True,):
+        super().__init__()
+        self.attn_norm = Norm(dim, eps=eps)
+        self.attn = GQA(dim, n_head, shape_rotator, kv_heads, eps=eps, causal=causal)
+    def forward(self, x: Tensor, kv: Optional[Tensor] = None) -> Tensor:
+        """
+        x: (B, S, D)
+        kv: (B, S, H, D)
+        """
+        return x + self.attn(self.attn_norm(x), kv)
+class PreNormFFNN(nn.Module):
+    def __init__(self,
+            dim: int,
+            ff_dim: int,
+            eps: float = 1e-5,):
+        super().__init__()
+        self.ffnn_norm = Norm(dim, eps=eps)
+        self.ffnn = FFNN(dim, ff_dim)
+    def forward(self, x: Tensor) -> Tensor:
+        return x + self.ffnn(self.ffnn_norm(x))
+class Block(nn.Module):
+    def __init__(self,
+            dim: int,
+            layer_id: int = 0,
+            n_head: int = 16,
+            kv_heads: Optional[int] = None,
+            ff_dim: Optional[int] = None,
+            eps: float = 1e-5,
+            causal: bool = True,
+            shape_rotator: ShapeRotator = None):
+        super().__init__()
+        self.attn = PreNormAttn(dim, n_head, shape_rotator, kv_heads, eps=eps, causal=causal)
+        self.ffnn = PreNormFFNN(dim, ff_dim, eps=eps)
+        self.dim = dim
+        self.layer_id = layer_id
+        self.head_dim = dim // n_head
+        self.expand_dim = self.ffnn.ffnn.expand_dim
+        self.reset_parameters()
+    def reset_parameters(self):
+        std = 1.0 / math.sqrt(self.dim)
+        nn.init.trunc_normal_(self.ffnn.ffnn.gateup_proj.weight, std=std, a=-3 * std, b=3 * std)
+        nn.init.trunc_normal_(self.attn.attn.proj_qkv.weight, std=std, a=-3 * std, b=3 * std)
+        nn.init.trunc_normal_(self.attn.attn.attn_out.weight, std=std, a=-3 * std, b=3 * std)
+        xstd = 1.0 / math.sqrt(self.expand_dim)
+        nn.init.trunc_normal_(self.ffnn.ffnn.down_proj.weight, std=xstd, a=-3 * xstd, b=3 * xstd)
+    def forward(self, x: Tensor, kv: Optional[Tensor] = None) -> Tensor:
+        """
+        x: (B, S, D)
+        kv: (B, S, H, D)
+        """
+        h = self.attn(x, kv)
+        out = self.ffnn(h)
+        return out
+class GPTOutput(nn.Module):
+    def __init__(self, dim, vocab_size):
+        super().__init__()
+        self.dim = dim
+        self.norm = Norm(dim)
+        self.output = Linear(dim, vocab_size)
+        self.reset_parameters()
+    def reset_parameters(self):
+        std = 1.0 / math.sqrt(self.dim**2)
+        nn.init.trunc_normal_(self.output.weight, std=std, a=-3 * std, b=3 * std)
+    def forward(self, x):
+        return self.output(self.norm(x))
+@si_module
+class Stack(nn.Module):
+    class Config:
+        layers: int
+        dim: int
+        seq_len: int
+        n_head: int = 32
+        ff_dim: int = None
+        kv_heads: int = None
+        eps: float = 1e-5
+        theta: Union[int, float] = 10_000
+        causal: bool = True
+        from_pretrained: Optional[Tuple[str, int]] = None
+    def __init__(self, c: Config):
+        super().__init__()
+        from_pretrained = c.from_pretrained
+        if exists(from_pretrained):
+            checkpoint = load_ckpt(c.from_pretrained)
+        self.shape_rotator = ShapeRotator(c.dim//c.n_head, c.seq_len, theta=c.theta)
+        self.layers = nn.ModuleList([
+            Block(
+                dim=c.dim,
+                layer_id=l,
+                n_head=c.n_head,
+                kv_heads=c.kv_heads,
+                ff_dim=c.ff_dim,
+                eps=c.eps,
+                causal=c.causal,
+                shape_rotator=self.shape_rotator,
+            ) for l in range(c.layers)
+        ])
+        kv_heads = c.kv_heads or c.n_head
+        head_dim = c.dim // c.n_head
+        cache_shape = [c.layers, c.seq_len, 2, kv_heads, head_dim]
+        self.cache_shape = cache_shape
+        self.cache = [None] * c.layers
+        if exists(from_pretrained):
+            self.load_state_dict(checkpoint)
+    def init_cache(self, bsize, device, dtype, length:int=None):
+        if self.cache_shape is None:
+            return
+        cache_shape = self.cache_shape.copy()
+        cache_shape[1] = length or cache_shape[1]
+        self.cache = T.full((bsize, *cache_shape), CACHE_FILL_VALUE, device=device, dtype=dtype).transpose(0, 1)
+    def deinit_cache(self):
+        self.cache = [None] * len(self.cache)
+    def forward(self, x: Tensor) -> Tensor:
+        for l, layer in enumerate(self.layers):
+            x = layer(x, kv=self.cache[l])
+        return x

utils/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from .blocks import *
+from .dist import *
+from .interp import *

utils/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (211 Bytes). View file

utils/__pycache__/blocks.cpython-310.pyc ADDED Viewed

Binary file (3.73 kB). View file

utils/__pycache__/dist.cpython-310.pyc ADDED Viewed

Binary file (3.65 kB). View file

utils/__pycache__/interp.cpython-310.pyc ADDED Viewed

Binary file (3.82 kB). View file

utils/blocks.py ADDED Viewed

	@@ -0,0 +1,92 @@

+from dataclasses import dataclass
+from typing import TypeVar, Generic, Type, Optional
+from functools import wraps
+import time
+import random
+import torch as T
+import torch.nn as nn
+# @TODO: remove si_module from codebase
+# we use this in our research codebase to make modules from callable configs
+si_module_TpV = TypeVar('si_module_TpV')
+def si_module(cls: Type[si_module_TpV]) -> Type[si_module_TpV]:
+    if not hasattr(cls, 'Config') or not isinstance(cls.Config, type):
+        class Config:
+            pass
+        cls.Config = Config
+    cls.Config = dataclass(cls.Config)
+    class ConfigWrapper(cls.Config, Generic[si_module_TpV]):
+        def __call__(self, *args, **kwargs) -> si_module_TpV:
+            if len(kwargs) > 0:
+                config_dict = {field.name: getattr(self, field.name) for field in self.__dataclass_fields__.values()}
+                config_dict.update(kwargs)
+                new_config = type(self)(**config_dict)
+                return cls(new_config)
+            else:
+                return cls(self, *args)
+    ConfigWrapper.__module__ = cls.__module__
+    ConfigWrapper.__name__ = f"{cls.__name__}Config"
+    ConfigWrapper.__qualname__ = f"{cls.__qualname__}.Config"
+    cls.Config = ConfigWrapper
+    original_init = cls.__init__
+    def new_init(self, *args, **kwargs):
+        self.c = next((arg for arg in args if isinstance(arg, cls.Config)), None) or next((arg for arg in kwargs.values() if isinstance(arg, cls.Config)), None)
+        original_init(self, *args, **kwargs)
+        self.register_buffer('_device_tracker', T.Tensor(), persistent=False)
+    cls.__init__ = new_init
+    @property
+    def device(self):
+        return self._device_tracker.device
+    @property
+    def dtype(self):
+        return self._device_tracker.dtype
+    cls.device = device
+    cls.dtype = dtype
+    return cls
+def get_activation(nonlinear_activation, nonlinear_activation_params={}):
+    if hasattr(nn, nonlinear_activation):
+        return getattr(nn, nonlinear_activation)(**nonlinear_activation_params)
+    else:
+        raise NotImplementedError(f"Activation {nonlinear_activation} not found in torch.nn")
+def exists(v):
+    return v is not None
+def isnt(v):
+    return not exists(v)
+def truthyexists(v):
+    return exists(v) and v is not False
+def truthyattr(obj, attr):
+    return hasattr(obj, attr) and truthyexists(getattr(obj, attr))
+defaultT = TypeVar('defaultT')
+def default(*args: Optional[defaultT]) -> Optional[defaultT]:
+    for arg in args:
+        if exists(arg):
+            return arg
+    return None
+def maybe(fn):
+    @wraps(fn)
+    def inner(x, *args, **kwargs):
+        if not exists(x):
+            return x
+        return fn(x, *args, **kwargs)
+    return inner

utils/dist.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import os
+import torch as T
+import re
+from tqdm import tqdm
+from datetime import timedelta
+import requests
+import hashlib
+from io import BytesIO
+def rank0():
+    rank = os.environ.get('RANK')
+    if rank is None or rank == '0':
+        return True
+    else:
+        return False
+def local0():
+    local_rank = os.environ.get('LOCAL_RANK')
+    if local_rank is None or local_rank == '0':
+        return True
+    else:
+        return False
+class tqdm0(tqdm):
+    def __init__(self, *args, **kwargs):
+        total = kwargs.get('total', None)
+        if total is None and len(args) > 0:
+            try:
+                total = len(args[0])
+            except TypeError:
+                pass
+        if total is not None:
+            kwargs['miniters'] = max(1, total // 20)
+        super().__init__(*args, **kwargs, disable=not rank0(), bar_format='{bar}| {n_fmt}/{total_fmt} [{rate_fmt}{postfix}]')
+def print0(*args, **kwargs):
+    if rank0():
+        print(*args, **kwargs)
+_PRINTED_IDS = set()
+def printonce(*args, id=None, **kwargs):
+    if id is None:
+        id = ' '.join(map(str, args))
+    if id not in _PRINTED_IDS:
+        print(*args, **kwargs)
+        _PRINTED_IDS.add(id)
+def print0once(*args, **kwargs):
+    if rank0():
+        printonce(*args, **kwargs)
+def init_dist():
+    if T.distributed.is_initialized():
+        print0('Distributed already initialized')
+        rank = T.distributed.get_rank()
+        local_rank = int(os.environ.get('LOCAL_RANK', 0))
+        world_size = T.distributed.get_world_size()
+    else:
+        try:
+            rank = int(os.environ['RANK'])
+            local_rank = int(os.environ['LOCAL_RANK'])
+            world_size = int(os.environ['WORLD_SIZE'])
+            device = f'cuda:{local_rank}'
+            T.cuda.set_device(device)
+            T.distributed.init_process_group(backend='nccl', timeout=timedelta(minutes=30), rank=rank, world_size=world_size, device_id=T.device(device))
+            print(f'Rank {rank} of {world_size}.')
+        except Exception as e:
+            print0once(f'Not initializing distributed env: {e}')
+            rank = 0
+            local_rank = 0
+            world_size = 1
+    return rank, local_rank, world_size
+def load_ckpt(load_from_location, expected_hash=None):
+    if local0():
+        os.makedirs('ckpt', exist_ok=True)
+        url = f"https://ckpt.si.inc/hertz-dev/{load_from_location}.pt"
+        save_path = f"ckpt/{load_from_location}.pt"
+        if not os.path.exists(save_path):
+            response = requests.get(url, stream=True)
+            total_size = int(response.headers.get('content-length', 0))
+            with open(save_path, 'wb') as f, tqdm(total=total_size, desc=f'Downloading {load_from_location}.pt', unit='GB', unit_scale=1/(1024*1024*1024)) as pbar:
+                for chunk in response.iter_content(chunk_size=8192):
+                    f.write(chunk)
+                    pbar.update(len(chunk))
+        if expected_hash is not None:
+            with open(save_path, 'rb') as f:
+                file_hash = hashlib.md5(f.read()).hexdigest()
+            if file_hash != expected_hash:
+                print(f'Hash mismatch for {save_path}. Expected {expected_hash} but got {file_hash}. Deleting checkpoint and trying again.')
+                os.remove(save_path)
+                return load_ckpt(load_from_location, expected_hash)
+    if T.distributed.is_initialized():
+        T.distributed.barrier() # so that ranks don't try to load checkpoint before it's finished downloading
+    loaded = T.load(f"ckpt/{load_from_location}.pt", weights_only=False, map_location='cpu')
+    return loaded

utils/interp.py ADDED Viewed

	@@ -0,0 +1,84 @@

+import torch as T
+import os
+def rank0():
+    rank = os.environ.get('RANK')
+    if rank is None or rank == '0':
+        return True
+    else:
+        return False
+def print_colored(message, color='reset', bold=False, **kwargs):
+    color_dict = {
+        'bold': '\033[1m',
+        'green': '\033[92m',
+        'yellow': '\033[93m',
+        'red': '\033[91m',
+        'blue': '\033[94m',
+        'grey': '\033[90m',
+        'white': '\033[97m',
+        'reset': '\033[0m'
+    }
+    color_code = color_dict.get(color.lower(), color_dict['reset'])
+    prefix = color_dict['bold'] if bold else ''
+    print(f"{prefix}{color_code}{message}{color_dict['reset']}", **kwargs)
+def print0_colored(*args, **kwargs):
+    if rank0():
+        print_colored(*args, **kwargs)
+def param_count(module):
+    def count_parameters(model):
+        return sum(p.numel() for p in model.parameters() if p.requires_grad)
+    total_params = count_parameters(module)
+    output = [f'Total model parameters: {total_params:,}', '---------------------------']
+    for name, child in module.named_children():
+        params = count_parameters(child)
+        output.append(f'{name} parameters: {params:,}')
+    return '\n'.join(output)
+def model_size_estimation(module):
+    def estimate_size(model):
+        param_size = sum(p.nelement() * p.element_size() for p in model.parameters())
+        buffer_size = sum(b.nelement() * b.element_size() for b in model.buffers())
+        return param_size + buffer_size
+    total_size = estimate_size(module)
+    output = [f'Total model size: {total_size / 1024**2:.2f} MB', '---------------------------']
+    for name, child in module.named_children():
+        child_size = estimate_size(child)
+        output.append(f'{name} size: {child_size / 1024**2:.2f} MB')
+    return '\n'.join(output)
+def layer_param_distribution(module):
+    def count_parameters(model):
+        return sum(p.numel() for p in model.parameters() if p.requires_grad)
+    def get_layer_types(model):
+        layer_types = {}
+        for name, module in model.named_modules():
+            layer_type = module.__class__.__name__
+            params = sum(p.numel() for p in module.parameters(recurse=False) if p.requires_grad)
+            if params > 0:
+                if layer_type not in layer_types:
+                    layer_types[layer_type] = 0
+                layer_types[layer_type] += params
+        return layer_types
+    total_params = count_parameters(module)
+    layer_types = get_layer_types(module)
+    output = [f'Total trainable parameters: {total_params:,}', '---------------------------']
+    for layer_type, count in sorted(layer_types.items(), key=lambda x: x[1], reverse=True):
+        percentage = (count / total_params) * 100
+        output.append(f'{layer_type}: {count:,} ({percentage:.2f}%)')
+    return '\n'.join(output)