Upload HfMoondream

Browse files

Files changed (9) hide show

config.json +1 -1
config.py +2 -2
layers.py +52 -51
model.safetensors +2 -2
moondream.py +29 -24
packing.py +52 -0
region.py +3 -3
text.py +87 -36
vision.py +3 -3

config.json CHANGED Viewed

@@ -8,6 +8,6 @@
   },
   "config": {},
   "model_type": "moondream1",
-  "torch_dtype": "float16",
   "transformers_version": "4.44.0"
 }

   },
   "config": {},
   "model_type": "moondream1",
+  "torch_dtype": "bfloat16",
   "transformers_version": "4.44.0"
 }

config.py CHANGED Viewed

@@ -12,7 +12,7 @@ class TextConfig:
     n_heads: int = 32
     n_kv_heads: int = 32
     prefix_attn: int = 730
-    group_size: int = 128
 @dataclass(frozen=True)
@@ -38,7 +38,7 @@ class RegionConfig:
     size_feat_dim: int = 512
     size_out_dim: int = 2048
     inner_dim: int = 8192
 @dataclass(frozen=True)
 class TokenizerConfig:

     n_heads: int = 32
     n_kv_heads: int = 32
     prefix_attn: int = 730
+    group_size: Optional[int] = 128
 @dataclass(frozen=True)
     size_feat_dim: int = 512
     size_out_dim: int = 2048
     inner_dim: int = 8192
+    group_size: Optional[int] = 128
 @dataclass(frozen=True)
 class TokenizerConfig:

layers.py CHANGED Viewed

@@ -1,11 +1,13 @@
-import bitblas
 import torch
 import torch.nn as nn
 from dataclasses import dataclass
 from typing import Literal
-from bitblas.cache import OperatorCache
-from torch.nn import functional as F
 def gelu_approx(x):
@@ -18,65 +20,65 @@ class LinearWeights:
     bias: torch.Tensor
-class Linear(nn.Module):
-    """
-    Linear layer with support for bitblas quantization.
-    If dtype is torch.int8, it uses bitblas for quantization.
-    Otherwise, it uses a standard nn.Linear layer.
-    """
     def __init__(
         self,
         in_features: int,
         out_features: int,
-        bias: bool = True,
-        dtype: torch.dtype = None,
-        group_size: int = 128,
     ):
         super().__init__()
-        if dtype == torch.int8:
-            self.linear = bitblas.Linear(
-                in_features=in_features,
-                out_features=out_features,
-                bias=bias,
-                with_zeros=True,
-                zeros_mode="original",
-                with_scaling=True,
-                A_dtype="float16",
-                W_dtype="uint4",
-                accum_dtype="float16",
-                out_dtype="float16",
-                fast_decoding=True,
-                enable_tuning=True,
-                group_size=group_size,
             )
-        else:
             self.linear = nn.Linear(
-                in_features=in_features,
-                out_features=out_features,
-                bias=bias,
-                dtype=torch.float16,
             )
-    def forward(self, x):
         return self.linear(x)
-    @property
-    def weight(self) -> torch.Tensor:
-        try:
-            return self.linear.weight
-        except AttributeError:
-            return self.linear.qweight
-    @property
-    def bias(self) -> torch.Tensor:
-        return self.linear.bias
-def linear(x: torch.Tensor, w: LinearWeights) -> torch.Tensor:
-    return F.linear(x, w.weight, w.bias)
 @dataclass
 class LayerNormWeights:
@@ -96,7 +98,6 @@ class MLPWeights:
 def mlp(x: torch.Tensor, w: MLPWeights) -> torch.Tensor:
     x = w.fc1(x)
     x = gelu_approx(x)
     x = w.fc2(x)

 import torch
 import torch.nn as nn
+import torch.nn.functional as F
 from dataclasses import dataclass
 from typing import Literal
+from torchao import quantize_
+from torchao.quantization import int4_weight_only
+from .packing import dequantize_tensor
 def gelu_approx(x):
     bias: torch.Tensor
+def linear(x: torch.Tensor, w: LinearWeights) -> torch.Tensor:
+    return F.linear(x, w.weight, w.bias)
+class QuantizedLinear(nn.Module):
     def __init__(
         self,
         in_features: int,
         out_features: int,
+        dtype: torch.dtype,
     ):
+        # TODO: Take group_size as an input instead of hardcoding it here.
         super().__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+        self.weight = nn.ParameterDict(
+            {
+                "packed": nn.Parameter(
+                    torch.empty(
+                        out_features, in_features // 128, 64, dtype=torch.uint8
+                    ),
+                    requires_grad=False,
+                ),
+                "scales": nn.Parameter(
+                    torch.empty(out_features, in_features // 128), requires_grad=False
+                ),
+            }
+        )
+        self.bias = nn.Parameter(torch.empty(out_features), requires_grad=False)
+        self.unpacked = False
+    def unpack(self):
+        self.weight = nn.Parameter(
+            dequantize_tensor(
+                self.weight["packed"],
+                self.weight["scales"],
+                (self.weight["packed"].shape[0], self.weight["packed"].shape[1] * 128),
+                128,
+                torch.bfloat16,
             )
+        )
+        with torch.device("meta"):
             self.linear = nn.Linear(
+                self.in_features, self.out_features, dtype=torch.bfloat16
             )
+        self.linear.weight = self.weight
+        self.linear.bias = nn.Parameter(
+            self.bias.to(torch.bfloat16), requires_grad=False
+        )
+        del self.weight, self.bias
+        quantize_(self, int4_weight_only(group_size=128))
+        torch.cuda.empty_cache()
+        self.unpacked = True
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not self.unpacked:
+            self.unpack()
         return self.linear(x)
 @dataclass
 class LayerNormWeights:
 def mlp(x: torch.Tensor, w: MLPWeights) -> torch.Tensor:
     x = w.fc1(x)
     x = gelu_approx(x)
     x = w.fc2(x)

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:73e9da0d1091d61630477994669a22011c830c7539e27e659fb63a4d6818f8a2
-size 2080370912

 version https://git-lfs.github.com/spec/v1
+oid sha256:97076df1a9a09ff4108a69ea59b4c9abf522b248e8425c9334bab98ddbaf4b33
+size 1838828672

moondream.py CHANGED Viewed

@@ -12,6 +12,7 @@ from .image_crops import reconstruct_from_crops
 from .vision import vision_encoder, vision_projection, prepare_crops, build_vision_model
 from .text import build_text_model, text_encoder, lm_head, text_decoder
 from .region import decode_coordinate, encode_coordinate, decode_size, encode_size
 from .utils import remove_outlier_points
@@ -63,47 +64,49 @@ class KVCache(nn.Module):
 class MoondreamModel(nn.Module):
-    def __init__(self, config: MoondreamConfig, dtype=torch.float16, setup_caches=True):
         super().__init__()
         self.config = config
-        self.dtype = dtype
-        self.setup_caches_flag = setup_caches
         self.tokenizer = Tokenizer.from_pretrained(
             "vikhyatk/moondream2", revision="2025-01-09"
         )
         self.vision = build_vision_model(config.vision, dtype)
-        self.text = build_text_model(config.text, torch.int8)
         # Region Model
         self.region = nn.ModuleDict(
             {
-                "coord_encoder": nn.Linear(
                     config.region.coord_feat_dim, config.region.dim, dtype=dtype
                 ),
                 "coord_decoder": nn.ModuleDict(
                     {
-                        "fc1": nn.Linear(
                             config.region.dim, config.region.inner_dim, dtype=dtype
                         ),
-                        "fc2": nn.Linear(
                             config.region.inner_dim,
                             config.region.coord_out_dim,
                             dtype=dtype,
                         ),
                     }
                 ),
-                "size_encoder": nn.Linear(
                     config.region.size_feat_dim, config.region.dim, dtype=dtype
                 ),
                 "size_decoder": nn.ModuleDict(
                     {
-                        "fc1": nn.Linear(
                             config.region.dim, config.region.inner_dim, dtype=dtype
                         ),
-                        "fc2": nn.Linear(
                             config.region.inner_dim,
                             config.region.size_out_dim,
                             dtype=dtype,
@@ -129,11 +132,11 @@ class MoondreamModel(nn.Module):
         attn_mask[..., :prefix_attn_len, :prefix_attn_len] = 1
         self.register_buffer("attn_mask", attn_mask, persistent=False)
-    def _setup_caches(self):
-        """Setup KV caches for the text model"""
-        if self.text is None:
-            return  # Can't set up caches without text model
         c = self.config.text
         for b in self.text.blocks:
             b.kv_cache = KVCache(
@@ -166,12 +169,16 @@ class MoondreamModel(nn.Module):
         return logits, hidden
     def compile(self):
         # TODO: vision_projection is not being compiled
-        self._vis_enc = torch.compile(
-            self._vis_enc, fullgraph=False, mode="reduce-overhead"
         )
-        self._prefill = torch.compile(self._prefill)
-        self._decode_one_tok = torch.compile(self._decode_one_tok)
     def _run_vision_encoder(self, image: Image.Image) -> torch.Tensor:
         all_crops, tiling = prepare_crops(image, self.config.vision, device=self.device)
@@ -204,7 +211,6 @@ class MoondreamModel(nn.Module):
         # Run through text model in addition to the vision encoder, to minimize
         # re-computation if multiple queries are performed on this image.
         with torch.inference_mode():
             img_emb = self._run_vision_encoder(image)
             bos_emb = text_encoder(
@@ -240,7 +246,6 @@ class MoondreamModel(nn.Module):
     def _prefill_prompt(
         self, prompt_tokens: torch.Tensor, pos: int, temperature: float, top_p: float
     ):
         with torch.inference_mode():
             prompt_emb = text_encoder(prompt_tokens, self.text)
             torch._dynamo.mark_dynamic(prompt_emb, 1)
@@ -585,11 +590,11 @@ class MoondreamModel(nn.Module):
                 self.text,
             )
             x_emb = encode_coordinate(
-                torch.tensor([[[source[0]]]], device=self.device, dtype=torch.float16),
                 self.region,
             )
             y_emb = encode_coordinate(
-                torch.tensor([[[source[1]]]], device=self.device, dtype=torch.float16),
                 self.region,
             )

 from .vision import vision_encoder, vision_projection, prepare_crops, build_vision_model
 from .text import build_text_model, text_encoder, lm_head, text_decoder
 from .region import decode_coordinate, encode_coordinate, decode_size, encode_size
+from .layers import QuantizedLinear
 from .utils import remove_outlier_points
 class MoondreamModel(nn.Module):
+    def __init__(
+        self, config: MoondreamConfig, dtype=torch.bfloat16, setup_caches=True
+    ):
         super().__init__()
         self.config = config
         self.tokenizer = Tokenizer.from_pretrained(
             "vikhyatk/moondream2", revision="2025-01-09"
         )
         self.vision = build_vision_model(config.vision, dtype)
+        self.text = build_text_model(config.text, dtype)
         # Region Model
+        linear_cls = (
+            QuantizedLinear if config.region.group_size is not None else nn.Linear
+        )
         self.region = nn.ModuleDict(
             {
+                "coord_encoder": linear_cls(
                     config.region.coord_feat_dim, config.region.dim, dtype=dtype
                 ),
                 "coord_decoder": nn.ModuleDict(
                     {
+                        "fc1": linear_cls(
                             config.region.dim, config.region.inner_dim, dtype=dtype
                         ),
+                        "fc2": linear_cls(
                             config.region.inner_dim,
                             config.region.coord_out_dim,
                             dtype=dtype,
                         ),
                     }
                 ),
+                "size_encoder": linear_cls(
                     config.region.size_feat_dim, config.region.dim, dtype=dtype
                 ),
                 "size_decoder": nn.ModuleDict(
                     {
+                        "fc1": linear_cls(
                             config.region.dim, config.region.inner_dim, dtype=dtype
                         ),
+                        "fc2": linear_cls(
                             config.region.inner_dim,
                             config.region.size_out_dim,
                             dtype=dtype,
         attn_mask[..., :prefix_attn_len, :prefix_attn_len] = 1
         self.register_buffer("attn_mask", attn_mask, persistent=False)
+        # Initialize KV caches.
+        if setup_caches:
+            self._setup_caches()
+    def _setup_caches(self):
         c = self.config.text
         for b in self.text.blocks:
             b.kv_cache = KVCache(
         return logits, hidden
     def compile(self):
+        for module in self.modules():
+            if isinstance(module, QuantizedLinear):
+                module.unpack()
         # TODO: vision_projection is not being compiled
+        self._vis_enc = torch.compile(self._vis_enc, fullgraph=True)
+        self._prefill = torch.compile(self._prefill, fullgraph=True)
+        self._decode_one_tok = torch.compile(
+            self._decode_one_tok, fullgraph=True, mode="reduce-overhead"
         )
     def _run_vision_encoder(self, image: Image.Image) -> torch.Tensor:
         all_crops, tiling = prepare_crops(image, self.config.vision, device=self.device)
         # Run through text model in addition to the vision encoder, to minimize
         # re-computation if multiple queries are performed on this image.
         with torch.inference_mode():
             img_emb = self._run_vision_encoder(image)
             bos_emb = text_encoder(
     def _prefill_prompt(
         self, prompt_tokens: torch.Tensor, pos: int, temperature: float, top_p: float
     ):
         with torch.inference_mode():
             prompt_emb = text_encoder(prompt_tokens, self.text)
             torch._dynamo.mark_dynamic(prompt_emb, 1)
                 self.text,
             )
             x_emb = encode_coordinate(
+                torch.tensor([[[source[0]]]], device=self.device, dtype=torch.bfloat16),
                 self.region,
             )
             y_emb = encode_coordinate(
+                torch.tensor([[[source[1]]]], device=self.device, dtype=torch.bfloat16),
                 self.region,
             )

packing.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import torch
+def unpack_int4(packed: torch.Tensor, original_length: int) -> torch.Tensor:
+    """
+    Unpack a tensor of uint8 packed bytes (two 4-bit values per byte) into a 1D tensor of int8 values,
+    vectorized over the entire input.
+    """
+    lower = packed & 0xF
+    upper = (packed >> 4) & 0xF
+    # Interleave lower and upper nibbles
+    nibbles = torch.stack([lower, upper], dim=-1).view(-1)[:original_length]
+    nibbles = nibbles.to(torch.int8)
+    nibbles[nibbles >= 8] -= 16
+    return nibbles
+def dequantize_tensor(
+    packed: torch.Tensor,
+    scales: torch.Tensor,
+    orig_shape: torch.Size,
+    block_size: int,
+    dtype: torch.dtype,
+):
+    """
+    Dequantizes a packed int4 tensor (with given per-block scales) back to bfloat16,
+    using vectorized operations to avoid Python loops.
+    """
+    num_bytes_per_block = (block_size + 1) // 2  # number of packed bytes per block
+    num_blocks_total = packed.numel() // num_bytes_per_block
+    # Reshape to (num_blocks_total, num_bytes_per_block)
+    packed_rows = packed.view(num_blocks_total, num_bytes_per_block)
+    # Vectorized unpacking: compute lower and upper nibbles for all rows at once.
+    lower = packed_rows & 0xF
+    upper = (packed_rows >> 4) & 0xF
+    # Create a new dimension for the two nibbles and then flatten.
+    nibbles = torch.stack([lower, upper], dim=2).view(num_blocks_total, -1)
+    # Slice to get exactly block_size values per block.
+    quantized_flat = nibbles[:, :block_size].to(torch.int8)
+    quantized_flat[quantized_flat >= 8] -= 16
+    # Reshape to original block structure.
+    last_dim = orig_shape[-1]
+    num_blocks = last_dim // block_size
+    new_shape = orig_shape[:-1] + (num_blocks, block_size)
+    quantized = quantized_flat.view(new_shape)
+    # Dequantize using scales.
+    dequantized = quantized.to(torch.float32) * scales.unsqueeze(-1)
+    dequantized = dequantized.view(orig_shape)
+    return dequantized.to(dtype)

region.py CHANGED Viewed

@@ -2,7 +2,7 @@ import torch
 import torch.nn as nn
 import math
-from .layers import linear, mlp
 def fourier_features(x: torch.Tensor, w: torch.Tensor) -> torch.Tensor:
@@ -36,7 +36,7 @@ def encode_coordinate(coord: torch.Tensor, w: nn.Module) -> torch.Tensor:
     Returns:
         Encoded hidden states tensor for input to text model
     """
-    return linear(fourier_features(coord, w.coord_features), w.coord_encoder)
 def decode_coordinate(hidden_state: torch.Tensor, w: nn.Module) -> torch.Tensor:
@@ -64,7 +64,7 @@ def encode_size(size: torch.Tensor, w: nn.Module) -> torch.Tensor:
     Returns:
         Encoded hidden states tensor for input to text model
     """
-    return linear(fourier_features(size, w.size_features), w.size_encoder)
 def decode_size(hidden_state: torch.Tensor, w: nn.Module) -> torch.Tensor:

 import torch.nn as nn
 import math
+from .layers import mlp
 def fourier_features(x: torch.Tensor, w: torch.Tensor) -> torch.Tensor:
     Returns:
         Encoded hidden states tensor for input to text model
     """
+    return w.coord_encoder(fourier_features(coord, w.coord_features))
 def decode_coordinate(hidden_state: torch.Tensor, w: nn.Module) -> torch.Tensor:
     Returns:
         Encoded hidden states tensor for input to text model
     """
+    return w.size_encoder(fourier_features(size, w.size_features))
 def decode_size(hidden_state: torch.Tensor, w: nn.Module) -> torch.Tensor:

text.py CHANGED Viewed

@@ -2,9 +2,8 @@ import torch
 import torch.nn as nn
 from torch.nn import functional as F
-from bitblas.cache import OperatorCache
-from .layers import layer_norm, mlp, Linear
 from .rope import apply_rotary_emb, precompute_freqs_cis
 from .config import TextConfig
@@ -27,7 +26,6 @@ def attn(
     head_dim = d_model // n_heads
     qkv_out = w.qkv(x)  # shape: (bsz, q_len, (n_heads + 2*n_kv_heads)*head_dim)
     q_dim = n_heads * head_dim
     kv_dim = n_kv_heads * head_dim
@@ -57,6 +55,71 @@ def attn(
     return out
 def text_decoder(
     x: torch.Tensor,
     w: nn.Module,
@@ -76,7 +139,6 @@ def text_decoder(
             n_kv_heads=config.n_kv_heads,
             position_ids=position_ids,
         )
         l_mlp = mlp(l_in, block.mlp)
         x = x + l_attn + l_mlp
@@ -90,30 +152,15 @@ def lm_head(hidden_BTC: torch.Tensor, w: nn.Module):
     return logits
-def build_text_model(
-    config: TextConfig,
-    linear_dtype: torch.dtype = torch.float16,
-    layernorm_dtype: torch.dtype = torch.float16,
-) -> nn.Module:
-    # note : layernorm dtype is used for layernorm, lm_head and wte not just layernorm
-    print(
-        "Initializing quantized backend. This only has to run once, but may take a few minutes."
-    )
-    qkv_dim = int(config.dim * (1 + 2 * config.n_kv_heads / config.n_heads))
-    group_size = None
-    if linear_dtype == torch.int8:
-        group_size = config.group_size
-    def create_linear(in_features, out_features, dtype=linear_dtype):
-        # factory function for creating Linear layers so we dont have to pass everything again and again
-        return Linear(
-            in_features=in_features,
-            out_features=out_features,
-            dtype=dtype,
-            group_size=group_size,
-        )
     text = nn.ModuleDict(
         {
@@ -121,17 +168,23 @@ def build_text_model(
                 [
                     nn.ModuleDict(
                         {
-                            "ln": nn.LayerNorm(config.dim, dtype=layernorm_dtype),
                             "attn": nn.ModuleDict(
                                 {
-                                    "qkv": create_linear(config.dim, qkv_dim),
-                                    "proj": create_linear(config.dim, config.dim),
                                 }
                             ),
                             "mlp": nn.ModuleDict(
                                 {
-                                    "fc1": create_linear(config.dim, config.ff_dim),
-                                    "fc2": create_linear(config.ff_dim, config.dim),
                                 }
                             ),
                         }
@@ -139,13 +192,11 @@ def build_text_model(
                     for _ in range(config.n_layers)
                 ]
             ),
-            "post_ln": nn.LayerNorm(config.dim, dtype=layernorm_dtype),
-            "lm_head": nn.Linear(config.dim, config.vocab_size, dtype=layernorm_dtype),
         }
     )
-    text.wte = nn.Parameter(
-        torch.empty(config.vocab_size, config.dim, dtype=layernorm_dtype)
-    )
     text.register_buffer(
         "freqs_cis",
         precompute_freqs_cis(config.dim // (2 * config.n_heads), config.max_context),

 import torch.nn as nn
 from torch.nn import functional as F
+from .layers import layer_norm, mlp, QuantizedLinear
 from .rope import apply_rotary_emb, precompute_freqs_cis
 from .config import TextConfig
     head_dim = d_model // n_heads
     qkv_out = w.qkv(x)  # shape: (bsz, q_len, (n_heads + 2*n_kv_heads)*head_dim)
     q_dim = n_heads * head_dim
     kv_dim = n_kv_heads * head_dim
     return out
+def _attn(
+    x: torch.Tensor,
+    w: torch.Tensor,
+    freqs_cis: torch.Tensor,
+    attn_mask: torch.Tensor,
+    n_heads: int,
+    n_kv_heads: int,
+):
+    bsz, q_len, d_model = x.shape
+    head_dim = d_model // n_heads
+    pos = 0
+    qkv_out = w.qkv(x)  # shape: (bsz, q_len, (n_heads + 2*n_kv_heads)*head_dim)
+    q_dim = n_heads * head_dim
+    kv_dim = n_kv_heads * head_dim
+    q = qkv_out[..., :q_dim].view(bsz, q_len, n_heads, head_dim).transpose(1, 2)
+    k = (
+        qkv_out[..., q_dim : q_dim + kv_dim]
+        .view(bsz, q_len, n_kv_heads, head_dim)
+        .transpose(1, 2)
+    )
+    v = (
+        qkv_out[..., q_dim + kv_dim :]
+        .view(bsz, q_len, n_kv_heads, head_dim)
+        .transpose(1, 2)
+    )
+    position_ids = torch.arange(pos, pos + q_len, dtype=torch.long)
+    q = apply_rotary_emb(q, freqs_cis, position_ids, n_heads)
+    k = apply_rotary_emb(k, freqs_cis, position_ids, n_kv_heads)
+    out = F.scaled_dot_product_attention(
+        q, k, v, attn_mask=attn_mask, enable_gqa=n_heads != n_kv_heads
+    )
+    out = out.transpose(1, 2).reshape(bsz, q_len, d_model)
+    out = w.proj(out)
+    return out
+def _produce_hidden(inputs_embeds: torch.Tensor, w: nn.Module, config: TextConfig):
+    hidden_BTC = inputs_embeds
+    bsz, q_len, d_model = inputs_embeds.shape
+    attn_mask = torch.zeros(q_len, q_len)
+    attn_mask[:730, :730] = 1
+    for i in range(730, q_len):
+        attn_mask[i, : i + 1] = 1
+    attn_mask = attn_mask.to(dtype=torch.bool)
+    for i, block in enumerate(w.blocks):
+        l_in = layer_norm(hidden_BTC, block.ln)
+        l_attn = _attn(
+            x=l_in,
+            w=block.attn,
+            freqs_cis=w.freqs_cis,
+            attn_mask=attn_mask,
+            n_heads=config.n_heads,
+            n_kv_heads=config.n_kv_heads,
+        )
+        l_mlp = mlp(l_in, block.mlp)
+        hidden_BTC = hidden_BTC + l_attn + l_mlp
+    return hidden_BTC
 def text_decoder(
     x: torch.Tensor,
     w: nn.Module,
             n_kv_heads=config.n_kv_heads,
             position_ids=position_ids,
         )
         l_mlp = mlp(l_in, block.mlp)
         x = x + l_attn + l_mlp
     return logits
+def _lm_head(hidden_BTC: torch.Tensor, w: nn.Module):
+    hidden_BTC = layer_norm(hidden_BTC, w.post_ln)
+    logits = w.lm_head(hidden_BTC)
+    return logits
+def build_text_model(config: TextConfig, dtype: torch.dtype) -> nn.Module:
+    qkv_dim = int(config.dim * (1 + 2 * config.n_kv_heads / config.n_heads))
+    linear_cls = QuantizedLinear if config.group_size is not None else nn.Linear
     text = nn.ModuleDict(
         {
                 [
                     nn.ModuleDict(
                         {
+                            "ln": nn.LayerNorm(config.dim, dtype=dtype),
                             "attn": nn.ModuleDict(
                                 {
+                                    "qkv": linear_cls(config.dim, qkv_dim, dtype=dtype),
+                                    "proj": linear_cls(
+                                        config.dim, config.dim, dtype=dtype
+                                    ),
                                 }
                             ),
                             "mlp": nn.ModuleDict(
                                 {
+                                    "fc1": linear_cls(
+                                        config.dim, config.ff_dim, dtype=dtype
+                                    ),
+                                    "fc2": linear_cls(
+                                        config.ff_dim, config.dim, dtype=dtype
+                                    ),
                                 }
                             ),
                         }
                     for _ in range(config.n_layers)
                 ]
             ),
+            "post_ln": nn.LayerNorm(config.dim, dtype=dtype),
+            "lm_head": linear_cls(config.dim, config.vocab_size, dtype=dtype),
         }
     )
+    text.wte = nn.Parameter(torch.empty(config.vocab_size, config.dim, dtype=dtype))
     text.register_buffer(
         "freqs_cis",
         precompute_freqs_cis(config.dim // (2 * config.n_heads), config.max_context),

vision.py CHANGED Viewed

@@ -6,7 +6,7 @@ import numpy as np
 from typing import Union, Tuple
 from PIL import Image
-from .layers import attn, layer_norm, linear, mlp
 from .image_crops import overlap_crop_image
 from .config import VisionConfig
@@ -33,7 +33,7 @@ def prepare_crops(
     all_crops = np.transpose(all_crops, (0, 3, 1, 2))
     all_crops = (
         torch.from_numpy(all_crops)
-        .to(device=device, dtype=torch.float16)
         .div_(255.0)
         .sub_(0.5)
         .div_(0.5)
@@ -64,7 +64,7 @@ def create_patches(x, patch_size):
 def vision_encoder(input_BCHW: torch.Tensor, w: nn.Module, config: VisionConfig):
     x = create_patches(input_BCHW, config.enc_patch_size)
-    x = linear(x, w.patch_emb)
     x = x + w.pos_emb
     for block in w.blocks:
         x = x + attn(layer_norm(x, block.ln1), block.attn, n_heads=config.enc_n_heads)

 from typing import Union, Tuple
 from PIL import Image
+from .layers import attn, layer_norm, mlp
 from .image_crops import overlap_crop_image
 from .config import VisionConfig
     all_crops = np.transpose(all_crops, (0, 3, 1, 2))
     all_crops = (
         torch.from_numpy(all_crops)
+        .to(device=device, dtype=torch.bfloat16)
         .div_(255.0)
         .sub_(0.5)
         .div_(0.5)
 def vision_encoder(input_BCHW: torch.Tensor, w: nn.Module, config: VisionConfig):
     x = create_patches(input_BCHW, config.enc_patch_size)
+    x = w.patch_emb(x)
     x = x + w.pos_emb
     for block in w.blocks:
         x = x + attn(layer_norm(x, block.ln1), block.attn, n_heads=config.enc_n_heads)