Upload HfMoondream

Browse files

Files changed (10) hide show

config.json +1 -1
config.py +3 -0
generation_config.json +1 -1
hf_moondream.py +26 -7
layers.py +2 -2
model.safetensors +1 -1
moondream.py +204 -136
region.py +19 -12
text.py +114 -76
vision.py +21 -7

config.json CHANGED Viewed

@@ -9,5 +9,5 @@
   "config": {},
   "model_type": "moondream1",
   "torch_dtype": "float16",
-  "transformers_version": "4.44.0"
 }

   "config": {},
   "model_type": "moondream1",
   "torch_dtype": "float16",
+  "transformers_version": "4.48.0"
 }

config.py CHANGED Viewed

@@ -5,10 +5,12 @@ from typing import Dict, List, Optional
 @dataclass(frozen=True)
 class TextConfig:
     dim: int = 2048
     n_layers: int = 24
     vocab_size: int = 51200
     max_context: int = 2048
     n_heads: int = 32
     prefix_attn: int = 730
@@ -46,6 +48,7 @@ class TokenizerConfig:
             "caption": {
                 "short": [198, 198, 16438, 8305, 25],
                 "normal": [198, 198, 24334, 1159, 25],
             },
             "query": {"prefix": [198, 198, 24361, 25], "suffix": [198, 198, 33706, 25]},
             "detect": {"prefix": [198, 198, 47504, 25], "suffix": [628]},

 @dataclass(frozen=True)
 class TextConfig:
     dim: int = 2048
+    ff_dim: int = 8192
     n_layers: int = 24
     vocab_size: int = 51200
     max_context: int = 2048
     n_heads: int = 32
+    n_kv_heads: int = 32
     prefix_attn: int = 730
             "caption": {
                 "short": [198, 198, 16438, 8305, 25],
                 "normal": [198, 198, 24334, 1159, 25],
+                "long": [198, 198, 14617, 8305, 25],
             },
             "query": {"prefix": [198, 198, 24361, 25], "suffix": [198, 198, 33706, 25]},
             "detect": {"prefix": [198, 198, 47504, 25], "suffix": [628]},

generation_config.json CHANGED Viewed

@@ -1,4 +1,4 @@
 {
   "_from_model_config": true,
-  "transformers_version": "4.44.0"
 }

 {
   "_from_model_config": true,
+  "transformers_version": "4.48.0"
 }

hf_moondream.py CHANGED Viewed

@@ -14,7 +14,7 @@ from .utils import *
 def extract_question(text):
     prefix = "<image>\n\nQuestion: "
     suffix = "\n\nAnswer:"
     if text.startswith(prefix) and text.endswith(suffix):
         return text[len(prefix) : -len(suffix)]
     else:
@@ -36,30 +36,44 @@ class HfMoondream(PreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
-        self.model = MoondreamModel(MoondreamConfig.from_dict(config.config))
     @property
     def encode_image(self):
         return self.model.encode_image
     @property
     def query(self):
         return self.model.query
     @property
     def caption(self):
         return self.model.caption
     @property
     def detect(self):
         return self.model.detect
     @property
     def point(self):
         return self.model.point
     @property
     def detect_gaze(self):
         return self.model.detect_gaze
     def answer_question(
@@ -98,22 +112,27 @@ class HfMoondream(PreTrainedModel):
         """
         prompt_extracted = extract_question(prompt)
         if prompt_extracted is not None:
-            answer = self.model.query(image=image_embeds, question=prompt_extracted, stream=False)[
-                "answer"
-            ]
         else:
             image_embeds = self.encode_image(image_embeds)
             prompt_tokens = torch.tensor(
                 [self.model.tokenizer.encode(prompt).ids],
                 device=self.device,
             )
             def generator():
                 for token in self.model._generate_text(
-                    prompt_tokens, image_embeds.kv_cache, image_embeds.pos, max_new_tokens
                 ):
                     yield token
             answer = "".join(list(generator()))
         return [answer]
     def get_input_embeddings(self):

 def extract_question(text):
     prefix = "<image>\n\nQuestion: "
     suffix = "\n\nAnswer:"
     if text.startswith(prefix) and text.endswith(suffix):
         return text[len(prefix) : -len(suffix)]
     else:
     def __init__(self, config):
         super().__init__(config)
+        self.model = MoondreamModel(
+            MoondreamConfig.from_dict(config.config), setup_caches=False
+        )
+        self._is_kv_cache_setup = False
+    def _setup_caches(self):
+        if not self._is_kv_cache_setup:
+            self.model._setup_caches()
+            self._is_kv_cache_setup = True
     @property
     def encode_image(self):
+        self._setup_caches()
         return self.model.encode_image
     @property
     def query(self):
+        self._setup_caches()
         return self.model.query
     @property
     def caption(self):
+        self._setup_caches()
         return self.model.caption
     @property
     def detect(self):
+        self._setup_caches()
         return self.model.detect
     @property
     def point(self):
+        self._setup_caches()
         return self.model.point
     @property
     def detect_gaze(self):
+        self._setup_caches()
         return self.model.detect_gaze
     def answer_question(
         """
         prompt_extracted = extract_question(prompt)
         if prompt_extracted is not None:
+            answer = self.model.query(
+                image=image_embeds, question=prompt_extracted, stream=False
+            )["answer"]
         else:
             image_embeds = self.encode_image(image_embeds)
             prompt_tokens = torch.tensor(
                 [self.model.tokenizer.encode(prompt).ids],
                 device=self.device,
             )
             def generator():
                 for token in self.model._generate_text(
+                    prompt_tokens,
+                    image_embeds.kv_cache,
+                    image_embeds.pos,
+                    max_new_tokens,
                 ):
                     yield token
             answer = "".join(list(generator()))
         return [answer]
     def get_input_embeddings(self):

layers.py CHANGED Viewed

@@ -37,9 +37,9 @@ class MLPWeights:
 def mlp(x: torch.Tensor, w: MLPWeights) -> torch.Tensor:
-    x = linear(x, w.fc1)
     x = gelu_approx(x)
-    x = linear(x, w.fc2)
     return x

 def mlp(x: torch.Tensor, w: MLPWeights) -> torch.Tensor:
+    x = w.fc1(x)
     x = gelu_approx(x)
+    x = w.fc2(x)
     return x

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:23e2e6498a058d12832e119dc97a1d2f14936b4ccf77b8492bc0fefba49ea8bb
 size 3854538376

 version https://git-lfs.github.com/spec/v1
+oid sha256:fadcffea8c17fe8a20ea68af3a013cf3184a63787ee4453cc9eb75206c7c1f9b
 size 3854538376

moondream.py CHANGED Viewed

@@ -2,7 +2,7 @@ import torch
 import torch.nn as nn
 import random
-from typing import Literal, Tuple, TypedDict, Union, Dict, Any, Optional
 from PIL import Image
 from dataclasses import dataclass
 from tokenizers import Tokenizer
@@ -10,7 +10,7 @@ from tokenizers import Tokenizer
 from .config import MoondreamConfig
 from .image_crops import reconstruct_from_crops
 from .vision import vision_encoder, vision_projection, prepare_crops, build_vision_model
-from .text import build_text_model, prefill, text_encoder, lm_head, decode_one_token
 from .region import decode_coordinate, encode_coordinate, decode_size, encode_size
 from .utils import remove_outlier_points
@@ -21,53 +21,41 @@ SamplingSettings = TypedDict(
     total=False,
 )
-DEFAULT_MAX_TOKENS = 512
 @dataclass(frozen=True)
 class EncodedImage:
     pos: int
-    kv_cache: torch.Tensor
-def _min_p_sampler(
-    logits: torch.Tensor,
-    min_p: float = 0.1,
-    filter_value: float = 0,
-    min_tokens_to_keep: int = 1,
-    temp=0.5,
-) -> torch.Tensor:
-    """
-    Min-p sampler adapted from https://github.com/oobabooga/text-generation-webui/blob/3146124ec01f02c8fb1650a6517cf1b60b537aaf/modules/sampler_hijack.py#L16C17-L16C17
-    https://arxiv.org/pdf/2407.01082
-    """
-    logits = logits / temp
-    probs = torch.softmax(logits, dim=-1)
-    top_probs, _ = probs.max(dim=-1, keepdim=True)
-    scaled_min_p = min_p * top_probs
-    tokens_to_remove = probs < scaled_min_p
-    sorted_indices = torch.argsort(logits, descending=True, dim=-1)
-    sorted_indices_to_remove = torch.gather(
-        tokens_to_remove, dim=-1, index=sorted_indices
-    )
-    if min_tokens_to_keep > 1:
-        sorted_indices_to_remove[..., :min_tokens_to_keep] = False
-    indices_to_remove = sorted_indices_to_remove.scatter(
-        1, sorted_indices, sorted_indices_to_remove
-    )
-    logits = logits.masked_fill(indices_to_remove, filter_value)
-    token = torch.multinomial(logits, num_samples=1)
-    return token.squeeze(0)
 class MoondreamModel(nn.Module):
-    def __init__(self, config: MoondreamConfig, dtype=torch.float16):
         super().__init__()
         self.config = config
         self.tokenizer = Tokenizer.from_pretrained(
-            "vikhyatk/moondream2", revision="2024-08-26"
         )
         self.vision = build_vision_model(config.vision, dtype)
         self.text = build_text_model(config.text, dtype)
@@ -114,35 +102,65 @@ class MoondreamModel(nn.Module):
             torch.empty(config.region.size_feat_dim // 2, 2, dtype=dtype).T
         )
-        self.ops = {
-            "vision_encoder": vision_encoder,
-            "vision_projection": vision_projection,
-            "prefill": prefill,
-            "decode_one_token": decode_one_token,
-        }
     @property
     def device(self):
         return self.vision.pos_emb.device
     def compile(self):
-        self.ops["vision_encoder"] = torch.compile(
-            self.ops["vision_encoder"], fullgraph=True
-        )
-        # Need to figure out how to mark the 'reconstructed' input shape as dynamic
-        # self.ops["vision_projection"] = torch.compile(
-        #     self.ops["vision_projection"], fullgraph=True
-        # )
-        self.ops["prefill"] = torch.compile(self.ops["prefill"], fullgraph=True)
-        self.ops["decode_one_token"] = torch.compile(
-            self.ops["decode_one_token"], fullgraph=True
         )
     def _run_vision_encoder(self, image: Image.Image) -> torch.Tensor:
         all_crops, tiling = prepare_crops(image, self.config.vision, device=self.device)
         torch._dynamo.mark_dynamic(all_crops, 0)
-        outputs = self.ops["vision_encoder"](all_crops, self.vision, self.config.vision)
         global_features = outputs[0]
         local_features = outputs[1:].view(
@@ -159,9 +177,7 @@ class MoondreamModel(nn.Module):
             overlap_margin=self.config.vision.overlap_margin,
         )
-        return self.ops["vision_projection"](
-            global_features, reconstructed, self.vision, self.config.vision
-        )
     def encode_image(self, image: Union[Image.Image, EncodedImage]) -> EncodedImage:
         if isinstance(image, EncodedImage):
@@ -171,34 +187,35 @@ class MoondreamModel(nn.Module):
         # Run through text model in addition to the vision encoder, to minimize
         # re-computation if multiple queries are performed on this image.
-        kv_cache = torch.zeros(
-            self.config.text.n_layers,
-            2,  # k, v
-            1,  # batch size
-            self.config.text.n_heads,
-            self.config.text.max_context,  # static cache
-            self.config.text.dim // self.config.text.n_heads,  # head dim
-            device=self.device,
-            dtype=torch.float16,
-        )
-        with torch.no_grad():
             img_emb = self._run_vision_encoder(image)
             bos_emb = text_encoder(
                 torch.tensor([[self.config.tokenizer.bos_id]], device=self.device),
                 self.text,
             )
             inputs_embeds = torch.cat([bos_emb, img_emb[None]], dim=1)
-            self.ops["prefill"](inputs_embeds, kv_cache, 0, self.text, self.config.text)
-        return EncodedImage(pos=inputs_embeds.size(1), kv_cache=kv_cache)
-    def _prefill_prompt(
-        self, kv_cache: torch.Tensor, prompt_tokens: torch.Tensor, pos: int
-    ):
-        with torch.no_grad():
             prompt_emb = text_encoder(prompt_tokens, self.text)
-            hidden = self.ops["prefill"](
-                prompt_emb, kv_cache, pos, self.text, self.config.text
-            )
             logits = lm_head(hidden, self.text)
             next_token = torch.argmax(logits, dim=-1)
         pos = pos + prompt_emb.size(1)
@@ -207,33 +224,67 @@ class MoondreamModel(nn.Module):
     def _generate_text(
         self,
         prompt_tokens: torch.Tensor,
-        kv_cache: torch.Tensor,
         pos: int,
         max_tokens: int,
     ):
-        kv_cache = kv_cache.clone()
-        _, _, next_token, pos = self._prefill_prompt(kv_cache, prompt_tokens, pos)
         def generator(next_token, pos):
             generated_tokens = 0
             while (
                 next_token_id := next_token.item()
             ) != self.config.tokenizer.eos_id and generated_tokens < max_tokens:
-                yield self.tokenizer.decode([next_token_id])
-                with torch.no_grad():
                     next_emb = text_encoder(next_token, self.text)
-                    logits, _, kv_cache_update = self.ops["decode_one_token"](
-                        next_emb, kv_cache, pos, self.text, self.config.text
-                    )
-                    kv_cache[:, :, :, :, pos : pos + kv_cache_update.size(-2), :] = (
-                        kv_cache_update
-                    )
                     pos += 1
                     next_token = torch.argmax(logits, dim=-1)
                     generated_tokens += 1
         return generator(next_token, pos)
     def query(
@@ -247,10 +298,12 @@ class MoondreamModel(nn.Module):
             raise NotImplementedError("Model does not support querying.")
         image = self.encode_image(image)
         prompt_tokens = torch.tensor(
             [
                 self.config.tokenizer.templates["query"]["prefix"]
-                + self.tokenizer.encode(question).ids
                 + self.config.tokenizer.templates["query"]["suffix"]
             ],
             device=self.device,
@@ -261,9 +314,7 @@ class MoondreamModel(nn.Module):
             max_tokens = settings.get("max_tokens", DEFAULT_MAX_TOKENS)
         def generator():
-            for token in self._generate_text(
-                prompt_tokens, image.kv_cache, image.pos, max_tokens
-            ):
                 yield token
         if stream:
@@ -271,10 +322,15 @@ class MoondreamModel(nn.Module):
         else:
             return {"answer": "".join(list(generator()))}
     def caption(
         self,
         image: Union[Image.Image, EncodedImage],
-        length: Literal["normal", "short"] = "normal",
         stream: bool = False,
         settings: Optional[SamplingSettings] = None,
     ):
@@ -284,6 +340,8 @@ class MoondreamModel(nn.Module):
             raise ValueError(f"Model does not support caption length '{length}'.")
         image = self.encode_image(image)
         prompt_tokens = torch.tensor(
             [self.config.tokenizer.templates["caption"][length]], device=self.device
         )
@@ -293,9 +351,7 @@ class MoondreamModel(nn.Module):
             max_tokens = settings.get("max_tokens", DEFAULT_MAX_TOKENS)
         def generator():
-            for token in self._generate_text(
-                prompt_tokens, image.kv_cache, image.pos, max_tokens
-            ):
                 yield token
         if stream:
@@ -306,15 +362,17 @@ class MoondreamModel(nn.Module):
     def _generate_points(
         self,
         hidden: torch.Tensor,
-        kv_cache: torch.Tensor,
         next_token: torch.Tensor,
         pos: int,
         include_size: bool = True,
         max_points: int = 50,
     ):
         out = []
-        with torch.no_grad():
             while (
                 next_token.item() != self.config.tokenizer.eos_id
                 and len(out) < max_points
@@ -326,12 +384,8 @@ class MoondreamModel(nn.Module):
                 )
                 # Decode y-coordinate
-                _, hidden, kv_cache_update = self.ops["decode_one_token"](
-                    next_emb, kv_cache, pos, self.text, self.config.text
-                )
-                kv_cache[:, :, :, :, pos : pos + kv_cache_update.size(-2), :] = (
-                    kv_cache_update
-                )
                 pos += 1
                 y_logits = decode_coordinate(hidden, self.region)
                 y_center = torch.argmax(y_logits, dim=-1) / y_logits.size(-1)
@@ -341,16 +395,20 @@ class MoondreamModel(nn.Module):
                 # Decode size
                 if include_size:
-                    logits, hidden, kv_cache_update = self.ops["decode_one_token"](
-                        next_emb, kv_cache, pos, self.text, self.config.text
-                    )
-                    kv_cache[:, :, :, :, pos : pos + kv_cache_update.size(-2), :] = (
-                        kv_cache_update
-                    )
                     pos += 1
                     size_logits = decode_size(hidden, self.region)
-                    w = torch.argmax(size_logits[0], dim=-1) / size_logits.size(-1)
-                    h = torch.argmax(size_logits[1], dim=-1) / size_logits.size(-1)
                     next_emb = encode_size(
                         torch.tensor(
                             [w, h], device=self.device, dtype=size_logits.dtype
@@ -371,12 +429,8 @@ class MoondreamModel(nn.Module):
                     out.append({"x": x_center.item(), "y": y_center.item()})
                 # Decode next token (x-coordinate, or eos)
-                logits, hidden, kv_cache_update = self.ops["decode_one_token"](
-                    next_emb, kv_cache, pos, self.text, self.config.text
-                )
-                kv_cache[:, :, :, :, pos : pos + kv_cache_update.size(-2), :] = (
-                    kv_cache_update
-                )
                 pos += 1
                 next_token = torch.argmax(logits, dim=-1)
@@ -392,23 +446,22 @@ class MoondreamModel(nn.Module):
             raise NotImplementedError("Model does not support object detection.")
         image = self.encode_image(image)
         prompt_tokens = torch.tensor(
             [
                 self.config.tokenizer.templates["detect"]["prefix"]
-                + self.tokenizer.encode(object).ids
                 + self.config.tokenizer.templates["detect"]["suffix"]
             ],
             device=self.device,
         )
-        kv_cache = image.kv_cache.clone()
-        _, hidden, next_token, pos = self._prefill_prompt(
-            kv_cache, prompt_tokens, image.pos
-        )
         hidden = hidden[:, -1:, :]
         objects = self._generate_points(
-            hidden, kv_cache, next_token, pos, include_size=True, max_points=50
         )
         return {"objects": objects}
@@ -423,23 +476,22 @@ class MoondreamModel(nn.Module):
             raise NotImplementedError("Model does not support pointing.")
         image = self.encode_image(image)
         prompt_tokens = torch.tensor(
             [
                 self.config.tokenizer.templates["point"]["prefix"]
-                + self.tokenizer.encode(object).ids
                 + self.config.tokenizer.templates["point"]["suffix"]
             ],
             device=self.device,
         )
-        kv_cache = image.kv_cache.clone()
-        _, hidden, next_token, pos = self._prefill_prompt(
-            kv_cache, prompt_tokens, image.pos
-        )
         hidden = hidden[:, -1:, :]
         objects = self._generate_points(
-            hidden, kv_cache, next_token, pos, include_size=False, max_points=50
         )
         return {"points": objects}
@@ -450,7 +502,7 @@ class MoondreamModel(nn.Module):
         source: Tuple[float, float],
         force_detect: bool = False,
     ):
-        with torch.no_grad():
             before_emb = text_encoder(
                 torch.tensor(
                     [self.tokenizer.encode("\n\nPoint:").ids], device=self.device
@@ -474,10 +526,13 @@ class MoondreamModel(nn.Module):
             prompt_emb = torch.cat([before_emb, x_emb, y_emb, after_emb], dim=1)
-            kv_cache = image.kv_cache.clone()
-            hidden = self.ops["prefill"](
-                prompt_emb, kv_cache, image.pos, self.text, self.config.text
             )
             logits = lm_head(hidden, self.text)
             next_token = torch.argmax(logits, dim=-1)
             pos = image.pos + prompt_emb.size(1)
@@ -490,7 +545,7 @@ class MoondreamModel(nn.Module):
                 return None
             gaze = self._generate_points(
-                hidden, kv_cache, next_token, pos, include_size=False, max_points=1
             )
             return gaze[0]
@@ -584,3 +639,16 @@ class MoondreamModel(nn.Module):
             )
             return {"gaze": {"x": mean_gaze[0], "y": mean_gaze[1]}}

 import torch.nn as nn
 import random
+from typing import Literal, Tuple, TypedDict, Union, Dict, Any, Optional, List
 from PIL import Image
 from dataclasses import dataclass
 from tokenizers import Tokenizer
 from .config import MoondreamConfig
 from .image_crops import reconstruct_from_crops
 from .vision import vision_encoder, vision_projection, prepare_crops, build_vision_model
+from .text import build_text_model, text_encoder, lm_head, text_decoder
 from .region import decode_coordinate, encode_coordinate, decode_size, encode_size
 from .utils import remove_outlier_points
     total=False,
 )
+DEFAULT_MAX_TOKENS = 768
 @dataclass(frozen=True)
 class EncodedImage:
     pos: int
+    caches: List[Tuple[torch.Tensor, torch.Tensor]]
+class KVCache(nn.Module):
+    def __init__(self, n_heads, n_kv_heads, max_context, dim, device, dtype):
+        super().__init__()
+        cache_shape = (1, n_kv_heads, max_context, dim // n_heads)
+        self.register_buffer(
+            "k_cache", torch.zeros(*cache_shape, device=device, dtype=dtype)
+        )
+        self.register_buffer(
+            "v_cache", torch.zeros(*cache_shape, device=device, dtype=dtype)
+        )
+    def update(self, pos_ids, k, v):
+        kout, vout = self.k_cache, self.v_cache
+        kout[:, :, pos_ids, :] = k
+        vout[:, :, pos_ids, :] = v
+        return kout, vout
 class MoondreamModel(nn.Module):
+    def __init__(self, config: MoondreamConfig, dtype=torch.float16, setup_caches=True):
         super().__init__()
         self.config = config
         self.tokenizer = Tokenizer.from_pretrained(
+            "vikhyatk/moondream2", revision="2025-01-09"
         )
         self.vision = build_vision_model(config.vision, dtype)
         self.text = build_text_model(config.text, dtype)
             torch.empty(config.region.size_feat_dim // 2, 2, dtype=dtype).T
         )
+        attn_mask = torch.tril(
+            torch.ones(
+                1, 1, config.text.max_context, config.text.max_context, dtype=torch.bool
+            )
+        )
+        patch_w = config.vision.crop_size // config.vision.enc_patch_size
+        prefix_attn_len = 1 + patch_w**2
+        attn_mask[..., :prefix_attn_len, :prefix_attn_len] = 1
+        self.register_buffer("attn_mask", attn_mask, persistent=False)
+        # Initialize KV caches.
+        if setup_caches:
+            self._setup_caches()
+    def _setup_caches(self):
+        c = self.config.text
+        for b in self.text.blocks:
+            b.kv_cache = KVCache(
+                c.n_heads,
+                c.n_kv_heads,
+                c.max_context,
+                c.dim,
+                device=self.device,
+                dtype=self.vision.pos_emb.dtype,
+            )
     @property
     def device(self):
         return self.vision.pos_emb.device
+    def _vis_enc(self, x: torch.Tensor):
+        return vision_encoder(x, self.vision, self.config.vision)
+    def _vis_proj(self, g: torch.Tensor, r: torch.Tensor):
+        return vision_projection(g, r, self.vision, self.config.vision)
+    def _prefill(self, x: torch.Tensor, attn_mask: torch.Tensor, pos_ids: torch.Tensor):
+        return text_decoder(x, self.text, attn_mask, pos_ids, self.config.text)
+    def _decode_one_tok(
+        self, x: torch.Tensor, attn_mask: torch.Tensor, pos_ids: torch.Tensor
+    ):
+        hidden = text_decoder(x[None], self.text, attn_mask, pos_ids, self.config.text)
+        logits = lm_head(hidden, self.text)
+        return logits, hidden
     def compile(self):
+        # TODO: vision_projection is not being compiled
+        self._vis_enc = torch.compile(self._vis_enc, fullgraph=True)
+        self._prefill = torch.compile(self._prefill, fullgraph=True)
+        self._decode_one_tok = torch.compile(
+            self._decode_one_tok, fullgraph=True, mode="reduce-overhead"
         )
     def _run_vision_encoder(self, image: Image.Image) -> torch.Tensor:
         all_crops, tiling = prepare_crops(image, self.config.vision, device=self.device)
         torch._dynamo.mark_dynamic(all_crops, 0)
+        outputs = self._vis_enc(all_crops)
         global_features = outputs[0]
         local_features = outputs[1:].view(
             overlap_margin=self.config.vision.overlap_margin,
         )
+        return self._vis_proj(global_features, reconstructed)
     def encode_image(self, image: Union[Image.Image, EncodedImage]) -> EncodedImage:
         if isinstance(image, EncodedImage):
         # Run through text model in addition to the vision encoder, to minimize
         # re-computation if multiple queries are performed on this image.
+        with torch.inference_mode():
             img_emb = self._run_vision_encoder(image)
             bos_emb = text_encoder(
                 torch.tensor([[self.config.tokenizer.bos_id]], device=self.device),
                 self.text,
             )
             inputs_embeds = torch.cat([bos_emb, img_emb[None]], dim=1)
+            mask = self.attn_mask[:, :, 0 : inputs_embeds.size(1), :]
+            pos_ids = torch.arange(inputs_embeds.size(1), dtype=torch.long)
+            self._prefill(inputs_embeds, mask, pos_ids)
+        return EncodedImage(
+            pos=inputs_embeds.size(1),
+            caches=[
+                (
+                    b.kv_cache.k_cache[:, :, : inputs_embeds.size(1), :].clone(),
+                    b.kv_cache.v_cache[:, :, : inputs_embeds.size(1), :].clone(),
+                )
+                for b in self.text.blocks
+            ],
+        )
+    def _prefill_prompt(self, prompt_tokens: torch.Tensor, pos: int):
+        with torch.inference_mode():
             prompt_emb = text_encoder(prompt_tokens, self.text)
+            torch._dynamo.mark_dynamic(prompt_emb, 1)
+            mask = self.attn_mask[:, :, pos : pos + prompt_emb.size(1), :]
+            pos_ids = torch.arange(pos, pos + prompt_emb.size(1), dtype=torch.long)
+            hidden = self._prefill(prompt_emb, mask, pos_ids)
             logits = lm_head(hidden, self.text)
             next_token = torch.argmax(logits, dim=-1)
         pos = pos + prompt_emb.size(1)
     def _generate_text(
         self,
         prompt_tokens: torch.Tensor,
         pos: int,
         max_tokens: int,
     ):
+        _, _, next_token, pos = self._prefill_prompt(prompt_tokens, pos)
         def generator(next_token, pos):
+            mask = torch.zeros(1, 1, 2048, device=self.device, dtype=torch.bool)
+            mask[:, :, :pos] = 1
+            pos_ids = torch.tensor([pos], device=self.device, dtype=torch.long)
             generated_tokens = 0
+            # For properly handling token streaming with Unicode
+            token_cache = []
+            print_len = 0
             while (
                 next_token_id := next_token.item()
             ) != self.config.tokenizer.eos_id and generated_tokens < max_tokens:
+                # Add token to our cache
+                token_cache.append(next_token_id)
+                # Decode all tokens collected so far
+                text = self.tokenizer.decode(token_cache)
+                # After a newline, we flush the cache completely
+                if text.endswith("\n"):
+                    printable_text = text[print_len:]
+                    token_cache = []
+                    print_len = 0
+                    if printable_text:
+                        yield printable_text
+                # If the last token is a CJK character, we can safely print it
+                elif len(text) > 0 and _is_cjk_char(ord(text[-1])):
+                    printable_text = text[print_len:]
+                    print_len += len(printable_text)
+                    if printable_text:
+                        yield printable_text
+                # Otherwise, only print up to the last space to avoid cutting words
+                else:
+                    last_space_idx = text.rfind(" ", print_len)
+                    if last_space_idx >= print_len:
+                        printable_text = text[print_len : last_space_idx + 1]
+                        print_len += len(printable_text)
+                        if printable_text:
+                            yield printable_text
+                with torch.inference_mode():
                     next_emb = text_encoder(next_token, self.text)
+                    mask[:, :, pos], pos_ids[0] = 1, pos
+                    logits, _ = self._decode_one_tok(next_emb, mask, pos_ids)
                     pos += 1
                     next_token = torch.argmax(logits, dim=-1)
                     generated_tokens += 1
+            # Flush any remaining text in the cache
+            if token_cache:
+                text = self.tokenizer.decode(token_cache)
+                printable_text = text[print_len:]
+                if printable_text:
+                    yield printable_text
         return generator(next_token, pos)
     def query(
             raise NotImplementedError("Model does not support querying.")
         image = self.encode_image(image)
+        self.load_encoded_image(image)
         prompt_tokens = torch.tensor(
             [
                 self.config.tokenizer.templates["query"]["prefix"]
+                + self.tokenizer.encode(" " + question).ids
                 + self.config.tokenizer.templates["query"]["suffix"]
             ],
             device=self.device,
             max_tokens = settings.get("max_tokens", DEFAULT_MAX_TOKENS)
         def generator():
+            for token in self._generate_text(prompt_tokens, image.pos, max_tokens):
                 yield token
         if stream:
         else:
             return {"answer": "".join(list(generator()))}
+    def load_encoded_image(self, encoded_image: EncodedImage):
+        for b, (k, v) in zip(self.text.blocks, encoded_image.caches):
+            b.kv_cache.k_cache[:, :, : k.size(2), :] = k
+            b.kv_cache.v_cache[:, :, : v.size(2), :] = v
     def caption(
         self,
         image: Union[Image.Image, EncodedImage],
+        length: Literal["normal", "short", "long"] = "normal",
         stream: bool = False,
         settings: Optional[SamplingSettings] = None,
     ):
             raise ValueError(f"Model does not support caption length '{length}'.")
         image = self.encode_image(image)
+        self.load_encoded_image(image)
         prompt_tokens = torch.tensor(
             [self.config.tokenizer.templates["caption"][length]], device=self.device
         )
             max_tokens = settings.get("max_tokens", DEFAULT_MAX_TOKENS)
         def generator():
+            for token in self._generate_text(prompt_tokens, image.pos, max_tokens):
                 yield token
         if stream:
     def _generate_points(
         self,
         hidden: torch.Tensor,
         next_token: torch.Tensor,
         pos: int,
         include_size: bool = True,
         max_points: int = 50,
     ):
         out = []
+        mask = torch.zeros(1, 1, 2048, device=self.device, dtype=torch.bool)
+        mask[:, :, :pos] = 1
+        pos_ids = torch.tensor([pos], device=self.device, dtype=torch.long)
+        with torch.inference_mode():
             while (
                 next_token.item() != self.config.tokenizer.eos_id
                 and len(out) < max_points
                 )
                 # Decode y-coordinate
+                mask[:, :, pos], pos_ids[0] = 1, pos
+                _, hidden = self._decode_one_tok(next_emb, mask, pos_ids)
                 pos += 1
                 y_logits = decode_coordinate(hidden, self.region)
                 y_center = torch.argmax(y_logits, dim=-1) / y_logits.size(-1)
                 # Decode size
                 if include_size:
+                    mask[:, :, pos], pos_ids[0] = 1, pos
+                    logits, hidden = self._decode_one_tok(next_emb, mask, pos_ids)
                     pos += 1
                     size_logits = decode_size(hidden, self.region)
+                    # Get bin indices from the logits
+                    w_bin = torch.argmax(size_logits[0], dim=-1)
+                    h_bin = torch.argmax(size_logits[1], dim=-1)
+                    # Convert from bin indices to actual size values using the inverse of the log-scale mapping
+                    # Formula: size = 2^((bin / 1023.0) * 10.0 - 10.0)
+                    w = torch.pow(2.0, (w_bin.float() / 1023.0) * 10.0 - 10.0)
+                    h = torch.pow(2.0, (h_bin.float() / 1023.0) * 10.0 - 10.0)
                     next_emb = encode_size(
                         torch.tensor(
                             [w, h], device=self.device, dtype=size_logits.dtype
                     out.append({"x": x_center.item(), "y": y_center.item()})
                 # Decode next token (x-coordinate, or eos)
+                mask[:, :, pos], pos_ids[0] = 1, pos
+                logits, hidden = self._decode_one_tok(next_emb, mask, pos_ids)
                 pos += 1
                 next_token = torch.argmax(logits, dim=-1)
             raise NotImplementedError("Model does not support object detection.")
         image = self.encode_image(image)
+        self.load_encoded_image(image)
         prompt_tokens = torch.tensor(
             [
                 self.config.tokenizer.templates["detect"]["prefix"]
+                + self.tokenizer.encode(" " + object).ids
                 + self.config.tokenizer.templates["detect"]["suffix"]
             ],
             device=self.device,
         )
+        _, hidden, next_token, pos = self._prefill_prompt(prompt_tokens, image.pos)
         hidden = hidden[:, -1:, :]
         objects = self._generate_points(
+            hidden, next_token, pos, include_size=True, max_points=50
         )
         return {"objects": objects}
             raise NotImplementedError("Model does not support pointing.")
         image = self.encode_image(image)
+        self.load_encoded_image(image)
         prompt_tokens = torch.tensor(
             [
                 self.config.tokenizer.templates["point"]["prefix"]
+                + self.tokenizer.encode(" " + object).ids
                 + self.config.tokenizer.templates["point"]["suffix"]
             ],
             device=self.device,
         )
+        _, hidden, next_token, pos = self._prefill_prompt(prompt_tokens, image.pos)
         hidden = hidden[:, -1:, :]
         objects = self._generate_points(
+            hidden, next_token, pos, include_size=False, max_points=50
         )
         return {"points": objects}
         source: Tuple[float, float],
         force_detect: bool = False,
     ):
+        with torch.inference_mode():
             before_emb = text_encoder(
                 torch.tensor(
                     [self.tokenizer.encode("\n\nPoint:").ids], device=self.device
             prompt_emb = torch.cat([before_emb, x_emb, y_emb, after_emb], dim=1)
+            self.load_encoded_image(image)
+            mask = self.attn_mask[:, :, image.pos : image.pos + prompt_emb.size(1), :]
+            pos_ids = torch.arange(
+                image.pos, image.pos + prompt_emb.size(1), dtype=torch.long
             )
+            hidden = self._prefill(prompt_emb, mask, pos_ids)
             logits = lm_head(hidden, self.text)
             next_token = torch.argmax(logits, dim=-1)
             pos = image.pos + prompt_emb.size(1)
                 return None
             gaze = self._generate_points(
+                hidden, next_token, pos, include_size=False, max_points=1
             )
             return gaze[0]
             )
             return {"gaze": {"x": mean_gaze[0], "y": mean_gaze[1]}}
+def _is_cjk_char(cp):
+    """Checks whether CP is the codepoint of a CJK character."""
+    # This defines a "chinese character" as anything in the CJK Unicode block:
+    # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+    if (
+        (cp >= 0x4E00 and cp <= 0x9FFF)
+        or (cp >= 0x3400 and cp <= 0x4DBF)
+        or (cp >= 0x2F800 and cp <= 0x2FA1F)
+    ):
+        return True
+    return False

region.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import torch
 import math
-from .weights import RegionModel
 from .layers import linear, mlp
@@ -25,7 +25,7 @@ def fourier_features(x: torch.Tensor, w: torch.Tensor) -> torch.Tensor:
     return torch.cat([f.cos(), f.sin()], dim=-1)
-def encode_coordinate(coord: torch.Tensor, w: RegionModel) -> torch.Tensor:
     """
     Takes as input a tensor containing a single float coordinate value (x or y)
     and encodes it into hidden states for input to the text model.
@@ -39,7 +39,7 @@ def encode_coordinate(coord: torch.Tensor, w: RegionModel) -> torch.Tensor:
     return linear(fourier_features(coord, w.coord_features), w.coord_encoder)
-def decode_coordinate(hidden_state: torch.Tensor, w: RegionModel) -> torch.Tensor:
     """
     Takes as input the last hidden state from the text model and outputs a single logit
     representing either an x or y coordinate prediction.
@@ -53,13 +53,13 @@ def decode_coordinate(hidden_state: torch.Tensor, w: RegionModel) -> torch.Tenso
     return mlp(hidden_state, w.coord_decoder)
-def encode_size(size: torch.Tensor, w: RegionModel) -> torch.Tensor:
     """
-    Takes a tensor containing normalized width and height values in range [0,1]
-    and encodes them into hidden states for input to the text model.
     Args:
-        size: Tensor with two floats for width and height in range [0,1]
     Returns:
         Encoded hidden states tensor for input to text model
@@ -67,16 +67,23 @@ def encode_size(size: torch.Tensor, w: RegionModel) -> torch.Tensor:
     return linear(fourier_features(size, w.size_features), w.size_encoder)
-def decode_size(hidden_state: torch.Tensor, w: RegionModel) -> torch.Tensor:
     """
-    Takes as input the last hidden state from the text model and outputs two logits
-    for width and height respectively.
     Args:
         hidden_state: The final hidden state tensor from the text model.
     Returns:
-        A tensor containing two logits - one for predicted width and one for
-        predicted height.
     """
     return mlp(hidden_state, w.size_decoder).view(2, -1)

 import torch
+import torch.nn as nn
 import math
 from .layers import linear, mlp
     return torch.cat([f.cos(), f.sin()], dim=-1)
+def encode_coordinate(coord: torch.Tensor, w: nn.Module) -> torch.Tensor:
     """
     Takes as input a tensor containing a single float coordinate value (x or y)
     and encodes it into hidden states for input to the text model.
     return linear(fourier_features(coord, w.coord_features), w.coord_encoder)
+def decode_coordinate(hidden_state: torch.Tensor, w: nn.Module) -> torch.Tensor:
     """
     Takes as input the last hidden state from the text model and outputs a single logit
     representing either an x or y coordinate prediction.
     return mlp(hidden_state, w.coord_decoder)
+def encode_size(size: torch.Tensor, w: nn.Module) -> torch.Tensor:
     """
+    Takes a tensor containing width and height values and encodes them into
+    hidden states for input to the text model.
     Args:
+        size: Tensor with two floats for width and height
     Returns:
         Encoded hidden states tensor for input to text model
     return linear(fourier_features(size, w.size_features), w.size_encoder)
+def decode_size(hidden_state: torch.Tensor, w: nn.Module) -> torch.Tensor:
     """
+    Takes as input the last hidden state from the text model and outputs logits
+    for 1024 bins representing width and height in log-scale.
+    The bins are distributed according to the formula:
+    bin = (log2(size) + 10.0) / 10.0 * 1023.0
+    where size values are clamped to be at least 1/1024.
+    To convert from bin back to size:
+    size = 2^((bin / 1023.0) * 10.0 - 10.0)
     Args:
         hidden_state: The final hidden state tensor from the text model.
     Returns:
+        A tensor containing logits for 1024 bins for width and height.
+        Shape is (2, 1024) where the first dimension corresponds to width and height.
     """
     return mlp(hidden_state, w.size_decoder).view(2, -1)

text.py CHANGED Viewed

@@ -1,10 +1,10 @@
 import torch
 import torch.nn as nn
 from torch.nn import functional as F
-from .layers import layer_norm, linear, mlp
 from .rope import apply_rotary_emb, precompute_freqs_cis
-from .weights import AttentionWeights
 from .config import TextConfig
@@ -14,106 +14,153 @@ def text_encoder(input_ids: torch.Tensor, w: nn.Module):
 def attn(
     x: torch.Tensor,
-    w: AttentionWeights,
     freqs_cis: torch.Tensor,
-    layer_kv_cache: torch.Tensor,
     attn_mask: torch.Tensor,
     n_heads: int,
-    pos: int,
 ):
     bsz, q_len, d_model = x.shape
     head_dim = d_model // n_heads
-    q, k, v = [
-        t.view(bsz, q_len, n_heads, head_dim).transpose(1, 2)
-        for t in linear(x, w.qkv).chunk(3, dim=-1)
-    ]
-    position_ids = torch.arange(pos, pos + q_len, dtype=torch.long)
     q = apply_rotary_emb(q, freqs_cis, position_ids, n_heads)
-    k = apply_rotary_emb(k, freqs_cis, position_ids, n_heads)
-    k_, v_ = k, v
-    if layer_kv_cache is not None:
-        k = torch.cat([layer_kv_cache[0, :, :, :pos, :], k], dim=2)
-        v = torch.cat([layer_kv_cache[1, :, :, :pos, :], v], dim=2)
-    out = F.scaled_dot_product_attention(q, k, v, attn_mask=attn_mask).to(
-        # This type conversion isn't needed when running in PyTorch directly, but the
-        # ONNX export runs attention in float32 because the attention mask is cast to
-        # float32.
-        x.dtype
     )
     out = out.transpose(1, 2).reshape(bsz, q_len, d_model)
-    out = linear(out, w.proj)
-    return out, torch.stack([k_, v_])
-def text_decoder(
-    inputs_embeds: torch.Tensor,
-    w: nn.Module,
-    kv_cache: torch.Tensor,
-    pos: int,
-    config: TextConfig,
 ):
     hidden_BTC = inputs_embeds
-    new_kv_cache = [torch.empty(0)] * len(w.blocks)
-    attn_mask = w.attn_mask[
-        :, :, pos : pos + hidden_BTC.size(1), : pos + hidden_BTC.size(1)
-    ]
     for i, block in enumerate(w.blocks):
         l_in = layer_norm(hidden_BTC, block.ln)
-        l_attn, new_kv_cache[i] = attn(
             l_in,
             block.attn,
             freqs_cis=w.freqs_cis,
-            layer_kv_cache=kv_cache[i],
             attn_mask=attn_mask,
             n_heads=config.n_heads,
-            pos=pos,
         )
         l_mlp = mlp(l_in, block.mlp)
-        hidden_BTC = hidden_BTC + l_attn + l_mlp
-    return hidden_BTC, torch.stack(new_kv_cache)
 def lm_head(hidden_BTC: torch.Tensor, w: nn.Module):
     hidden_BC = hidden_BTC[:, -1, :]
     hidden_BC = layer_norm(hidden_BC, w.post_ln)
-    logits = linear(hidden_BC, w.lm_head)
     return logits
-def prefill(
-    inputs_embeds: torch.Tensor,
-    kv_cache: torch.Tensor,
-    pos: int,
-    w: nn.Module,
-    config: TextConfig,
-):
-    # Updates kv_cache in-place
-    hidden, kv_cache[:, :, :, :, pos : pos + inputs_embeds.size(1), :] = text_decoder(
-        inputs_embeds, w, kv_cache, pos, config
-    )
-    return hidden
-def decode_one_token(
-    token_emb: torch.Tensor,
-    kv_cache: torch.Tensor,
-    pos: int,
-    w: nn.Module,
-    config: TextConfig,
-):
-    hidden, kv_cache_update = text_decoder(token_emb[None], w, kv_cache, pos, config)
-    logits = lm_head(hidden, w)
-    return logits, hidden, kv_cache_update
 def build_text_model(config: TextConfig, dtype: torch.dtype) -> nn.Module:
     text = nn.ModuleDict(
         {
             "blocks": nn.ModuleList(
@@ -123,9 +170,7 @@ def build_text_model(config: TextConfig, dtype: torch.dtype) -> nn.Module:
                             "ln": nn.LayerNorm(config.dim, dtype=dtype),
                             "attn": nn.ModuleDict(
                                 {
-                                    "qkv": nn.Linear(
-                                        config.dim, 3 * config.dim, dtype=dtype
-                                    ),
                                     "proj": nn.Linear(
                                         config.dim, config.dim, dtype=dtype
                                     ),
@@ -134,10 +179,10 @@ def build_text_model(config: TextConfig, dtype: torch.dtype) -> nn.Module:
                             "mlp": nn.ModuleDict(
                                 {
                                     "fc1": nn.Linear(
-                                        config.dim, 4 * config.dim, dtype=dtype
                                     ),
                                     "fc2": nn.Linear(
-                                        4 * config.dim, config.dim, dtype=dtype
                                     ),
                                 }
                             ),
@@ -157,11 +202,4 @@ def build_text_model(config: TextConfig, dtype: torch.dtype) -> nn.Module:
         persistent=False,
     )
-    attn_mask = torch.tril(
-        torch.ones(1, 1, config.max_context, config.max_context, dtype=torch.bool)
-    )
-    if config.prefix_attn != 0:
-        attn_mask[..., : config.prefix_attn, : config.prefix_attn] = 1
-    text.register_buffer("attn_mask", attn_mask, persistent=False)
     return text

 import torch
 import torch.nn as nn
 from torch.nn import functional as F
+from .layers import layer_norm, mlp
 from .rope import apply_rotary_emb, precompute_freqs_cis
 from .config import TextConfig
 def attn(
     x: torch.Tensor,
+    w: nn.Module,
     freqs_cis: torch.Tensor,
+    kv_cache: nn.Module,
     attn_mask: torch.Tensor,
     n_heads: int,
+    n_kv_heads: int,
+    position_ids: torch.Tensor,
 ):
     bsz, q_len, d_model = x.shape
     head_dim = d_model // n_heads
+    qkv_out = w.qkv(x)  # shape: (bsz, q_len, (n_heads + 2*n_kv_heads)*head_dim)
+    q_dim = n_heads * head_dim
+    kv_dim = n_kv_heads * head_dim
+    q = qkv_out[..., :q_dim].view(bsz, q_len, n_heads, head_dim).transpose(1, 2)
+    k = (
+        qkv_out[..., q_dim : q_dim + kv_dim]
+        .view(bsz, q_len, n_kv_heads, head_dim)
+        .transpose(1, 2)
+    )
+    v = (
+        qkv_out[..., q_dim + kv_dim :]
+        .view(bsz, q_len, n_kv_heads, head_dim)
+        .transpose(1, 2)
+    )
     q = apply_rotary_emb(q, freqs_cis, position_ids, n_heads)
+    k = apply_rotary_emb(k, freqs_cis, position_ids, n_kv_heads)
+    if kv_cache is not None:
+        k, v = kv_cache.update(position_ids, k, v)
+    out = F.scaled_dot_product_attention(
+        q, k, v, attn_mask=attn_mask, enable_gqa=n_heads != n_kv_heads
     )
     out = out.transpose(1, 2).reshape(bsz, q_len, d_model)
+    out = w.proj(out)
+    return out
+def _attn(
+    x: torch.Tensor,
+    w: torch.Tensor,
+    freqs_cis: torch.Tensor,
+    attn_mask: torch.Tensor,
+    n_heads: int,
+    n_kv_heads: int,
 ):
+    bsz, q_len, d_model = x.shape
+    head_dim = d_model // n_heads
+    pos = 0
+    qkv_out = w.qkv(x)  # shape: (bsz, q_len, (n_heads + 2*n_kv_heads)*head_dim)
+    q_dim = n_heads * head_dim
+    kv_dim = n_kv_heads * head_dim
+    q = qkv_out[..., :q_dim].view(bsz, q_len, n_heads, head_dim).transpose(1, 2)
+    k = (
+        qkv_out[..., q_dim : q_dim + kv_dim]
+        .view(bsz, q_len, n_kv_heads, head_dim)
+        .transpose(1, 2)
+    )
+    v = (
+        qkv_out[..., q_dim + kv_dim :]
+        .view(bsz, q_len, n_kv_heads, head_dim)
+        .transpose(1, 2)
+    )
+    position_ids = torch.arange(pos, pos + q_len, dtype=torch.long)
+    q = apply_rotary_emb(q, freqs_cis, position_ids, n_heads)
+    k = apply_rotary_emb(k, freqs_cis, position_ids, n_kv_heads)
+    out = F.scaled_dot_product_attention(
+        q, k, v, attn_mask=attn_mask, enable_gqa=n_heads != n_kv_heads
+    )
+    out = out.transpose(1, 2).reshape(bsz, q_len, d_model)
+    out = w.proj(out)
+    return out
+def _produce_hidden(inputs_embeds: torch.Tensor, w: nn.Module, config: TextConfig):
     hidden_BTC = inputs_embeds
+    bsz, q_len, d_model = inputs_embeds.shape
+    attn_mask = torch.zeros(q_len, q_len)
+    attn_mask[:730, :730] = 1
+    for i in range(730, q_len):
+        attn_mask[i, : i + 1] = 1
+    attn_mask = attn_mask.to(dtype=torch.bool)
     for i, block in enumerate(w.blocks):
         l_in = layer_norm(hidden_BTC, block.ln)
+        l_attn = _attn(
+            x=l_in,
+            w=block.attn,
+            freqs_cis=w.freqs_cis,
+            attn_mask=attn_mask,
+            n_heads=config.n_heads,
+            n_kv_heads=config.n_kv_heads,
+        )
+        l_mlp = mlp(l_in, block.mlp)
+        hidden_BTC = hidden_BTC + l_attn + l_mlp
+    return hidden_BTC
+def text_decoder(
+    x: torch.Tensor,
+    w: nn.Module,
+    attn_mask: torch.Tensor,
+    position_ids: torch.Tensor,
+    config: TextConfig,
+):
+    for i, block in enumerate(w.blocks):
+        l_in = layer_norm(x, block.ln)
+        l_attn = attn(
             l_in,
             block.attn,
             freqs_cis=w.freqs_cis,
+            kv_cache=block.kv_cache,
             attn_mask=attn_mask,
             n_heads=config.n_heads,
+            n_kv_heads=config.n_kv_heads,
+            position_ids=position_ids,
         )
         l_mlp = mlp(l_in, block.mlp)
+        x = x + l_attn + l_mlp
+    return x
 def lm_head(hidden_BTC: torch.Tensor, w: nn.Module):
     hidden_BC = hidden_BTC[:, -1, :]
     hidden_BC = layer_norm(hidden_BC, w.post_ln)
+    logits = w.lm_head(hidden_BC)
     return logits
+def _lm_head(hidden_BTC: torch.Tensor, w: nn.Module):
+    hidden_BTC = layer_norm(hidden_BTC, w.post_ln)
+    logits = w.lm_head(hidden_BTC)
+    return logits
 def build_text_model(config: TextConfig, dtype: torch.dtype) -> nn.Module:
+    qkv_dim = int(config.dim * (1 + 2 * config.n_kv_heads / config.n_heads))
     text = nn.ModuleDict(
         {
             "blocks": nn.ModuleList(
                             "ln": nn.LayerNorm(config.dim, dtype=dtype),
                             "attn": nn.ModuleDict(
                                 {
+                                    "qkv": nn.Linear(config.dim, qkv_dim, dtype=dtype),
                                     "proj": nn.Linear(
                                         config.dim, config.dim, dtype=dtype
                                     ),
                             "mlp": nn.ModuleDict(
                                 {
                                     "fc1": nn.Linear(
+                                        config.dim, config.ff_dim, dtype=dtype
                                     ),
                                     "fc2": nn.Linear(
+                                        config.ff_dim, config.dim, dtype=dtype
                                     ),
                                 }
                             ),
         persistent=False,
     )
     return text

vision.py CHANGED Viewed

@@ -4,7 +4,6 @@ import torch.nn.functional as F
 import numpy as np
 from typing import Union, Tuple
-from einops import rearrange
 from PIL import Image
 from .layers import attn, layer_norm, linear, mlp
@@ -42,13 +41,28 @@ def prepare_crops(
     return all_crops, overlap_crops["tiling"]
 def vision_encoder(input_BCHW: torch.Tensor, w: nn.Module, config: VisionConfig):
-    x = rearrange(
-        input_BCHW,
-        "b c (h p1) (w p2) -> b (h w) (c p1 p2)",
-        p1=config.enc_patch_size,
-        p2=config.enc_patch_size,
-    )  # B3HW -> B(HxW)(3xP1xP2), aka BTC
     x = linear(x, w.patch_emb)
     x = x + w.pos_emb

 import numpy as np
 from typing import Union, Tuple
 from PIL import Image
 from .layers import attn, layer_norm, linear, mlp
     return all_crops, overlap_crops["tiling"]
+def create_patches(x, patch_size):
+    # Original shape: [B, C, H, W]
+    B, C, H, W = x.shape
+    P1 = P2 = patch_size
+    # Step 1: Split H and W dimensions into patches
+    # [B, C, H/P1, P1, W/P2, P2]
+    x = x.reshape(B, C, H // P1, P1, W // P2, P2)
+    # Step 2: Rearrange dimensions to match target shape
+    # [B, H/P1, W/P2, C, P1, P2]
+    x = x.permute(0, 2, 4, 1, 3, 5)
+    # Step 3: Combine dimensions to get final shape
+    # [B, (H/P1)*(W/P2), C*P1*P2]
+    x = x.reshape(B, (H // P1) * (W // P2), C * P1 * P2)
+    return x
 def vision_encoder(input_BCHW: torch.Tensor, w: nn.Module, config: VisionConfig):
+    x = create_patches(input_BCHW, config.enc_patch_size)
     x = linear(x, w.patch_emb)
     x = x + w.pos_emb