moonshotai
/

Kimi-VL-A3B-Instruct

@@ -44,7 +44,6 @@ import math
 import warnings
 from typing import List, Optional, Tuple, Union
 from copy import deepcopy
-from functools import cached_property
 from typing import Union, Tuple, Sequence, Optional, List
 import numpy as np
@@ -66,10 +65,7 @@ from transformers.modeling_outputs import (
     BaseModelOutputWithPast,
     CausalLMOutputWithPast,
 )
-from transformers.pytorch_utils import (
-    ALL_LAYERNORM_LAYERS,
-    is_torch_greater_or_equal_than_1_13,
-)
 from transformers.utils import (
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
@@ -80,7 +76,7 @@ from transformers.utils import (
 )
 from transformers.utils.import_utils import is_torch_fx_available
-from .configuration_kimi_vl import MoonViTConfig, DeepseekV3Config, KimiVLConfig
 if is_flash_attn_2_available():
@@ -280,18 +276,18 @@ class MoonVisionPatchEmbed(nn.Module):
             height=pos_emb_height, width=pos_emb_width, dim=out_dim
         )
-    def forward(self, x: torch.Tensor, grid_hw: torch.Tensor) -> torch.Tensor:
         """
         Args:
             x (L, Channels): input tensor
-            grid_hw (N, 2): grid height and width
         Returns:
             (L, Cout) tensor
         """
         x = self.proj(x).view(x.size(0), -1)
         # apply positional embedding
-        x = self.pos_emb(x, grid_hw)
         return x
@@ -317,22 +313,20 @@ class Rope2DPosEmb(nn.Module):
         device (str): the device to store the precomputed cis
     """
-    def __init__(
-        self, dim: int, max_height: int, max_width: int, theta_base=10000, device="cuda"
-    ):
         super().__init__()
         self.dim = dim
         assert self.dim % 4 == 0, "dim must be divisible by 4"
         self.max_height = max_height
         self.max_width = max_width
         self.theta_base = theta_base
-        self.device = device
     def extra_repr(self):
         return f"dim={self.dim}, max_height={self.max_height}, max_width={self.max_width}, theta_base={self.theta_base}"
-    @cached_property
-    def precomputed_freqs_cis(self) -> torch.Tensor:
         """Calculate the cis(freqs) for each position in the 2D grid.
         Return: complex tensor of shape (max_height, max_width, dim//2) and value:
@@ -341,11 +335,11 @@ class Rope2DPosEmb(nn.Module):
             note: `cis` is a mathematical notation defined by cis x = cos x + i sin x,
         """
         N = self.max_height * self.max_width
-        flat_pos = torch.arange(0, N).float().to(self.device)
         x_pos = flat_pos % self.max_width
         y_pos = flat_pos // self.max_width
         dim_range = (
-            torch.arange(0, self.dim, 4)[: (self.dim // 4)].float().to(self.device)
         )  # C/4
         freqs = 1.0 / (self.theta_base ** (dim_range / self.dim))
         x_freqs = torch.outer(x_pos, freqs).float()  # N, C/4
@@ -360,13 +354,17 @@ class Rope2DPosEmb(nn.Module):
         freqs_cis = freqs_cis.reshape(self.max_height, self.max_width, -1)
         return freqs_cis
-    def get_freqs_cis_by_seqlens(self, grid_hws: torch.Tensor) -> torch.Tensor:
         """
         Args:
-            grid_hws (torch.Tensor): containing list of (height, width) or (t, height, width) tuples.
         Returns:
             freqs_cis: tensor of shape (sum(t * height * width), dim//2)
         """
         shapes = grid_hws.tolist()
         assert all(
             1 <= h <= self.max_height and 1 <= w <= self.max_width for h, w in shapes
@@ -376,41 +374,11 @@ class Rope2DPosEmb(nn.Module):
             self.max_width,
         )
         freqs_cis = torch.cat(
-            [
-                self.precomputed_freqs_cis[:h, :w].reshape(-1, self.dim // 2)
-                for h, w in shapes
-            ],
             dim=0,
         )
         return freqs_cis
-    def get_freqs_cis_by_idx(
-        self, pos_idx: torch.Tensor, pos_idx_mask: torch.Tensor
-    ) -> torch.Tensor:
-        """
-        Args:
-            pos_idx: tensor of shape (..., 2), It contains the (h, w) position indices of each 2D token.
-            pos_idx_mask: a mask of shape (...), the leading dimensions should be the same as pos_idx.
-                Rope will only be applied to the tokens with True mask. `freqs_cis` for the tokens with False mask with be ones.
-        Return:
-            freqs_cis: tensor of shape (..., dim//2)
-        """
-        assert (
-            pos_idx.shape[:-1] == pos_idx_mask.shape
-            and pos_idx.shape[-1] == 2
-            and pos_idx.ndim == pos_idx_mask.ndim + 1
-        ), (pos_idx.shape, pos_idx_mask.shape)
-        assert pos_idx_mask.dtype == torch.bool, pos_idx_mask.dtype
-        shp = pos_idx_mask.shape + (self.dim // 2,)  # ..., head_dim/2
-        freqs_cis = torch.ones(
-            shp, dtype=torch.complex64, device=self.device
-        )  # ..., head_dim/2
-        freqs_cis[pos_idx_mask] = self.precomputed_freqs_cis[
-            pos_idx[..., 0][pos_idx_mask], pos_idx[..., 1][pos_idx_mask]
-        ]
-        return freqs_cis
 class MLP2(nn.Module):
     """
@@ -537,14 +505,14 @@ class MoonVitEncoder(nn.Module):
         self.final_layernorm = nn.LayerNorm(hidden_dim)
     def forward(
-        self, hidden_states: torch.Tensor, grid_hw: torch.Tensor
     ) -> torch.Tensor:
-        rope_freqs_cis = self.rope_2d.get_freqs_cis_by_seqlens(grid_hws=grid_hw)
         lengths = torch.cat(
             (
-                torch.zeros(1, device=hidden_states.device, dtype=grid_hw.dtype),
-                grid_hw[:, 0] * grid_hw[:, 1],
             )
         )
         cu_seqlens = lengths.cumsum(dim=0, dtype=torch.int32)
@@ -561,14 +529,14 @@ class MoonVitEncoder(nn.Module):
 def patch_merger(
     x: torch.Tensor,
-    grid_hw: torch.Tensor,
     merge_kernel_size: list[int, int] = (2, 2),
 ) -> List[torch.Tensor]:
     d_model = x.size(-1)
     outputs = []
     pre_sum = 0
-    for x_shape in grid_hw.tolist():
         height, width = x_shape[0], x_shape[1]
         # Get the current sequence
         seq = x[pre_sum : pre_sum + height * width]
@@ -2290,20 +2258,20 @@ class MoonVitPretrainedModel(PreTrainedModel):
         )
     def forward(
-        self, pixel_values: torch.Tensor, grid_hw: torch.Tensor
     ) -> torch.Tensor:
         """
         Args:
             pixel_values (torch.Tensor): The input pixel values.
-            grid_hw (torch.Tensor): The grid height and width.
         Returns:
             torch.Tensor: The output tokens.
         """
-        hidden_states = self.patch_embed(pixel_values, grid_hw)
-        hidden_states = self.encoder(hidden_states, grid_hw)
         hidden_states = patch_merger(
-            hidden_states, grid_hw, merge_kernel_size=self.merge_kernel_size
         )
         return hidden_states

 import warnings
 from typing import List, Optional, Tuple, Union
 from copy import deepcopy
 from typing import Union, Tuple, Sequence, Optional, List
 import numpy as np
     BaseModelOutputWithPast,
     CausalLMOutputWithPast,
 )
+from transformers.pytorch_utils import is_torch_greater_or_equal_than_1_13
 from transformers.utils import (
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
 )
 from transformers.utils.import_utils import is_torch_fx_available
+from configuration_kimi_vl import MoonViTConfig, DeepseekV3Config, KimiVLConfig
 if is_flash_attn_2_available():
             height=pos_emb_height, width=pos_emb_width, dim=out_dim
         )
+    def forward(self, x: torch.Tensor, grid_hws: torch.Tensor) -> torch.Tensor:
         """
         Args:
             x (L, Channels): input tensor
+            grid_hws (N, 2): grid height and width
         Returns:
             (L, Cout) tensor
         """
         x = self.proj(x).view(x.size(0), -1)
         # apply positional embedding
+        x = self.pos_emb(x, grid_hws)
         return x
         device (str): the device to store the precomputed cis
     """
+    def __init__(self, dim: int, max_height: int, max_width: int, theta_base=10000):
         super().__init__()
         self.dim = dim
         assert self.dim % 4 == 0, "dim must be divisible by 4"
         self.max_height = max_height
         self.max_width = max_width
         self.theta_base = theta_base
+        self.freqs_cis = None
     def extra_repr(self):
         return f"dim={self.dim}, max_height={self.max_height}, max_width={self.max_width}, theta_base={self.theta_base}"
+    def _precompute_freqs_cis(self, device: torch.device) -> torch.Tensor:
         """Calculate the cis(freqs) for each position in the 2D grid.
         Return: complex tensor of shape (max_height, max_width, dim//2) and value:
             note: `cis` is a mathematical notation defined by cis x = cos x + i sin x,
         """
         N = self.max_height * self.max_width
+        flat_pos = torch.arange(0, N).float().to(device)
         x_pos = flat_pos % self.max_width
         y_pos = flat_pos // self.max_width
         dim_range = (
+            torch.arange(0, self.dim, 4)[: (self.dim // 4)].float().to(device)
         )  # C/4
         freqs = 1.0 / (self.theta_base ** (dim_range / self.dim))
         x_freqs = torch.outer(x_pos, freqs).float()  # N, C/4
         freqs_cis = freqs_cis.reshape(self.max_height, self.max_width, -1)
         return freqs_cis
+    def get_freqs_cis(self, grid_hws: torch.Tensor) -> torch.Tensor:
         """
         Args:
+            grid_hws (torch.Tensor): grid height and width
         Returns:
             freqs_cis: tensor of shape (sum(t * height * width), dim//2)
         """
+        if self.freqs_cis is None:
+            self.freqs_cis = self._precompute_freqs_cis(grid_hws.device)
         shapes = grid_hws.tolist()
         assert all(
             1 <= h <= self.max_height and 1 <= w <= self.max_width for h, w in shapes
             self.max_width,
         )
         freqs_cis = torch.cat(
+            [self.freqs_cis[:h, :w].reshape(-1, self.dim // 2) for h, w in shapes],
             dim=0,
         )
         return freqs_cis
 class MLP2(nn.Module):
     """
         self.final_layernorm = nn.LayerNorm(hidden_dim)
     def forward(
+        self, hidden_states: torch.Tensor, grid_hws: torch.Tensor
     ) -> torch.Tensor:
+        rope_freqs_cis = self.rope_2d.get_freqs_cis(grid_hws=grid_hws)
         lengths = torch.cat(
             (
+                torch.zeros(1, device=hidden_states.device, dtype=grid_hws.dtype),
+                grid_hws[:, 0] * grid_hws[:, 1],
             )
         )
         cu_seqlens = lengths.cumsum(dim=0, dtype=torch.int32)
 def patch_merger(
     x: torch.Tensor,
+    grid_hws: torch.Tensor,
     merge_kernel_size: list[int, int] = (2, 2),
 ) -> List[torch.Tensor]:
     d_model = x.size(-1)
     outputs = []
     pre_sum = 0
+    for x_shape in grid_hws.tolist():
         height, width = x_shape[0], x_shape[1]
         # Get the current sequence
         seq = x[pre_sum : pre_sum + height * width]
         )
     def forward(
+        self, pixel_values: torch.Tensor, grid_hws: torch.Tensor
     ) -> torch.Tensor:
         """
         Args:
             pixel_values (torch.Tensor): The input pixel values.
+            grid_hws (torch.Tensor): The grid height and width.
         Returns:
             torch.Tensor: The output tokens.
         """
+        hidden_states = self.patch_embed(pixel_values, grid_hws)
+        hidden_states = self.encoder(hidden_states, grid_hws)
         hidden_states = patch_merger(
+            hidden_states, grid_hws, merge_kernel_size=self.merge_kernel_size
         )
         return hidden_states