Upload HfMoondream

Browse files

Files changed (3) hide show

layers.py +9 -6
model.safetensors +1 -1
packing.py +7 -32

layers.py CHANGED Viewed

@@ -39,15 +39,17 @@ class QuantizedLinear(nn.Module):
             {
                 "packed": nn.Parameter(
                     torch.empty(
-                        out_features, in_features // 128, 64, dtype=torch.uint8
                     ),
                     requires_grad=False,
                 ),
                 "scale": nn.Parameter(
-                    torch.empty(out_features, in_features // 128), requires_grad=False
                 ),
                 "zero_point": nn.Parameter(
-                    torch.empty(out_features, in_features // 128), requires_grad=False
                 ),
             }
         )
@@ -57,13 +59,13 @@ class QuantizedLinear(nn.Module):
     def unpack(self):
         if self.unpacked:
             return
         self.weight = nn.Parameter(
             dequantize_tensor(
                 self.weight["packed"],
                 self.weight["scale"],
                 self.weight["zero_point"],
-                (self.weight["packed"].shape[0], self.weight["packed"].shape[1] * 128),
-                128,
                 torch.bfloat16,
             )
         )
@@ -75,10 +77,11 @@ class QuantizedLinear(nn.Module):
         self.linear.bias = nn.Parameter(
             self.bias.to(torch.bfloat16), requires_grad=False
         )
         del self.weight, self.bias
         quantize_(self, int4_weight_only(group_size=128))
-        torch.cuda.empty_cache()
         self.unpacked = True
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         if not self.unpacked:

             {
                 "packed": nn.Parameter(
                     torch.empty(
+                        out_features * in_features // (128 * 2), 128, dtype=torch.uint8
                     ),
                     requires_grad=False,
                 ),
                 "scale": nn.Parameter(
+                    torch.empty(out_features * in_features // 128, 1),
+                    requires_grad=False,
                 ),
                 "zero_point": nn.Parameter(
+                    torch.empty(out_features * in_features // 128, 1),
+                    requires_grad=False,
                 ),
             }
         )
     def unpack(self):
         if self.unpacked:
             return
         self.weight = nn.Parameter(
             dequantize_tensor(
                 self.weight["packed"],
                 self.weight["scale"],
                 self.weight["zero_point"],
+                (self.out_features, self.in_features),
                 torch.bfloat16,
             )
         )
         self.linear.bias = nn.Parameter(
             self.bias.to(torch.bfloat16), requires_grad=False
         )
         del self.weight, self.bias
         quantize_(self, int4_weight_only(group_size=128))
         self.unpacked = True
+        torch.cuda.empty_cache()
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         if not self.unpacked:

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:dfce186edf359fff98d0c077ae389b980b6cae99279d157fc00b2d03ca65968f
 size 2032380848

 version https://git-lfs.github.com/spec/v1
+oid sha256:b839cdbd6716eef6242536929c05243d58af49929a12c198d3913caa05c7c3ee
 size 2032380848

packing.py CHANGED Viewed

@@ -1,35 +1,10 @@
 import torch
-def unpack_int4(packed: torch.Tensor, original_length: int) -> torch.Tensor:
-    orig_shape = packed.shape
-    last_dim = orig_shape[-1]
-    batch_shape = orig_shape[:-1]
-    flat_packed = packed.reshape(-1, last_dim)
-    batch_size = flat_packed.shape[0]
-    flat_bytes = flat_packed.reshape(-1)
-    lower = flat_bytes & 0xF
-    upper = (flat_bytes >> 4) & 0xF
-    unpacked = torch.stack([lower, upper], dim=1).reshape(batch_size, last_dim * 2)
-    unpacked = unpacked[:, :original_length]
-    unpacked = unpacked.reshape(*batch_shape, original_length)
-    return unpacked.to(torch.int8)
-def dequantize_tensor(
-    packed: torch.Tensor,
-    scales: torch.Tensor,
-    zero_points: torch.Tensor,
-    orig_shape: torch.Size,
-    block_size: int,
-    dtype: torch.dtype = torch.bfloat16,
-):
-    out_features, num_blocks, _ = packed.shape
-    unpacked = unpack_int4(packed, block_size)
-    scales_view = scales.unsqueeze(2)  # Shape: [out_features, num_blocks, 1]
-    zero_points_view = zero_points.unsqueeze(2)  # Shape: [out_features, num_blocks, 1]
-    dequantized = (unpacked.float() - zero_points_view) * scales_view
-    dequantized = dequantized.reshape(out_features, num_blocks * block_size)
-    dequantized = dequantized[:, : orig_shape[1]]
-    dequantized = dequantized.reshape(orig_shape)
-    return dequantized.to(dtype)

 import torch
+def dequantize_tensor(W_q, scale, zero, orig_shape, dtype=torch.bfloat16):
+    _step = W_q.shape[0]
+    W_r = torch.empty([2 * _step, W_q.shape[1]], dtype=dtype, device=W_q.device)
+    W_r[:_step] = (W_q & 0b11110000) >> 4
+    W_r[_step:] = W_q & 0b00001111
+    W_r.sub_(zero).mul_(scale)
+    return W_r.reshape(orig_shape)