kuleshov-group
/

bd3lm-owt-block_size16

Text Generation

language-modeling

Model card Files Files and versions

marriola commited on Mar 4

Commit

00c5b41

·

verified ·

1 Parent(s): e58f74c

Upload BD3LM

Files changed (1) hide show

modeling_bd3lm.py +6 -8

modeling_bd3lm.py CHANGED Viewed

@@ -267,10 +267,11 @@ def regular_attention_multi_headed(qkv):
 class DDiTBlock(nn.Module):
-  def __init__(self, n, dim, n_heads, cond_dim, mlp_ratio=4,
                dropout=0.1, attn_backend='flash_attn'):
     super().__init__()
     self.n = n
     self.n_heads = n_heads
     self.attn_backend = attn_backend
     self.kv_cache = None
@@ -302,19 +303,15 @@ class DDiTBlock(nn.Module):
   def get_qkv(self, x, rotary_cos_sin, store_kv=False):
     # compute qkv (potentially use cache)
     if self.kv_cache is not None:
-      block_len = x.shape[1] - self.kv_cache.shape[1]
-      new_qkv = self.attn_qkv(x[:, -block_len:])
       qkv = torch.cat((self.kv_cache, new_qkv), dim=1)
     else:
       qkv = self.attn_qkv(x)
     # store kv cache in a sliding window (can't exceed context len)
     if store_kv:
-      if self.kv_cache is not None:
-        cache_len = min(x.shape[1], self.n - block_len)
-        self.kv_cache = qkv[:, -cache_len:]
-      else:
-        self.kv_cache = qkv
     qkv = einops.rearrange(
       qkv,
       'b s (three h d) -> b s three h d',
@@ -440,6 +437,7 @@ class DITBackbone(nn.Module):
     blocks = []
     for _ in range(config.n_blocks):
       blocks.append(DDiTBlock(self.n,
                               config.hidden_dim,
                               config.n_heads,
                               config.cond_dim,

 class DDiTBlock(nn.Module):
+  def __init__(self, n, block_size, dim, n_heads, cond_dim, mlp_ratio=4,
                dropout=0.1, attn_backend='flash_attn'):
     super().__init__()
     self.n = n
+    self.block_size = block_size
     self.n_heads = n_heads
     self.attn_backend = attn_backend
     self.kv_cache = None
   def get_qkv(self, x, rotary_cos_sin, store_kv=False):
     # compute qkv (potentially use cache)
     if self.kv_cache is not None:
+      new_qkv = self.attn_qkv(x[:, -self.block_size:])
       qkv = torch.cat((self.kv_cache, new_qkv), dim=1)
     else:
       qkv = self.attn_qkv(x)
     # store kv cache in a sliding window (can't exceed context len)
     if store_kv:
+      self.kv_cache = qkv
+      self.kv_cache = self.kv_cache[:, -self.n:]
     qkv = einops.rearrange(
       qkv,
       'b s (three h d) -> b s three h d',
     blocks = []
     for _ in range(config.n_blocks):
       blocks.append(DDiTBlock(self.n,
+                              self.block_size,
                               config.hidden_dim,
                               config.n_heads,
                               config.cond_dim,