kuleshov-group
/

bd3lm-owt-block_size8

Text Generation

language-modeling

Model card Files Files and versions

marriola commited on Mar 4

Commit

66201c3

·

verified ·

1 Parent(s): 3d9f895

Upload BD3LM

Files changed (1) hide show

modeling_bd3lm.py +2 -4

modeling_bd3lm.py CHANGED Viewed

@@ -299,7 +299,6 @@ class DDiTBlock(nn.Module):
     else:
       return bias_dropout_add_scale_fused_inference
   def get_qkv(self, x, rotary_cos_sin, store_kv=False):
     # compute qkv (potentially use cache)
     if self.kv_cache is not None:
@@ -307,11 +306,10 @@ class DDiTBlock(nn.Module):
       qkv = torch.cat((self.kv_cache, new_qkv), dim=1)
     else:
       qkv = self.attn_qkv(x)
     # store kv cache in a sliding window (can't exceed context len)
     if store_kv:
-      self.kv_cache = qkv
-      self.kv_cache = self.kv_cache[:, -self.n:]
     qkv = einops.rearrange(
       qkv,
       'b s (three h d) -> b s three h d',

     else:
       return bias_dropout_add_scale_fused_inference
   def get_qkv(self, x, rotary_cos_sin, store_kv=False):
     # compute qkv (potentially use cache)
     if self.kv_cache is not None:
       qkv = torch.cat((self.kv_cache, new_qkv), dim=1)
     else:
       qkv = self.attn_qkv(x)
     # store kv cache in a sliding window (can't exceed context len)
     if store_kv:
+      self.kv_cache = qkv[:, -(self.n-self.block_size):]
     qkv = einops.rearrange(
       qkv,
       'b s (three h d) -> b s three h d',