SmallDoge
/

Doge-160M

Text Generation

Model card Files Files and versions

JingzeShi commited on Feb 17

Commit

a6a6403

·

verified ·

1 Parent(s): 09899c7

Upload DogeForCausalLM

Files changed (1) hide show

modeling_doge.py +1 -8

modeling_doge.py CHANGED Viewed

@@ -864,14 +864,7 @@ class DogeModel(DogePreTrainedModel):
         past_key_values: Cache,
         output_attentions: bool,
     ):
-        if self.config._attn_implementation == "flash_attention_2":
-            if attention_mask is not None and (attention_mask == 0.0).any():
-                return attention_mask
-            return None
-        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
-        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
-        # to infer the attention mask.
         past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
         using_static_cache = isinstance(past_key_values, StaticCache)

         past_key_values: Cache,
         output_attentions: bool,
     ):
+        # We have to provide attention_mask for dynamic mask computation
         past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
         using_static_cache = isinstance(past_key_values, StaticCache)