Upload DogeForCausalLM
Browse files- modeling_doge.py +1 -8
modeling_doge.py
CHANGED
|
@@ -864,14 +864,7 @@ class DogeModel(DogePreTrainedModel):
|
|
| 864 |
past_key_values: Cache,
|
| 865 |
output_attentions: bool,
|
| 866 |
):
|
| 867 |
-
|
| 868 |
-
if attention_mask is not None and (attention_mask == 0.0).any():
|
| 869 |
-
return attention_mask
|
| 870 |
-
return None
|
| 871 |
-
|
| 872 |
-
# For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
|
| 873 |
-
# order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
|
| 874 |
-
# to infer the attention mask.
|
| 875 |
past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
|
| 876 |
using_static_cache = isinstance(past_key_values, StaticCache)
|
| 877 |
|
|
|
|
| 864 |
past_key_values: Cache,
|
| 865 |
output_attentions: bool,
|
| 866 |
):
|
| 867 |
+
# We have to provide attention_mask for dynamic mask computation
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 868 |
past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
|
| 869 |
using_static_cache = isinstance(past_key_values, StaticCache)
|
| 870 |
|