loubb
/

aria-medium-base

@@ -10,7 +10,7 @@
   "model_type": "aria",
   "num_attention_heads": 24,
   "num_hidden_layers": 16,
-  "torch_dtype": "bfloat16",
   "transformers_version": "4.45.0",
   "use_cache": true,
   "vocab_size": 17727,

   "model_type": "aria",
   "num_attention_heads": 24,
   "num_hidden_layers": 16,
+  "torch_dtype": "float32",
   "transformers_version": "4.45.0",
   "use_cache": true,
   "vocab_size": 17727,

model.safetensors DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:9057480d90c91e0b9000f365ceafcbd7e21cd1940dc4bb25f1bd328cbe26c28f
-size 2634170640

modeling_aria.py CHANGED Viewed

@@ -180,13 +180,11 @@ class TransformerBlock(nn.Module):
                 xk, xv, self.layer_idx, cache_kwargs
             )
-        # scaled_dot_product_attention expects: (b_sz, n_head, s_len, d_head)
         att = F.scaled_dot_product_attention(
             query=xq,
             key=xk,
             value=xv,
-            attn_mask=attention_mask,
-            # is_causal=True,
         )
         # Reshape for out: (b_sz, s_len, n_head, d_head)
@@ -215,6 +213,7 @@ class AriaModel(AriaPreTrainedModel):
         super().__init__(model_config)
         self.model_config = model_config
         self.freqs_cis = None
         self.tok_embeddings = nn.Embedding(
             num_embeddings=model_config.vocab_size,
@@ -341,13 +340,10 @@ class AriaModel(AriaPreTrainedModel):
             position_ids = cache_position.unsqueeze(0)
         hidden_states = inputs_embeds
-        causal_mask = self._update_causal_mask(
-            attention_mask,
-            inputs_embeds,
-            cache_position,
-            past_key_values,
-            output_attentions,
-        )
         if self.freqs_cis is None:
             self.freqs_cis = precompute_freqs_cis(
@@ -360,6 +356,19 @@ class AriaModel(AriaPreTrainedModel):
         freqs_cis = self.freqs_cis[cache_position]
         kwargs = {
             "position_ids": position_ids,
             "past_key_values": past_key_values,
@@ -432,130 +441,6 @@ class AriaModel(AriaPreTrainedModel):
             attentions=all_attentions,
         )
-    def _update_causal_mask(
-        self,
-        attention_mask: torch.Tensor,
-        input_tensor: torch.Tensor,
-        cache_position: torch.Tensor,
-        past_key_values: Cache,
-        output_attentions: bool,
-    ):
-        if self.model_config._attn_implementation == "flash_attention_2":
-            if attention_mask is not None and (attention_mask == 0.0).any():
-                return attention_mask
-            return None
-        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
-        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
-        # to infer the attention mask.
-        past_seen_tokens = (
-            past_key_values.get_seq_length()
-            if past_key_values is not None
-            else 0
-        )
-        using_static_cache = isinstance(past_key_values, StaticCache)
-        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
-        if (
-            self.model_config._attn_implementation == "sdpa"
-            and not using_static_cache
-            and not output_attentions
-        ):
-            if AttentionMaskConverter._ignore_causal_mask_sdpa(
-                attention_mask,
-                inputs_embeds=input_tensor,
-                past_key_values_length=past_seen_tokens,
-                is_training=self.training,
-            ):
-                return None
-        dtype, device = input_tensor.dtype, input_tensor.device
-        sequence_length = input_tensor.shape[1]
-        if using_static_cache:
-            target_length = past_key_values.get_max_cache_shape()
-        else:
-            target_length = (
-                attention_mask.shape[-1]
-                if isinstance(attention_mask, torch.Tensor)
-                else past_seen_tokens + sequence_length + 1
-            )
-        # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
-        causal_mask = (
-            self._prepare_4d_causal_attention_mask_with_cache_position(
-                attention_mask,
-                sequence_length=sequence_length,
-                target_length=target_length,
-                dtype=dtype,
-                device=device,
-                cache_position=cache_position,
-                batch_size=input_tensor.shape[0],
-            )
-        )
-        if (
-            self.model_config._attn_implementation == "sdpa"
-            and attention_mask is not None
-            and attention_mask.device.type == "cuda"
-            and not output_attentions
-        ):
-            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
-            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
-            # Details: https://github.com/pytorch/pytorch/issues/110213
-            min_dtype = torch.finfo(dtype).min
-            causal_mask = AttentionMaskConverter._unmask_unattended(
-                causal_mask, min_dtype
-            )
-        return causal_mask
-    @staticmethod
-    # Copied from transformers.models.llama.modeling_llama.LlamaModel._prepare_4d_causal_attention_mask_with_cache_position
-    def _prepare_4d_causal_attention_mask_with_cache_position(
-        attention_mask: torch.Tensor,
-        sequence_length: int,
-        target_length: int,
-        dtype: torch.dtype,
-        device: torch.device,
-        cache_position: torch.Tensor,
-        batch_size: int,
-        **kwargs,
-    ):
-        if attention_mask is not None and attention_mask.dim() == 4:
-            # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
-            causal_mask = attention_mask
-        else:
-            min_dtype = torch.finfo(dtype).min
-            causal_mask = torch.full(
-                (sequence_length, target_length),
-                fill_value=min_dtype,
-                dtype=dtype,
-                device=device,
-            )
-            if sequence_length != 1:
-                causal_mask = torch.triu(causal_mask, diagonal=1)
-            causal_mask *= torch.arange(
-                target_length, device=device
-            ) > cache_position.reshape(-1, 1)
-            causal_mask = causal_mask[None, None, :, :].expand(
-                batch_size, 1, -1, -1
-            )
-            if attention_mask is not None:
-                causal_mask = (
-                    causal_mask.clone()
-                )  # copy to contiguous memory for in-place edit
-                mask_length = attention_mask.shape[-1]
-                padding_mask = (
-                    causal_mask[:, :, :, :mask_length]
-                    + attention_mask[:, None, None, :]
-                )
-                padding_mask = padding_mask == 0
-                causal_mask[:, :, :, :mask_length] = causal_mask[
-                    :, :, :, :mask_length
-                ].masked_fill(padding_mask, min_dtype)
-        return causal_mask
 class AriaForCausalLM(AriaPreTrainedModel, GenerationMixin):
     """Transformer decoder with head for language modelling.
@@ -732,6 +617,12 @@ class AriaForSequenceEmbedding(AriaPreTrainedModel):
         )
 def precompute_freqs_cis(
     seq_len: int,
     n_elem: int,
@@ -749,7 +640,6 @@ def precompute_freqs_cis(
     return cache.to(dtype=dtype)
-@torch.jit.script
 def apply_rotary_emb(x: torch.Tensor, freqs_cis: torch.Tensor) -> torch.Tensor:
     """
     In-place RoPE. Credits to Katherine Crowson:

                 xk, xv, self.layer_idx, cache_kwargs
             )
         att = F.scaled_dot_product_attention(
             query=xq,
             key=xk,
             value=xv,
+            attn_mask=attention_mask[..., : xk.shape[2]],
         )
         # Reshape for out: (b_sz, s_len, n_head, d_head)
         super().__init__(model_config)
         self.model_config = model_config
         self.freqs_cis = None
+        self.causal_mask = None
         self.tok_embeddings = nn.Embedding(
             num_embeddings=model_config.vocab_size,
             position_ids = cache_position.unsqueeze(0)
         hidden_states = inputs_embeds
+        if self.causal_mask is None:
+            self.causal_mask = precompute_causal_mask(
+                max_seq_len=self.model_config.max_seq_len,
+            ).to(input_ids.device)
         if self.freqs_cis is None:
             self.freqs_cis = precompute_freqs_cis(
         freqs_cis = self.freqs_cis[cache_position]
+        if use_cache is True:
+            causal_mask = self.causal_mask[None, None, cache_position]
+        else:
+            causal_mask = self.causal_mask[None, None, :seq_length, :seq_length]
+        if attention_mask is not None:
+            pad_len = causal_mask.shape[3] - attention_mask.shape[1]
+            padded_attention_mask = F.pad(attention_mask, (0, pad_len), value=1)
+            padded_attention_mask = padded_attention_mask[:, None, None, :]
+            padded_attention_mask = padded_attention_mask.bool()
+            causal_mask = causal_mask & padded_attention_mask
         kwargs = {
             "position_ids": position_ids,
             "past_key_values": past_key_values,
             attentions=all_attentions,
         )
 class AriaForCausalLM(AriaPreTrainedModel, GenerationMixin):
     """Transformer decoder with head for language modelling.
         )
+def precompute_causal_mask(max_seq_len: int):
+    return torch.tril(
+        torch.ones(max_seq_len, max_seq_len, dtype=torch.bool)
+    ).cuda()
 def precompute_freqs_cis(
     seq_len: int,
     n_elem: int,
     return cache.to(dtype=dtype)
 def apply_rotary_emb(x: torch.Tensor, freqs_cis: torch.Tensor) -> torch.Tensor:
     """
     In-place RoPE. Credits to Katherine Crowson: