sync changes from github

Browse files

Files changed (5) hide show

configuration_aria.py +17 -5
modeling_aria.py +53 -264
moe_lm.py +1 -1
processing_aria.py +23 -1
vision_processor.py +1 -1

configuration_aria.py CHANGED Viewed

@@ -17,11 +17,15 @@
 # specific language governing permissions and limitations
 # under the License.
 from transformers.configuration_utils import PretrainedConfig
 from .moe_lm import AriaMoELMConfig
 from .vision_encoder import AriaVisionConfig
 # adapted from transformers.models.llava.configuration_llava.LlavaConfig
 class AriaConfig(PretrainedConfig):
@@ -69,6 +73,7 @@ class AriaConfig(PretrainedConfig):
         self.image_token_index = image_token_index
         attn_implementation = kwargs.pop("attn_implementation", None)
         # Convert the keys and values of projector_patch_to_query_dict to integers
         # This ensures consistency even if they were provided as strings
@@ -78,11 +83,15 @@ class AriaConfig(PretrainedConfig):
         if isinstance(vision_config, dict) and "model_type" in vision_config:
             vision_config = AriaVisionConfig(**vision_config)
-            vision_attn_implementation = (
-                "flash_attention_2"
-                if attn_implementation is None
-                else attn_implementation
-            )
             vision_config._attn_implementation = vision_attn_implementation
         self.vision_config = vision_config
@@ -95,3 +104,6 @@ class AriaConfig(PretrainedConfig):
             text_config._attn_implementation = text_attn_implementation
         self.text_config = text_config

 # specific language governing permissions and limitations
 # under the License.
+import logging
 from transformers.configuration_utils import PretrainedConfig
 from .moe_lm import AriaMoELMConfig
 from .vision_encoder import AriaVisionConfig
+logger = logging.getLogger(__name__)
 # adapted from transformers.models.llava.configuration_llava.LlavaConfig
 class AriaConfig(PretrainedConfig):
         self.image_token_index = image_token_index
         attn_implementation = kwargs.pop("attn_implementation", None)
+        self._attn_implementation = attn_implementation
         # Convert the keys and values of projector_patch_to_query_dict to integers
         # This ensures consistency even if they were provided as strings
         if isinstance(vision_config, dict) and "model_type" in vision_config:
             vision_config = AriaVisionConfig(**vision_config)
+            if attn_implementation is None:
+                vision_attn_implementation = "flash_attention_2"
+            elif attn_implementation == "sdpa":
+                logger.warning(
+                    "SDPA is not supported for vit, using flash_attention_2 instead"
+                )
+                vision_attn_implementation = "flash_attention_2"
+            else:
+                vision_attn_implementation = attn_implementation
             vision_config._attn_implementation = vision_attn_implementation
         self.vision_config = vision_config
             text_config._attn_implementation = text_attn_implementation
         self.text_config = text_config
+        # This is needed for the static kv cache
+        self.num_hidden_layers = self.text_config.num_hidden_layers

modeling_aria.py CHANGED Viewed

@@ -24,7 +24,6 @@ import torch
 import torch.nn as nn
 from torch import nn
 from transformers import PreTrainedModel
-from transformers.cache_utils import Cache
 from transformers.modeling_outputs import ModelOutput
 from transformers.utils import logging
@@ -48,6 +47,7 @@ class AriaPretrainedModel(PreTrainedModel):
     _skip_keys_device_placement = "past_key_values"
     _supports_flash_attn_2 = True
     _supports_cache_class = True
     @property
     def _supports_sdpa(self):
@@ -183,138 +183,6 @@ class AriaForConditionalGeneration(AriaPretrainedModel):
         """
         self.language_model.set_aux_loss_coeff(value)
-    # copied from transformers.models.llava.modeling_llava.LlavaForConditionalGeneration
-    def _merge_input_ids_with_image_features(
-        self, image_features, inputs_embeds, input_ids, attention_mask, labels
-    ):
-        """
-        Merge input IDs with image features to create a combined input representation.
-        This method handles the complex logic of interleaving text and image tokens,
-        adjusting attention masks and labels accordingly.
-        Args:
-            image_features (torch.Tensor): Processed image features.
-            inputs_embeds (torch.Tensor): Text input embeddings.
-            input_ids (torch.Tensor): Input token IDs.
-            attention_mask (torch.Tensor): Attention mask for input tokens.
-            labels (torch.Tensor, optional): Labels for language modeling.
-        Returns:
-            tuple: Contains the merged embeddings, updated attention mask,
-                   updated labels, and position IDs.
-        """
-        num_images, num_image_patches, embed_dim = image_features.shape
-        batch_size, sequence_length = input_ids.shape
-        left_padding = not torch.sum(
-            input_ids[:, -1] == torch.tensor(self.pad_token_id)
-        )
-        # 1. Create a mask to know where special image tokens are
-        special_image_token_mask = input_ids == self.config.image_token_index
-        num_special_image_tokens = torch.sum(special_image_token_mask, dim=-1)
-        # Compute the maximum embed dimension
-        max_embed_dim = (
-            num_special_image_tokens.max() * (num_image_patches - 1)
-        ) + sequence_length
-        batch_indices, non_image_indices = torch.where(
-            input_ids != self.config.image_token_index
-        )
-        # 2. Compute the positions where text should be written
-        # Calculate new positions for text tokens in merged image-text sequence.
-        # `special_image_token_mask` identifies image tokens. Each image token will be replaced by `nb_text_tokens_per_images - 1` text tokens.
-        # `torch.cumsum` computes how each image token shifts subsequent text token positions.
-        # - 1 to adjust for zero-based indexing, as `cumsum` inherently increases indices by one.
-        new_token_positions = (
-            torch.cumsum((special_image_token_mask * (num_image_patches - 1) + 1), -1)
-            - 1
-        )
-        nb_image_pad = max_embed_dim - 1 - new_token_positions[:, -1]
-        if left_padding:
-            new_token_positions += nb_image_pad[:, None]  # offset for left padding
-        text_to_overwrite = new_token_positions[batch_indices, non_image_indices]
-        # 3. Create the full embedding, already padded to the maximum position
-        final_embedding = torch.zeros(
-            batch_size,
-            max_embed_dim,
-            embed_dim,
-            dtype=inputs_embeds.dtype,
-            device=inputs_embeds.device,
-        )
-        final_attention_mask = torch.zeros(
-            batch_size,
-            max_embed_dim,
-            dtype=attention_mask.dtype,
-            device=inputs_embeds.device,
-        )
-        if labels is not None:
-            final_labels = torch.full(
-                (batch_size, max_embed_dim),
-                self.config.ignore_index,
-                dtype=input_ids.dtype,
-                device=input_ids.device,
-            )
-        # In case the Vision model or the Language model has been offloaded to CPU, we need to manually
-        # set the corresponding tensors into their correct target device.
-        target_device = inputs_embeds.device
-        batch_indices, non_image_indices, text_to_overwrite = (
-            batch_indices.to(target_device),
-            non_image_indices.to(target_device),
-            text_to_overwrite.to(target_device),
-        )
-        attention_mask = attention_mask.to(target_device)
-        # 4. Fill the embeddings based on the mask. If we have ["hey" "<image>", "how", "are"]
-        # we need to index copy on [0, 577, 578, 579] for the text and [1:576] for the image features
-        final_embedding[batch_indices, text_to_overwrite] = inputs_embeds[
-            batch_indices, non_image_indices
-        ]
-        final_attention_mask[batch_indices, text_to_overwrite] = attention_mask[
-            batch_indices, non_image_indices
-        ]
-        if labels is not None:
-            final_labels[batch_indices, text_to_overwrite] = labels[
-                batch_indices, non_image_indices
-            ]
-        # 5. Fill the embeddings corresponding to the images. Anything that is not `text_positions` needs filling (#29835)
-        image_to_overwrite = torch.full(
-            (batch_size, max_embed_dim),
-            True,
-            dtype=torch.bool,
-            device=inputs_embeds.device,
-        )
-        image_to_overwrite[batch_indices, text_to_overwrite] = False
-        image_to_overwrite &= image_to_overwrite.cumsum(-1) - 1 >= nb_image_pad[
-            :, None
-        ].to(target_device)
-        if image_to_overwrite.sum() != image_features.shape[:-1].numel():
-            raise ValueError(
-                f"The input provided to the model are wrong. The number of image tokens is {torch.sum(special_image_token_mask)} while"
-                f" the number of image given to the model is {num_images}. This prevents correct indexing and breaks batch generation."
-            )
-        final_embedding[image_to_overwrite] = (
-            image_features.contiguous().reshape(-1, embed_dim).to(target_device)
-        )
-        final_attention_mask |= image_to_overwrite
-        position_ids = (final_attention_mask.cumsum(-1) - 1).masked_fill_(
-            (final_attention_mask == 0), 1
-        )
-        # 6. Mask out the embedding at padding positions, as we later use the past_key_value value to determine the non-attended tokens.
-        batch_indices, pad_indices = torch.where(input_ids == self.pad_token_id)
-        indices_to_mask = new_token_positions[batch_indices, pad_indices]
-        final_embedding[batch_indices, indices_to_mask] = 0
-        if labels is None:
-            final_labels = None
-        return final_embedding, final_attention_mask, final_labels, position_ids
     def forward(
         self,
         input_ids: torch.LongTensor = None,
@@ -329,6 +197,8 @@ class AriaForConditionalGeneration(AriaPretrainedModel):
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, AriaCausalLMOutputWithPast]:
         """
         Forward pass of the AriaForConditionalGeneration model.
@@ -371,69 +241,38 @@ class AriaForConditionalGeneration(AriaPretrainedModel):
             # 1. Extra the input embeddings
             inputs_embeds = self.get_input_embeddings()(input_ids)
-            # 2. Merge text and images
-            if pixel_values is not None and input_ids.shape[1] != 1:
-                image_outputs, image_attn_mask = self.vision_tower(
-                    pixel_values,
-                    pixel_mask=pixel_mask,
-                )
-                selected_image_feature = image_outputs.last_hidden_state
-                image_features = self.multi_modal_projector(
-                    selected_image_feature, attn_mask=image_attn_mask
-                )
-                inputs_embeds = inputs_embeds.to(image_features.dtype)
-                (
-                    inputs_embeds,
-                    attention_mask,
-                    labels,
-                    position_ids,
-                ) = self._merge_input_ids_with_image_features(
-                    image_features, inputs_embeds, input_ids, attention_mask, labels
-                )
-            # In case input_ids.shape[1] == 1 & pixel_values != None & past_key_values != None, we are in the case of
-            # generation with cache
-            elif (
-                past_key_values is not None
-                and pixel_values is not None
-                and input_ids.shape[1] == 1
-            ):
-                # Retrieve the first layer to inspect the logits and mask out the hidden states
-                # that are set to 0
-                first_layer_past_key_value = past_key_values[0][0][:, :, :, 0]
-                # Sum all dimensions of head_dim (-2) to avoid random errors
-                # such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941
-                batch_index, non_attended_tokens = torch.where(
-                    first_layer_past_key_value.float().sum(-2) == 0
-                )
-                # Get the target length
-                target_length = input_ids.shape[1]
-                past_length = first_layer_past_key_value.shape[-1]
-                extended_attention_mask = torch.ones(
-                    (attention_mask.shape[0], past_length),
-                    dtype=attention_mask.dtype,
-                    device=attention_mask.device,
-                )
-                # Filter out only the tokens that can be un-attended, this can happen
-                # if one uses Llava + Fused modules where the cache on the
-                # first iteration is already big enough, or if one passes custom cache
-                valid_indices = non_attended_tokens < extended_attention_mask.size(-1)
-                new_batch_index = batch_index[valid_indices]
-                new_non_attended_tokens = non_attended_tokens[valid_indices]
-                # Zero-out the places where we don't need to attend
-                extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0
-                attention_mask = torch.cat(
-                    (extended_attention_mask, attention_mask[:, -target_length:]), dim=1
                 )
-                position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
         outputs = self.language_model(
             attention_mask=attention_mask,
@@ -444,6 +283,8 @@ class AriaForConditionalGeneration(AriaPretrainedModel):
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
         )
         logits = outputs[0]
@@ -452,7 +293,11 @@ class AriaForConditionalGeneration(AriaPretrainedModel):
         if labels is not None:
             # Shift so that tokens < n predict n
             if attention_mask is not None:
-                shift_attention_mask = attention_mask[..., 1:]
                 shift_logits = logits[..., :-1, :][
                     shift_attention_mask.to(logits.device) != 0
                 ].contiguous()
@@ -487,80 +332,24 @@ class AriaForConditionalGeneration(AriaPretrainedModel):
         past_key_values=None,
         inputs_embeds=None,
         pixel_values=None,
-        pixel_mask=None,
         attention_mask=None,
         **kwargs,
     ):
-        """
-        Prepare inputs for generation step.
-        This method prepares the inputs for the generation step, handling both
-        text and image inputs, and managing the model's cache mechanism.
-        Args:
-            input_ids (torch.LongTensor): Input token ids.
-            past_key_values (Cache or List[torch.FloatTensor], optional): Past key values for efficient processing.
-            inputs_embeds (torch.FloatTensor, optional): Input embeddings.
-            pixel_values (torch.FloatTensor, optional): Pixel values of the images.
-            pixel_mask (torch.LongTensor, optional): Mask for the pixel values.
-            attention_mask (torch.Tensor, optional): Attention mask.
-            **kwargs: Additional keyword arguments.
-        Returns:
-            dict: A dictionary containing the prepared inputs for the generation step.
-        """
-        if past_key_values is not None:
-            if isinstance(past_key_values, Cache):
-                cache_length = past_key_values.get_seq_length()
-                past_length = past_key_values.seen_tokens
-            else:
-                cache_length = past_length = past_key_values[0][0].shape[2]
-            # Keep only the unprocessed tokens:
-            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
-            # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
-            # input)
-            if (
-                attention_mask is not None
-                and attention_mask.shape[1] > input_ids.shape[1]
-            ):
-                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
-            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
-            # input_ids based on the past_length.
-            elif past_length < input_ids.shape[1]:
-                input_ids = input_ids[:, past_length:]
-            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
-            elif self.config.image_token_index in input_ids:
-                input_ids = input_ids[:, input_ids.shape[1] - 1 :]
-            # If the cache has seen more tokens than it can hold, then the cache has a size limit. Let's discard the
-            # older attention values, as their corresponding values are not part of the input.
-            if cache_length < past_length and attention_mask is not None:
-                attention_mask = attention_mask[
-                    :, -(cache_length + input_ids.shape[1]) :
-                ]
-        position_ids = kwargs.get("position_ids", None)
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                position_ids = position_ids[:, -input_ids.shape[1] :]
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {"inputs_embeds": inputs_embeds}
-        else:
-            model_inputs = {"input_ids": input_ids}
-        model_inputs.update(
-            {
-                "position_ids": position_ids,
-                "past_key_values": past_key_values,
-                "use_cache": kwargs.get("use_cache"),
-                "attention_mask": attention_mask,
-                "pixel_values": pixel_values,
-                "pixel_mask": pixel_mask,
-            }
-        )
         return model_inputs

 import torch.nn as nn
 from torch import nn
 from transformers import PreTrainedModel
 from transformers.modeling_outputs import ModelOutput
 from transformers.utils import logging
     _skip_keys_device_placement = "past_key_values"
     _supports_flash_attn_2 = True
     _supports_cache_class = True
+    _supports_static_cache = True
     @property
     def _supports_sdpa(self):
         """
         self.language_model.set_aux_loss_coeff(value)
     def forward(
         self,
         input_ids: torch.LongTensor = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        num_logits_to_keep: int = 0,
     ) -> Union[Tuple, AriaCausalLMOutputWithPast]:
         """
         Forward pass of the AriaForConditionalGeneration model.
             # 1. Extra the input embeddings
             inputs_embeds = self.get_input_embeddings()(input_ids)
+        image_features = None
+        if pixel_values is not None:
+            image_outputs, image_attn_mask = self.vision_tower(
+                pixel_values,
+                pixel_mask=pixel_mask,
+            )
+            selected_image_feature = image_outputs.last_hidden_state
+            image_features = self.multi_modal_projector(
+                selected_image_feature, attn_mask=image_attn_mask
+            )
+        if image_features is not None:
+            n_image_tokens = (input_ids == self.config.image_token_index).sum().item()
+            n_image_features = image_features.shape[0] * image_features.shape[1]
+            if n_image_tokens != n_image_features:
+                raise ValueError(
+                    f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
                 )
+            special_image_mask = (
+                (input_ids == self.config.image_token_index)
+                .unsqueeze(-1)
+                .expand_as(inputs_embeds)
+                .to(inputs_embeds.device)
+            )
+            image_features = image_features.to(
+                inputs_embeds.device, inputs_embeds.dtype
+            )
+            inputs_embeds = inputs_embeds.masked_scatter(
+                special_image_mask, image_features
+            )
         outputs = self.language_model(
             attention_mask=attention_mask,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
+            cache_position=cache_position,
+            num_logits_to_keep=num_logits_to_keep,
         )
         logits = outputs[0]
         if labels is not None:
             # Shift so that tokens < n predict n
             if attention_mask is not None:
+                # we use the input attention mask to shift the logits and labels, because it is 2D.
+                # we also crop attn mask in case it is longer, which happens in PrefixTuning with peft
+                shift_attention_mask = attention_mask[:, -(logits.shape[1] - 1) :].to(
+                    logits.device
+                )
                 shift_logits = logits[..., :-1, :][
                     shift_attention_mask.to(logits.device) != 0
                 ].contiguous()
         past_key_values=None,
         inputs_embeds=None,
         pixel_values=None,
         attention_mask=None,
+        cache_position=None,
+        num_logits_to_keep=None,
         **kwargs,
     ):
+        model_inputs = self.language_model.prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            num_logits_to_keep=num_logits_to_keep,
+            **kwargs,
+        )
+        if cache_position[0] == 0:
+            # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
+            # Otherwise we need pixel values to be passed to model
+            model_inputs["pixel_values"] = pixel_values
         return model_inputs

moe_lm.py CHANGED Viewed

@@ -146,7 +146,7 @@ def switch_load_balancing_loss_func(
     topk: int,
     moe_aux_loss_coeff: float,
 ):
-    """Calculate the auxiliary loss for better load balacing.
     Please refer to the Switch Transformer paper (https://arxiv.org/abs/2101.03961) for details.
     Args:

     topk: int,
     moe_aux_loss_coeff: float,
 ):
+    """Calculate the auxiliary loss for better load balancing.
     Please refer to the Switch Transformer paper (https://arxiv.org/abs/2101.03961) for details.
     Args:

processing_aria.py CHANGED Viewed

@@ -94,6 +94,7 @@ class AriaProcessor(ProcessorMixin):
         max_image_size: Optional[int] = 980,
         split_image: Optional[bool] = False,
         return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
     ) -> BatchFeature:
         """
         Main method to prepare for the model one or several sequences(s) and image(s). Please refer to the doctsring
@@ -168,6 +169,24 @@ class AriaProcessor(ProcessorMixin):
                     )
                 )
         else:
             image_inputs = {}
             prompt_strings = text
@@ -180,7 +199,10 @@ class AriaProcessor(ProcessorMixin):
             max_length=max_length,
         )
-        return BatchFeature(data={**text_inputs, **image_inputs})
     @staticmethod
     def _extract_kwargs(func: callable, **kwargs) -> dict:

         max_image_size: Optional[int] = 980,
         split_image: Optional[bool] = False,
         return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
+        return_final_prompts: Optional[bool] = False,
     ) -> BatchFeature:
         """
         Main method to prepare for the model one or several sequences(s) and image(s). Please refer to the doctsring
                     )
                 )
+            max_image_size = (
+                max_image_size
+                if max_image_size is not None
+                else self.image_processor.max_image_size
+            )
+            if max_image_size == 490:
+                num_image_tokens = 128
+            elif max_image_size == 980:
+                num_image_tokens = 256
+            else:
+                raise ValueError(
+                    f"max_image_size must be either 490 or 980, got {max_image_size}"
+                )
+            prompt_strings = [
+                sample.replace(self.image_token, self.image_token * num_image_tokens)
+                for sample in prompt_strings
+            ]
         else:
             image_inputs = {}
             prompt_strings = text
             max_length=max_length,
         )
+        if return_final_prompts:
+            return BatchFeature(data={**text_inputs, **image_inputs}), prompt_strings
+        else:
+            return BatchFeature(data={**text_inputs, **image_inputs})
     @staticmethod
     def _extract_kwargs(func: callable, **kwargs) -> dict:

vision_processor.py CHANGED Viewed

@@ -45,7 +45,7 @@ def _select_best_resolution(
     aspect_ratio = img_width / img_height
     best_ratio_diff = float("inf")
     best_ratio_w, best_ratio_h = 1, 1
-    area = np.int32(img_height) * np.int32(img_height)
     for ratio in target_ratios:
         target_aspect_ratio = ratio[0] / ratio[1]
         ratio_diff = abs(aspect_ratio - target_aspect_ratio)

     aspect_ratio = img_width / img_height
     best_ratio_diff = float("inf")
     best_ratio_w, best_ratio_h = 1, 1
+    area = np.int32(img_width) * np.int32(img_height)
     for ratio in target_ratios:
         target_aspect_ratio = ratio[0] / ratio[1]
         ratio_diff = abs(aspect_ratio - target_aspect_ratio)