Convert to HF format

#70

by cyrilvallez HF Staff - opened Apr 17

base: refs/heads/main

←

from: refs/pr/70

Discussion Files changed

+5716

-11918

Files changed (38) hide show

chat_template.jinja +1 -0
config.json +55 -87
configuration_phi4mm.py → configuration_phi4_multimodal.py +294 -45
feature_extraction_phi4_multimodal.py +353 -0
generation_config.json +3 -4
image_processing_phi4_multimodal_fast.py +284 -0
speech-lora/tokenizer.json → model-00001-of-00002.safetensors +2 -2
model-00001-of-00003.safetensors → model-00002-of-00002.safetensors +2 -2
model-00002-of-00003.safetensors +0 -3
model-00003-of-00003.safetensors +0 -3
model.safetensors.index.json +0 -0
modeling_phi4mm.py → modeling_phi4_multimodal.py +0 -0
preprocessor_config.json +14 -7
processing_phi4_multimodal.py +541 -0
processing_phi4mm.py +0 -733
processor_config.json +0 -6
sample_finetune_speech.py +0 -478
sample_finetune_vision.py +0 -556
sample_inference_phi4mm.py +0 -243
special_tokens_map.json +7 -1
speech-lora/adapter_config.json +18 -10
speech-lora/adapter_model.safetensors +2 -2
speech-lora/added_tokens.json +0 -12
speech-lora/special_tokens_map.json +0 -24
speech-lora/tokenizer_config.json +0 -125
speech-lora/vocab.json +0 -0
speech_conformer_encoder.py +0 -0
tokenizer.json +2 -2
tokenizer_config.json +1 -125
vision-lora/adapter_config.json +18 -10
vision-lora/adapter_model.safetensors +2 -2
vision-lora/added_tokens.json +0 -12
vision-lora/special_tokens_map.json +0 -24
vision-lora/tokenizer.json +0 -3
vision-lora/tokenizer_config.json +0 -125
vision-lora/vocab.json +0 -0
vision_siglip_navit.py +0 -1717
vocab.json +0 -0

chat_template.jinja ADDED Viewed

	@@ -0,0 +1 @@

+ {% for message in messages %}{{ '<|' + message['role'] + '|>' }}{% if message['content'] is string %}{{ message['content'] }}{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' %}{{ '<|image|>' }}{% elif content['type'] == 'audio' %}{{ '<|audio|>' }}{% elif content['type'] == 'text' %}{{ content['text'] }}{% endif %}{% endfor %}{% endif %}{% if message['role'] == 'system' and 'tools' in message and message['tools'] is not none %}{{ '<|tool|>' + message['tools'] + '<|/tool|>' + '<|end|>' }}{% endif %}{{ '<|end|>' }}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>' }}{% else %}{{ eos_token }}{% endif %}

config.json CHANGED Viewed

@@ -1,82 +1,47 @@
 {
-  "_name_or_path": "Phi-4-multimodal-instruct",
   "architectures": [
-    "Phi4MMForCausalLM"
   ],
   "attention_bias": false,
   "attention_dropout": 0.0,
-  "audio_processor": {
-    "config": {
-      "activation": "swish",
-      "activation_checkpointing": {
-        "interval": 1,
-        "module": "transformer",
-        "offload": false
-      },
-      "attention_dim": 1024,
-      "attention_heads": 16,
-      "batch_norm": false,
-      "bias_in_glu": true,
-      "causal": true,
-      "chunk_size": -1,
-      "cnn_layer_norm": true,
-      "conv_activation": "swish",
-      "conv_glu_type": "swish",
-      "depthwise_multiplier": 1,
-      "depthwise_seperable_out_channel": 1024,
-      "dropout_rate": 0.0,
-      "encoder_embedding_config": {
-        "input_size": 80
-      },
-      "ext_pw_kernel_size": 1,
-      "ext_pw_out_channel": 1024,
-      "input_layer": "nemo_conv",
-      "input_size": 80,
-      "kernel_size": 3,
-      "left_chunk": 18,
-      "linear_units": 1536,
-      "nemo_conv_settings": {
-        "conv_channels": 1024
-      },
-      "num_blocks": 24,
-      "relative_attention_bias_args": {
-        "t5_bias_max_distance": 500,
-        "type": "t5"
-      },
-      "time_reduction": 8
-    },
-    "name": "cascades"
-  },
-  "auto_map": {
-    "AutoConfig": "configuration_phi4mm.Phi4MMConfig",
-    "AutoModelForCausalLM": "modeling_phi4mm.Phi4MMForCausalLM",
-    "AutoTokenizer": "Xenova/gpt-4o"
   },
   "bos_token_id": 199999,
-  "embd_layer": {
-    "audio_embd_layer": {
-      "compression_rate": 8,
-      "downsample_rate": 1,
-      "embedding_cls": "audio",
-      "enable_gradient_checkpointing": true,
-      "projection_cls": "mlp",
-      "use_conv_downsample": false,
-      "use_qformer": false
-    },
-    "embedding_cls": "image_audio",
-    "image_embd_layer": {
-      "crop_size": 448,
-      "embedding_cls": "tune_image",
-      "enable_gradient_checkpointing": true,
-      "hd_transform_order": "sub_glb",
-      "image_token_compression_cls": "avg_pool_2d",
-      "projection_cls": "mlp",
-      "use_hd_transform": true,
-      "with_learnable_separator": true
-    }
-  },
   "embd_pdrop": 0.0,
-  "eos_token_id": 199999,
   "full_attn_mod": 1,
   "hidden_act": "silu",
   "hidden_size": 3072,
@@ -84,21 +49,9 @@
   "intermediate_size": 8192,
   "interpolate_factor": 1,
   "lm_head_bias": false,
-  "vision_lora": {
-    "dp": 0.0,
-    "layer": "layers.*((self_attn\\.(qkv_proj|o_proj))|(mlp\\.(gate_up|down)_proj))",
-    "lora_alpha": 512,
-    "r": 256
-  },
-  "speech_lora": {
-    "dp": 0.01,
-    "layer": "((layers.*self_attn\\.(qkv|o)_proj)|(layers.*mlp\\.(gate_up|down)_proj))",
-    "lora_alpha": 640,
-    "r": 320
-  },
   "max_position_embeddings": 131072,
   "mlp_bias": false,
-  "model_type": "phi4mm",
   "num_attention_heads": 24,
   "num_hidden_layers": 32,
   "num_key_value_heads": 8,
@@ -214,8 +167,23 @@
   "sliding_window": 262144,
   "tie_word_embeddings": true,
   "torch_dtype": "bfloat16",
-  "transformers_version": "4.46.1",
   "use_cache": true,
-  "vocab_size": 200064,
-  "_attn_implementation": "flash_attention_2"
 }

 {
+  "auto_map": {
+    "AutoConfig": "configuration_phi4_multimodal.Phi4MultimodalConfig",
+    "AutoModelForCausalLM": "modeling_phi4_multimodal.Phi4MultimodalForCausalLM"
+  },
   "architectures": [
+    "Phi4MultimodalForCausalLM"
   ],
   "attention_bias": false,
   "attention_dropout": 0.0,
+  "audio_config": {
+    "activation": "swish",
+    "audio_token_id": 200011,
+    "bias_max_distance": 500,
+    "bias_symmetric": false,
+    "chunk_size": -1,
+    "conv_activation": "swish",
+    "conv_glu_type": "swish",
+    "depthwise_multiplier": 1,
+    "depthwise_seperable_out_channel": 1024,
+    "downsample_rate": 1,
+    "dropout_rate": 0.0,
+    "ext_pw_out_channel": 1024,
+    "feature_layer": -2,
+    "hidden_size": 1024,
+    "initializer_range": 0.02,
+    "input_size": 80,
+    "intermediate_size": 1536,
+    "kernel_size": 3,
+    "left_chunk": 18,
+    "model_type": "phi4_multimodal_audio",
+    "nemo_activation": "relu",
+    "nemo_conv_channels": 1024,
+    "nemo_final_size": 10,
+    "num_attention_heads": 16,
+    "num_blocks": 24,
+    "time_reduction": 8
   },
   "bos_token_id": 199999,
   "embd_pdrop": 0.0,
+  "eos_token_id": [
+    199999,
+    200020
+  ],
   "full_attn_mod": 1,
   "hidden_act": "silu",
   "hidden_size": 3072,
   "intermediate_size": 8192,
   "interpolate_factor": 1,
   "lm_head_bias": false,
   "max_position_embeddings": 131072,
   "mlp_bias": false,
+  "model_type": "phi4_multimodal",
   "num_attention_heads": 24,
   "num_hidden_layers": 32,
   "num_key_value_heads": 8,
   "sliding_window": 262144,
   "tie_word_embeddings": true,
   "torch_dtype": "bfloat16",
+  "transformers_version": "4.52.0.dev0",
   "use_cache": true,
+  "vision_config": {
+    "attention_dropout": 0.0,
+    "crop_size": 448,
+    "feature_layer": -2,
+    "hidden_act": "gelu_pytorch_tanh",
+    "hidden_size": 1152,
+    "image_size": 448,
+    "image_token_id": 200010,
+    "intermediate_size": 4304,
+    "layer_norm_eps": 1e-06,
+    "model_type": "phi4_multimodal_vision",
+    "num_attention_heads": 16,
+    "num_channels": 3,
+    "num_hidden_layers": 27,
+    "patch_size": 14
+  },
+  "vocab_size": 200064
 }

configuration_phi4mm.py → configuration_phi4_multimodal.py RENAMED Viewed

@@ -1,5 +1,10 @@
-# coding=utf-8
-# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,27 +18,243 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Phi-4-MM model configuration"""
 from transformers.configuration_utils import PretrainedConfig
-from transformers.utils import logging
-logger = logging.get_logger(__name__)
-class Phi4MMConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a [`Phi4MMModel`]. It is used to instantiate a Phi-4-MM
-    model according to the specified arguments, defining the model architecture.
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
     Args:
         vocab_size (`int`, *optional*, defaults to 200064):
-            Vocabulary size of the Phi-4-MM model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`Phi4MMModel`].
         hidden_size (`int`, *optional*, defaults to 3072):
             Dimension of the hidden representations.
         intermediate_size (`int`, *optional*, defaults to 8192):
@@ -42,7 +263,7 @@ class Phi4MMConfig(PretrainedConfig):
             Number of hidden layers in the Transformer decoder.
         num_attention_heads (`int`, *optional*, defaults to 32):
             Number of attention heads for each attention layer in the Transformer decoder.
-        num_key_value_heads (`int`, *optional*):
             This is the number of key_value heads that should be used to implement Grouped Query Attention. If
             `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
             `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
@@ -58,11 +279,8 @@ class Phi4MMConfig(PretrainedConfig):
             The dropout ratio after computing the attention scores.
         hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
             The non-linear activation function (function or string) in the decoder.
-        max_position_embeddings (`int`, *optional*, defaults to 4096):
             The maximum sequence length that this model might ever be used with.
-        original_max_position_embeddings (`int`, *optional*, defaults to 4096):
-            The maximum sequence length that this model was trained with. This is used to determine the size of the
-            original RoPE embeddings when using long scaling.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         rms_norm_eps (`float`, *optional*, defaults to 1e-05):
@@ -79,34 +297,58 @@ class Phi4MMConfig(PretrainedConfig):
             contain the following keys: `type`, `short_factor` and `long_factor`. The `type` must be `longrope` and
             the `short_factor` and `long_factor` must be lists of numbers with the same length as the hidden size
             divided by the number of attention heads divided by 2.
-        partial_rotary_factor (`float`, *optional*, defaults to 0.5):
-            Percentage of the query and keys which will have rotary embedding.
         bos_token_id (`int`, *optional*, defaults to 199999):
             The id of the "beginning-of-sequence" token.
-        eos_token_id (`int`, *optional*, defaults to 199999):
             The id of the "end-of-sequence" token.
         pad_token_id (`int`, *optional*, defaults to 199999):
             The id of the padding token.
         sliding_window (`int`, *optional*):
             Sliding window attention window size. If `None`, no sliding window is applied.
     Example:
     ```python
-    >>> from transformers import Phi4MMModel, Phi4MMConfig
-    >>> # Initializing a Phi-4-MM style configuration
-    >>> configuration = Phi4MMConfig.from_pretrained("TBA")
     >>> # Initializing a model from the configuration
-    >>> model = Phi4MMModel(configuration)
     >>> # Accessing the model configuration
     >>> configuration = model.config
     ```"""
-    model_type = "phi4mm"
     keys_to_ignore_at_inference = ["past_key_values"]
     def __init__(
         self,
@@ -115,13 +357,12 @@ class Phi4MMConfig(PretrainedConfig):
         intermediate_size=8192,
         num_hidden_layers=32,
         num_attention_heads=32,
-        num_key_value_heads=None,
         resid_pdrop=0.0,
         embd_pdrop=0.0,
         attention_dropout=0.0,
         hidden_act="silu",
-        max_position_embeddings=4096,
-        original_max_position_embeddings=4096,
         initializer_range=0.02,
         rms_norm_eps=1e-5,
         use_cache=True,
@@ -130,22 +371,21 @@ class Phi4MMConfig(PretrainedConfig):
         rope_scaling=None,
         partial_rotary_factor=1,
         bos_token_id=199999,
-        eos_token_id=199999,
         pad_token_id=199999,
         sliding_window=None,
-        embd_layer: str = "default",
-        img_processor=None,
-        audio_processor=None,
-        vision_lora=None,
-        speech_lora=None,
         **kwargs,
     ):
-        self.embd_layer = embd_layer
-        self.img_processor = img_processor
-        self.audio_processor = audio_processor
-        self.vision_lora = vision_lora
-        self.speech_lora = speech_lora
         self.vocab_size = vocab_size
         self.hidden_size = hidden_size
         self.intermediate_size = intermediate_size
@@ -172,13 +412,17 @@ class Phi4MMConfig(PretrainedConfig):
         self._rope_scaling_validation()
         self.sliding_window = sliding_window
-        super().__init__(
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            pad_token_id=pad_token_id,
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )
     def _rope_scaling_adjustment(self):
         """
@@ -233,3 +477,8 @@ class Phi4MMConfig(PretrainedConfig):
             raise ValueError(
                 f"`rope_scaling`'s long_factor field must have length {rotary_ndims // 2}, got {len(rope_scaling_long_factor)}"
             )

+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/phi4_multimodal/modular_phi4_multimodal.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_phi4_multimodal.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# Copyright 2025 Microsoft and the HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import math
 from transformers.configuration_utils import PretrainedConfig
+class Phi4MultimodalVisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Phi4MultimodalVisionModel`]. It is used to instantiate a
+    Phi4Multimodal vision encoder according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the vision encoder of
+    [microsoft/Phi-4-multimodal-instruct](https://huggingface.co/microsoft/Phi-4-multimodal-instruct) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        hidden_size (`int`, *optional*, defaults to 1152):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 4304):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 27):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_channels (`int`, *optional*, defaults to 3):
+            Number of channels in the input images.
+        image_size (`int`, *optional*, defaults to 448):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 14):
+            The size (resolution) of each patch.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the layer normalization layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        crop_size (`int`, *optional*, defaults to 448):
+            Crop size for the input images.
+        image_token_id (`int`, *optional*, defaults to 200010):
+            The image token id.
+        feature_layer (`int`, *optional*, defaults to -2):
+            The index of the layer of the encoder from which to extract image features.
+    Example:
+    ```python
+    >>> from transformers import Phi4MultimodalVisionConfig
+    >>> # Initializing a Phi4MultimodalVisionConfig with microsoft/Phi-4-multimodal-instruct style configuration
+    >>> configuration = Phi4MultimodalVisionConfig()
+    ```"""
+    model_type = "phi4_multimodal_vision"
+    base_config_key = "vision_config"
+    def __init__(
+        self,
+        hidden_size=1152,
+        intermediate_size=4304,
+        num_hidden_layers=27,
+        num_attention_heads=16,
+        num_channels=3,
+        image_size=448,
+        patch_size=14,
+        hidden_act="gelu_pytorch_tanh",
+        layer_norm_eps=1e-6,
+        attention_dropout=0.0,
+        crop_size: int = 448,
+        image_token_id: int = 200010,
+        feature_layer: int = -2,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.crop_size = crop_size
+        self.image_token_id = image_token_id
+        self.feature_layer = feature_layer
+class Phi4MultimodalAudioConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Phi4MultimodalAudioModel`]. It is used to instantiate a
+    Phi4Multimodal audio encoder according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the audio encoder of
+    [microsoft/Phi-4-multimodal-instruct](https://huggingface.co/microsoft/Phi-4-multimodal-instruct) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        hidden_size (`int`, *optional*, defaults to 1024):
+            Dimensionality of the encoder layers.
+        intermediate_size (`int`, *optional*, defaults to 1536):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_blocks (`int`, *optional*, defaults to 24):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        activation (`str`, *optional*, defaults to `"swish"`):
+            The non-linear activation function in the MLPs.
+        chunk_size (`int`, *optional*, defaults to -1):
+            The chunk size to create the masks.
+        left_chunk (`int`, *optional*, defaults to 18):
+            The left chunk to create the masks.
+        dropout_rate (`float`, *optional*, defaults to 0.0):
+            The dropout ratio.
+        ext_pw_out_channel (`int`, *optional*, defaults to 1024):
+            Number of out channels in the point-wise conv modules.
+        depthwise_seperable_out_channel (`int`, *optional*, defaults to 1024):
+            Number of out channels in the depth-wise separable conv modules.
+        depthwise_multiplier (`int`, *optional*, defaults to 1):
+            Input size multiplier for the depth-wise separable conv modules.
+        kernel_size (`int`, *optional*, defaults to 3):
+            Kernel size for the depth-wise separable conv modules.
+        conv_activation (`str`, *optional*, defaults to `"swish"`):
+            The non-linear activation function in the conv modules.
+        input_size (`int`, *optional*, defaults to 80):
+            Input size for the audio model.
+        conv_glu_type (`str`, *optional*, defaults to `"swish"`):
+            The non-linear activation function in the point-wise conv modules.
+        time_reduction (`int`, *optional*, defaults to 8):
+            Time reduction (subsampling factor).
+        bias_max_distance (`int`, *optional*, defaults to 1000):
+            Max distance for the relative attention bias module.
+        bias_symmetric (`bool`, *optional*, defaults to `False`):
+            Whether the relative attention bias should be symmetric or not.
+        nemo_activation (`str`, *optional*, defaults to `"relu"`):
+            The non-linear activation function in the nemo conv modules.
+        nemo_conv_channels (`int`, *optional*, defaults to 1024):
+            Number of channels in the nemo conv modules.
+        downsample_rate (`int`, *optional*, defaults to 1):
+            Downsample rate for the audio feature extractor.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        audio_token_id (`int`, *optional*, defaults to 200011):
+            The audio token id.
+        feature_layer (`int`, *optional*, defaults to -2):
+            The index of the layer of the encoder from which to extract audio features.
+    Example:
+    ```python
+    >>> from transformers import Phi4MultimodalAudioConfig
+    >>> # Initializing a Phi4MultimodalAudioConfig with microsoft/Phi-4-multimodal-instruct style configuration
+    >>> configuration = Phi4MultimodalAudioConfig()
+    ```"""
+    model_type = "phi4_multimodal_audio"
+    def __init__(
+        self,
+        hidden_size: int = 1024,
+        intermediate_size: int = 1536,
+        num_blocks: int = 24,
+        num_attention_heads: int = 16,
+        activation: str = "swish",
+        chunk_size: int = -1,
+        left_chunk: int = 18,
+        dropout_rate: float = 0.0,
+        ext_pw_out_channel: int = 1024,
+        depthwise_seperable_out_channel: int = 1024,
+        depthwise_multiplier: int = 1,
+        kernel_size: int = 3,
+        conv_activation: str = "swish",
+        input_size: int = 80,
+        conv_glu_type: str = "swish",
+        time_reduction: int = 8,
+        bias_max_distance: int = 1000,
+        bias_symmetric: bool = False,
+        nemo_activation: str = "relu",
+        nemo_conv_channels: int = 1024,
+        downsample_rate: int = 1,
+        initializer_range: float = 0.02,
+        audio_token_id: int = 200011,
+        feature_layer: int = -2,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.hidden_size = hidden_size
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.activation = activation
+        self.chunk_size = chunk_size
+        self.left_chunk = left_chunk
+        self.num_blocks = num_blocks
+        self.dropout_rate = dropout_rate
+        self.ext_pw_out_channel = ext_pw_out_channel
+        self.depthwise_seperable_out_channel = depthwise_seperable_out_channel
+        self.depthwise_multiplier = depthwise_multiplier
+        self.kernel_size = kernel_size
+        self.conv_activation = conv_activation
+        self.input_size = input_size
+        self.conv_glu_type = conv_glu_type
+        self.time_reduction = time_reduction
+        self.bias_max_distance = bias_max_distance
+        self.bias_symmetric = bias_symmetric
+        self.nemo_activation = nemo_activation
+        self.nemo_conv_channels = nemo_conv_channels
+        self.downsample_rate = downsample_rate
+        self.audio_token_id = audio_token_id
+        self.initializer_range = initializer_range
+        self.feature_layer = feature_layer
+        if time_reduction % 2 != 0:
+            raise ValueError("`time_reduction` should be a multiple of 2!")
+        length = input_size
+        for _ in range(int(math.log(time_reduction, 2))):
+            length = math.floor((length - 1) / 2 + 1)
+        self.nemo_final_size = length
+class Phi4MultimodalConfig(PretrainedConfig):
     r"""
+    This is the configuration class to store the configuration of a [`Phi4MultimodalModel`]. It is used to instantiate a
+    Phi4Multimodal model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the
+    [microsoft/Phi-4-multimodal-instruct](https://huggingface.co/microsoft/Phi-4-multimodal-instruct) architecture.
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
     Args:
         vocab_size (`int`, *optional*, defaults to 200064):
+            Vocabulary size of the Phi-3 model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Phi3Model`].
         hidden_size (`int`, *optional*, defaults to 3072):
             Dimension of the hidden representations.
         intermediate_size (`int`, *optional*, defaults to 8192):
             Number of hidden layers in the Transformer decoder.
         num_attention_heads (`int`, *optional*, defaults to 32):
             Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*, defaults to 8):
             This is the number of key_value heads that should be used to implement Grouped Query Attention. If
             `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
             `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
             The dropout ratio after computing the attention scores.
         hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
             The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 131072):
             The maximum sequence length that this model might ever be used with.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         rms_norm_eps (`float`, *optional*, defaults to 1e-05):
             contain the following keys: `type`, `short_factor` and `long_factor`. The `type` must be `longrope` and
             the `short_factor` and `long_factor` must be lists of numbers with the same length as the hidden size
             divided by the number of attention heads divided by 2.
+        partial_rotary_factor (`float`, *optional*, defaults to `1.0`):
+            Percentage of the query and keys which will have rotary embedding. Must be between 0.0 and 1.0.
         bos_token_id (`int`, *optional*, defaults to 199999):
             The id of the "beginning-of-sequence" token.
+        eos_token_id (`int` or `list[int]`, *optional*, defaults to `[199999, 200020]`):
             The id of the "end-of-sequence" token.
         pad_token_id (`int`, *optional*, defaults to 199999):
             The id of the padding token.
+        original_max_position_embeddings (`int`, *optional*, defaults to 4096):
+            The maximum sequence length that this model was trained with. This is used to determine the size of the
+            original RoPE embeddings when using long scaling.
         sliding_window (`int`, *optional*):
             Sliding window attention window size. If `None`, no sliding window is applied.
+        vision_config (`Phi4MultimodalVisionConfig` or `dict`, *optional*):
+            The vision config for the underlying image embedding model. If not provided, will default to the configuration
+            used to instantiate a model similar in architecture as
+            [microsoft/Phi-4-multimodal-instruct](https://huggingface.co/microsoft/Phi-4-multimodal-instruct).
+        audio_config (`Phi4MultimodalAudioConfig` or `dict`, *optional*):
+            The audio config for the underlying audio embedding model. If not provided, will default to the configuration
+            used to instantiate a model similar in architecture as
+            [microsoft/Phi-4-multimodal-instruct](https://huggingface.co/microsoft/Phi-4-multimodal-instruct).
     Example:
     ```python
+    >>> from transformers import Phi4MultimodalModel, Phi4MultimodalConfig
+    >>> # Initializing a Phi4Multimodal style configuration
+    >>> configuration = Phi4MultimodalConfig.from_pretrained("microsoft/Phi-4-multimodal-instruct")
     >>> # Initializing a model from the configuration
+    >>> model = Phi4MultimodalModel(configuration)
     >>> # Accessing the model configuration
     >>> configuration = model.config
     ```"""
+    model_type = "phi4_multimodal"
     keys_to_ignore_at_inference = ["past_key_values"]
+    base_model_tp_plan = {
+        "layers.*.self_attn.qkv_proj": "colwise_rep",  # we need to replicate here due to the slicing of qkv
+        "layers.*.self_attn.o_proj": "rowwise_rep",  # we need to replicate here due to the slicing of qkv
+        "layers.*.mlp.gate_up_proj": "colwise_rep",  # we need to replicate here due to the `chunk` operation
+        "layers.*.mlp.down_proj": "rowwise_rep",  # we need to replicate here due to the `chunk` operation
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+    sub_configs = {"audio_config": Phi4MultimodalAudioConfig, "vision_config": Phi4MultimodalVisionConfig}
     def __init__(
         self,
         intermediate_size=8192,
         num_hidden_layers=32,
         num_attention_heads=32,
+        num_key_value_heads=8,
         resid_pdrop=0.0,
         embd_pdrop=0.0,
         attention_dropout=0.0,
         hidden_act="silu",
+        max_position_embeddings=131072,
         initializer_range=0.02,
         rms_norm_eps=1e-5,
         use_cache=True,
         rope_scaling=None,
         partial_rotary_factor=1,
         bos_token_id=199999,
+        eos_token_id=[199999, 200020],
         pad_token_id=199999,
+        original_max_position_embeddings=4096,
         sliding_window=None,
+        vision_config=None,
+        audio_config=None,
         **kwargs,
     ):
+        super().__init__(
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            pad_token_id=pad_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
         self.vocab_size = vocab_size
         self.hidden_size = hidden_size
         self.intermediate_size = intermediate_size
         self._rope_scaling_validation()
         self.sliding_window = sliding_window
+        if isinstance(vision_config, dict):
+            vision_config = Phi4MultimodalVisionConfig(**vision_config)
+        elif vision_config is None:
+            Phi4MultimodalVisionConfig()
+        self.vision_config = vision_config
+        if isinstance(audio_config, dict):
+            audio_config = Phi4MultimodalAudioConfig(**audio_config)
+        elif vision_config is None:
+            audio_config = Phi4MultimodalAudioConfig()
+        self.audio_config = audio_config
     def _rope_scaling_adjustment(self):
         """
             raise ValueError(
                 f"`rope_scaling`'s long_factor field must have length {rotary_ndims // 2}, got {len(rope_scaling_long_factor)}"
             )
+__all__ = ["Phi4MultimodalVisionConfig", "Phi4MultimodalAudioConfig", "Phi4MultimodalConfig"]
+Phi4MultimodalConfig.register_for_auto_class()

feature_extraction_phi4_multimodal.py ADDED Viewed

	@@ -0,0 +1,353 @@

+# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for Phi4Multimodal
+"""
+from typing import Optional, Union, List, Tuple
+import numpy as np
+from transformers.feature_extraction_sequence_utils import SequenceFeatureExtractor
+from transformers.image_processing_utils import BatchFeature
+from transformers.utils import TensorType, is_torch_available, logging
+if is_torch_available():
+    import torch
+logger = logging.get_logger(__name__)
+AudioInput = Union[
+    np.ndarray, "torch.Tensor", List[np.ndarray], Tuple[np.ndarray], List["torch.Tensor"], Tuple["torch.Tensor"]  # noqa: F821
+]
+# TODO: @eustlb, remove this once #36603 is merged.
+def speechlib_mel(sample_rate, n_fft, n_mels, fmin=None, fmax=None):
+    """Create a Mel filter-bank the same as SpeechLib FbankFC.
+    Args:
+        sample_rate (int): Sample rate in Hz. number > 0 [scalar]
+        n_fft (int): FFT size. int > 0 [scalar]
+        n_mel (int): Mel filter size. int > 0 [scalar]
+        fmin (float): lowest frequency (in Hz). If None use 0.0.
+            float >= 0 [scalar]
+        fmax: highest frequency (in Hz). If None use sample_rate / 2.
+            float >= 0 [scalar]
+    Returns
+        out (numpy.ndarray): Mel transform matrix
+            [shape=(n_mels, 1 + n_fft/2)]
+    """
+    bank_width = int(n_fft // 2 + 1)
+    if fmax is None:
+        fmax = sample_rate / 2
+    if fmin is None:
+        fmin = 0
+    assert fmin >= 0, "fmin cannot be negtive"
+    assert fmin < fmax <= sample_rate / 2, "fmax must be between (fmin, samplerate / 2]"
+    def mel(f):
+        return 1127.0 * np.log(1.0 + f / 700.0)
+    def bin2mel(fft_bin):
+        return 1127.0 * np.log(1.0 + fft_bin * sample_rate / (n_fft * 700.0))
+    def f2bin(f):
+        return int((f * n_fft / sample_rate) + 0.5)
+    # Spec 1: FFT bin range [f2bin(fmin) + 1, f2bin(fmax) - 1]
+    klo = f2bin(fmin) + 1
+    khi = f2bin(fmax)
+    khi = max(khi, klo)
+    # Spec 2: SpeechLib uses trianges in Mel space
+    mlo = mel(fmin)
+    mhi = mel(fmax)
+    m_centers = np.linspace(mlo, mhi, n_mels + 2)
+    ms = (mhi - mlo) / (n_mels + 1)
+    matrix = np.zeros((n_mels, bank_width), dtype=np.float32)
+    for m in range(0, n_mels):
+        left = m_centers[m]
+        center = m_centers[m + 1]
+        right = m_centers[m + 2]
+        for fft_bin in range(klo, khi):
+            mbin = bin2mel(fft_bin)
+            if left < mbin < right:
+                matrix[m, fft_bin] = 1.0 - abs(center - mbin) / ms
+    return matrix
+class Phi4MultimodalFeatureExtractor(SequenceFeatureExtractor):
+    model_input_names = ["audio_input_features", "audio_embed_sizes", "audio_attention_mask"]
+    def __init__(
+        self,
+        feature_size: int = 80,
+        sampling_rate: int = 16000,
+        hop_length: int = 160,
+        n_fft: int = 512,
+        win_length: int = 400,
+        preemphasis: float = 0.97,
+        padding_value: float = 0.0,
+        audio_compression_rate: int = 8,
+        audio_downsample_rate: int = 1,
+        audio_feat_stride: int = 1,
+        mel_min_frequency: float = 0,
+        mel_max_frequency: float = 7690,
+        **kwargs,
+    ):
+        super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
+        self.hop_length = hop_length
+        self.n_fft = n_fft
+        self.win_length = win_length
+        self.preemphasis = preemphasis
+        self.padding_value = padding_value
+        self.audio_compression_rate = audio_compression_rate
+        self.audio_downsample_rate = audio_downsample_rate
+        self.audio_feat_stride = audio_feat_stride
+        # TODO: @eustlb, uncomment and remove speechlib_mel once #36603 is merged.
+        # self.mel_filters = mel_filter_bank(
+        #     num_frequency_bins=self.n_fft // 2 + 1,
+        #     num_mel_filters=self.feature_size,
+        #     min_frequency=mel_min_frequency,
+        #     max_frequency=mel_max_frequency,
+        #     sampling_rate=self.sampling_rate,
+        #     triangularize_in_mel_space=True,
+        #     mel_scale="kaldi",
+        # )
+        self.mel_filters = speechlib_mel(
+            self.sampling_rate, self.n_fft, self.feature_size, mel_min_frequency, mel_max_frequency
+        ).T
+    def __call__(
+        self,
+        raw_speech: AudioInput,
+        sampling_rate: Optional[int] = None,
+        pad_to_multiple_of: Optional[int] = None,
+        padding: Optional[str] = "longest",
+        max_length: Optional[int] = None,
+        truncation: bool = False,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_attention_mask: Optional[bool] = True,
+        device: Optional[str] = "cpu",
+        **kwargs,
+    ) -> BatchFeature:
+        """
+        Main method to featurize and prepare for the model one or several audio sequence(s). Implementation uses PyTorch for
+        the STFT computation if available, otherwise a slower NumPy based one.
+        Args:
+            raw_speech (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The sequence or batch of sequences to be processed. Each sequence can be a numpy array or PyTorch tensor.
+                For batched inputs, sequences can be a list of numpy arrays or PyTorch tensors, or a single numpy array or
+                PyTorch tensor with first dimension being the batch size.
+            sampling_rate (`int`, *optional*):
+                The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
+                `sampling_rate` at the forward call to prevent silent errors.
+            pad_to_multiple_of (`int`, *optional*, defaults to None):
+                If set will pad the sequence to a multiple of the provided value.
+            padding (`str`, *optional*, defaults to "longest"):
+                Padding strategy. Can be "longest" to pad to the longest sequence in the batch, or a specific length.
+            max_length (`int`, *optional*):
+                Maximum length of the returned list and optionally padding length.
+            truncation (`bool`, *optional*, defaults to False):
+                Activates truncation to cut input sequences longer than *max_length* to *max_length*.
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors instead of numpy arrays. Acceptable values are:
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return Numpy `np.ndarray` objects.
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+            return_attention_mask (`bool`, *optional*, defaults to `True`):
+                Whether to return the extracted audio input features' attention mask.
+            device (`str`, *optional*, defaults to "cpu"):
+                Specifies the device for computation of the audio features. (e.g., "cpu", "cuda")
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+                - **audio_input_features** -- Audio features extracted from the raw audio input, shape (batch_size, max_feature_length, feature_size).
+                - **audio_lengths** -- Length of each audio sample in the batch, shape (batch_size,).
+                - **audio_attention_mask** -- Attention mask for the audio input, shape (batch_size, max_feature_length).
+                If `return_tensors` is not specified, the fields will be PyTorch tensors if PyTorch is available, otherwise NumPy arrays.
+        """
+        if sampling_rate is not None:
+            if sampling_rate != self.sampling_rate:
+                raise ValueError(
+                    f"The model corresponding to this feature extractor: {self.__class__.__name__} was trained using a"
+                    f" sampling rate of {self.sampling_rate}. Please make sure that the provided `raw_speech` input"
+                    f" was sampled with {self.sampling_rate} and not {sampling_rate}."
+                )
+        else:
+            logger.warning(
+                f"It is strongly recommended to pass the `sampling_rate` argument to `{self.__class__.__name__}()`. "
+                "Failing to do so can result in silent errors that might be hard to debug."
+            )
+        # Convert to torch tensor
+        if isinstance(raw_speech, np.ndarray):
+            raw_speech = torch.tensor(raw_speech)
+        elif isinstance(raw_speech, (list, tuple)) and isinstance(raw_speech[0], np.ndarray):
+            raw_speech = [torch.tensor(speech) for speech in raw_speech]
+        is_batched_torch = isinstance(raw_speech, torch.Tensor) and len(raw_speech.shape) > 1
+        if is_batched_torch and len(raw_speech.shape) > 2:
+            logger.warning(
+                f"Only mono-channel audio is supported for input to {self.__class__.__name__}. "
+                "We will take the mean of the channels to convert to mono."
+            )
+            raw_speech = raw_speech.mean(-1)
+        is_batched_sequence = isinstance(raw_speech, (list, tuple))
+        if is_batched_sequence:
+            for speech in raw_speech:
+                if len(speech.shape) > 1:
+                    logger.warning(
+                        f"Only mono-channel audio is supported for input to {self.__class__.__name__}. "
+                        "We will take the mean of the channels to convert to mono."
+                    )
+                    speech = speech.mean(-1)
+        if is_batched_torch or is_batched_sequence:
+            raw_speech = [speech[:, None].to(torch.float32) for speech in raw_speech]
+        else:
+            raw_speech = [raw_speech[:, None].to(torch.float32)]
+        audio_lengths = [len(speech) for speech in raw_speech]
+        # convert into correct format for padding
+        batched_speech = BatchFeature(data={"audio_input_features": raw_speech, "audio_lengths": audio_lengths})
+        padded_inputs = self.pad(
+            batched_speech,
+            padding=padding,
+            max_length=max_length,
+            truncation=truncation,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_tensors="pt",
+        )
+        input_features = padded_inputs.audio_input_features.squeeze(-1)
+        audio_lengths = padded_inputs.audio_lengths
+        input_features = self._torch_extract_fbank_features(input_features, audio_lengths, device)
+        feature_lengths = (audio_lengths - self.win_length) // self.hop_length + 1
+        feature_lengths = feature_lengths * self.audio_feat_stride
+        audio_embed_sizes = self._compute_audio_embed_size(feature_lengths)
+        feature_attention_mask = (
+            torch.arange(0, feature_lengths.max()) if is_torch_available() else np.arange(0, feature_lengths.max())
+        )
+        feature_attention_mask = (
+            feature_attention_mask[None, :] < feature_lengths[:, None] if len(feature_lengths) > 1 else None
+        )
+        data = {
+            "audio_input_features": input_features,
+            "audio_embed_sizes": audio_embed_sizes,
+        }
+        if feature_attention_mask is not None and return_attention_mask:
+            data["audio_attention_mask"] = feature_attention_mask
+        return BatchFeature(data=data, tensor_type=return_tensors)
+    # TODO; @eustlb, move this to audio_utils in a general spectogram_batch function that handles torch and numpy
+    def _torch_extract_fbank_features(
+        self, waveform: "torch.FloatTensor", audio_lengths: "torch.Tensor", device: str = "cpu"
+    ) -> "torch.FloatTensor":
+        """
+        Compute the log mel-scaled spectrogram of batched waveforms using PyTorch's FFT implementation.
+        Args:
+            waveform (torch.FloatTensor` of shape `(batch_size, max_audio_length)`):
+                The batched waveforms.
+            audio_lengths (`torch.Tensor` of shape `(batch_size,)`):
+                The lengths of the waveforms along the max_audio_length dimension.
+            device (`str`, *optional*, defaults to "cpu"):
+                The device to run the computation on. (e.g., "cpu", "cuda")
+        Returns:
+            `torch.FloatTensor` of shape `(batch_size, max_feature_length, feature_size)`:
+                The log mel-scaled spectrogram of the batched waveforms.
+        """
+        fft_window = torch.hamming_window(self.win_length, periodic=False, device=device, dtype=torch.float64)
+        # batched implementation
+        batch_size = waveform.shape[0]
+        frames = waveform.unfold(-1, self.win_length, self.hop_length)
+        # ---
+        # the unbatched (and unpaded) original implementation skips last few audio values that can't be included in a frame
+        # we need to ensure that the corresponding frames for the padded input also mask these values
+        if batch_size > 1:
+            frames = frames.clone()
+            # concerned batch indices
+            to_mask_batch_idxs = torch.arange(batch_size)[audio_lengths != audio_lengths.max()]
+            if to_mask_batch_idxs.numel() > 0:
+                batch_idxs_down = (audio_lengths[to_mask_batch_idxs] - self.win_length) // self.hop_length + 1
+                batch_idxs_up = audio_lengths[to_mask_batch_idxs] // self.hop_length + 1
+                offset_idx = batch_idxs_down.min()
+                max_idx = batch_idxs_up.max()
+                mask = torch.arange(max_idx - offset_idx, device=device).expand(to_mask_batch_idxs.shape[0], -1)
+                mask = ((batch_idxs_down - offset_idx).unsqueeze(1) <= mask) & (
+                    mask < (batch_idxs_up - offset_idx).unsqueeze(1)
+                )
+                mask = mask.unsqueeze(-1).expand(-1, -1, self.win_length)
+                masked_frames = frames[to_mask_batch_idxs, offset_idx:max_idx].masked_fill_(mask, 0)
+                frames[to_mask_batch_idxs, offset_idx:max_idx] = masked_frames
+        # ---
+        # apply pre-emphasis first order filter on fft windows
+        frames_prev = torch.roll(frames, 1, dims=-1)
+        frames_prev[:, :, 0] = frames_prev[:, :, 1]
+        frames = (frames - self.preemphasis * frames_prev) * 32768
+        # apply fft
+        S = torch.fft.rfft(fft_window * frames.view(-1, self.win_length), n=self.n_fft, dim=1)
+        S = S.view(frames.shape[0], -1, S.shape[-1])
+        S = S.to(torch.complex64)
+        spec = torch.abs(S)
+        spec_power = spec**2
+        # apply triangular mel filter bank
+        mel_filters = torch.from_numpy(self.mel_filters).to(device, torch.float32)
+        log_spec = torch.clamp(spec_power @ mel_filters, min=1.0)
+        log_spec = torch.log(log_spec)
+        return log_spec
+    def _compute_audio_embed_size(self, audio_frames):
+        integer = audio_frames // self.audio_compression_rate
+        remainder = audio_frames % self.audio_compression_rate
+        result = integer + (remainder > 0).to(integer.dtype)
+        integer = result // self.audio_downsample_rate
+        remainder = result % self.audio_downsample_rate
+        result = integer + (remainder > 0).to(integer.dtype)  # qformer compression
+        return result
+__all__ = ["Phi4MultimodalFeatureExtractor"]
+Phi4MultimodalFeatureExtractor.register_for_auto_class()

generation_config.json CHANGED Viewed

@@ -2,10 +2,9 @@
   "_from_model_config": true,
   "bos_token_id": 199999,
   "eos_token_id": [
-    200020,
-    199999
   ],
   "pad_token_id": 199999,
-  "transformers_version": "4.46.1",
-  "use_cache": true
 }

   "_from_model_config": true,
   "bos_token_id": 199999,
   "eos_token_id": [
+    199999,
+    200020
   ],
   "pad_token_id": 199999,
+  "transformers_version": "4.52.0.dev0"
 }

image_processing_phi4_multimodal_fast.py ADDED Viewed

	@@ -0,0 +1,284 @@

+# Copyright 2025 Microsoft and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for Phi4Multimodal
+"""
+import math
+from typing import List, Optional, Union, TypedDict
+import torch
+from torchvision.transforms import functional as F
+from transformers.image_processing_utils_fast import (
+    BaseImageProcessorFast,
+    BatchFeature,
+    Unpack,
+    convert_to_rgb,
+    ChannelDimension
+)
+from transformers.image_utils import ImageInput, make_flat_list_of_images, valid_images
+from transformers.utils import TensorType, logging
+logger = logging.get_logger(__name__)
+class DefaultFastImageProcessorKwargs(TypedDict, total=False):
+    do_resize: Optional[bool]
+    size: Optional[dict[str, int]]
+    default_to_square: Optional[bool]
+    resample: Optional[Union["PILImageResampling", "F.InterpolationMode"]]
+    do_center_crop: Optional[bool]
+    crop_size: Optional[dict[str, int]]
+    do_rescale: Optional[bool]
+    rescale_factor: Optional[Union[int, float]]
+    do_normalize: Optional[bool]
+    image_mean: Optional[Union[float, list[float]]]
+    image_std: Optional[Union[float, list[float]]]
+    do_convert_rgb: Optional[bool]
+    return_tensors: Optional[Union[str, TensorType]]
+    data_format: Optional[ChannelDimension]
+    input_data_format: Optional[Union[str, ChannelDimension]]
+    device: Optional["torch.device"]
+class Phi4MultimodalFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
+    image_size: Optional[int]
+    patch_size: Optional[int]
+    dynamic_hd: Optional[int]
+class Phi4MultimodalImageProcessorFast(BaseImageProcessorFast):
+    r"""
+    Constructs a Phi4Multimodal image processor.
+    """
+    image_size = 448
+    patch_size = 14
+    dynamic_hd = 36
+    image_mean = [0.5, 0.5, 0.5]
+    image_std = [0.5, 0.5, 0.5]
+    valid_init_kwargs = Phi4MultimodalFastImageProcessorKwargs
+    model_input_names = ["image_pixel_values", "image_sizes", "image_attention_mask"]
+    def __init__(self, **kwargs: Unpack[Phi4MultimodalFastImageProcessorKwargs]):
+        super().__init__(**kwargs)
+    def find_closest_aspect_ratio(self, aspect_ratio, target_ratios, width, height):
+        best_ratio_diff = float("inf")
+        best_ratio = (1, 1)
+        area = width * height
+        for ratio in target_ratios:
+            target_aspect_ratio = ratio[0] / ratio[1]
+            ratio_diff = abs(aspect_ratio - target_aspect_ratio)
+            if ratio_diff < best_ratio_diff:
+                best_ratio_diff = ratio_diff
+                best_ratio = ratio
+            elif ratio_diff == best_ratio_diff:
+                if area > 0.5 * self.image_size * self.image_size * ratio[0] * ratio[1]:
+                    best_ratio = ratio
+        return best_ratio
+    def dynamic_preprocess(self, image, max_num=36, min_num=1):
+        image_size = self.image_size
+        patch_size = self.patch_size
+        mask_size = image_size // patch_size
+        orig_width, orig_height = image.size
+        w_crop_num = math.ceil(orig_width / float(image_size))
+        h_crop_num = math.ceil(orig_height / float(image_size))
+        if w_crop_num * h_crop_num > max_num:
+            aspect_ratio = orig_width / orig_height
+            # calculate the existing image aspect ratio
+            target_ratios = {
+                (i, j)
+                for n in range(min_num, max_num + 1)
+                for i in range(1, n + 1)
+                for j in range(1, n + 1)
+                if i * j <= max_num and i * j >= min_num
+            }
+            target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+            # find the closest aspect ratio to the target
+            target_aspect_ratio = self.find_closest_aspect_ratio(aspect_ratio, target_ratios, orig_width, orig_height)
+            # calculate the target width and height
+            target_width = image_size * target_aspect_ratio[0]
+            target_height = image_size * target_aspect_ratio[1]
+        else:
+            target_width = image_size * w_crop_num
+            target_height = image_size * h_crop_num
+            target_aspect_ratio = (w_crop_num, h_crop_num)
+        # Calculate the ratio
+        ratio_width = target_width / orig_width
+        ratio_height = target_height / orig_height
+        if ratio_width < ratio_height:
+            new_size = (target_width, int(orig_height * ratio_width))
+            padding_width = 0
+            padding_height = target_height - int(orig_height * ratio_width)
+        else:
+            new_size = (int(orig_width * ratio_height), target_height)
+            padding_width = target_width - int(orig_width * ratio_height)
+            padding_height = 0
+        attention_mask = torch.ones((int(mask_size * target_aspect_ratio[1]), int(mask_size * target_aspect_ratio[0])))
+        if padding_width >= patch_size:
+            attention_mask[:, -math.floor(padding_width / patch_size) :] = 0
+        if padding_height >= patch_size:
+            attention_mask[-math.floor(padding_height / patch_size) :, :] = 0
+        if min(new_size[1], target_height) < 10 or min(new_size[0], target_width) < 10:
+            raise ValueError(f"the aspect ratio is very extreme {new_size}")
+        image = F.resize(image, [new_size[1], new_size[0]])
+        resized_img = F.pad(image, [0, 0, padding_width, padding_height], fill=[255, 255, 255])
+        return resized_img, attention_mask
+    def pad_to_max_num_crops(self, images, max_crops=5):
+        """
+        images: B x 3 x H x W, B<=max_crops
+        """
+        B, _, H, W = images.shape
+        if B < max_crops:
+            pad = torch.zeros(max_crops - B, 3, H, W, dtype=images.dtype, device=images.device)
+            images = torch.cat([images, pad], dim=0)
+        return images
+    def pad_mask_to_max_num_crops(self, masks, max_crops=5):
+        B, H, W = masks.shape
+        if B < max_crops:
+            pad = torch.ones(max_crops - B, H, W, dtype=masks.dtype, device=masks.device)
+            masks = torch.cat([masks, pad], dim=0)
+        return masks
+    def preprocess(
+        self,
+        images: ImageInput,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+    ):
+        """
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Mean to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Standard deviation to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+        """
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        images = make_flat_list_of_images(images)
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+        images = [convert_to_rgb(image) for image in images]
+        image_size = self.image_size
+        patch_size = self.patch_size
+        mask_size = image_size // patch_size
+        imgs_and_masks = [self.dynamic_preprocess(image, max_num=self.dynamic_hd) for image in images]
+        images, image_attention_masks = [x[0] for x in imgs_and_masks], [x[1] for x in imgs_and_masks]
+        images = [F.to_tensor(image) for image in images]
+        hd_images = [F.normalize(image, image_mean, image_std) for image in images]
+        global_image = [
+            torch.nn.functional.interpolate(
+                image.unsqueeze(0).float(),
+                size=(image_size, image_size),
+                mode="bicubic",
+            ).to(image.dtype)
+            for image in hd_images
+        ]
+        shapes = [[image.size(1), image.size(2)] for image in hd_images]
+        mask_shapes = [[mask.size(0), mask.size(1)] for mask in image_attention_masks]
+        global_attention_mask = [torch.ones((1, mask_size, mask_size)) for _ in hd_images]
+        hd_images_reshape = []
+        for im, (h, w) in zip(hd_images, shapes):
+            im = im.reshape(1, 3, h // image_size, image_size, w // image_size, image_size)
+            im = im.permute(0, 2, 4, 1, 3, 5)
+            im = im.reshape(-1, 3, image_size, image_size)
+            hd_images_reshape.append(im.contiguous())
+        attention_masks_reshape = []
+        for mask, (h, w) in zip(image_attention_masks, mask_shapes):
+            mask = mask.reshape(h // mask_size, mask_size, w // mask_size, mask_size)
+            mask = mask.transpose(1, 2)
+            mask = mask.reshape(-1, mask_size, mask_size)
+            attention_masks_reshape.append(mask.contiguous())
+        downsample_attention_masks = []
+        for mask, (h, w) in zip(attention_masks_reshape, mask_shapes):
+            mask = mask[:, 0::2, 0::2]
+            mask = mask.reshape(
+                h // mask_size, w // mask_size, mask_size // 2 + mask_size % 2, mask_size // 2 + mask_size % 2
+            )
+            mask = mask.transpose(1, 2)
+            mask = mask.reshape(mask.size(0) * mask.size(1), mask.size(2) * mask.size(3))
+            downsample_attention_masks.append(mask)
+        num_img_tokens = [
+            256 + 1 + int(mask.sum().item()) + int(mask[:, 0].sum().item()) + 16 for mask in downsample_attention_masks
+        ]
+        hd_images_reshape = [
+            torch.cat([_global_image] + [_im], dim=0) for _global_image, _im in zip(global_image, hd_images_reshape)
+        ]
+        hd_masks_reshape = [
+            torch.cat([_global_mask] + [_mask], dim=0)
+            for _global_mask, _mask in zip(global_attention_mask, attention_masks_reshape)
+        ]
+        max_crops = max([img.size(0) for img in hd_images_reshape])
+        image_transformed = [self.pad_to_max_num_crops(im, max_crops) for im in hd_images_reshape]
+        image_transformed = torch.stack(image_transformed, dim=0)
+        mask_transformed = [self.pad_mask_to_max_num_crops(mask, max_crops) for mask in hd_masks_reshape]
+        mask_transformed = torch.stack(mask_transformed, dim=0)
+        returned_input_image_embeds = image_transformed
+        returned_image_sizes = torch.tensor(shapes, dtype=torch.long)
+        returned_image_attention_mask = mask_transformed
+        returned_num_img_tokens = num_img_tokens
+        data = {
+            "image_pixel_values": returned_input_image_embeds,
+            "image_sizes": returned_image_sizes,
+            "image_attention_mask": returned_image_attention_mask,
+            "num_img_tokens": returned_num_img_tokens,
+        }
+        return BatchFeature(data=data, tensor_type=return_tensors)
+__all__ = ["Phi4MultimodalImageProcessorFast"]
+Phi4MultimodalImageProcessorFast.register_for_auto_class()

speech-lora/tokenizer.json → model-00001-of-00002.safetensors RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:382cc235b56c725945e149cc25f191da667c836655efd0857b004320e90e91ea
-size 15524095

 version https://git-lfs.github.com/spec/v1
+oid sha256:57b93f5d0c9422c0b76b68119660187989bd8bb47848994376be3ac53eb61a95
+size 4903637712

model-00001-of-00003.safetensors → model-00002-of-00002.safetensors RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c46bb03332d82f6a3eaf85bd20af388dd4d4d68b198c2203c965c7381a466094
-size 4997504848

 version https://git-lfs.github.com/spec/v1
+oid sha256:fd6f60df08041b5c48afe7d7624d4de6e9d7d86162dec7a7e908a71d595e2967
+size 4584575136

model-00002-of-00003.safetensors DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b3e812c0c8acef4e7f5e34d6c9f77a7640ee4a2b93ea351921365ac62f19918d
-size 4952333128

model-00003-of-00003.safetensors DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:7be96b7339303752634b202d3f377bcf312a03046586eca6cea23347ace1e65a
-size 1199389232

model.safetensors.index.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

modeling_phi4mm.py → modeling_phi4_multimodal.py RENAMED Viewed

The diff for this file is too large to render. See raw diff

preprocessor_config.json CHANGED Viewed

@@ -1,14 +1,21 @@
 {
   "auto_map": {
-    "AutoProcessor": "processing_phi4mm.Phi4MMProcessor",
-    "AutoImageProcessor": "processing_phi4mm.Phi4MMImageProcessor",
-    "AutoFeatureExtractor": "processing_phi4mm.Phi4MMAudioFeatureExtractor"
   },
-  "image_processor_type": "Phi4MMImageProcessor",
-  "processor_class": "Phi4MMProcessor",
-  "feature_extractor_type": "Phi4MMAudioFeatureExtractor",
   "audio_compression_rate": 8,
   "audio_downsample_rate": 1,
   "audio_feat_stride": 1,
-  "dynamic_hd": 36
 }

 {
   "auto_map": {
+    "AutoProcessor": "processing_phi4_multimodal.Phi4MultimodalProcessor",
+    "AutoImageProcessor": "image_processing_phi4_multimodal_fast.Phi4MultimodalImageProcessorFast",
+    "AutoFeatureExtractor": "feature_extraction_phi4_multimodal.Phi4MultimodalFeatureExtractor"
   },
   "audio_compression_rate": 8,
   "audio_downsample_rate": 1,
   "audio_feat_stride": 1,
+  "feature_extractor_type": "Phi4MultimodalFeatureExtractor",
+  "feature_size": 80,
+  "hop_length": 160,
+  "n_fft": 512,
+  "padding_side": "right",
+  "padding_value": 0.0,
+  "preemphasis": 0.97,
+  "processor_class": "Phi4MultimodalProcessor",
+  "return_attention_mask": true,
+  "sampling_rate": 16000,
+  "win_length": 400
 }

processing_phi4_multimodal.py ADDED Viewed

	@@ -0,0 +1,541 @@

+# Copyright 2025 Microsoft and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for Phi4Multimodal
+"""
+import re
+import os
+import requests
+import base64
+from io import BytesIO
+from typing import List, Optional, Union, TypedDict
+import librosa
+import numpy as np
+import PIL.Image
+from transformers.image_processing_utils import BatchFeature
+from transformers.image_utils import ImageInput
+from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack, TextKwargs, ImagesKwargs, VideosKwargs, AudioKwargs, CommonKwargs, ProcessorChatTemplateKwargs
+from transformers.tokenization_utils_base import TextInput
+from transformers.utils import logging
+from .feature_extraction_phi4_multimodal import AudioInput
+logger = logging.get_logger(__name__)
+class ChatTemplateLoadKwargs(TypedDict, total=False):
+    """
+    Keyword arguments used to load multimodal data in processor chat templates.
+    num_frames (`int`, *optional*):
+        Number of frames to sample uniformly. If not passed, the whole video is loaded.
+    video_load_backend (`str`, *optional*, defaults to `"pyav"`):
+        The backend to use when loading the video which will be used only when there are videos in the conversation.
+        Can be any of ["decord", "pyav", "opencv", "torchvision"]. Defaults to "pyav" because it is the only backend
+        that supports all types of sources to load from.
+    video_fps (`int`, *optional*):
+        Number of frames to sample per second. Should be passed only when `num_frames=None`.
+        If not specified and `num_frames==None`, all frames are sampled.
+    sample_indices_fn (`Callable`, *optional*):
+            A callable function that will return indices at which the video should be sampled. If the video has to be loaded using
+            by a different sampling technique than provided by `num_frames` or `fps` arguments, one should provide their own `sample_indices_fn`.
+            If not provided, simple uniformt sampling with fps is performed, otherwise `sample_indices_fn` has priority over other args.
+            The function expects at input the all args along with all kwargs passed to `load_video` and should output valid
+            indices at which the video should be sampled. For example:
+            def sample_indices_fn(num_frames, fps, metadata, **kwargs):
+                # add you sampling logic here ...
+                return np.linspace(start_idx, end_idx, num_frames, dtype=int)
+    """
+    num_frames: Optional[int] = None
+    video_load_backend: Optional[str] = "pyav"
+    video_fps: Optional[int] = None
+    sampling_rate: Optional[int] = 16_000
+    load_audio_from_video: Optional[bool] = False
+class AllKwargsForChatTemplate(
+    TextKwargs, ImagesKwargs, VideosKwargs, AudioKwargs, CommonKwargs, ProcessorChatTemplateKwargs
+):
+    processor_kwargs: ProcessingKwargs = {
+        **ProcessingKwargs.__annotations__,
+    }
+    mm_load_kwargs: ChatTemplateLoadKwargs = {
+        **TextKwargs.__annotations__,
+    }
+    template_kwargs: ProcessorChatTemplateKwargs = {
+        **ProcessorChatTemplateKwargs.__annotations__,
+    }
+class Phi4MultimodalProcessorKwargs(ProcessingKwargs, total=False):
+    _defaults = {
+        "audio_kwargs": {
+            "device": "cpu",
+        },
+    }
+def load_audio(audio: Union[str, np.ndarray], sampling_rate=16000, timeout=None) -> np.ndarray:
+    """
+    Loads `audio` to an np.ndarray object.
+    Args:
+        audio (`str` or `np.ndarray`):
+            The audio to be laoded to the numpy array format.
+        sampling_rate (`int`, *optional*, defaults to 16000):
+            The samlping rate to be used when loading the audio. It should be same as the
+            sampling rate the model you will be using further was trained with.
+        timeout (`float`, *optional*):
+            The timeout value in seconds for the URL request.
+    Returns:
+        `np.ndarray`: A numpy artay representing the audio.
+    """
+    if isinstance(audio, str):
+        # Load audio from URL (e.g https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/translate_to_chinese.wav)
+        if audio.startswith("http://") or audio.startswith("https://"):
+            audio = librosa.load(BytesIO(requests.get(audio, timeout=timeout).content), sr=sampling_rate)[0]
+        elif os.path.isfile(audio):
+            audio = librosa.load(audio, sr=sampling_rate)[0]
+    elif isinstance(audio, np.ndarray):
+        audio = audio
+    else:
+        raise TypeError(
+            "Incorrect format used for `audio`. Should be an url linking to an audio, a local path, or numpy array."
+        )
+    return audio
+def load_image(image: Union[str, "PIL.Image.Image"], timeout: Optional[float] = None) -> "PIL.Image.Image":
+    """
+    Loads `image` to a PIL Image.
+    Args:
+        image (`str` or `PIL.Image.Image`):
+            The image to convert to the PIL Image format.
+        timeout (`float`, *optional*):
+            The timeout value in seconds for the URL request.
+    Returns:
+        `PIL.Image.Image`: A PIL Image.
+    """
+    if isinstance(image, str):
+        if image.startswith("http://") or image.startswith("https://"):
+            # We need to actually check for a real protocol, otherwise it's impossible to use a local file
+            # like http_huggingface_co.png
+            image = PIL.Image.open(BytesIO(requests.get(image, timeout=timeout).content))
+        elif os.path.isfile(image):
+            image = PIL.Image.open(image)
+        else:
+            if image.startswith("data:image/"):
+                image = image.split(",")[1]
+            # Try to load as base64
+            try:
+                b64 = base64.decodebytes(image.encode())
+                image = PIL.Image.open(BytesIO(b64))
+            except Exception as e:
+                raise ValueError(
+                    f"Incorrect image source. Must be a valid URL starting with `http://` or `https://`, a valid path to an image file, or a base64 encoded string. Got {image}. Failed with {e}"
+                )
+    elif isinstance(image, PIL.Image.Image):
+        image = image
+    else:
+        raise TypeError(
+            "Incorrect format used for image. Should be an url linking to an image, a base64 string, a local path, or a PIL image."
+        )
+    image = PIL.ImageOps.exif_transpose(image)
+    image = image.convert("RGB")
+    return image
+class Phi4MultimodalProcessor(ProcessorMixin):
+    r"""
+    Constructs a Phi4Multimodal processor which raps an image processor, a audio processor, and a GPT tokenizer into a single processor.
+    [`Phi4MultimodalProcessor`] offers all the functionalities of [`Phi4MultimodalImageProcessorFast`] and [`GPT2Tokenizer`]. See the
+    [`~Phi4MultimodalProcessor.__call__`] and [`~Phi4MultimodalProcessor.decode`] for more information.
+    Args:
+        image_processor (`Phi4MultimodalImageProcessorFast`):
+            The image processor to use for images.
+        audio_processor (`Phi4MultimodalFeatureExtractor`):
+            The audio processor to use for audio inputs.
+        tokenizer (`GPT2TokenizerFast`):
+            The tokenizer to use for text.
+        fake_image_token_pattern (`str`, *optional*, defaults to `r"<\|image_\d+\|>"`):
+            The fake image token pattern.
+        fake_audio_token_pattern (`str`, *optional*, defaults to `r"<\|audio_\d+\|>"`):
+            The fake audio token pattern.
+    """
+    attributes = ["image_processor", "audio_processor", "tokenizer"]
+    tokenizer_class = "GPT2TokenizerFast"
+    image_processor_class = "AutoImageProcessor"
+    audio_processor_class = "AutoFeatureExtractor"
+    valid_kwargs = ["chat_template"]
+    def __init__(
+        self,
+        image_processor,
+        audio_processor,
+        tokenizer,
+        **kwargs,
+    ):
+        self.image_token = tokenizer.image_token
+        self.image_token_id = tokenizer.image_token_id
+        self.audio_token = tokenizer.audio_token
+        self.audio_token_id = tokenizer.audio_token_id
+        super().__init__(image_processor, audio_processor, tokenizer, **kwargs)
+    def __call__(
+        self,
+        text: Union[TextInput, List[TextInput]],
+        images: Optional[ImageInput] = None,
+        audio: Optional[AudioInput] = None,
+        **kwargs: Unpack[ProcessingKwargs],
+    ) -> BatchFeature:
+        """
+        Main method to prepare for the model one or several sequences(s) and image(s). This method forards the `text`
+        and `kwargs` arguments to GPT2Tokenizer's [`~GPT2Tokenizer.__call__`] if `text` is not `None` to encode
+        the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
+        Phi4MultimodalImageProcessorFast's [`~Phi4MultimodalImageProcessorFast.__call__`] if `images` is not `None`. Please refer to the doctsring
+        of the above two methods for more information.
+        Args:
+            text (`str`, `List[str]`, `List[List[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. Both channels-first and channels-last formats are supported.
+            audio (`List[Union[np.ndarray, torch.Tensor]]`):
+                List of the audios to be prepared.
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+            - **input_ids** -- List of token ids to be fed to a model.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model.
+            - **input_image_embeds** -- Pixel values to be fed to a model.
+            - **image_sizes** -- List of tuples specifying the size of each image in `input_image_embeds`.
+            - **image_attention_mask** -- List of attention masks for each image in `input_image_embeds`.
+            - **input_audio_embeds** -- Audio embeddings to be fed to a model.
+            - **audio_embed_sizes** -- List of integers specifying the size of each audio in `input_audio_embeds`.
+        """
+        output_kwargs = self._merge_kwargs(Phi4MultimodalProcessorKwargs, self.tokenizer.init_kwargs, **kwargs)
+        image_kwargs = output_kwargs["images_kwargs"]
+        audio_kwargs = output_kwargs["audio_kwargs"]
+        image_inputs = self.image_processor(images, **image_kwargs) if images is not None else {}
+        audio_inputs = self.audio_processor(audio, **audio_kwargs) if audio is not None else {}
+        # We pop here for images as we don't need it later
+        num_img_tokens = image_inputs.pop("num_img_tokens", [])
+        audio_embed_sizes = audio_inputs.get("audio_embed_sizes", [])
+        # Replace certain special tokens for compatibility
+        if isinstance(text, str):
+            text = [text]
+        elif not isinstance(text, list) and not isinstance(text[0], str):
+            raise ValueError("Invalid input text. Please provide a string, or a list of strings")
+        image_token = self.tokenizer.image_token
+        audio_token = self.tokenizer.audio_token
+        # Check that the number of special tokens is sound
+        concatenated_prompt = "".join(text)
+        if concatenated_prompt.count(image_token) != len(num_img_tokens):
+            raise ValueError(
+                "You should add as much image tokens `<|image|>` in your prompt as you pass `images` to the processor. ",
+                f"Input contains {concatenated_prompt.count(image_token)} tokens != {len(num_img_tokens)} images",
+            )
+        if concatenated_prompt.count(audio_token) != len(audio_embed_sizes):
+            raise ValueError(
+                "You should add as much audio tokens `<|audio|>` in your prompt as you pass `audios` to the processor. "
+                f"Input contains {concatenated_prompt.count(audio_token)} tokens != {len(audio_embed_sizes)} audios"
+            )
+        # Add appropriate number of image/audio tokens (note that the count of replacement is dynamic)
+        image_count_iter = iter(num_img_tokens)
+        audio_count_iter = iter(audio_embed_sizes)
+        processed_text = [
+            re.sub(re.escape(image_token), lambda _: image_token * next(image_count_iter), t) for t in text
+        ]
+        processed_text = [
+            re.sub(re.escape(audio_token), lambda _: audio_token * next(audio_count_iter), t) for t in processed_text
+        ]
+        return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
+        text_inputs = self.tokenizer(processed_text, **output_kwargs["text_kwargs"])
+        self._check_special_mm_tokens(processed_text, text_inputs, modalities=["image"])
+        # prepare batch feature
+        data = {
+            **text_inputs,
+            **image_inputs,
+            **audio_inputs,
+        }
+        return BatchFeature(data=data, tensor_type=return_tensors)
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to GPT2Tokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to GPT2Tokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        audio_processor_input_names = self.audio_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names + audio_processor_input_names))
+    def _check_special_mm_tokens(self, text: list[str], text_inputs: "BatchFeature", modalities: list[str]):
+        """
+        Checks that number of special tokens in text and processed text is same. The count can be different
+        if tokenized text was truncated, leading to issues in model code.
+        """
+        for modality in modalities:
+            token_str = getattr(self, f"{modality}_token")
+            token_id = getattr(self, f"{modality}_token_id")
+            ids_count = [list(ids).count(token_id) for ids in text_inputs["input_ids"]]
+            text_count = [sample.count(token_str) for sample in text]
+            if ids_count != text_count:
+                raise ValueError(
+                    f"Mismatch in `{modality}` token count between text and `input_ids`. Got ids={ids_count} and text={text_count}. "
+                    "Likely due to `truncation='max_length'`. Please disable truncation or increase `max_length`."
+                )
+    def apply_chat_template(
+        self,
+        conversation: Union[list[dict[str, str]], list[list[dict[str, str]]]],
+        chat_template: Optional[str] = None,
+        **kwargs: Unpack[AllKwargsForChatTemplate],
+    ) -> str:
+        """
+        Similar to the `apply_chat_template` method on tokenizers, this method applies a Jinja template to input
+        conversations to turn them into a single tokenizable string.
+        The input is expected to be in the following format, where each message content is a list consisting of text and
+        optionally image or video inputs. One can also provide an image, video, URL or local path which will be used to form
+        `pixel_values` when `return_dict=True`. If not provided, one will get only the formatted text, optionally tokenized text.
+        conversation = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image", "image": "https://www.ilankelman.org/stopsigns/australia.jpg"},
+                    {"type": "text", "text": "Please describe this image in detail."},
+                ],
+            },
+        ]
+        Args:
+            conversation (`Union[List[Dict, [str, str]], List[List[Dict[str, str]]]]`):
+                The conversation to format.
+            chat_template (`Optional[str]`, *optional*):
+                The Jinja template to use for formatting the conversation. If not provided, the tokenizer's
+                chat template is used.
+        """
+        if chat_template is None:
+            if isinstance(self.chat_template, dict) and "default" in self.chat_template:
+                chat_template = self.chat_template["default"]
+            elif isinstance(self.chat_template, dict):
+                raise ValueError(
+                    'The processor has multiple chat templates but none of them are named "default". You need to specify'
+                    " which one to use by passing the `chat_template` argument. Available templates are: "
+                    f"{', '.join(self.chat_template.keys())}"
+                )
+            elif self.chat_template is not None:
+                chat_template = self.chat_template
+            else:
+                raise ValueError(
+                    "Cannot use apply_chat_template because this processor does not have a chat template."
+                )
+        else:
+            if isinstance(self.chat_template, dict) and chat_template in self.chat_template:
+                # It's the name of a template, not a full template string
+                chat_template = self.chat_template[chat_template]
+            else:
+                # It's a template string, render it directly
+                chat_template = chat_template
+        # Fill sets of kwargs that should be used by different parts of template
+        processed_kwargs = {
+            "mm_load_kwargs": {},
+            "template_kwargs": {},
+        }
+        for kwarg_type in processed_kwargs:
+            for key in AllKwargsForChatTemplate.__annotations__[kwarg_type].__annotations__.keys():
+                kwarg_type_defaults = AllKwargsForChatTemplate.__annotations__[kwarg_type]
+                default_value = getattr(kwarg_type_defaults, key, None)
+                value = kwargs.pop(key, default_value)
+                if value is not None and not isinstance(value, dict):
+                    processed_kwargs[kwarg_type][key] = value
+        if isinstance(conversation, (list, tuple)) and (
+            isinstance(conversation[0], (list, tuple)) or hasattr(conversation[0], "content")
+        ):
+            is_batched = True
+            conversations = conversation
+        else:
+            is_batched = False
+            conversations = [conversation]
+        tokenize = processed_kwargs["template_kwargs"].pop("tokenize", False)
+        return_dict = processed_kwargs["template_kwargs"].pop("return_dict", False)
+        mm_load_kwargs = processed_kwargs["mm_load_kwargs"]
+        if tokenize:
+            batch_images, batch_videos = [], []
+            batch_audios = []
+            batch_video_metadata = []
+            for conversation in conversations:
+                images, videos = [], []
+                video_metadata = []
+                for message in conversation:
+                    visuals = [content for content in message["content"] if content["type"] in ["image", "video"]]
+                    audio_fnames = [
+                        content[key]
+                        for content in message["content"]
+                        for key in ["audio", "url", "path"]
+                        if key in content and content["type"] == "audio"
+                    ]
+                    image_fnames = [
+                        vision_info[key]
+                        for vision_info in visuals
+                        for key in ["image", "url", "path", "base64"]
+                        if key in vision_info and vision_info["type"] == "image"
+                    ]
+                    video_fnames = [
+                        vision_info[key]
+                        for vision_info in visuals
+                        for key in ["video", "url", "path"]
+                        if key in vision_info and vision_info["type"] == "video"
+                    ]
+                    for fname in image_fnames:
+                        images.append(load_image(fname))
+                    # Audio models do not accept nested list of audios (yet!) so we construct a flat input audio list
+                    if not mm_load_kwargs["load_audio_from_video"]:
+                        for fname in audio_fnames:
+                            batch_audios.append(load_audio(fname, sampling_rate=mm_load_kwargs["sampling_rate"]))
+                    else:
+                        for fname in video_fnames:
+                            batch_audios.append(load_audio(fname, sampling_rate=mm_load_kwargs["sampling_rate"]))
+                    for fname in video_fnames:
+                        if isinstance(fname, (list, tuple)) and isinstance(fname[0], str):
+                            video = [np.array(load_image(image_fname)) for image_fname in fname]
+                            # create a 4D video because `load_video` always returns a 4D array
+                            video = np.stack(video)
+                            metadata = None
+                            logger.warning(
+                                "When loading the video from list of images, we cannot infer metadata such as `fps` or `duration`. "
+                                "If your model uses this metadata during processing, please load the whole video and let the model sample frames instead."
+                            )
+                        else:
+                            # TODO: raushan, should be `self.video_processor.load_video_for_model` when API is added
+                            video, metadata = self._load_video_for_model(
+                                fname,
+                                num_frames=mm_load_kwargs.get("num_frames", None),
+                                fps=mm_load_kwargs.get("video_fps", None),
+                                backend=mm_load_kwargs["video_load_backend"],
+                                **kwargs,
+                            )
+                        videos.append(video)
+                        video_metadata.append(metadata)
+                # Currently all processors can accept nested list of batches, but not flat list of visuals
+                # So we'll make a batched list of images and let the processor handle it
+                if images:
+                    batch_images.append(images)
+                if videos:
+                    batch_videos.append(videos)
+                    batch_video_metadata.append(video_metadata)
+            # Process conversation with video/image information if needed. Then convert into a prompt using Jinja template
+            conversations = self._process_messages_for_chat_template(
+                conversations,
+                batch_images=batch_images,
+                batch_videos=batch_videos,
+                batch_video_metadata=batch_video_metadata,
+                **processed_kwargs["mm_load_kwargs"],
+            )
+        prompt = self.tokenizer.apply_chat_template(
+            conversations,
+            chat_template=chat_template,
+            tokenize=False,
+            return_dict=False,
+            **processed_kwargs["template_kwargs"],
+        )
+        if not is_batched:
+            prompt = prompt[0]
+        if tokenize:
+            # Tokenizer's `apply_chat_template` never adds special tokens when tokenizing
+            # But processor's `apply_chat_template` didn't have an option to tokenize, so users had to format the prompt
+            # and pass it to the processor. Users thus never worried about special tokens relying on processor handling
+            # everything internally. The below line is to keep BC for that and be able to work with model that have
+            # special tokens in the template (consistent with tokenizers). We dont want to raise warning, it will flood command line
+            # without actionable solution for users
+            single_prompt = prompt[0] if is_batched else prompt
+            if self.tokenizer.bos_token is not None and single_prompt.startswith(self.tokenizer.bos_token):
+                kwargs["add_special_tokens"] = False
+            out = self(
+                text=prompt,
+                images=batch_images if batch_images else None,
+                videos=batch_videos if batch_videos else None,
+                audio=batch_audios if batch_audios else None,
+                **kwargs,
+            )
+            if return_dict:
+                return out
+            else:
+                return out["input_ids"]
+        return prompt
+__all__ = ["Phi4MultimodalProcessor"]
+Phi4MultimodalProcessor.register_for_auto_class()

processing_phi4mm.py DELETED Viewed

@@ -1,733 +0,0 @@
-# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Processor class for Phi4MM
-"""
-import re
-from typing import List, Optional, Tuple, Union
-import math
-from enum import Enum
-import numpy as np
-import scipy
-import torch
-import torchvision
-from transformers import AutoFeatureExtractor, AutoImageProcessor
-from transformers.feature_extraction_sequence_utils import SequenceFeatureExtractor
-from transformers.image_processing_utils import BaseImageProcessor, BatchFeature
-from transformers.image_utils import (
-    ImageInput,
-    make_list_of_images,
-    valid_images,
-)
-from transformers.processing_utils import ProcessorMixin
-from transformers.tokenization_utils_base import PaddingStrategy, TextInput, TruncationStrategy
-from transformers.utils import TensorType, logging
-from torch.nn.utils.rnn import pad_sequence
-logger = logging.get_logger(__name__)
-# Special tokens
-_COMPATIBLE_IMAGE_SPECIAL_TOKEN_PATTERN = r'<\|image_\d+\|>'  # For backward compatibility
-_COMPATIBLE_AUDIO_SPECIAL_TOKEN_PATTERN = r'<\|audio_\d+\|>'  # For backward compatibility
-_IMAGE_SPECIAL_TOKEN = '<|endoftext10|>'
-_AUDIO_SPECIAL_TOKEN = '<|endoftext11|>'
-_IMAGE_SPECIAL_TOKEN_ID = 200010  # '<|endoftext10|>', or we can better name it (in `tokenizer_config.json`)
-_AUDIO_SPECIAL_TOKEN_ID = 200011  # '<|endoftext11|>'
-class InputMode(Enum):
-    LANGUAGE = 0
-    VISION = 1
-    SPEECH = 2
-    VISION_SPEECH = 3
-class Phi4MMImageProcessor(BaseImageProcessor):
-    r"""
-    Constructs a Phi4MM image processor.
-    """
-    model_input_names = ["input_image_embeds", "image_sizes", "image_attention_mask"]
-    def __init__(
-        self,
-        dynamic_hd,
-        **kwargs,
-    ) -> None:
-        super().__init__(**kwargs)
-        self.dynamic_hd = dynamic_hd
-    def find_closest_aspect_ratio(self, aspect_ratio, target_ratios, width, height, image_size):
-        best_ratio_diff = float('inf')
-        best_ratio = (1, 1)
-        area = width * height
-        for ratio in target_ratios:
-            target_aspect_ratio = ratio[0] / ratio[1]
-            ratio_diff = abs(aspect_ratio - target_aspect_ratio)
-            if ratio_diff < best_ratio_diff:
-                best_ratio_diff = ratio_diff
-                best_ratio = ratio
-            elif ratio_diff == best_ratio_diff:
-                if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
-                    best_ratio = ratio
-        return best_ratio
-    def dynamic_preprocess(self, image, min_num=1, max_num=12, image_size=384, mask_size=27, use_thumbnail=True):
-        orig_width, orig_height = image.size
-        w_crop_num = math.ceil(orig_width/float(image_size))
-        h_crop_num = math.ceil(orig_height/float(image_size))
-        if w_crop_num * h_crop_num > max_num:
-            aspect_ratio = orig_width / orig_height
-            # calculate the existing image aspect ratio
-            target_ratios = set(
-                (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
-                i * j <= max_num and i * j >= min_num)
-            target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
-            # find the closest aspect ratio to the target
-            target_aspect_ratio = self.find_closest_aspect_ratio(
-                aspect_ratio, target_ratios, orig_width, orig_height, image_size)
-            # calculate the target width and height
-            target_width = image_size * target_aspect_ratio[0]
-            target_height = image_size * target_aspect_ratio[1]
-        else:
-            target_width = image_size * w_crop_num
-            target_height = image_size * h_crop_num
-            target_aspect_ratio = (w_crop_num, h_crop_num)
-        # Calculate the ratio
-        ratio_width = target_width / orig_width
-        ratio_height = target_height / orig_height
-        if ratio_width < ratio_height:
-            new_size = (target_width, int(orig_height * ratio_width))
-            padding_width = 0
-            padding_height = target_height - int(orig_height * ratio_width)
-        else:
-            new_size = (int(orig_width * ratio_height), target_height)
-            padding_width = target_width - int(orig_width * ratio_height)
-            padding_height = 0
-        attention_mask = torch.ones((int(mask_size*target_aspect_ratio[1]), int(mask_size*target_aspect_ratio[0])))
-        if padding_width >= 14:
-            attention_mask[:, -math.floor(padding_width/14):] = 0
-        if padding_height >= 14:
-            attention_mask[-math.floor(padding_height/14):,:] = 0
-        assert attention_mask.sum() > 0
-        if min(new_size[1], target_height) < 10 or min(new_size[0], target_width) < 10:
-            raise ValueError(f'the aspect ratio is very extreme {new_size}')
-        image = torchvision.transforms.functional.resize(image, [new_size[1], new_size[0]],)
-        resized_img = torchvision.transforms.functional.pad(image, [0, 0, padding_width, padding_height], fill=[255,255,255])
-        return resized_img, attention_mask
-    def pad_to_max_num_crops(self, images, max_crops=5):
-        """
-        images: B x 3 x H x W, B<=max_crops
-        """
-        B, _, H, W = images.shape
-        if B < max_crops:
-            pad = torch.zeros(max_crops - B, 3, H, W, dtype=images.dtype, device=images.device)
-            images = torch.cat([images, pad], dim=0)
-        return images
-    def pad_mask_to_max_num_crops(self, masks, max_crops=5):
-        B, H, W = masks.shape
-        if B < max_crops:
-            pad = torch.ones(max_crops - B, H, W, dtype=masks.dtype, device=masks.device)
-            masks = torch.cat([masks, pad], dim=0)
-        return masks
-    def preprocess(
-        self,
-        images: ImageInput,
-        return_tensors: Optional[Union[str, TensorType]] = None,
-    ):
-        """
-        Args:
-            images (`ImageInput`):
-                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
-                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
-            return_tensors (`str` or `TensorType`, *optional*):
-                The type of tensors to return. Can be one of:
-                - Unset: Return a list of `np.ndarray`.
-                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
-                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
-                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
-                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
-        """
-        images = make_list_of_images(images)
-        if not valid_images(images):
-            raise ValueError(
-                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
-                "torch.Tensor, tf.Tensor or jax.ndarray."
-            )
-        # Basic settings.
-        img_processor = torchvision.transforms.Compose([
-            torchvision.transforms.ToTensor(),
-            torchvision.transforms.Normalize(
-                (0.5, 0.5, 0.5),
-                (0.5, 0.5, 0.5)
-            ),
-        ])
-        dyhd_base_resolution = 448
-        # Dynamic HD
-        base_resolution = dyhd_base_resolution
-        images = [image.convert('RGB') for image in images]
-        # cover 384 and 448 resolution
-        mask_resolution = base_resolution // 14
-        elems, image_attention_masks = [], []
-        for im in images:
-            elem, attention_mask = self.dynamic_preprocess(im, max_num=self.dynamic_hd, image_size=base_resolution, mask_size=mask_resolution)
-            elems.append(elem)
-            image_attention_masks.append(attention_mask)
-        hd_images = [img_processor(im) for im in elems]
-        global_image = [torch.nn.functional.interpolate(im.unsqueeze(0).float(), size=(base_resolution, base_resolution), mode='bicubic',).to(im.dtype) for im in hd_images]
-        shapes = [[im.size(1), im.size(2)] for im in hd_images]
-        mask_shapes = [[mask.size(0), mask.size(1)] for mask in image_attention_masks]
-        global_attention_mask = [torch.ones((1, mask_resolution, mask_resolution)) for _ in hd_images]
-        hd_images_reshape = [im.reshape(1, 3,
-                                            h//base_resolution,
-                                            base_resolution,
-                                            w//base_resolution,
-                                            base_resolution
-                                            ).permute(0,2,4,1,3,5).reshape(-1, 3, base_resolution, base_resolution).contiguous() for im, (h, w) in zip(hd_images, shapes)]
-        attention_masks_reshape = [mask.reshape(1,
-                                            h//mask_resolution,
-                                            mask_resolution,
-                                            w//mask_resolution,
-                                            mask_resolution
-                                            ).permute(0,1,3,2,4).reshape(-1, mask_resolution, mask_resolution).contiguous() for mask, (h, w) in zip(image_attention_masks, mask_shapes)]
-        downsample_attention_masks = [mask[:,0::2,0::2].reshape(1,
-                                            h//mask_resolution,
-                                            w//mask_resolution,
-                                            mask_resolution//2+mask_resolution%2,
-                                            mask_resolution//2+mask_resolution%2
-                                            ).permute(0,1,3,2,4) for mask, (h,w) in zip(attention_masks_reshape, mask_shapes)]
-        downsample_attention_masks = [mask.reshape(mask.size(1)*mask.size(2), mask.size(3)*mask.size(4))for mask in downsample_attention_masks]
-        num_img_tokens = [256 + 1 + int(mask.sum().item()) + int(mask[:,0].sum().item()) + 16 for mask in downsample_attention_masks]
-        hd_images_reshape = [torch.cat([_global_image] + [_im], dim=0) for _global_image, _im in zip(global_image, hd_images_reshape)]
-        hd_masks_reshape = [torch.cat([_global_mask] + [_mask], dim=0) for _global_mask, _mask in zip(global_attention_mask, attention_masks_reshape)]
-        max_crops = max([img.size(0) for img in hd_images_reshape])
-        image_transformed = [self.pad_to_max_num_crops(im, max_crops) for im in hd_images_reshape]
-        image_transformed = torch.stack(image_transformed, dim=0)
-        mask_transformed = [self.pad_mask_to_max_num_crops(mask, max_crops) for mask in hd_masks_reshape]
-        mask_transformed = torch.stack(mask_transformed, dim=0)
-        returned_input_image_embeds = image_transformed
-        returned_image_sizes = torch.tensor(shapes, dtype=torch.long)
-        returned_image_attention_mask = mask_transformed
-        returned_num_img_tokens = num_img_tokens
-        data = {
-            "input_image_embeds": returned_input_image_embeds,
-            "image_sizes": returned_image_sizes,
-            "image_attention_mask": returned_image_attention_mask,
-            "num_img_tokens": returned_num_img_tokens,
-        }
-        return BatchFeature(data=data, tensor_type=return_tensors)
-AudioInput = Tuple[Union[np.ndarray, torch.Tensor], int]
-AudioInputs = List[AudioInput]
-def speechlib_mel(sample_rate, n_fft, n_mels, fmin=None, fmax=None):
-    """Create a Mel filter-bank the same as SpeechLib FbankFC.
-    Args:
-        sample_rate (int): Sample rate in Hz. number > 0 [scalar]
-        n_fft (int): FFT size. int > 0 [scalar]
-        n_mel (int): Mel filter size. int > 0 [scalar]
-        fmin (float): lowest frequency (in Hz). If None use 0.0.
-            float >= 0 [scalar]
-        fmax: highest frequency (in Hz). If None use sample_rate / 2.
-            float >= 0 [scalar]
-    Returns
-        out (numpy.ndarray): Mel transform matrix
-            [shape=(n_mels, 1 + n_fft/2)]
-    """
-    bank_width = int(n_fft // 2 + 1)
-    if fmax is None:
-        fmax = sample_rate / 2
-    if fmin is None:
-        fmin = 0
-    assert fmin >= 0, "fmin cannot be negtive"
-    assert fmin < fmax <= sample_rate / 2, "fmax must be between (fmin, samplerate / 2]"
-    def mel(f):
-        return 1127.0 * np.log(1.0 + f / 700.0)
-    def bin2mel(fft_bin):
-        return 1127.0 * np.log(1.0 + fft_bin * sample_rate / (n_fft * 700.0))
-    def f2bin(f):
-        return int((f * n_fft / sample_rate) + 0.5)
-    # Spec 1: FFT bin range [f2bin(fmin) + 1, f2bin(fmax) - 1]
-    klo = f2bin(fmin) + 1
-    khi = f2bin(fmax)
-    khi = max(khi, klo)
-    # Spec 2: SpeechLib uses trianges in Mel space
-    mlo = mel(fmin)
-    mhi = mel(fmax)
-    m_centers = np.linspace(mlo, mhi, n_mels + 2)
-    ms = (mhi - mlo) / (n_mels + 1)
-    matrix = np.zeros((n_mels, bank_width), dtype=np.float32)
-    for m in range(0, n_mels):
-        left = m_centers[m]
-        center = m_centers[m + 1]
-        right = m_centers[m + 2]
-        for fft_bin in range(klo, khi):
-            mbin = bin2mel(fft_bin)
-            if left < mbin < right:
-                matrix[m, fft_bin] = 1.0 - abs(center - mbin) / ms
-    return matrix
-class Phi4MMAudioFeatureExtractor(SequenceFeatureExtractor):
-    model_input_names = ["input_audio_embeds", "audio_embed_sizes", "audio_attention_mask"]
-    def __init__(self, audio_compression_rate, audio_downsample_rate, audio_feat_stride, **kwargs):
-        feature_size = 80
-        sampling_rate = 16000
-        padding_value = 0.0
-        super().__init__(feature_size, sampling_rate, padding_value, **kwargs)
-        self.compression_rate = audio_compression_rate
-        self.qformer_compression_rate = audio_downsample_rate
-        self.feat_stride = audio_feat_stride
-        self._eightk_method = "fillzero"
-        self._mel = speechlib_mel(16000, 512, 80, fmin=None, fmax=7690).T
-        self._hamming400 = np.hamming(400)  # for 16k audio
-        self._hamming200 = np.hamming(200)  # for 8k audio
-    def duration_to_frames(self, duration):
-        """duration in s, estimated frames"""
-        frame_rate = 10
-        num_frames = duration * 1000 // frame_rate
-        return num_frames
-    def __call__(
-        self,
-        audios: List[AudioInput],
-        return_tensors: Optional[Union[str, TensorType]] = None,
-    ):
-        # Ref: https://github.com/huggingface/transformers/blob/v4.47.0/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py#L161
-        returned_input_audio_embeds = []
-        returned_audio_embed_sizes = []
-        audio_frames_list = []
-        for audio_data, sample_rate in audios:
-            audio_embeds = self._extract_features(audio_data, sample_rate)
-            audio_frames = len(audio_embeds) * self.feat_stride
-            audio_embed_size = self._compute_audio_embed_size(audio_frames)
-            returned_input_audio_embeds.append(torch.tensor(audio_embeds))
-            returned_audio_embed_sizes.append(torch.tensor(audio_embed_size).long())
-            audio_frames_list.append(audio_frames)
-        returned_input_audio_embeds = pad_sequence(
-            returned_input_audio_embeds, batch_first=True
-        )
-        returned_audio_embed_sizes = torch.stack(returned_audio_embed_sizes, dim=0)
-        audio_frames = torch.tensor(audio_frames_list)
-        returned_audio_attention_mask = torch.arange(0, audio_frames.max()).unsqueeze(0) < audio_frames.unsqueeze(1) if len(audios) > 1 else None
-        data = {
-            "input_audio_embeds": returned_input_audio_embeds,
-            "audio_embed_sizes": returned_audio_embed_sizes,
-        }
-        if returned_audio_attention_mask is not None:
-            data["audio_attention_mask"] = returned_audio_attention_mask
-        return BatchFeature(data=data, tensor_type=return_tensors)
-    def _extract_spectrogram(self, wav, fs):
-        """Extract spectrogram features from waveform.
-        Args:
-            wav (1D array): waveform of the input
-            fs (int): sampling rate of the waveform, 16000 or 8000.
-                If fs=8000, the waveform will be resampled to 16000Hz.
-        Output:
-            log_fbank (2D array): a TxD matrix of log Mel filterbank features.
-                D=80, and T is the number of frames.
-        """
-        if wav.ndim > 1:
-            wav = np.squeeze(wav)
-        # by default, we extract the mean if stereo
-        if len(wav.shape) == 2:
-            wav = wav.mean(1)
-        # Resample to 16000 or 8000 if needed
-        if fs > 16000:
-            wav = scipy.signal.resample_poly(wav, 1, fs // 16000)
-            fs = 16000
-        elif 8000 < fs < 16000:
-            wav = scipy.signal.resample_poly(wav, 1, fs // 8000)
-            fs = 8000
-        elif fs < 8000:
-            raise RuntimeError(f"Unsupported sample rate {fs}")
-        if fs == 8000:
-            if self._eightk_method == "resample":
-                # Input audio is 8 kHz. Convert to 16 kHz before feature
-                # extraction
-                wav = scipy.signal.resample_poly(wav, 2, 1)
-                fs = 16000
-            # Do nothing here for fillzero method
-        elif fs != 16000:
-            # Input audio is not a supported sample rate.
-            raise RuntimeError(f"Input data using an unsupported sample rate: {fs}")
-        preemphasis = 0.97
-        if fs == 8000:
-            n_fft = 256
-            win_length = 200
-            hop_length = 80
-            fft_window = self._hamming200
-        elif fs == 16000:
-            n_fft = 512
-            win_length = 400
-            hop_length = 160
-            fft_window = self._hamming400
-        # Spec 1: SpeechLib cut remaining sample insufficient for a hop
-        n_batch = (wav.shape[0] - win_length) // hop_length + 1
-        # Here we don't use stride_tricks since the input array may not satisfy
-        # memory layout requirement and we need writeable output
-        # Here we only use list of views before copy to desination
-        # so it is more efficient than broadcasting
-        y_frames = np.array(
-            [wav[_stride : _stride + win_length] for _stride in range(0, hop_length * n_batch, hop_length)],
-            dtype=np.float32,
-        )
-        # Spec 2: SpeechLib applies preemphasis within each batch
-        y_frames_prev = np.roll(y_frames, 1, axis=1)
-        y_frames_prev[:, 0] = y_frames_prev[:, 1]
-        y_frames = (y_frames - preemphasis * y_frames_prev) * 32768
-        S = np.fft.rfft(fft_window * y_frames, n=n_fft, axis=1).astype(np.complex64)
-        if fs == 8000:
-            # Need to pad the output to look like 16 kHz data but with zeros in
-            # the 4 to 8 kHz bins.
-            frames, bins = S.shape
-            padarray = np.zeros((frames, bins))
-            S = np.concatenate((S[:, 0:-1], padarray), axis=1)  # Nyquist bin gets set to zero
-        spec = np.abs(S).astype(np.float32)
-        return spec
-    def _extract_features(self, wav, fs):
-        """Extract log filterbank features from waveform.
-        Args:
-            wav (1D array): waveform of the input
-            fs (int): sampling rate of the waveform, 16000 or 8000.
-                If fs=8000, the waveform will be resampled to 16000Hz.
-        Output:
-            log_fbank (2D array): a TxD matrix of log Mel filterbank features.
-                D=80, and T is the number of frames.
-        """
-        spec = self._extract_spectrogram(wav, fs)
-        spec_power = spec**2
-        fbank_power = np.clip(spec_power.dot(self._mel), 1.0, None)
-        log_fbank = np.log(fbank_power).astype(np.float32)
-        return log_fbank
-    def _compute_audio_embed_size(self, audio_frames):
-        integer = audio_frames // self.compression_rate
-        remainder = audio_frames % self.compression_rate
-        result = integer if remainder == 0 else integer + 1
-        integer = result // self.qformer_compression_rate
-        remainder = result % self.qformer_compression_rate
-        result = integer if remainder == 0 else integer + 1  # qformer compression
-        return result
-class Phi4MMProcessor(ProcessorMixin):
-    r"""
-    Constructs a Phi4MM processor which raps an image processor, a audio processor, and a GPT tokenizer into a single processor.
-    [`Phi4MMProcessor`] offers all the functionalities of [`Phi4MMImageProcessor`] and [`GPT2Tokenizer`]. See the
-    [`~Phi4MMProcessor.__call__`] and [`~Phi4MMProcessor.decode`] for more information.
-    Args:
-        image_processor ([`Phi4MMImageProcessor`], *optional*):
-            The image processor is a required input.
-        tokenizer ([`GPT2Tokenizer`], *optional*):
-            The tokenizer is a required input.
-    """
-    attributes = ["image_processor", "audio_processor", "tokenizer"]
-    tokenizer_class = "GPT2TokenizerFast"
-    image_processor_class = "AutoImageProcessor"  # Phi4MMImageProcessor will be registered later
-    audio_processor_class = "AutoFeatureExtractor"  # Phi4MMAudioFeatureExtractor will be registered later
-    def __init__(self, image_processor, audio_processor, tokenizer):
-        self.image_processor = image_processor
-        self.audio_processor = audio_processor
-        self.tokenizer = tokenizer
-    def __call__(
-        self,
-        text: Union[TextInput, List[TextInput]],
-        images: Optional[ImageInput] = None,
-        audios: Optional[AudioInputs] = None,
-        padding: Union[bool, str, PaddingStrategy] = False,
-        truncation: Optional[Union[bool, str, TruncationStrategy]] = None,
-        max_length=None,
-        return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
-    ) -> BatchFeature:
-        """
-        Main method to prepare for the model one or several sequences(s) and image(s). This method forards the `text`
-        and `kwargs` arguments to GPT2Tokenizer's [`~GPT2Tokenizer.__call__`] if `text` is not `None` to encode
-        the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
-        Phi4MMImageProcessor's [`~Phi4MMImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
-        of the above two methods for more information.
-        Args:
-            text (`str`, `List[str]`, `List[List[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. Both channels-first and channels-last formats are supported.
-            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
-                Select a strategy to pad the returned sequences (according to the model's padding side and padding
-                index) among:
-                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
-                  sequence if provided).
-                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
-                  acceptable input length for the model if that argument is not provided.
-                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
-                  lengths).
-            max_length (`int`, *optional*):
-                Maximum length of the returned list and optionally padding length (see above).
-            truncation (`bool`, *optional*):
-                Activates truncation to cut input sequences longer than `max_length` to `max_length`.
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-                - `'tf'`: Return TensorFlow `tf.constant` objects.
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return NumPy `np.ndarray` objects.
-                - `'jax'`: Return JAX `jnp.ndarray` objects.
-        Returns:
-            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
-            - **input_ids** -- List of token ids to be fed to a model.
-            - **input_image_embeds** -- Pixel values to be fed to a model.
-            - **image_sizes** -- List of tuples specifying the size of each image in `input_image_embeds`.
-            - **image_attention_mask** -- List of attention masks for each image in `input_image_embeds`.
-            - **input_audio_embeds** -- Audio embeddings to be fed to a model.
-            - **audio_embed_sizes** -- List of integers specifying the size of each audio in `input_audio_embeds`.
-            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model.
-        """
-        image_inputs = self.image_processor(images, return_tensors=return_tensors) if images is not None else {}
-        audio_inputs = self.audio_processor(audios, return_tensors=return_tensors) if audios is not None else {}
-        inputs = self._convert_images_audios_text_to_inputs(
-            image_inputs,
-            audio_inputs,
-            text,
-            padding=padding,
-            truncation=truncation,
-            max_length=max_length,
-            return_tensors=return_tensors,
-        )
-        # idenfity the input mode
-        if len(image_inputs) > 0 and len(audio_inputs) > 0:
-            input_mode = InputMode.VISION_SPEECH
-        elif len(image_inputs) > 0:
-            input_mode = InputMode.VISION
-        elif len(audio_inputs) > 0:
-            input_mode = InputMode.SPEECH
-        else:
-            input_mode = InputMode.LANGUAGE
-        inputs["input_mode"] = torch.tensor([input_mode.value], dtype=torch.long)
-        return inputs
-    @property
-    def special_image_token_id(self):
-        return self.tokenizer.convert_tokens_to_ids(self.special_image_token)
-    def get_special_image_token_id(self):
-        return self.tokenizer.convert_tokens_to_ids(self.special_image_token)
-    @property
-    def chat_template(self):
-        return self.tokenizer.chat_template
-    def _convert_images_audios_text_to_inputs(
-        self, images, audios, text, padding=False, truncation=None, max_length=None, return_tensors=None
-    ):
-        # prepare image id to image input ids
-        if len(images) > 0:
-            input_image_embeds = images["input_image_embeds"]
-            image_sizes = images["image_sizes"]
-            image_attention_mask = images["image_attention_mask"]
-            num_img_tokens = images['num_img_tokens']
-        else:
-            input_image_embeds = torch.tensor([])
-            image_sizes = torch.tensor([])
-            image_attention_mask = torch.tensor([])
-            num_img_tokens = []
-        # prepare audio id to audio input ids
-        if len(audios) > 0:
-            input_audio_embeds = audios["input_audio_embeds"]
-            audio_embed_sizes = audios["audio_embed_sizes"]
-            audio_attention_mask = audios.get("audio_attention_mask", None)
-        else:
-            input_audio_embeds = torch.tensor([])
-            audio_embed_sizes = torch.tensor([])
-            audio_attention_mask = None
-        # Replace certain special tokens for compatibility
-        # Ref: https://stackoverflow.com/questions/11475885/python-replace-regex
-        if isinstance(text, str):
-            text = [text]
-        assert isinstance(text, list)
-        processed_text = [re.sub(_COMPATIBLE_IMAGE_SPECIAL_TOKEN_PATTERN, _IMAGE_SPECIAL_TOKEN, t) for t in text]
-        processed_text = [re.sub(_COMPATIBLE_AUDIO_SPECIAL_TOKEN_PATTERN, _AUDIO_SPECIAL_TOKEN, t) for t in processed_text]
-        input_ids_list = [self.tokenizer(t).input_ids for t in processed_text]
-        img_cnt, audio_cnt = 0, 0  # only needed for later assertion
-        image_token_count_iter = iter(num_img_tokens)
-        audio_embed_size_iter = iter(audio_embed_sizes.tolist())
-        new_input_ids_list = []
-        for input_ids in input_ids_list:
-            i = 0
-            while i < len(input_ids):
-                token_id = input_ids[i]
-                if token_id == _AUDIO_SPECIAL_TOKEN_ID:
-                    token_count = next(audio_embed_size_iter)
-                    audio_cnt += 1
-                elif token_id == _IMAGE_SPECIAL_TOKEN_ID:
-                    token_count = next(image_token_count_iter)
-                    img_cnt += 1
-                else:
-                    i += 1
-                    continue
-                tokens = [token_id] * token_count
-                input_ids = input_ids[:i] + tokens + input_ids[i + 1:]
-                i += token_count
-            input_ids = torch.tensor(input_ids, dtype=torch.long)
-            new_input_ids_list.append(input_ids)
-        lengths = torch.tensor([len(input_ids) for input_ids in new_input_ids_list])
-        max_len = lengths.max()
-        input_ids = input_ids.new_full((len(new_input_ids_list), max_len), self.tokenizer.pad_token_id)
-        # batched inference requires left padding
-        for i in range(len(new_input_ids_list)):
-            input_ids[i, max_len - len(new_input_ids_list[i]):] = new_input_ids_list[i]
-        # If the below assertion fails, it might be that input pure-text
-        # messages contain image/audio special tokens literally
-        # (<|endoftext10|>, <|endoftext11|>).
-        assert (
-            img_cnt == len(num_img_tokens)
-        ), (
-            f"Number of image tokens in prompt_token_ids ({img_cnt}) "
-            f"does not match number of images ({len(num_img_tokens)})"
-        )
-        assert (
-            audio_cnt == len(audio_embed_sizes)
-        ), (
-            f"Number of audio tokens in prompt_token_ids ({audio_cnt}) "
-            f"does not match number of audios ({len(audio_embed_sizes)})"
-        )
-        # prepare attention mask
-        seq_range = torch.arange(max_len - 1, -1, -1)
-        attention_mask = seq_range.unsqueeze(0) < lengths.unsqueeze(1)
-        # prepare batch feature
-        data = {
-            "input_ids": input_ids,
-            "input_image_embeds": input_image_embeds,
-            "image_sizes": image_sizes,
-            "image_attention_mask": image_attention_mask,
-            "input_audio_embeds": input_audio_embeds,
-            "audio_embed_sizes": audio_embed_sizes,
-            "audio_attention_mask": audio_attention_mask,
-            "attention_mask": attention_mask,
-        }
-        return BatchFeature(
-            data=data
-        )
-    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama
-    def batch_decode(self, *args, **kwargs):
-        """
-        This method forwards all its arguments to GPT2Tokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
-        refer to the docstring of this method for more information.
-        """
-        return self.tokenizer.batch_decode(*args, **kwargs)
-    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.decode with CLIP->Llama
-    def decode(self, *args, **kwargs):
-        """
-        This method forwards all its arguments to GPT2Tokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to
-        the docstring of this method for more information.
-        """
-        return self.tokenizer.decode(*args, **kwargs)
-    @property
-    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.model_input_names
-    def model_input_names(self):
-        tokenizer_input_names = self.tokenizer.model_input_names
-        image_processor_input_names = self.image_processor.model_input_names
-        audio_processor_input_names = self.audio_processor.model_input_names
-        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names + audio_processor_input_names))
-AutoImageProcessor.register("Phi4MMImageProcessor", Phi4MMImageProcessor)
-AutoFeatureExtractor.register("Phi4MMAudioFeatureExtractor", Phi4MMAudioFeatureExtractor)

processor_config.json DELETED Viewed

@@ -1,6 +0,0 @@
-{
-  "auto_map": {
-    "AutoProcessor": "processing_phi4mm.Phi4MMProcessor"
-  },
-  "processor_class": "Phi4MMProcessor"
-}

sample_finetune_speech.py DELETED Viewed

@@ -1,478 +0,0 @@
-"""
-finetune Phi-4-multimodal-instruct on an speech task
-scipy==1.15.1
-peft==0.13.2
-backoff==2.2.1
-transformers==4.46.1
-accelerate==1.3.0
-"""
-import argparse
-import json
-import os
-from pathlib import Path
-import torch
-import sacrebleu
-from accelerate import Accelerator
-from accelerate.utils import gather_object
-from datasets import load_dataset
-from torch.utils.data import Dataset
-from tqdm import tqdm
-from transformers import (
-    AutoModelForCausalLM,
-    AutoProcessor,
-    BatchFeature,
-    Trainer,
-    TrainingArguments,
-    StoppingCriteria,
-    StoppingCriteriaList,
-)
-INSTSRUCTION = {
-    "en_zh-CN": "Translate the audio to Mandarin.",
-    "en_id": "Translate the audio to Indonesian.",
-    "en_sl": "Translate the audio to Slovenian.",
-}
-TOKENIZER = {
-    "en_zh-CN": "zh",
-    "en_ja": "ja-mecab",
-}
-ANSWER_SUFFIX = "<|end|><|endoftext|>"
-_IGNORE_INDEX = -100
-_TRAIN_SIZE = 50000
-_EVAL_SIZE = 200
-class MultipleTokenBatchStoppingCriteria(StoppingCriteria):
-    """Stopping criteria capable of receiving multiple stop-tokens and handling batched inputs."""
-    def __init__(self, stop_tokens: torch.LongTensor, batch_size: int = 1) -> None:
-        """Initialize the multiple token batch stopping criteria.
-        Args:
-            stop_tokens: Stop-tokens.
-            batch_size: Batch size.
-        """
-        self.stop_tokens = stop_tokens
-        self.max_stop_tokens = stop_tokens.shape[-1]
-        self.stop_tokens_idx = torch.zeros(batch_size, dtype=torch.long, device=stop_tokens.device)
-    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
-        # Only gather the maximum number of inputs compatible with stop tokens
-        # and checks whether generated inputs are equal to `stop_tokens`
-        generated_inputs = torch.eq(input_ids[:, -self.max_stop_tokens :].unsqueeze(1), self.stop_tokens)
-        equal_generated_inputs = torch.all(generated_inputs, dim=2)
-        # Mark the position where a stop token has been produced for each input in the batch,
-        # but only if the corresponding entry is not already set
-        sequence_idx = torch.any(equal_generated_inputs, dim=1)
-        sequence_set_mask = self.stop_tokens_idx == 0
-        self.stop_tokens_idx[sequence_idx & sequence_set_mask] = input_ids.shape[-1]
-        return torch.all(self.stop_tokens_idx)
-class CoVoSTDataset(Dataset):
-    def __init__(self, processor, data_dir, split,
-                 lang="en_zh-CN", rank=0, world_size=1):
-        self.data = load_dataset("facebook/covost2",
-                           lang,
-                           data_dir=data_dir,
-                           split=split,
-                           trust_remote_code=True
-                           )
-        self.training = "train" in split
-        self.processor = processor
-        self.instruction = INSTSRUCTION[lang]
-        if world_size > 1:
-            self.data = self.data.shard(world_size, rank)
-    def __len__(self):
-        return len(self.data)
-    def __getitem__(self, idx):
-        """
-        {'client_id': '0013037a1d45cc33460806cc3f8ecee9d536c45639ba4cbbf1564f1c051f53ff3c9f89ef2f1bf04badf55b3a2e7654c086f903681a7b6299616cff6f67598eff',
-        'file': '{data_dir}/clips/common_voice_en_699711.mp3',
-        'audio': {'path': '{data_dir}/clips/common_voice_en_699711.mp3',
-        'array': array([-1.28056854e-09, -1.74622983e-09, -1.16415322e-10, ...,
-                3.92560651e-10,  6.62794264e-10, -3.89536581e-09]),
-        'sampling_rate': 16000},
-        'sentence': '"She\'ll be all right."',
-        'translation': '她会没事的。',
-        'id': 'common_voice_en_699711'}
-        """
-        data = self.data[idx]
-        user_message = {
-            'role': 'user',
-            'content': '<|audio_1|>\n' + self.instruction,
-        }
-        prompt = self.processor.tokenizer.apply_chat_template(
-            [user_message], tokenize=False, add_generation_prompt=True
-        )
-        inputs = self.processor(text=prompt, audios=[(data["audio"]["array"], data["audio"]["sampling_rate"])], return_tensors='pt')
-        answer = f"{data['translation']}{ANSWER_SUFFIX}"
-        answer_ids = self.processor.tokenizer(answer, return_tensors='pt').input_ids
-        if  self.training:
-            input_ids = torch.cat([inputs.input_ids, answer_ids], dim=1)
-            labels = torch.full_like(input_ids, _IGNORE_INDEX)
-            labels[:, -answer_ids.shape[1] :] = answer_ids
-        else:
-            input_ids = inputs.input_ids
-            labels = answer_ids
-        return {
-            'input_ids': input_ids,
-            'labels': labels,
-            'input_audio_embeds': inputs.input_audio_embeds,
-            'audio_embed_sizes': inputs.audio_embed_sizes,
-        }
-def pad_sequence(sequences, padding_side='right', padding_value=0):
-    """
-    Pad a list of sequences to the same length.
-    sequences: list of tensors in [seq_len, *] shape
-    """
-    assert padding_side in ['right', 'left']
-    max_size = sequences[0].size()
-    trailing_dims = max_size[1:]
-    max_len = max(len(seq) for seq in sequences)
-    batch_size = len(sequences)
-    output = sequences[0].new_full((batch_size, max_len) + trailing_dims, padding_value)
-    for i, seq in enumerate(sequences):
-        length = seq.size(0)
-        if padding_side == 'right':
-            output.data[i, :length] = seq
-        else:
-            output.data[i, -length:] = seq
-    return output
-def cat_with_pad(tensors, dim, padding_value=0):
-    """
-    cat along dim, while pad to max for all other dims
-    """
-    ndim = tensors[0].dim()
-    assert all(
-        t.dim() == ndim for t in tensors[1:]
-    ), 'All tensors must have the same number of dimensions'
-    out_size = [max(t.shape[i] for t in tensors) for i in range(ndim)]
-    out_size[dim] = sum(t.shape[dim] for t in tensors)
-    output = tensors[0].new_full(out_size, padding_value)
-    index = 0
-    for t in tensors:
-        # Create a slice list where every dimension except dim is full slice
-        slices = [slice(0, t.shape[d]) for d in range(ndim)]
-        # Update only the concat dimension slice
-        slices[dim] = slice(index, index + t.shape[dim])
-        output[slices] = t
-        index += t.shape[dim]
-    return output
-def covost_collate_fn(batch):
-    input_ids_list = []
-    labels_list = []
-    input_audio_embeds_list = []
-    audio_embed_sizes_list = []
-    audio_attention_mask_list = []
-    for inputs in batch:
-        input_ids_list.append(inputs['input_ids'][0])
-        labels_list.append(inputs['labels'][0])
-        input_audio_embeds_list.append(inputs['input_audio_embeds'])
-        audio_embed_sizes_list.append(inputs['audio_embed_sizes'])
-        audio_attention_mask_list.append(
-            inputs['input_audio_embeds'].new_full((inputs['input_audio_embeds'].size(1),), True, dtype=torch.bool)
-        )
-    try:
-        input_ids = pad_sequence(input_ids_list, padding_side='left', padding_value=0)
-        labels = pad_sequence(labels_list, padding_side='left', padding_value=0)
-        audio_attention_mask = (
-            pad_sequence(audio_attention_mask_list, padding_side='right', padding_value=False)
-            if len(audio_attention_mask_list) > 1
-            else None
-        )
-    except Exception as e:
-        print(e)
-        print(input_ids_list)
-        print(labels_list)
-        raise
-    attention_mask = (input_ids != 0).long()
-    input_audio_embeds = cat_with_pad(input_audio_embeds_list, dim=0)
-    audio_embed_sizes = torch.cat(audio_embed_sizes_list)
-    return BatchFeature(
-        {
-            'input_ids': input_ids,
-            'labels': labels,
-            'attention_mask': attention_mask,
-            'input_audio_embeds': input_audio_embeds,
-            'audio_embed_sizes': audio_embed_sizes,
-            'audio_attention_mask': audio_attention_mask,
-            'input_mode': 2,  # speech mode
-        }
-    )
-def create_model(model_name_or_path, use_flash_attention=False):
-    model = AutoModelForCausalLM.from_pretrained(
-        model_name_or_path,
-        torch_dtype=torch.bfloat16 if use_flash_attention else torch.float32,
-        _attn_implementation='flash_attention_2' if use_flash_attention else 'sdpa',
-        trust_remote_code=True,
-    ).to('cuda')
-    return model
-@torch.no_grad()
-def evaluate(
-    model, processor, eval_dataset, save_path=None, disable_tqdm=False, eval_batch_size=1
-):
-    rank = int(os.environ.get('RANK', 0))
-    local_rank = int(os.environ.get('LOCAL_RANK', 0))
-    model.eval()
-    all_generated_texts = []
-    all_labels = []
-    eval_dataloader = torch.utils.data.DataLoader(
-        eval_dataset,
-        batch_size=eval_batch_size,
-        collate_fn=covost_collate_fn,
-        shuffle=False,
-        drop_last=False,
-        num_workers=8,
-        prefetch_factor=2,
-        pin_memory=True,
-    )
-    stop_tokens = ["<|end|>", processor.tokenizer.eos_token]
-    stop_tokens_ids = processor.tokenizer(stop_tokens, add_special_tokens=False, padding="longest", return_tensors="pt")["input_ids"]
-    stop_tokens_ids = stop_tokens_ids.to(f'cuda:{local_rank}')
-    for inputs in tqdm(
-        eval_dataloader, disable=(rank != 0) or disable_tqdm, desc='running eval'
-    ):
-        stopping_criteria=StoppingCriteriaList([MultipleTokenBatchStoppingCriteria(stop_tokens_ids, batch_size=inputs.input_ids.size(0))])
-        inputs = inputs.to(f'cuda:{local_rank}')
-        generated_ids = model.generate(
-            **inputs, eos_token_id=processor.tokenizer.eos_token_id, max_new_tokens=64,
-            stopping_criteria=stopping_criteria,
-        )
-        stop_tokens_idx = stopping_criteria[0].stop_tokens_idx.reshape(inputs.input_ids.size(0), -1)[:, 0]
-        stop_tokens_idx = torch.where(
-            stop_tokens_idx > 0,
-            stop_tokens_idx - stop_tokens_ids.shape[-1],
-            generated_ids.shape[-1],
-        )
-        generated_text = [
-            processor.decode(_pred_ids[inputs["input_ids"].shape[1] : _stop_tokens_idx], skip_special_tokens=True, clean_up_tokenization_spaces=False)
-            for _pred_ids, _stop_tokens_idx in zip(generated_ids, stop_tokens_idx)
-        ]
-        all_generated_texts.extend(generated_text)
-        labels = [processor.decode(_label_ids[_label_ids != 0]).removesuffix(ANSWER_SUFFIX) for _label_ids in inputs["labels"]]
-        all_labels.extend(labels)
-    all_generated_texts = gather_object(all_generated_texts)
-    all_labels = gather_object(all_labels)
-    if rank == 0:
-        assert len(all_generated_texts) == len(all_labels)
-        bleu = sacrebleu.corpus_bleu(all_generated_texts, [all_labels])
-        print(bleu)
-        if save_path:
-            with open(save_path, 'w') as f:
-                save_dict = {
-                    'all_generated_texts': all_generated_texts,
-                    'all_labels': all_labels,
-                    'score': bleu.score,
-                }
-                json.dump(save_dict, f)
-        return bleu.score
-    return None
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        '--model_name_or_path',
-        type=str,
-        default='microsoft/Phi-4-multimodal-instruct',
-        help='Model name or path to load from',
-    )
-    parser.add_argument(
-        "--common_voice_dir",
-        type=str,
-        default="CommonVoice/EN",
-        help="Unzipped Common Voice Audio dataset directory, refer to https://commonvoice.mozilla.org/en/datasets, version 4.0",
-    )
-    parser.add_argument(
-        "--lang",
-        type=str,
-        default="en_sl",
-        help="Language pair for translation.",
-    )
-    parser.add_argument('--use_flash_attention', action='store_true', help='Use Flash Attention')
-    parser.add_argument('--output_dir', type=str, default='./output/', help='Output directory')
-    parser.add_argument('--batch_size', type=int, default=128, help='Batch size')
-    parser.add_argument(
-        '--batch_size_per_gpu',
-        type=int,
-        default=32,
-        help='Batch size per GPU (adjust this to fit in GPU memory)',
-    )
-    parser.add_argument(
-        '--num_train_epochs', type=int, default=1, help='Number of training epochs'
-    )
-    parser.add_argument('--learning_rate', type=float, default=4.0e-5, help='Learning rate')
-    parser.add_argument('--wd', type=float, default=0.01, help='Weight decay')
-    parser.add_argument('--no-tqdm', dest='tqdm', action='store_false', help='Disable tqdm')
-    args = parser.parse_args()
-    accelerator = Accelerator()
-    with accelerator.local_main_process_first():
-        processor = AutoProcessor.from_pretrained(
-            args.model_name_or_path,
-            trust_remote_code=True,
-        )
-        model = create_model(
-            args.model_name_or_path,
-            use_flash_attention=args.use_flash_attention,
-        )
-    model.set_lora_adapter('speech')
-    rank = int(os.environ.get('RANK', 0))
-    world_size = int(os.environ.get('WORLD_SIZE', 1))
-    eval_dataset = CoVoSTDataset(processor,
-                                 data_dir=args.common_voice_dir,
-                                 split=f'test[:{_EVAL_SIZE}]',
-                                 lang=args.lang,
-                                 rank=rank,
-                                 world_size=world_size)
-    train_dataset = CoVoSTDataset(processor,
-                                  data_dir=args.common_voice_dir,
-                                  split=f'train[:{_TRAIN_SIZE}]',
-                                  lang=args.lang)
-    num_gpus = accelerator.num_processes
-    print(f'training on {num_gpus} GPUs')
-    assert (
-        args.batch_size % (num_gpus * args.batch_size_per_gpu) == 0
-    ), 'Batch size must be divisible by the number of GPUs'
-    gradient_accumulation_steps = args.batch_size // (num_gpus * args.batch_size_per_gpu)
-    if args.use_flash_attention:
-        fp16 = False
-        bf16 = True
-    else:
-        fp16 = True
-        bf16 = False
-    # hard coded training args
-    training_args = TrainingArguments(
-        num_train_epochs=args.num_train_epochs,
-        per_device_train_batch_size=args.batch_size_per_gpu,
-        gradient_checkpointing=True,
-        gradient_checkpointing_kwargs={'use_reentrant': False},
-        gradient_accumulation_steps=gradient_accumulation_steps,
-        optim='adamw_torch',
-        adam_beta1=0.9,
-        adam_beta2=0.95,
-        adam_epsilon=1e-7,
-        learning_rate=args.learning_rate,
-        weight_decay=args.wd,
-        max_grad_norm=1.0,
-        lr_scheduler_type='linear',
-        warmup_steps=50,
-        logging_steps=10,
-        output_dir=args.output_dir,
-        save_strategy='no',
-        save_total_limit=10,
-        save_only_model=True,
-        bf16=bf16,
-        fp16=fp16,
-        remove_unused_columns=False,
-        report_to='none',
-        deepspeed=None,
-        disable_tqdm=not args.tqdm,
-        dataloader_num_workers=4,
-        ddp_find_unused_parameters=True,  # for unused SigLIP layers
-    )
-    # eval before fine-tuning
-    out_path = Path(training_args.output_dir)
-    out_path.mkdir(parents=True, exist_ok=True)
-    score = evaluate(
-        model,
-        processor,
-        eval_dataset,
-        save_path=out_path / 'eval_before.json',
-        disable_tqdm=not args.tqdm,
-        eval_batch_size=args.batch_size_per_gpu,
-    )
-    if accelerator.is_main_process:
-        print(f'BLEU Score before finetuning: {score}')
-    trainer = Trainer(
-        model=model,
-        args=training_args,
-        data_collator=covost_collate_fn,
-        train_dataset=train_dataset,
-    )
-    trainer.train()
-    trainer.save_model()
-    if accelerator.is_main_process:
-        processor.save_pretrained(training_args.output_dir)
-    accelerator.wait_for_everyone()
-    # eval after fine-tuning (load saved checkpoint)
-    # first try to clear GPU memory
-    del model
-    del trainer
-    __import__('gc').collect()
-    torch.cuda.empty_cache()
-    # reload the model for inference
-    model = AutoModelForCausalLM.from_pretrained(
-        training_args.output_dir,
-        torch_dtype=torch.bfloat16 if args.use_flash_attention else torch.float32,
-        trust_remote_code=True,
-        _attn_implementation='flash_attention_2' if args.use_flash_attention else 'sdpa',
-    ).to('cuda')
-    score = evaluate(
-        model,
-        processor,
-        eval_dataset,
-        save_path=out_path / 'eval_after.json',
-        disable_tqdm=not args.tqdm,
-        eval_batch_size=args.batch_size_per_gpu,
-    )
-    if accelerator.is_main_process:
-        print(f'BLEU Score after finetuning: {score}')
-if __name__ == '__main__':
-    main()

sample_finetune_vision.py DELETED Viewed

@@ -1,556 +0,0 @@
-"""
-finetune Phi-4-multimodal-instruct on an image task
-scipy==1.15.1
-peft==0.13.2
-backoff==2.2.1
-transformers==4.47.0
-accelerate==1.3.0
-"""
-import argparse
-import json
-import os
-import tempfile
-import zipfile
-from pathlib import Path
-import torch
-from accelerate import Accelerator
-from accelerate.utils import gather_object
-from datasets import load_dataset
-from huggingface_hub import hf_hub_download
-from PIL import Image
-from torch.utils.data import Dataset
-from tqdm import tqdm
-from transformers import (
-    AutoModelForCausalLM,
-    AutoProcessor,
-    BatchFeature,
-    Trainer,
-    TrainingArguments,
-)
-DEFAULT_INSTSRUCTION = "Answer with the option's letter from the given choices directly."
-_IGNORE_INDEX = -100
-_TRAIN_SIZE = 8000
-_EVAL_SIZE = 500
-_MAX_TRAINING_LENGTH = 8192
-class PmcVqaTrainDataset(Dataset):
-    def __init__(self, processor, data_size, instruction=DEFAULT_INSTSRUCTION):
-        # Download the file
-        file_path = hf_hub_download(
-            repo_id='xmcmic/PMC-VQA',  # repository name
-            filename='images_2.zip',  # file to download
-            repo_type='dataset',  # specify it's a dataset repo
-        )
-        # file_path will be the local path where the file was downloaded
-        print(f'File downloaded to: {file_path}')
-        # unzip to temp folder
-        self.image_folder = Path(tempfile.mkdtemp())
-        with zipfile.ZipFile(file_path, 'r') as zip_ref:
-            zip_ref.extractall(self.image_folder)
-        data_files = {
-            'train': 'https://huggingface.co/datasets/xmcmic/PMC-VQA/resolve/main/train_2.csv',
-        }
-        split = 'train' if data_size is None else f'train[:{data_size}]'
-        self.annotations = load_dataset('xmcmic/PMC-VQA', data_files=data_files, split=split)
-        self.processor = processor
-        self.instruction = instruction
-    def __len__(self):
-        return len(self.annotations)
-    def __getitem__(self, idx):
-        """
-        {'index': 35,
-         'Figure_path': 'PMC8253797_Fig4_11.jpg',
-         'Caption': 'A slightly altered cell . (c-c‴) A highly altered cell as seen from 4 different angles . Note mitochondria/mitochondrial networks (green), Golgi complexes (red), cell nuclei (light blue) and the cell outline (yellow).',
-         'Question': ' What color is used to label the Golgi complexes in the image?',
-         'Choice A': ' A: Green ',
-         'Choice B': ' B: Red ',
-         'Choice C': ' C: Light blue ',
-         'Choice D': ' D: Yellow',
-         'Answer': 'B',
-         'split': 'train'}
-        """
-        annotation = self.annotations[idx]
-        image = Image.open(self.image_folder / 'figures' / annotation['Figure_path'])
-        question = annotation['Question']
-        choices = [annotation[f'Choice {chr(ord("A") + i)}'] for i in range(4)]
-        user_message = {
-            'role': 'user',
-            'content': '<|image_1|>' + '\n'.join([question] + choices + [self.instruction]),
-        }
-        prompt = self.processor.tokenizer.apply_chat_template(
-            [user_message], tokenize=False, add_generation_prompt=True
-        )
-        answer = f'{annotation["Answer"]}<|end|><|endoftext|>'
-        inputs = self.processor(prompt, images=[image], return_tensors='pt')
-        answer_ids = self.processor.tokenizer(answer, return_tensors='pt').input_ids
-        input_ids = torch.cat([inputs.input_ids, answer_ids], dim=1)
-        labels = torch.full_like(input_ids, _IGNORE_INDEX)
-        labels[:, -answer_ids.shape[1] :] = answer_ids
-        if input_ids.size(1) > _MAX_TRAINING_LENGTH:
-            input_ids = input_ids[:, :_MAX_TRAINING_LENGTH]
-            labels = labels[:, :_MAX_TRAINING_LENGTH]
-            if torch.all(labels == _IGNORE_INDEX).item():
-                # workaround to make sure loss compute won't fail
-                labels[:, -1] = self.processor.tokenizer.eos_token_id
-        return {
-            'input_ids': input_ids,
-            'labels': labels,
-            'input_image_embeds': inputs.input_image_embeds,
-            'image_attention_mask': inputs.image_attention_mask,
-            'image_sizes': inputs.image_sizes,
-        }
-    def __del__(self):
-        __import__('shutil').rmtree(self.image_folder)
-class PmcVqaEvalDataset(Dataset):
-    def __init__(
-        self, processor, data_size, instruction=DEFAULT_INSTSRUCTION, rank=0, world_size=1
-    ):
-        # Download the file
-        file_path = hf_hub_download(
-            repo_id='xmcmic/PMC-VQA',  # repository name
-            filename='images_2.zip',  # file to download
-            repo_type='dataset',  # specify it's a dataset repo
-        )
-        # file_path will be the local path where the file was downloaded
-        print(f'File downloaded to: {file_path}')
-        # unzip to temp folder
-        self.image_folder = Path(tempfile.mkdtemp())
-        with zipfile.ZipFile(file_path, 'r') as zip_ref:
-            zip_ref.extractall(self.image_folder)
-        data_files = {
-            'test': 'https://huggingface.co/datasets/xmcmic/PMC-VQA/resolve/main/test_2.csv',
-        }
-        split = 'test' if data_size is None else f'test[:{data_size}]'
-        self.annotations = load_dataset(
-            'xmcmic/PMC-VQA', data_files=data_files, split=split
-        ).shard(num_shards=world_size, index=rank)
-        self.processor = processor
-        self.instruction = instruction
-    def __len__(self):
-        return len(self.annotations)
-    def __getitem__(self, idx):
-        """
-        {'index': 62,
-         'Figure_path': 'PMC8253867_Fig2_41.jpg',
-         'Caption': 'CT pulmonary angiogram reveals encasement and displacement of the left anterior descending coronary artery ( blue arrows ).',
-         'Question': ' What is the name of the artery encased and displaced in the image? ',
-         'Choice A': ' A: Right Coronary Artery ',
-         'Choice B': ' B: Left Anterior Descending Coronary Artery ',
-         'Choice C': ' C: Circumflex Coronary Artery ',
-         'Choice D': ' D: Superior Mesenteric Artery ',
-         'Answer': 'B',
-         'split': 'test'}
-        """
-        annotation = self.annotations[idx]
-        image = Image.open(self.image_folder / 'figures' / annotation['Figure_path'])
-        question = annotation['Question']
-        choices = [annotation[f'Choice {chr(ord("A") + i)}'] for i in range(4)]
-        user_message = {
-            'role': 'user',
-            'content': '<|image_1|>' + '\n'.join([question] + choices + [self.instruction]),
-        }
-        prompt = self.processor.tokenizer.apply_chat_template(
-            [user_message], tokenize=False, add_generation_prompt=True
-        )
-        answer = annotation['Answer']
-        inputs = self.processor(prompt, images=[image], return_tensors='pt')
-        unique_id = f'{annotation["index"]:010d}'
-        return {
-            'id': unique_id,
-            'input_ids': inputs.input_ids,
-            'input_image_embeds': inputs.input_image_embeds,
-            'image_attention_mask': inputs.image_attention_mask,
-            'image_sizes': inputs.image_sizes,
-            'answer': answer,
-        }
-    def __del__(self):
-        __import__('shutil').rmtree(self.image_folder)
-def pad_sequence(sequences, padding_side='right', padding_value=0):
-    """
-    Pad a list of sequences to the same length.
-    sequences: list of tensors in [seq_len, *] shape
-    """
-    assert padding_side in ['right', 'left']
-    max_size = sequences[0].size()
-    trailing_dims = max_size[1:]
-    max_len = max(len(seq) for seq in sequences)
-    batch_size = len(sequences)
-    output = sequences[0].new_full((batch_size, max_len) + trailing_dims, padding_value)
-    for i, seq in enumerate(sequences):
-        length = seq.size(0)
-        if padding_side == 'right':
-            output.data[i, :length] = seq
-        else:
-            output.data[i, -length:] = seq
-    return output
-def cat_with_pad(tensors, dim, padding_value=0):
-    """
-    cat along dim, while pad to max for all other dims
-    """
-    ndim = tensors[0].dim()
-    assert all(
-        t.dim() == ndim for t in tensors[1:]
-    ), 'All tensors must have the same number of dimensions'
-    out_size = [max(t.shape[i] for t in tensors) for i in range(ndim)]
-    out_size[dim] = sum(t.shape[dim] for t in tensors)
-    output = tensors[0].new_full(out_size, padding_value)
-    index = 0
-    for t in tensors:
-        # Create a slice list where every dimension except dim is full slice
-        slices = [slice(0, t.shape[d]) for d in range(ndim)]
-        # Update only the concat dimension slice
-        slices[dim] = slice(index, index + t.shape[dim])
-        output[slices] = t
-        index += t.shape[dim]
-    return output
-def pmc_vqa_collate_fn(batch):
-    input_ids_list = []
-    labels_list = []
-    input_image_embeds_list = []
-    image_attention_mask_list = []
-    image_sizes_list = []
-    for inputs in batch:
-        input_ids_list.append(inputs['input_ids'][0])
-        labels_list.append(inputs['labels'][0])
-        input_image_embeds_list.append(inputs['input_image_embeds'])
-        image_attention_mask_list.append(inputs['image_attention_mask'])
-        image_sizes_list.append(inputs['image_sizes'])
-    input_ids = pad_sequence(input_ids_list, padding_side='right', padding_value=0)
-    labels = pad_sequence(labels_list, padding_side='right', padding_value=0)
-    attention_mask = (input_ids != 0).long()
-    input_image_embeds = cat_with_pad(input_image_embeds_list, dim=0)
-    image_attention_mask = cat_with_pad(image_attention_mask_list, dim=0)
-    image_sizes = torch.cat(image_sizes_list)
-    return BatchFeature(
-        {
-            'input_ids': input_ids,
-            'labels': labels,
-            'attention_mask': attention_mask,
-            'input_image_embeds': input_image_embeds,
-            'image_attention_mask': image_attention_mask,
-            'image_sizes': image_sizes,
-            'input_mode': 1,  # vision mode
-        }
-    )
-def pmc_vqa_eval_collate_fn(batch):
-    input_ids_list = []
-    input_image_embeds_list = []
-    image_attention_mask_list = []
-    image_sizes_list = []
-    all_unique_ids = []
-    all_answers = []
-    for inputs in batch:
-        input_ids_list.append(inputs['input_ids'][0])
-        input_image_embeds_list.append(inputs['input_image_embeds'])
-        image_attention_mask_list.append(inputs['image_attention_mask'])
-        image_sizes_list.append(inputs['image_sizes'])
-        all_unique_ids.append(inputs['id'])
-        all_answers.append(inputs['answer'])
-    input_ids = pad_sequence(input_ids_list, padding_side='left', padding_value=0)
-    attention_mask = (input_ids != 0).long()
-    input_image_embeds = cat_with_pad(input_image_embeds_list, dim=0)
-    image_attention_mask = cat_with_pad(image_attention_mask_list, dim=0)
-    image_sizes = torch.cat(image_sizes_list)
-    return (
-        all_unique_ids,
-        all_answers,
-        BatchFeature(
-            {
-                'input_ids': input_ids,
-                'attention_mask': attention_mask,
-                'input_image_embeds': input_image_embeds,
-                'image_attention_mask': image_attention_mask,
-                'image_sizes': image_sizes,
-                'input_mode': 1,  # vision mode
-            }
-        ),
-    )
-def create_model(model_name_or_path, use_flash_attention=False):
-    model = AutoModelForCausalLM.from_pretrained(
-        model_name_or_path,
-        torch_dtype=torch.bfloat16 if use_flash_attention else torch.float32,
-        _attn_implementation='flash_attention_2' if use_flash_attention else 'sdpa',
-        trust_remote_code=True,
-    ).to('cuda')
-    # remove parameters irrelevant to vision tasks
-    del model.model.embed_tokens_extend.audio_embed  # remove audio encoder
-    for layer in model.model.layers:
-        # remove audio lora
-        del layer.mlp.down_proj.lora_A.speech
-        del layer.mlp.down_proj.lora_B.speech
-        del layer.mlp.gate_up_proj.lora_A.speech
-        del layer.mlp.gate_up_proj.lora_B.speech
-        del layer.self_attn.o_proj.lora_A.speech
-        del layer.self_attn.o_proj.lora_B.speech
-        del layer.self_attn.qkv_proj.lora_A.speech
-        del layer.self_attn.qkv_proj.lora_B.speech
-    # TODO remove unused vision layers?
-    return model
-@torch.no_grad()
-def evaluate(
-    model, processor, eval_dataset, save_path=None, disable_tqdm=False, eval_batch_size=1
-):
-    rank = int(os.environ.get('RANK', 0))
-    local_rank = int(os.environ.get('LOCAL_RANK', 0))
-    model.eval()
-    all_answers = []
-    all_generated_texts = []
-    eval_dataloader = torch.utils.data.DataLoader(
-        eval_dataset,
-        batch_size=eval_batch_size,
-        collate_fn=pmc_vqa_eval_collate_fn,
-        shuffle=False,
-        drop_last=False,
-        num_workers=4,
-        prefetch_factor=2,
-        pin_memory=True,
-    )
-    for ids, answers, inputs in tqdm(
-        eval_dataloader, disable=(rank != 0) or disable_tqdm, desc='running eval'
-    ):
-        all_answers.extend({'id': i, 'answer': a.strip().lower()} for i, a in zip(ids, answers))
-        inputs = inputs.to(f'cuda:{local_rank}')
-        generated_ids = model.generate(
-            **inputs, eos_token_id=processor.tokenizer.eos_token_id, max_new_tokens=64
-        )
-        input_len = inputs.input_ids.size(1)
-        generated_texts = processor.batch_decode(
-            generated_ids[:, input_len:],
-            skip_special_tokens=True,
-            clean_up_tokenization_spaces=False,
-        )
-        all_generated_texts.extend(
-            {'id': i, 'generated_text': g.strip().lower()} for i, g in zip(ids, generated_texts)
-        )
-    # gather outputs from all ranks
-    all_answers = gather_object(all_answers)
-    all_generated_texts = gather_object(all_generated_texts)
-    if rank == 0:
-        assert len(all_answers) == len(all_generated_texts)
-        acc = sum(
-            a['answer'] == g['generated_text'] for a, g in zip(all_answers, all_generated_texts)
-        ) / len(all_answers)
-        if save_path:
-            with open(save_path, 'w') as f:
-                save_dict = {
-                    'answers_unique': all_answers,
-                    'generated_texts_unique': all_generated_texts,
-                    'accuracy': acc,
-                }
-                json.dump(save_dict, f)
-        return acc
-    return None
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        '--model_name_or_path',
-        type=str,
-        default='microsoft/Phi-4-multimodal-instruct',
-        help='Model name or path to load from',
-    )
-    parser.add_argument('--use_flash_attention', action='store_true', help='Use Flash Attention')
-    parser.add_argument('--output_dir', type=str, default='./output/', help='Output directory')
-    parser.add_argument('--batch_size', type=int, default=16, help='Batch size')
-    parser.add_argument(
-        '--batch_size_per_gpu',
-        type=int,
-        default=1,
-        help='Batch size per GPU (adjust this to fit in GPU memory)',
-    )
-    parser.add_argument(
-        '--dynamic_hd',
-        type=int,
-        default=36,
-        help='Number of maximum image crops',
-    )
-    parser.add_argument(
-        '--num_train_epochs', type=int, default=1, help='Number of training epochs'
-    )
-    parser.add_argument('--learning_rate', type=float, default=4.0e-5, help='Learning rate')
-    parser.add_argument('--wd', type=float, default=0.01, help='Weight decay')
-    parser.add_argument('--no_tqdm', dest='tqdm', action='store_false', help='Disable tqdm')
-    parser.add_argument('--full_run', action='store_true', help='Run the full training and eval')
-    args = parser.parse_args()
-    accelerator = Accelerator()
-    with accelerator.local_main_process_first():
-        processor = AutoProcessor.from_pretrained(
-            args.model_name_or_path,
-            trust_remote_code=True,
-            dynamic_hd=args.dynamic_hd,
-        )
-        model = create_model(
-            args.model_name_or_path,
-            use_flash_attention=args.use_flash_attention,
-        )
-    # tune vision encoder and lora
-    model.set_lora_adapter('vision')
-    for param in model.model.embed_tokens_extend.image_embed.parameters():
-        param.requires_grad = True
-    rank = int(os.environ.get('RANK', 0))
-    world_size = int(os.environ.get('WORLD_SIZE', 1))
-    train_dataset = PmcVqaTrainDataset(processor, data_size=None if args.full_run else _TRAIN_SIZE)
-    eval_dataset = PmcVqaEvalDataset(
-        processor,
-        data_size=None if args.full_run else _EVAL_SIZE,
-        rank=rank,
-        world_size=world_size,
-    )
-    num_gpus = accelerator.num_processes
-    print(f'training on {num_gpus} GPUs')
-    assert (
-        args.batch_size % (num_gpus * args.batch_size_per_gpu) == 0
-    ), 'Batch size must be divisible by the number of GPUs'
-    gradient_accumulation_steps = args.batch_size // (num_gpus * args.batch_size_per_gpu)
-    if args.use_flash_attention:
-        fp16 = False
-        bf16 = True
-    else:
-        fp16 = True
-        bf16 = False
-    # hard coded training args
-    training_args = TrainingArguments(
-        num_train_epochs=args.num_train_epochs,
-        per_device_train_batch_size=args.batch_size_per_gpu,
-        gradient_checkpointing=True,
-        gradient_checkpointing_kwargs={'use_reentrant': False},
-        gradient_accumulation_steps=gradient_accumulation_steps,
-        optim='adamw_torch',
-        adam_beta1=0.9,
-        adam_beta2=0.95,
-        adam_epsilon=1e-7,
-        learning_rate=args.learning_rate,
-        weight_decay=args.wd,
-        max_grad_norm=1.0,
-        lr_scheduler_type='linear',
-        warmup_steps=50,
-        logging_steps=10,
-        output_dir=args.output_dir,
-        save_strategy='no',
-        save_total_limit=10,
-        save_only_model=True,
-        bf16=bf16,
-        fp16=fp16,
-        remove_unused_columns=False,
-        report_to='none',
-        deepspeed=None,
-        disable_tqdm=not args.tqdm,
-        dataloader_num_workers=4,
-        ddp_find_unused_parameters=True,  # for unused SigLIP layers
-    )
-    # eval before fine-tuning
-    out_path = Path(training_args.output_dir)
-    out_path.mkdir(parents=True, exist_ok=True)
-    acc = evaluate(
-        model,
-        processor,
-        eval_dataset,
-        save_path=out_path / 'eval_before.json',
-        disable_tqdm=not args.tqdm,
-        eval_batch_size=args.batch_size_per_gpu,
-    )
-    if accelerator.is_main_process:
-        print(f'Accuracy before finetuning: {acc}')
-    trainer = Trainer(
-        model=model,
-        args=training_args,
-        data_collator=pmc_vqa_collate_fn,
-        train_dataset=train_dataset,
-    )
-    trainer.train()
-    trainer.save_model()
-    accelerator.wait_for_everyone()
-    # eval after fine-tuning (load saved checkpoint)
-    # first try to clear GPU memory
-    del model
-    del trainer
-    __import__('gc').collect()
-    torch.cuda.empty_cache()
-    # reload the model for inference
-    model = AutoModelForCausalLM.from_pretrained(
-        training_args.output_dir,
-        torch_dtype=torch.bfloat16 if args.use_flash_attention else torch.float32,
-        trust_remote_code=True,
-        _attn_implementation='flash_attention_2' if args.use_flash_attention else 'sdpa',
-    ).to('cuda')
-    acc = evaluate(
-        model,
-        processor,
-        eval_dataset,
-        save_path=out_path / 'eval_after.json',
-        disable_tqdm=not args.tqdm,
-        eval_batch_size=args.batch_size_per_gpu,
-    )
-    if accelerator.is_main_process:
-        print(f'Accuracy after finetuning: {acc}')
-if __name__ == '__main__':
-    main()

sample_inference_phi4mm.py DELETED Viewed

@@ -1,243 +0,0 @@
-import os
-import requests
-import torch
-from PIL import Image
-import soundfile
-from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig
-model_path = './'
-kwargs = {}
-kwargs['torch_dtype'] = torch.bfloat16
-processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
-print(processor.tokenizer)
-model = AutoModelForCausalLM.from_pretrained(
-    model_path,
-    trust_remote_code=True,
-    torch_dtype='auto',
-    _attn_implementation='flash_attention_2',
-).cuda()
-print("model.config._attn_implementation:", model.config._attn_implementation)
-generation_config = GenerationConfig.from_pretrained(model_path, 'generation_config.json')
-user_prompt = '<|user|>'
-assistant_prompt = '<|assistant|>'
-prompt_suffix = '<|end|>'
-#################################################### text-only ####################################################
-prompt = f'{user_prompt}what is the answer for 1+1? Explain it.{prompt_suffix}{assistant_prompt}'
-print(f'>>> Prompt\n{prompt}')
-inputs = processor(prompt, images=None, return_tensors='pt').to('cuda:0')
-generate_ids = model.generate(
-    **inputs,
-    max_new_tokens=1000,
-    generation_config=generation_config,
-)
-generate_ids = generate_ids[:, inputs['input_ids'].shape[1] :]
-response = processor.batch_decode(
-    generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
-)[0]
-print(f'>>> Response\n{response}')
-#################################################### vision (single-turn) ####################################################
-# single-image prompt
-prompt = f'{user_prompt}<|image_1|>What is shown in this image?{prompt_suffix}{assistant_prompt}'
-url = 'https://www.ilankelman.org/stopsigns/australia.jpg'
-print(f'>>> Prompt\n{prompt}')
-image = Image.open(requests.get(url, stream=True).raw)
-inputs = processor(text=prompt, images=image, return_tensors='pt').to('cuda:0')
-generate_ids = model.generate(
-    **inputs,
-    max_new_tokens=1000,
-    generation_config=generation_config,
-)
-generate_ids = generate_ids[:, inputs['input_ids'].shape[1] :]
-response = processor.batch_decode(
-    generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
-)[0]
-print(f'>>> Response\n{response}')
-#################################################### vision (multi-turn) ####################################################
-# chat template
-chat = [
-    {'role': 'user', 'content': f'<|image_1|>What is shown in this image?'},
-    {
-        'role': 'assistant',
-        'content': "The image depicts a street scene with a prominent red stop sign in the foreground. The background showcases a building with traditional Chinese architecture, characterized by its red roof and ornate decorations. There are also several statues of lions, which are common in Chinese culture, positioned in front of the building. The street is lined with various shops and businesses, and there's a car passing by.",
-    },
-    {'role': 'user', 'content': 'What is so special about this image'},
-]
-url = 'https://www.ilankelman.org/stopsigns/australia.jpg'
-image = Image.open(requests.get(url, stream=True).raw)
-prompt = processor.tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
-# need to remove last <|endoftext|> if it is there, which is used for training, not inference. For training, make sure to add <|endoftext|> in the end.
-if prompt.endswith('<|endoftext|>'):
-    prompt = prompt.rstrip('<|endoftext|>')
-print(f'>>> Prompt\n{prompt}')
-inputs = processor(prompt, [image], return_tensors='pt').to('cuda:0')
-generate_ids = model.generate(
-    **inputs,
-    max_new_tokens=1000,
-    generation_config=generation_config,
-)
-generate_ids = generate_ids[:, inputs['input_ids'].shape[1] :]
-response = processor.batch_decode(
-    generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
-)[0]
-print(f'>>> Response\n{response}')
-########################### vision (multi-frame) ################################
-images = []
-placeholder = ''
-for i in range(1, 5):
-    url = f'https://image.slidesharecdn.com/azureintroduction-191206101932/75/Introduction-to-Microsoft-Azure-Cloud-{i}-2048.jpg'
-    images.append(Image.open(requests.get(url, stream=True).raw))
-    placeholder += f'<|image_{i}|>'
-messages = [
-    {'role': 'user', 'content': placeholder + 'Summarize the deck of slides.'},
-]
-prompt = processor.tokenizer.apply_chat_template(
-    messages, tokenize=False, add_generation_prompt=True
-)
-print(f'>>> Prompt\n{prompt}')
-inputs = processor(prompt, images, return_tensors='pt').to('cuda:0')
-generation_args = {
-    'max_new_tokens': 1000,
-    'temperature': 0.0,
-    'do_sample': False,
-}
-generate_ids = model.generate(
-    **inputs, **generation_args, generation_config=generation_config,
-)
-# remove input tokens
-generate_ids = generate_ids[:, inputs['input_ids'].shape[1] :]
-response = processor.batch_decode(
-    generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
-)[0]
-print(response)
-# NOTE: Please prepare the audio file 'examples/what_is_the_traffic_sign_in_the_image.wav'
-#       and audio file 'examples/what_is_shown_in_this_image.wav' before running the following code
-#       Basically you can record your own voice for the question "What is the traffic sign in the image?" in "examples/what_is_the_traffic_sign_in_the_image.wav".
-#       And you can record your own voice for the question "What is shown in this image?" in "examples/what_is_shown_in_this_image.wav".
-AUDIO_FILE_1 = 'examples/what_is_the_traffic_sign_in_the_image.wav'
-AUDIO_FILE_2 = 'examples/what_is_shown_in_this_image.wav'
-if not os.path.exists(AUDIO_FILE_1):
-    raise FileNotFoundError(f'Please prepare the audio file {AUDIO_FILE_1} before running the following code.')
-########################## vision-speech ################################
-prompt = f'{user_prompt}<|image_1|><|audio_1|>{prompt_suffix}{assistant_prompt}'
-url = 'https://www.ilankelman.org/stopsigns/australia.jpg'
-print(f'>>> Prompt\n{prompt}')
-image = Image.open(requests.get(url, stream=True).raw)
-audio = soundfile.read(AUDIO_FILE_1)
-inputs = processor(text=prompt, images=[image], audios=[audio], return_tensors='pt').to('cuda:0')
-generate_ids = model.generate(
-    **inputs,
-    max_new_tokens=1000,
-    generation_config=generation_config,
-)
-generate_ids = generate_ids[:, inputs['input_ids'].shape[1] :]
-response = processor.batch_decode(
-    generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
-)[0]
-print(f'>>> Response\n{response}')
-########################## speech only ################################
-speech_prompt = "Based on the attached audio, generate a comprehensive text transcription of the spoken content."
-prompt = f'{user_prompt}<|audio_1|>{speech_prompt}{prompt_suffix}{assistant_prompt}'
-print(f'>>> Prompt\n{prompt}')
-audio = soundfile.read(AUDIO_FILE_1)
-inputs = processor(text=prompt, audios=[audio], return_tensors='pt').to('cuda:0')
-generate_ids = model.generate(
-    **inputs,
-    max_new_tokens=1000,
-    generation_config=generation_config,
-)
-generate_ids = generate_ids[:, inputs['input_ids'].shape[1] :]
-response = processor.batch_decode(
-    generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
-)[0]
-print(f'>>> Response\n{response}')
-if not os.path.exists(AUDIO_FILE_2):
-    raise FileNotFoundError(f'Please prepare the audio file {AUDIO_FILE_2} before running the following code.')
-########################### speech only (multi-turn) ################################
-audio_1 = soundfile.read(AUDIO_FILE_2)
-audio_2 = soundfile.read(AUDIO_FILE_1)
-chat = [
-    {'role': 'user', 'content': f'<|audio_1|>Based on the attached audio, generate a comprehensive text transcription of the spoken content.'},
-    {
-        'role': 'assistant',
-        'content': "What is shown in this image.",
-    },
-    {'role': 'user', 'content': f'<|audio_2|>Based on the attached audio, generate a comprehensive text transcription of the spoken content.'},
-]
-prompt = processor.tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
-# need to remove last <|endoftext|> if it is there, which is used for training, not inference. For training, make sure to add <|endoftext|> in the end.
-if prompt.endswith('<|endoftext|>'):
-    prompt = prompt.rstrip('<|endoftext|>')
-print(f'>>> Prompt\n{prompt}')
-inputs = processor(text=prompt, audios=[audio_1, audio_2], return_tensors='pt').to('cuda:0')
-generate_ids = model.generate(
-    **inputs,
-    max_new_tokens=1000,
-    generation_config=generation_config,
-)
-generate_ids = generate_ids[:, inputs['input_ids'].shape[1] :]
-response = processor.batch_decode(
-    generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
-)[0]
-print(f'>>> Response\n{response}')
-#################################################### vision-speech (multi-turn) ####################################################
-# chat template
-audio_1 = soundfile.read(AUDIO_FILE_2)
-audio_2 = soundfile.read(AUDIO_FILE_1)
-chat = [
-    {'role': 'user', 'content': f'<|image_1|><|audio_1|>'},
-    {
-        'role': 'assistant',
-        'content': "The image depicts a street scene with a prominent red stop sign in the foreground. The background showcases a building with traditional Chinese architecture, characterized by its red roof and ornate decorations. There are also several statues of lions, which are common in Chinese culture, positioned in front of the building. The street is lined with various shops and businesses, and there's a car passing by.",
-    },
-    {'role': 'user', 'content': f'<|audio_2|>'},
-]
-url = 'https://www.ilankelman.org/stopsigns/australia.jpg'
-image = Image.open(requests.get(url, stream=True).raw)
-prompt = processor.tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
-# need to remove last <|endoftext|> if it is there, which is used for training, not inference. For training, make sure to add <|endoftext|> in the end.
-if prompt.endswith('<|endoftext|>'):
-    prompt = prompt.rstrip('<|endoftext|>')
-print(f'>>> Prompt\n{prompt}')
-inputs = processor(text=prompt, images=[image], audios=[audio_1, audio_2], return_tensors='pt').to('cuda:0')
-generate_ids = model.generate(
-    **inputs,
-    max_new_tokens=1000,
-    generation_config=generation_config,
-)
-generate_ids = generate_ids[:, inputs['input_ids'].shape[1] :]
-response = processor.batch_decode(
-    generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
-)[0]
-print(f'>>> Response\n{response}')

special_tokens_map.json CHANGED Viewed

@@ -13,7 +13,13 @@
     "rstrip": false,
     "single_word": false
   },
-  "pad_token": "<|endoftext|>",
   "unk_token": {
     "content": "<|endoftext|>",
     "lstrip": false,

     "rstrip": false,
     "single_word": false
   },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
   "unk_token": {
     "content": "<|endoftext|>",
     "lstrip": false,

speech-lora/adapter_config.json CHANGED Viewed

@@ -1,23 +1,31 @@
 {
   "auto_mapping": null,
-  "base_model_name_or_path": "TBA",
   "bias": "none",
   "fan_in_fan_out": false,
-  "inference_mode": true,
   "init_lora_weights": true,
   "layers_pattern": null,
   "layers_to_transform": null,
   "lora_alpha": 640,
   "lora_dropout": 0.01,
-  "modules_to_save": [],
   "peft_type": "LORA",
   "r": 320,
   "revision": null,
-  "target_modules": [
-    "qkv_proj",
-    "o_proj",
-    "gate_up_proj",
-    "down_proj"
-  ],
-  "task_type": "CAUSAL_LM"
 }

 {
+  "alpha_pattern": {},
   "auto_mapping": null,
+  "base_model_name_or_path": null,
   "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
   "fan_in_fan_out": false,
+  "inference_mode": false,
   "init_lora_weights": true,
+  "layer_replication": null,
   "layers_pattern": null,
   "layers_to_transform": null,
+  "loftq_config": {},
   "lora_alpha": 640,
+  "lora_bias": false,
   "lora_dropout": 0.01,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
   "peft_type": "LORA",
   "r": 320,
+  "rank_pattern": {},
   "revision": null,
+  "target_modules": "model.layers.\\d+.((self_attn.(qkv|o)_proj)|(mlp.(gate_up|down)_proj))",
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_rslora": false
 }

speech-lora/adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1c2237461a4d1f9292cd128147bd3f0f70326a48d5d79c8e0f7583b26c095b30
-size 922782296

 version https://git-lfs.github.com/spec/v1
+oid sha256:16f70b0aba566f6c30e67a11e90033453e9375d102e031cec40956a2a0e9771e
+size 922777944

speech-lora/added_tokens.json DELETED Viewed

@@ -1,12 +0,0 @@
-{
-  "<|/tool_call|>": 200026,
-  "<|/tool|>": 200024,
-  "<|assistant|>": 200019,
-  "<|end|>": 200020,
-  "<|system|>": 200022,
-  "<|tag|>": 200028,
-  "<|tool_call|>": 200025,
-  "<|tool_response|>": 200027,
-  "<|tool|>": 200023,
-  "<|user|>": 200021
-}

speech-lora/special_tokens_map.json DELETED Viewed

@@ -1,24 +0,0 @@
-{
-  "bos_token": {
-    "content": "<|endoftext|>",
-    "lstrip": false,
-    "normalized": false,
-    "rstrip": false,
-    "single_word": false
-  },
-  "eos_token": {
-    "content": "<|endoftext|>",
-    "lstrip": false,
-    "normalized": false,
-    "rstrip": false,
-    "single_word": false
-  },
-  "pad_token": "<|endoftext|>",
-  "unk_token": {
-    "content": "<|endoftext|>",
-    "lstrip": false,
-    "normalized": false,
-    "rstrip": false,
-    "single_word": false
-  }
-}

speech-lora/tokenizer_config.json DELETED Viewed

@@ -1,125 +0,0 @@
-{
-  "add_prefix_space": false,
-  "added_tokens_decoder": {
-    "200010": {
-      "content": "<|endoftext10|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "200011": {
-      "content": "<|endoftext11|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "199999": {
-      "content": "<|endoftext|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "200018": {
-      "content": "<|endofprompt|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "200019": {
-      "content": "<|assistant|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": true,
-      "single_word": false,
-      "special": true
-    },
-    "200020": {
-      "content": "<|end|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": true,
-      "single_word": false,
-      "special": true
-    },
-    "200021": {
-      "content": "<|user|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": true,
-      "single_word": false,
-      "special": true
-    },
-    "200022": {
-      "content": "<|system|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": true,
-      "single_word": false,
-      "special": true
-    },
-    "200023": {
-      "content": "<|tool|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": true,
-      "single_word": false,
-      "special": false
-    },
-    "200024": {
-      "content": "<|/tool|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": true,
-      "single_word": false,
-      "special": false
-    },
-    "200025": {
-      "content": "<|tool_call|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": true,
-      "single_word": false,
-      "special": false
-    },
-    "200026": {
-      "content": "<|/tool_call|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": true,
-      "single_word": false,
-      "special": false
-    },
-    "200027": {
-      "content": "<|tool_response|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": true,
-      "single_word": false,
-      "special": false
-    },
-    "200028": {
-      "content": "<|tag|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": true,
-      "single_word": false,
-      "special": true
-    }
-  },
-  "bos_token": "<|endoftext|>",
-  "chat_template": "{% for message in messages %}{% if message['role'] == 'system' and 'tools' in message and message['tools'] is not none %}{{ '<|' + message['role'] + '|>' + message['content'] + '<|tool|>' + message['tools'] + '<|/tool|>' + '<|end|>' }}{% else %}{{ '<|' + message['role'] + '|>' + message['content'] + '<|end|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>' }}{% else %}{{ eos_token }}{% endif %}",
-  "clean_up_tokenization_spaces": false,
-  "eos_token": "<|endoftext|>",
-  "model_max_length": 128000,
-  "pad_token": "<|endoftext|>",
-  "tokenizer_class": "GPT2TokenizerFast",
-  "unk_token": "<|endoftext|>"
-}

speech-lora/vocab.json DELETED Viewed

The diff for this file is too large to render. See raw diff

speech_conformer_encoder.py DELETED Viewed

The diff for this file is too large to render. See raw diff

tokenizer.json CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4c1b9f641d4f8b7247b8d5007dd3b6a9f6a87cb5123134fe0d326f14d10c0585
-size 15524479

 version https://git-lfs.github.com/spec/v1
+oid sha256:57589a5827b578065aecc0a91cc1e4e9a0bac0a17fb02539bea63bb9beb889a2
+size 13303259

tokenizer_config.json CHANGED Viewed

@@ -1,125 +1 @@
-{
-  "add_prefix_space": false,
-  "added_tokens_decoder": {
-    "200010": {
-      "content": "<|endoftext10|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "200011": {
-      "content": "<|endoftext11|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "199999": {
-      "content": "<|endoftext|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "200018": {
-      "content": "<|endofprompt|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "200019": {
-      "content": "<|assistant|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": true,
-      "single_word": false,
-      "special": true
-    },
-    "200020": {
-      "content": "<|end|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": true,
-      "single_word": false,
-      "special": true
-    },
-    "200021": {
-      "content": "<|user|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": true,
-      "single_word": false,
-      "special": true
-    },
-    "200022": {
-      "content": "<|system|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": true,
-      "single_word": false,
-      "special": true
-    },
-    "200023": {
-      "content": "<|tool|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": true,
-      "single_word": false,
-      "special": false
-    },
-    "200024": {
-      "content": "<|/tool|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": true,
-      "single_word": false,
-      "special": false
-    },
-    "200025": {
-      "content": "<|tool_call|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": true,
-      "single_word": false,
-      "special": false
-    },
-    "200026": {
-      "content": "<|/tool_call|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": true,
-      "single_word": false,
-      "special": false
-    },
-    "200027": {
-      "content": "<|tool_response|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": true,
-      "single_word": false,
-      "special": false
-    },
-    "200028": {
-      "content": "<|tag|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": true,
-      "single_word": false,
-      "special": true
-    }
-  },
-  "bos_token": "<|endoftext|>",
-  "chat_template": "{% for message in messages %}{% if message['role'] == 'system' and 'tools' in message and message['tools'] is not none %}{{ '<|' + message['role'] + '|>' + message['content'] + '<|tool|>' + message['tools'] + '<|/tool|>' + '<|end|>' }}{% else %}{{ '<|' + message['role'] + '|>' + message['content'] + '<|end|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>' }}{% else %}{{ eos_token }}{% endif %}",
-  "clean_up_tokenization_spaces": false,
-  "eos_token": "<|endoftext|>",
-  "model_max_length": 131072,
-  "pad_token": "<|endoftext|>",
-  "tokenizer_class": "GPT2TokenizerFast",
-  "unk_token": "<|endoftext|>"
-}


1	+ {"add_prefix_space": false, "added_tokens_decoder": {"199999": {"content": "<\|endoftext\|>", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true}, "200010": {"content": "<\|image\|>", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true}, "200011": {"content": "<\|audio\|>", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true}, "200018": {"content": "<\|endofprompt\|>", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true}, "200019": {"content": "<\|assistant\|>", "lstrip": false, "normalized": false, "rstrip": true, "single_word": false, "special": true}, "200020": {"content": "<\|end\|>", "lstrip": false, "normalized": false, "rstrip": true, "single_word": false, "special": true}, "200021": {"content": "<\|user\|>", "lstrip": false, "normalized": false, "rstrip": true, "single_word": false, "special": true}, "200022": {"content": "<\|system\|>", "lstrip": false, "normalized": false, "rstrip": true, "single_word": false, "special": true}, "200023": {"content": "<\|tool\|>", "lstrip": false, "normalized": false, "rstrip": true, "single_word": false, "special": false}, "200024": {"content": "<\|/tool\|>", "lstrip": false, "normalized": false, "rstrip": true, "single_word": false, "special": false}, "200025": {"content": "<\|tool_call\|>", "lstrip": false, "normalized": false, "rstrip": true, "single_word": false, "special": false}, "200026": {"content": "<\|/tool_call\|>", "lstrip": false, "normalized": false, "rstrip": true, "single_word": false, "special": false}, "200027": {"content": "<\|tool_response\|>", "lstrip": false, "normalized": false, "rstrip": true, "single_word": false, "special": false}, "200028": {"content": "<\|tag\|>", "lstrip": false, "normalized": false, "rstrip": true, "single_word": false, "special": true}}, "audio_token": "<\|audio\|>", "bos_token": "<\|endoftext\|>", "clean_up_tokenization_spaces": false, "eos_token": "<\|endoftext\|>", "extra_special_tokens": {"audio_token": "<\|audio\|>", "image_token": "<\|image\|>"}, "image_token": "<\|image\|>", "model_max_length": 131072, "pad_token": "<\|endoftext\|>", "processor_class": "Phi4MultimodalProcessor", "tokenizer_class": "GPT2Tokenizer", "unk_token": "<\|endoftext\|>"}

vision-lora/adapter_config.json CHANGED Viewed

@@ -1,23 +1,31 @@
 {
   "auto_mapping": null,
-  "base_model_name_or_path": "TBA",
   "bias": "none",
   "fan_in_fan_out": false,
-  "inference_mode": true,
   "init_lora_weights": true,
   "layers_pattern": null,
   "layers_to_transform": null,
   "lora_alpha": 512,
   "lora_dropout": 0.0,
-  "modules_to_save": [],
   "peft_type": "LORA",
   "r": 256,
   "revision": null,
-  "target_modules": [
-    "qkv_proj",
-    "o_proj",
-    "gate_up_proj",
-    "down_proj"
-  ],
-  "task_type": "CAUSAL_LM"
 }

 {
+  "alpha_pattern": {},
   "auto_mapping": null,
+  "base_model_name_or_path": null,
   "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
   "fan_in_fan_out": false,
+  "inference_mode": false,
   "init_lora_weights": true,
+  "layer_replication": null,
   "layers_pattern": null,
   "layers_to_transform": null,
+  "loftq_config": {},
   "lora_alpha": 512,
+  "lora_bias": false,
   "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
   "peft_type": "LORA",
   "r": 256,
+  "rank_pattern": {},
   "revision": null,
+  "target_modules": "model.layers.\\d+.((self_attn.(qkv|o)_proj)|(mlp.(gate_up|down)_proj))",
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_rslora": false
 }

vision-lora/adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1620b16722edf701038bf66e3cd46412c7cc5458e58df89e9f92cedb71fcbde8
-size 738232904

 version https://git-lfs.github.com/spec/v1
+oid sha256:76facf464ca0246e9f5dc409520e83764e0b73fa66fdb561526e064133728f8a
+size 738228552

vision-lora/added_tokens.json DELETED Viewed

@@ -1,12 +0,0 @@
-{
-  "<|/tool_call|>": 200026,
-  "<|/tool|>": 200024,
-  "<|assistant|>": 200019,
-  "<|end|>": 200020,
-  "<|system|>": 200022,
-  "<|tag|>": 200028,
-  "<|tool_call|>": 200025,
-  "<|tool_response|>": 200027,
-  "<|tool|>": 200023,
-  "<|user|>": 200021
-}

vision-lora/special_tokens_map.json DELETED Viewed

@@ -1,24 +0,0 @@
-{
-  "bos_token": {
-    "content": "<|endoftext|>",
-    "lstrip": false,
-    "normalized": false,
-    "rstrip": false,
-    "single_word": false
-  },
-  "eos_token": {
-    "content": "<|endoftext|>",
-    "lstrip": false,
-    "normalized": false,
-    "rstrip": false,
-    "single_word": false
-  },
-  "pad_token": "<|endoftext|>",
-  "unk_token": {
-    "content": "<|endoftext|>",
-    "lstrip": false,
-    "normalized": false,
-    "rstrip": false,
-    "single_word": false
-  }
-}

vision-lora/tokenizer.json DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:382cc235b56c725945e149cc25f191da667c836655efd0857b004320e90e91ea
-size 15524095

vision-lora/tokenizer_config.json DELETED Viewed

@@ -1,125 +0,0 @@
-{
-  "add_prefix_space": false,
-  "added_tokens_decoder": {
-    "200010": {
-      "content": "<|endoftext10|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "200011": {
-      "content": "<|endoftext11|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "199999": {
-      "content": "<|endoftext|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "200018": {
-      "content": "<|endofprompt|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "200019": {
-      "content": "<|assistant|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": true,
-      "single_word": false,
-      "special": true
-    },
-    "200020": {
-      "content": "<|end|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": true,
-      "single_word": false,
-      "special": true
-    },
-    "200021": {
-      "content": "<|user|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": true,
-      "single_word": false,
-      "special": true
-    },
-    "200022": {
-      "content": "<|system|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": true,
-      "single_word": false,
-      "special": true
-    },
-    "200023": {
-      "content": "<|tool|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": true,
-      "single_word": false,
-      "special": false
-    },
-    "200024": {
-      "content": "<|/tool|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": true,
-      "single_word": false,
-      "special": false
-    },
-    "200025": {
-      "content": "<|tool_call|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": true,
-      "single_word": false,
-      "special": false
-    },
-    "200026": {
-      "content": "<|/tool_call|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": true,
-      "single_word": false,
-      "special": false
-    },
-    "200027": {
-      "content": "<|tool_response|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": true,
-      "single_word": false,
-      "special": false
-    },
-    "200028": {
-      "content": "<|tag|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": true,
-      "single_word": false,
-      "special": true
-    }
-  },
-  "bos_token": "<|endoftext|>",
-  "chat_template": "{% for message in messages %}{% if message['role'] == 'system' and 'tools' in message and message['tools'] is not none %}{{ '<|' + message['role'] + '|>' + message['content'] + '<|tool|>' + message['tools'] + '<|/tool|>' + '<|end|>' }}{% else %}{{ '<|' + message['role'] + '|>' + message['content'] + '<|end|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>' }}{% else %}{{ eos_token }}{% endif %}",
-  "clean_up_tokenization_spaces": false,
-  "eos_token": "<|endoftext|>",
-  "model_max_length": 128000,
-  "pad_token": "<|endoftext|>",
-  "tokenizer_class": "GPT2TokenizerFast",
-  "unk_token": "<|endoftext|>"
-}

vision-lora/vocab.json DELETED Viewed

The diff for this file is too large to render. See raw diff

vision_siglip_navit.py DELETED Viewed

@@ -1,1717 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Siglip model configuration"""
-import os
-from typing import Union
-from transformers.configuration_utils import PretrainedConfig
-from transformers.utils import logging
-logger = logging.get_logger(__name__)
-SIGLIP_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "google/siglip-base-patch16-224": "https://huggingface.co/google/siglip-base-patch16-224/resolve/main/config.json",
-}
-class SiglipTextConfig(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a [`SiglipTextModel`]. It is used to instantiate a
-    Siglip text encoder according to the specified arguments, defining the model architecture. Instantiating a
-    configuration with the defaults will yield a similar configuration to that of the text encoder of the Siglip
-    [google/siglip-base-patch16-224](https://huggingface.co/google/siglip-base-patch16-224) architecture.
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-    Args:
-        vocab_size (`int`, *optional*, defaults to 32000):
-            Vocabulary size of the Siglip text model. Defines the number of different tokens that can be represented by
-            the `inputs_ids` passed when calling [`SiglipModel`].
-        hidden_size (`int`, *optional*, defaults to 768):
-            Dimensionality of the encoder layers and the pooler layer.
-        intermediate_size (`int`, *optional*, defaults to 3072):
-            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-        num_hidden_layers (`int`, *optional*, defaults to 12):
-            Number of hidden layers in the Transformer encoder.
-        num_attention_heads (`int`, *optional*, defaults to 12):
-            Number of attention heads for each attention layer in the Transformer encoder.
-        max_position_embeddings (`int`, *optional*, defaults to 64):
-            The maximum sequence length that this model might ever be used with. Typically set this to something large
-            just in case (e.g., 512 or 1024 or 2048).
-        hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
-            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
-            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
-            The epsilon used by the layer normalization layers.
-        attention_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for the attention probabilities.
-        pad_token_id (`int`, *optional*, defaults to 1):
-            The id of the padding token in the vocabulary.
-        bos_token_id (`int`, *optional*, defaults to 49406):
-            The id of the beginning-of-sequence token in the vocabulary.
-        eos_token_id (`int`, *optional*, defaults to 49407):
-            The id of the end-of-sequence token in the vocabulary.
-    Example:
-    ```python
-    >>> from transformers import SiglipTextConfig, SiglipTextModel
-    >>> # Initializing a SiglipTextConfig with google/siglip-base-patch16-224 style configuration
-    >>> configuration = SiglipTextConfig()
-    >>> # Initializing a SiglipTextModel (with random weights) from the google/siglip-base-patch16-224 style configuration
-    >>> model = SiglipTextModel(configuration)
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```"""
-    model_type = "siglip_text_model"
-    def __init__(
-        self,
-        vocab_size=32000,
-        hidden_size=768,
-        intermediate_size=3072,
-        num_hidden_layers=12,
-        num_attention_heads=12,
-        max_position_embeddings=64,
-        hidden_act="gelu_pytorch_tanh",
-        layer_norm_eps=1e-6,
-        attention_dropout=0.0,
-        # This differs from `CLIPTokenizer`'s default and from openai/siglip
-        # See https://github.com/huggingface/transformers/pull/24773#issuecomment-1632287538
-        pad_token_id=1,
-        bos_token_id=49406,
-        eos_token_id=49407,
-        _flash_attn_2_enabled=True,
-        **kwargs,
-    ):
-        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.max_position_embeddings = max_position_embeddings
-        self.layer_norm_eps = layer_norm_eps
-        self.hidden_act = hidden_act
-        self.attention_dropout = attention_dropout
-        self._flash_attn_2_enabled = _flash_attn_2_enabled
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-        # get the text config dict if we are loading from SiglipConfig
-        if config_dict.get("model_type") == "siglip":
-            config_dict = config_dict["text_config"]
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-        return cls.from_dict(config_dict, **kwargs)
-class SiglipVisionConfig(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a [`SiglipVisionModel`]. It is used to instantiate a
-    Siglip vision encoder according to the specified arguments, defining the model architecture. Instantiating a
-    configuration with the defaults will yield a similar configuration to that of the vision encoder of the Siglip
-    [google/siglip-base-patch16-224](https://huggingface.co/google/siglip-base-patch16-224) architecture.
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-    Args:
-        hidden_size (`int`, *optional*, defaults to 768):
-            Dimensionality of the encoder layers and the pooler layer.
-        intermediate_size (`int`, *optional*, defaults to 3072):
-            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-        num_hidden_layers (`int`, *optional*, defaults to 12):
-            Number of hidden layers in the Transformer encoder.
-        num_attention_heads (`int`, *optional*, defaults to 12):
-            Number of attention heads for each attention layer in the Transformer encoder.
-        num_channels (`int`, *optional*, defaults to 3):
-            Number of channels in the input images.
-        image_size (`int`, *optional*, defaults to 224):
-            The size (resolution) of each image.
-        patch_size (`int`, *optional*, defaults to 16):
-            The size (resolution) of each patch.
-        hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
-            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
-            `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
-            The epsilon used by the layer normalization layers.
-        attention_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for the attention probabilities.
-    Example:
-    ```python
-    >>> from transformers import SiglipVisionConfig, SiglipVisionModel
-    >>> # Initializing a SiglipVisionConfig with google/siglip-base-patch16-224 style configuration
-    >>> configuration = SiglipVisionConfig()
-    >>> # Initializing a SiglipVisionModel (with random weights) from the google/siglip-base-patch16-224 style configuration
-    >>> model = SiglipVisionModel(configuration)
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```"""
-    model_type = "siglip_vision_model"
-    def __init__(
-        self,
-        hidden_size=768,
-        intermediate_size=3072,
-        num_hidden_layers=12,
-        num_attention_heads=12,
-        num_channels=3,
-        image_size=224,
-        patch_size=16,
-        hidden_act="gelu_pytorch_tanh",
-        layer_norm_eps=1e-6,
-        attention_dropout=0.0,
-        _flash_attn_2_enabled=True,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.num_channels = num_channels
-        self.patch_size = patch_size
-        self.image_size = image_size
-        self.attention_dropout = attention_dropout
-        self.layer_norm_eps = layer_norm_eps
-        self.hidden_act = hidden_act
-        self._flash_attn_2_enabled = _flash_attn_2_enabled
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-        # get the vision config dict if we are loading from SiglipConfig
-        if config_dict.get("model_type") == "siglip":
-            config_dict = config_dict["vision_config"]
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-        return cls.from_dict(config_dict, **kwargs)
-class SiglipConfig(PretrainedConfig):
-    r"""
-    [`SiglipConfig`] is the configuration class to store the configuration of a [`SiglipModel`]. It is used to
-    instantiate a Siglip model according to the specified arguments, defining the text model and vision model configs.
-    Instantiating a configuration with the defaults will yield a similar configuration to that of the Siglip
-    [google/siglip-base-patch16-224](https://huggingface.co/google/siglip-base-patch16-224) architecture.
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-    Args:
-        text_config (`dict`, *optional*):
-            Dictionary of configuration options used to initialize [`SiglipTextConfig`].
-        vision_config (`dict`, *optional*):
-            Dictionary of configuration options used to initialize [`SiglipVisionConfig`].
-        kwargs (*optional*):
-            Dictionary of keyword arguments.
-    Example:
-    ```python
-    >>> from transformers import SiglipConfig, SiglipModel
-    >>> # Initializing a SiglipConfig with google/siglip-base-patch16-224 style configuration
-    >>> configuration = SiglipConfig()
-    >>> # Initializing a SiglipModel (with random weights) from the google/siglip-base-patch16-224 style configuration
-    >>> model = SiglipModel(configuration)
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    >>> # We can also initialize a SiglipConfig from a SiglipTextConfig and a SiglipVisionConfig
-    >>> from transformers import SiglipTextConfig, SiglipVisionConfig
-    >>> # Initializing a SiglipText and SiglipVision configuration
-    >>> config_text = SiglipTextConfig()
-    >>> config_vision = SiglipVisionConfig()
-    >>> config = SiglipConfig.from_text_vision_configs(config_text, config_vision)
-    ```"""
-    model_type = "siglip"
-    def __init__(self, text_config=None, vision_config=None, **kwargs):
-        super().__init__(**kwargs)
-        if text_config is None:
-            text_config = {}
-            logger.info("`text_config` is `None`. Initializing the `SiglipTextConfig` with default values.")
-        if vision_config is None:
-            vision_config = {}
-            logger.info("`vision_config` is `None`. initializing the `SiglipVisionConfig` with default values.")
-        self.text_config = SiglipTextConfig(**text_config)
-        self.vision_config = SiglipVisionConfig(**vision_config)
-        self.initializer_factor = 1.0
-    @classmethod
-    def from_text_vision_configs(cls, text_config: SiglipTextConfig, vision_config: SiglipVisionConfig, **kwargs):
-        r"""
-        Instantiate a [`SiglipConfig`] (or a derived class) from siglip text model configuration and siglip vision
-        model configuration.
-        Returns:
-            [`SiglipConfig`]: An instance of a configuration object
-        """
-        return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
-# coding=utf-8
-# Copyright 2024 Google AI and The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" PyTorch Siglip model."""
-import math
-import warnings
-from dataclasses import dataclass
-from typing import Any, Optional, Tuple, Union
-import numpy as np
-import torch
-import torch.nn.functional as F
-import torch.utils.checkpoint
-from torch import nn
-from torch.nn.init import _calculate_fan_in_and_fan_out
-from transformers.activations import ACT2FN
-from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask
-from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
-from transformers.modeling_utils import PreTrainedModel
-from transformers.utils import (
-    ModelOutput,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    is_flash_attn_2_available,
-    logging,
-    replace_return_docstrings,
-)
-logger = logging.get_logger(__name__)
-_CHECKPOINT_FOR_DOC = "google/siglip-base-patch16-224"
-SIGLIP_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "google/siglip-base-patch16-224",
-    # See all SigLIP models at https://huggingface.co/models?filter=siglip
-]
-if is_flash_attn_2_available():
-    from flash_attn import flash_attn_func, flash_attn_varlen_func
-    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
-# Copied from transformers.models.llama.modeling_llama._get_unpad_data
-def _get_unpad_data(attention_mask):
-    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
-    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
-    max_seqlen_in_batch = seqlens_in_batch.max().item()
-    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
-    return (
-        indices,
-        cu_seqlens,
-        max_seqlen_in_batch,
-    )
-def _trunc_normal_(tensor, mean, std, a, b):
-    # Cut & paste from PyTorch official master until it's in a few official releases - RW
-    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
-    def norm_cdf(x):
-        # Computes standard normal cumulative distribution function
-        return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0
-    if (mean < a - 2 * std) or (mean > b + 2 * std):
-        warnings.warn(
-            "mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
-            "The distribution of values may be incorrect.",
-            stacklevel=2,
-        )
-    # Values are generated by using a truncated uniform distribution and
-    # then using the inverse CDF for the normal distribution.
-    # Get upper and lower cdf values
-    l = norm_cdf((a - mean) / std)
-    u = norm_cdf((b - mean) / std)
-    # Uniformly fill tensor with values from [l, u], then translate to
-    # [2l-1, 2u-1].
-    tensor.uniform_(2 * l - 1, 2 * u - 1)
-    # Use inverse cdf transform for normal distribution to get truncated
-    # standard normal
-    if tensor.dtype in [torch.float16, torch.bfloat16]:
-        # The `erfinv_` op is not (yet?) defined in float16+cpu, bfloat16+gpu
-        og_dtype = tensor.dtype
-        tensor = tensor.to(torch.float32)
-        tensor.erfinv_()
-        tensor = tensor.to(og_dtype)
-    else:
-        tensor.erfinv_()
-    # Transform to proper mean, std
-    tensor.mul_(std * math.sqrt(2.0))
-    tensor.add_(mean)
-    # Clamp to ensure it's in the proper range
-    if tensor.dtype == torch.float16:
-        # The `clamp_` op is not (yet?) defined in float16+cpu
-        tensor = tensor.to(torch.float32)
-        tensor.clamp_(min=a, max=b)
-        tensor = tensor.to(torch.float16)
-    else:
-        tensor.clamp_(min=a, max=b)
-def trunc_normal_tf_(
-    tensor: torch.Tensor, mean: float = 0.0, std: float = 1.0, a: float = -2.0, b: float = 2.0
-) -> torch.Tensor:
-    """Fills the input Tensor with values drawn from a truncated
-    normal distribution. The values are effectively drawn from the
-    normal distribution :math:`\\mathcal{N}(\text{mean}, \text{std}^2)`
-    with values outside :math:`[a, b]` redrawn until they are within
-    the bounds. The method used for generating the random values works
-    best when :math:`a \\leq \text{mean} \\leq b`.
-    NOTE: this 'tf' variant behaves closer to Tensorflow / JAX impl where the
-    bounds [a, b] are applied when sampling the normal distribution with mean=0, std=1.0
-    and the result is subsquently scaled and shifted by the mean and std args.
-    Args:
-        tensor: an n-dimensional `torch.Tensor`
-        mean: the mean of the normal distribution
-        std: the standard deviation of the normal distribution
-        a: the minimum cutoff value
-        b: the maximum cutoff value
-    """
-    with torch.no_grad():
-        _trunc_normal_(tensor, 0, 1.0, a, b)
-        tensor.mul_(std).add_(mean)
-def variance_scaling_(tensor, scale=1.0, mode="fan_in", distribution="normal"):
-    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
-    if mode == "fan_in":
-        denom = fan_in
-    elif mode == "fan_out":
-        denom = fan_out
-    elif mode == "fan_avg":
-        denom = (fan_in + fan_out) / 2
-    variance = scale / denom
-    if distribution == "truncated_normal":
-        # constant is stddev of standard normal truncated to (-2, 2)
-        trunc_normal_tf_(tensor, std=math.sqrt(variance) / 0.87962566103423978)
-    elif distribution == "normal":
-        with torch.no_grad():
-            tensor.normal_(std=math.sqrt(variance))
-    elif distribution == "uniform":
-        bound = math.sqrt(3 * variance)
-        with torch.no_grad():
-            tensor.uniform_(-bound, bound)
-    else:
-        raise ValueError(f"invalid distribution {distribution}")
-def lecun_normal_(tensor):
-    variance_scaling_(tensor, mode="fan_in", distribution="truncated_normal")
-def default_flax_embed_init(tensor):
-    variance_scaling_(tensor, mode="fan_in", distribution="normal")
-@dataclass
-# Copied from transformers.models.clip.modeling_clip.CLIPVisionModelOutput with CLIP->Siglip
-class SiglipVisionModelOutput(ModelOutput):
-    """
-    Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
-    Args:
-        image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
-            The image embeddings obtained by applying the projection layer to the pooler_output.
-        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
-            Sequence of hidden-states at the output of the last layer of the model.
-        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
-            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
-            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
-        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-    """
-    image_embeds: Optional[torch.FloatTensor] = None
-    last_hidden_state: torch.FloatTensor = None
-    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    attentions: Optional[Tuple[torch.FloatTensor]] = None
-@dataclass
-# Copied from transformers.models.clip.modeling_clip.CLIPTextModelOutput with CLIP->Siglip
-class SiglipTextModelOutput(ModelOutput):
-    """
-    Base class for text model's outputs that also contains a pooling of the last hidden states.
-    Args:
-        text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
-            The text embeddings obtained by applying the projection layer to the pooler_output.
-        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
-            Sequence of hidden-states at the output of the last layer of the model.
-        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
-            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
-            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
-        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-    """
-    text_embeds: Optional[torch.FloatTensor] = None
-    last_hidden_state: torch.FloatTensor = None
-    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    attentions: Optional[Tuple[torch.FloatTensor]] = None
-@dataclass
-# Copied from transformers.models.clip.modeling_clip.CLIPOutput with CLIP->Siglip
-class SiglipOutput(ModelOutput):
-    """
-    Args:
-        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
-            Contrastive loss for image-text similarity.
-        logits_per_image:(`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
-            The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
-            similarity scores.
-        logits_per_text:(`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
-            The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
-            similarity scores.
-        text_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
-            The text embeddings obtained by applying the projection layer to the pooled output of [`SiglipTextModel`].
-        image_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
-            The image embeddings obtained by applying the projection layer to the pooled output of [`SiglipVisionModel`].
-        text_model_output(`BaseModelOutputWithPooling`):
-            The output of the [`SiglipTextModel`].
-        vision_model_output(`BaseModelOutputWithPooling`):
-            The output of the [`SiglipVisionModel`].
-    """
-    loss: Optional[torch.FloatTensor] = None
-    logits_per_image: torch.FloatTensor = None
-    logits_per_text: torch.FloatTensor = None
-    text_embeds: torch.FloatTensor = None
-    image_embeds: torch.FloatTensor = None
-    text_model_output: BaseModelOutputWithPooling = None
-    vision_model_output: BaseModelOutputWithPooling = None
-    def to_tuple(self) -> Tuple[Any]:
-        return tuple(
-            self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple()
-            for k in self.keys()
-        )
-class SiglipVisionEmbeddings(nn.Module):
-    def __init__(self, config: SiglipVisionConfig):
-        super().__init__()
-        self.config = config
-        self.embed_dim = config.hidden_size
-        self.image_size = config.image_size
-        self.patch_size = config.patch_size
-        self.patch_embedding = nn.Conv2d(
-            in_channels=config.num_channels,
-            out_channels=self.embed_dim,
-            kernel_size=self.patch_size,
-            stride=self.patch_size,
-            padding="valid",
-        )
-        self.num_patches_per_side = self.image_size // self.patch_size
-        self.num_patches = self.num_patches_per_side**2
-        self.num_positions = self.num_patches
-        self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
-    def forward(self, pixel_values: torch.FloatTensor, patch_attention_mask: torch.BoolTensor) -> torch.Tensor:
-        batch_size = pixel_values.size(0)
-        patch_embeds = self.patch_embedding(pixel_values)
-        embeddings = patch_embeds.flatten(2).transpose(1, 2)
-        max_im_h, max_im_w = pixel_values.size(2), pixel_values.size(3)
-        max_nb_patches_h, max_nb_patches_w = max_im_h // self.patch_size, max_im_w // self.patch_size
-        boundaries = torch.arange(1 / self.num_patches_per_side, 1.0, 1 / self.num_patches_per_side)
-        position_ids = torch.full(
-            size=(
-                batch_size,
-                max_nb_patches_h * max_nb_patches_w,
-            ),
-            fill_value=0,
-        )
-        for batch_idx, p_attn_mask in enumerate(patch_attention_mask):
-            nb_patches_h = p_attn_mask[:, 0].sum()
-            nb_patches_w = p_attn_mask[0].sum()
-            fractional_coords_h = torch.arange(0, 1 - 1e-6, 1 / nb_patches_h)
-            fractional_coords_w = torch.arange(0, 1 - 1e-6, 1 / nb_patches_w)
-            bucket_coords_h = torch.bucketize(fractional_coords_h, boundaries, right=True)
-            bucket_coords_w = torch.bucketize(fractional_coords_w, boundaries, right=True)
-            pos_ids = (bucket_coords_h[:, None] * self.num_patches_per_side + bucket_coords_w).flatten()
-            position_ids[batch_idx][p_attn_mask.view(-1).cpu()] = pos_ids
-        position_ids = position_ids.to(self.position_embedding.weight.device)
-        embeddings = embeddings + self.position_embedding(position_ids)
-        return embeddings
-# Copied from transformers.models.clip.modeling_clip.CLIPTextEmbeddings with CLIP->Siglip
-class SiglipTextEmbeddings(nn.Module):
-    def __init__(self, config: SiglipTextConfig):
-        super().__init__()
-        embed_dim = config.hidden_size
-        self.token_embedding = nn.Embedding(config.vocab_size, embed_dim)
-        self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim)
-        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
-        self.register_buffer(
-            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
-        )
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-    ) -> torch.Tensor:
-        seq_length = input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2]
-        if position_ids is None:
-            position_ids = self.position_ids[:, :seq_length]
-        if inputs_embeds is None:
-            inputs_embeds = self.token_embedding(input_ids)
-        position_embeddings = self.position_embedding(position_ids)
-        embeddings = inputs_embeds + position_embeddings
-        return embeddings
-class SiglipAttention(nn.Module):
-    """Multi-headed attention from 'Attention Is All You Need' paper"""
-    # Copied from transformers.models.clip.modeling_clip.CLIPAttention.__init__
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.embed_dim = config.hidden_size
-        self.num_heads = config.num_attention_heads
-        self.head_dim = self.embed_dim // self.num_heads
-        if self.head_dim * self.num_heads != self.embed_dim:
-            raise ValueError(
-                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
-                f" {self.num_heads})."
-            )
-        self.scale = self.head_dim**-0.5
-        self.dropout = config.attention_dropout
-        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
-        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
-        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
-        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        output_attentions: Optional[bool] = False,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        """Input shape: Batch x Time x Channel"""
-        batch_size, q_len, _ = hidden_states.size()
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-        query_states = query_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        k_v_seq_len = key_states.shape[-2]
-        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * self.scale
-        if attn_weights.size() != (batch_size, self.num_heads, q_len, k_v_seq_len):
-            raise ValueError(
-                f"Attention weights should be of size {(batch_size, self.num_heads, q_len, k_v_seq_len)}, but is"
-                f" {attn_weights.size()}"
-            )
-        if attention_mask is not None:
-            if attention_mask.size() != (batch_size, 1, q_len, k_v_seq_len):
-                raise ValueError(
-                    f"Attention mask should be of size {(batch_size, 1, q_len, k_v_seq_len)}, but is {attention_mask.size()}"
-                )
-            attn_weights = attn_weights + attention_mask
-        # upcast attention to fp32
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
-        attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
-        attn_output = torch.matmul(attn_weights, value_states)
-        if attn_output.size() != (batch_size, self.num_heads, q_len, self.head_dim):
-            raise ValueError(
-                f"`attn_output` should be of size {(batch_size, self.num_heads, q_len, self.head_dim)}, but is"
-                f" {attn_output.size()}"
-            )
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.reshape(batch_size, q_len, self.embed_dim)
-        attn_output = self.out_proj(attn_output)
-        return attn_output, attn_weights
-class SiglipFlashAttention2(SiglipAttention):
-    """
-    Llama flash attention module. This module inherits from `LlamaAttention` as the weights of the module stays
-    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
-    flash attention and deal with padding tokens in case the input contains any of them.
-    """
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.is_causal = False  # Hack to make sure we don't use a causal mask
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.LongTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        **kwargs,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        output_attentions = False
-        bsz, q_len, _ = hidden_states.size()
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-        # Flash attention requires the input to have the shape
-        # batch_size x seq_length x head_dim x hidden_dim
-        # therefore we just need to keep the original shape
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-        # cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-        # query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-        # if past_key_value is not None:
-        #     cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
-        #     key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-        # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
-        # to be able to avoid many of these transpose/reshape/view.
-        query_states = query_states.transpose(1, 2)
-        key_states = key_states.transpose(1, 2)
-        value_states = value_states.transpose(1, 2)
-        dropout_rate = self.dropout if self.training else 0.0
-        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
-        # therefore the input hidden states gets silently casted in float32. Hence, we need
-        # cast them back in the correct dtype just to be sure everything works as expected.
-        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
-        # in fp32. (LlamaRMSNorm handles it correctly)
-        input_dtype = query_states.dtype
-        if input_dtype == torch.float32:
-            if torch.is_autocast_enabled():
-                target_dtype = torch.get_autocast_gpu_dtype()
-            # Handle the case where the model is quantized
-            elif hasattr(self.config, "_pre_quantization_dtype"):
-                target_dtype = self.config._pre_quantization_dtype
-            else:
-                target_dtype = self.q_proj.weight.dtype
-            logger.warning_once(
-                "The input hidden states seems to be silently casted in float32, this might be related to the fact"
-                " you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
-                f" {target_dtype}."
-            )
-            query_states = query_states.to(target_dtype)
-            key_states = key_states.to(target_dtype)
-            value_states = value_states.to(target_dtype)
-        attn_output = self._flash_attention_forward(
-            query_states, key_states, value_states, attention_mask, q_len, dropout=dropout_rate
-        )
-        attn_output = attn_output.reshape(bsz, q_len, self.embed_dim).contiguous()
-        attn_output = self.out_proj(attn_output)
-        if not output_attentions:
-            attn_weights = None
-        return attn_output, attn_weights
-    def _flash_attention_forward(
-        self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
-    ):
-        """
-        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
-        first unpad the input, then computes the attention scores and pad the final attention scores.
-        Args:
-            query_states (`torch.Tensor`):
-                Input query states to be passed to Flash Attention API
-            key_states (`torch.Tensor`):
-                Input key states to be passed to Flash Attention API
-            value_states (`torch.Tensor`):
-                Input value states to be passed to Flash Attention API
-            attention_mask (`torch.Tensor`):
-                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
-                position of padding tokens and 1 for the position of non-padding tokens.
-            dropout (`int`, *optional*):
-                Attention dropout
-            softmax_scale (`float`, *optional*):
-                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
-        """
-        # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
-        causal = self.is_causal and query_length != 1
-        # Contains at least one padding token in the sequence
-        if attention_mask is not None:
-            batch_size = query_states.shape[0]
-            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
-                query_states, key_states, value_states, attention_mask, query_length
-            )
-            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
-            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
-            attn_output_unpad = flash_attn_varlen_func(
-                query_states,
-                key_states,
-                value_states,
-                cu_seqlens_q=cu_seqlens_q,
-                cu_seqlens_k=cu_seqlens_k,
-                max_seqlen_q=max_seqlen_in_batch_q,
-                max_seqlen_k=max_seqlen_in_batch_k,
-                dropout_p=dropout,
-                softmax_scale=softmax_scale,
-                causal=causal,
-            )
-            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
-        else:
-            attn_output = flash_attn_func(
-                query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal
-            )
-        return attn_output
-    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
-        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
-        batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
-        key_layer = index_first_axis(
-            key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
-        )
-        value_layer = index_first_axis(
-            value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
-        )
-        if query_length == kv_seq_len:
-            query_layer = index_first_axis(
-                query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k
-            )
-            cu_seqlens_q = cu_seqlens_k
-            max_seqlen_in_batch_q = max_seqlen_in_batch_k
-            indices_q = indices_k
-        elif query_length == 1:
-            max_seqlen_in_batch_q = 1
-            cu_seqlens_q = torch.arange(
-                batch_size + 1, dtype=torch.int32, device=query_layer.device
-            )  # There is a memcpy here, that is very bad.
-            indices_q = cu_seqlens_q[:-1]
-            query_layer = query_layer.squeeze(1)
-        else:
-            # The -q_len: slice assumes left padding.
-            attention_mask = attention_mask[:, -query_length:]
-            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
-        return (
-            query_layer,
-            key_layer,
-            value_layer,
-            indices_q,
-            (cu_seqlens_q, cu_seqlens_k),
-            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
-        )
-# Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->Siglip
-class SiglipMLP(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.activation_fn = ACT2FN[config.hidden_act]
-        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
-        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        hidden_states = self.fc1(hidden_states)
-        hidden_states = self.activation_fn(hidden_states)
-        hidden_states = self.fc2(hidden_states)
-        return hidden_states
-# Copied from transformers.models.clip.modeling_clip.CLIPEncoderLayer with CLIP->Siglip
-class SiglipEncoderLayer(nn.Module):
-    def __init__(self, config: SiglipConfig):
-        super().__init__()
-        self.embed_dim = config.hidden_size
-        self.self_attn = (
-            SiglipAttention(config)
-            if not getattr(config, "_flash_attn_2_enabled", False)
-            else SiglipFlashAttention2(config)
-        )
-        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
-        self.mlp = SiglipMLP(config)
-        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: torch.Tensor,
-        output_attentions: Optional[bool] = False,
-    ) -> Tuple[torch.FloatTensor]:
-        """
-        Args:
-            hidden_states (`torch.FloatTensor`):
-                Input to the layer of shape `(batch, seq_len, embed_dim)`.
-            attention_mask (`torch.FloatTensor`):
-                Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values.
-            output_attentions (`bool`, *optional*, defaults to `False`):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-        """
-        residual = hidden_states
-        hidden_states = self.layer_norm1(hidden_states)
-        hidden_states, attn_weights = self.self_attn(
-            hidden_states=hidden_states,
-            attention_mask=attention_mask,
-            output_attentions=output_attentions,
-        )
-        hidden_states = residual + hidden_states
-        residual = hidden_states
-        hidden_states = self.layer_norm2(hidden_states)
-        hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + hidden_states
-        outputs = (hidden_states,)
-        if output_attentions:
-            outputs += (attn_weights,)
-        return outputs
-class SiglipPreTrainedModel(PreTrainedModel):
-    """
-    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
-    models.
-    """
-    config_class = SiglipConfig
-    base_model_prefix = "siglip"
-    supports_gradient_checkpointing = True
-    def _init_weights(self, module):
-        """Initialize the weights"""
-        if isinstance(module, SiglipVisionEmbeddings):
-            width = (
-                self.config.vision_config.hidden_size
-                if isinstance(self.config, SiglipConfig)
-                else self.config.hidden_size
-            )
-            nn.init.normal_(module.position_embedding.weight, std=1 / np.sqrt(width))
-        elif isinstance(module, nn.Embedding):
-            default_flax_embed_init(module.weight)
-        elif isinstance(module, SiglipAttention):
-            nn.init.normal_(module.q_proj.weight)
-            nn.init.normal_(module.k_proj.weight)
-            nn.init.normal_(module.v_proj.weight)
-            nn.init.normal_(module.out_proj.weight)
-            nn.init.zeros_(module.q_proj.bias)
-            nn.init.zeros_(module.k_proj.bias)
-            nn.init.zeros_(module.v_proj.bias)
-            nn.init.zeros_(module.out_proj.bias)
-        elif isinstance(module, SiglipMLP):
-            nn.init.normal_(module.fc1.weight)
-            nn.init.normal_(module.fc2.weight)
-            nn.init.normal_(module.fc1.bias, std=1e-6)
-            nn.init.normal_(module.fc2.bias, std=1e-6)
-        elif isinstance(module, SiglipMultiheadAttentionPoolingHead):
-            nn.init.normal_(module.probe.data)
-            nn.init.normal_(module.attention.in_proj_weight.data)
-            nn.init.zeros_(module.attention.in_proj_bias.data)
-        elif isinstance(module, SiglipModel):
-            logit_scale_init = torch.tensor(0.0)
-            module.logit_scale.data.fill_(logit_scale_init)
-            module.logit_bias.data.zero_()
-        elif isinstance(module, (nn.Linear, nn.Conv2d)):
-            lecun_normal_(module.weight)
-            if module.bias is not None:
-                nn.init.zeros_(module.bias)
-        elif isinstance(module, nn.LayerNorm):
-            module.bias.data.zero_()
-            module.weight.data.fill_(1.0)
-SIGLIP_START_DOCSTRING = r"""
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
-    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
-    etc.)
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
-    and behavior.
-    Parameters:
-        config ([`SiglipConfig`]): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the
-            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-SIGLIP_TEXT_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
-            it.
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-            [What are input IDs?](../glossary#input-ids)
-        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-            [What are attention masks?](../glossary#attention-mask)
-        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
-            config.max_position_embeddings - 1]`.
-            [What are position IDs?](../glossary#position-ids)
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-SIGLIP_VISION_INPUTS_DOCSTRING = r"""
-    Args:
-        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
-            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
-            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-SIGLIP_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
-            it.
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-            [What are input IDs?](../glossary#input-ids)
-        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-            [What are attention masks?](../glossary#attention-mask)
-        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
-            config.max_position_embeddings - 1]`.
-            [What are position IDs?](../glossary#position-ids)
-        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
-            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
-            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
-        return_loss (`bool`, *optional*):
-            Whether or not to return the contrastive loss.
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-# Copied from transformers.models.clip.modeling_clip.CLIPEncoder with CLIP->Siglip
-class SiglipEncoder(nn.Module):
-    """
-    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
-    [`SiglipEncoderLayer`].
-    Args:
-        config: SiglipConfig
-    """
-    def __init__(self, config: SiglipConfig):
-        super().__init__()
-        self.config = config
-        self.layers = nn.ModuleList([SiglipEncoderLayer(config) for _ in range(config.num_hidden_layers)])
-        self.gradient_checkpointing = False
-    # Ignore copy
-    def forward(
-        self,
-        inputs_embeds,
-        attention_mask: Optional[torch.Tensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutput]:
-        r"""
-        Args:
-            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
-                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
-                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
-                than the model's internal embedding lookup matrix.
-            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-                - 1 for tokens that are **not masked**,
-                - 0 for tokens that are **masked**.
-                [What are attention masks?](../glossary#attention-mask)
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-            output_hidden_states (`bool`, *optional*):
-                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
-                for more detail.
-            return_dict (`bool`, *optional*):
-                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-        """
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        encoder_states = () if output_hidden_states else None
-        all_attentions = () if output_attentions else None
-        hidden_states = inputs_embeds
-        for encoder_layer in self.layers:
-            if output_hidden_states:
-                encoder_states = encoder_states + (hidden_states,)
-            if self.gradient_checkpointing and self.training:
-                layer_outputs = self._gradient_checkpointing_func(
-                    encoder_layer.__call__,
-                    hidden_states,
-                    attention_mask,
-                    output_attentions,
-                )
-            else:
-                layer_outputs = encoder_layer(
-                    hidden_states,
-                    attention_mask,
-                    output_attentions=output_attentions,
-                )
-            hidden_states = layer_outputs[0]
-            if output_attentions:
-                all_attentions = all_attentions + (layer_outputs[1],)
-        if output_hidden_states:
-            encoder_states = encoder_states + (hidden_states,)
-        if not return_dict:
-            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
-        return BaseModelOutput(
-            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
-        )
-class SiglipTextTransformer(nn.Module):
-    def __init__(self, config: SiglipTextConfig):
-        super().__init__()
-        self.config = config
-        embed_dim = config.hidden_size
-        self.embeddings = SiglipTextEmbeddings(config)
-        self.encoder = SiglipEncoder(config)
-        self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
-        self.head = nn.Linear(embed_dim, embed_dim)
-    @add_start_docstrings_to_model_forward(SIGLIP_TEXT_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=SiglipTextConfig)
-    def forward(
-        self,
-        input_ids: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.Tensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPooling]:
-        r"""
-        Returns:
-        """
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        if input_ids is None:
-            raise ValueError("You have to specify input_ids")
-        input_shape = input_ids.size()
-        input_ids = input_ids.view(-1, input_shape[-1])
-        hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids)
-        # note: SigLIP's text model does not use a causal mask, unlike the original CLIP model.
-        # expand attention_mask
-        if attention_mask is not None:
-            # [batch_size, seq_len] -> [batch_size, 1, tgt_seq_len, src_seq_len]
-            attention_mask = _prepare_4d_attention_mask(attention_mask, hidden_states.dtype)
-        encoder_outputs = self.encoder(
-            inputs_embeds=hidden_states,
-            attention_mask=attention_mask,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        last_hidden_state = encoder_outputs[0]
-        last_hidden_state = self.final_layer_norm(last_hidden_state)
-        # Assuming "sticky" EOS tokenization, last token is always EOS.
-        pooled_output = last_hidden_state[:, -1, :]
-        pooled_output = self.head(pooled_output)
-        if not return_dict:
-            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
-        return BaseModelOutputWithPooling(
-            last_hidden_state=last_hidden_state,
-            pooler_output=pooled_output,
-            hidden_states=encoder_outputs.hidden_states,
-            attentions=encoder_outputs.attentions,
-        )
-@add_start_docstrings(
-    """The text model from SigLIP without any head or projection on top.""",
-    SIGLIP_START_DOCSTRING,
-)
-class SiglipTextModel(SiglipPreTrainedModel):
-    config_class = SiglipTextConfig
-    _no_split_modules = ["SiglipTextEmbeddings", "SiglipEncoderLayer"]
-    def __init__(self, config: SiglipTextConfig):
-        super().__init__(config)
-        self.text_model = SiglipTextTransformer(config)
-        # Initialize weights and apply final processing
-        self.post_init()
-    def get_input_embeddings(self) -> nn.Module:
-        return self.text_model.embeddings.token_embedding
-    def set_input_embeddings(self, value):
-        self.text_model.embeddings.token_embedding = value
-    @add_start_docstrings_to_model_forward(SIGLIP_TEXT_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=SiglipTextConfig)
-    def forward(
-        self,
-        input_ids: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.Tensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPooling]:
-        r"""
-        Returns:
-        Examples:
-        ```python
-        >>> from transformers import AutoTokenizer, SiglipTextModel
-        >>> model = SiglipTextModel.from_pretrained("google/siglip-base-patch16-224")
-        >>> tokenizer = AutoTokenizer.from_pretrained("google/siglip-base-patch16-224")
-        >>> # important: make sure to set padding="max_length" as that's how the model was trained
-        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding="max_length", return_tensors="pt")
-        >>> outputs = model(**inputs)
-        >>> last_hidden_state = outputs.last_hidden_state
-        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
-        ```"""
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        return self.text_model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-class SiglipVisionTransformer(nn.Module):
-    def __init__(self, config: SiglipVisionConfig):
-        super().__init__()
-        self.config = config
-        embed_dim = config.hidden_size
-        self.embeddings = SiglipVisionEmbeddings(config)
-        self.encoder = SiglipEncoder(config)
-        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
-        self.head = SiglipMultiheadAttentionPoolingHead(config)
-    @add_start_docstrings_to_model_forward(SIGLIP_VISION_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=SiglipVisionConfig)
-    def forward(
-        self,
-        pixel_values,
-        patch_attention_mask: Optional[torch.BoolTensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPooling]:
-        r"""
-        Returns:
-        """
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        batch_size = pixel_values.size(0)
-        if patch_attention_mask is None:
-            patch_attention_mask = torch.ones(
-                size=(
-                    batch_size,
-                    pixel_values.size(2) // self.config.patch_size,
-                    pixel_values.size(3) // self.config.patch_size,
-                ),
-                dtype=torch.bool,
-                device=pixel_values.device,
-            )
-        hidden_states = self.embeddings(pixel_values=pixel_values, patch_attention_mask=patch_attention_mask)
-        patch_attention_mask = patch_attention_mask.view(batch_size, -1)
-        # The call to `_upad_input` in `_flash_attention_forward` is expensive
-        # So when the `patch_attention_mask` is full of 1s (i.e. attending to the whole sequence),
-        # avoiding passing the attention_mask, which is equivalent to attending to the full sequence
-        if not torch.any(~patch_attention_mask):
-            attention_mask=None
-        else:
-            attention_mask = (
-                _prepare_4d_attention_mask(patch_attention_mask, hidden_states.dtype)
-                if not self.config._flash_attn_2_enabled
-                else patch_attention_mask
-            )
-        encoder_outputs = self.encoder(
-            inputs_embeds=hidden_states,
-            attention_mask=attention_mask,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        last_hidden_state = encoder_outputs[0]
-        last_hidden_state = self.post_layernorm(last_hidden_state)
-        pooled_output = self.head(
-            hidden_state=last_hidden_state,
-            attention_mask=patch_attention_mask,
-        )
-        if not return_dict:
-            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
-        return BaseModelOutputWithPooling(
-            last_hidden_state=last_hidden_state,
-            pooler_output=pooled_output,
-            hidden_states=encoder_outputs.hidden_states,
-            attentions=encoder_outputs.attentions,
-        )
-class SiglipMultiheadAttentionPoolingHead(nn.Module):
-    """Multihead Attention Pooling."""
-    def __init__(self, config: SiglipVisionConfig):
-        super().__init__()
-        self.probe = nn.Parameter(torch.randn(1, 1, config.hidden_size))
-        self.attention = torch.nn.MultiheadAttention(config.hidden_size, config.num_attention_heads, batch_first=True)
-        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.mlp = SiglipMLP(config)
-    def forward(self, hidden_state, attention_mask):
-        batch_size = hidden_state.shape[0]
-        probe = self.probe.repeat(batch_size, 1, 1)
-        hidden_state = self.attention(
-            query=probe, key=hidden_state, value=hidden_state, key_padding_mask=~attention_mask
-        )[0]
-        residual = hidden_state
-        hidden_state = self.layernorm(hidden_state)
-        hidden_state = residual + self.mlp(hidden_state)
-        return hidden_state[:, 0]
-@add_start_docstrings(
-    """The vision model from SigLIP without any head or projection on top.""",
-    SIGLIP_START_DOCSTRING,
-)
-class SiglipVisionModel(SiglipPreTrainedModel):
-    config_class = SiglipVisionConfig
-    main_input_name = "pixel_values"
-    def __init__(self, config: SiglipVisionConfig):
-        super().__init__(config)
-        self.vision_model = SiglipVisionTransformer(config)
-        # Initialize weights and apply final processing
-        self.post_init()
-    def get_input_embeddings(self) -> nn.Module:
-        return self.vision_model.embeddings.patch_embedding
-    @add_start_docstrings_to_model_forward(SIGLIP_VISION_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=SiglipVisionConfig)
-    def forward(
-        self,
-        pixel_values,
-        patch_attention_mask: Optional[torch.BoolTensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPooling]:
-        r"""
-        Returns:
-        Examples:
-        ```python
-        >>> from PIL import Image
-        >>> import requests
-        >>> from transformers import AutoProcessor, SiglipVisionModel
-        >>> model = SiglipVisionModel.from_pretrained("google/siglip-base-patch16-224")
-        >>> processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224")
-        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-        >>> image = Image.open(requests.get(url, stream=True).raw)
-        >>> inputs = processor(images=image, return_tensors="pt")
-        >>> outputs = model(**inputs)
-        >>> last_hidden_state = outputs.last_hidden_state
-        >>> pooled_output = outputs.pooler_output  # pooled features
-        ```"""
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        return self.vision_model(
-            pixel_values=pixel_values,
-            patch_attention_mask=patch_attention_mask,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-@add_start_docstrings(SIGLIP_START_DOCSTRING)
-class SiglipModel(SiglipPreTrainedModel):
-    config_class = SiglipConfig
-    def __init__(self, config: SiglipConfig):
-        super().__init__(config)
-        if not isinstance(config.text_config, SiglipTextConfig):
-            raise ValueError(
-                "config.text_config is expected to be of type SiglipTextConfig but is of type"
-                f" {type(config.text_config)}."
-            )
-        if not isinstance(config.vision_config, SiglipVisionConfig):
-            raise ValueError(
-                "config.vision_config is expected to be of type SiglipVisionConfig but is of type"
-                f" {type(config.vision_config)}."
-            )
-        text_config = config.text_config
-        vision_config = config.vision_config
-        self.text_model = SiglipTextTransformer(text_config)
-        self.vision_model = SiglipVisionTransformer(vision_config)
-        self.logit_scale = nn.Parameter(torch.randn(1))
-        self.logit_bias = nn.Parameter(torch.randn(1))
-        # Initialize weights and apply final processing
-        self.post_init()
-    @add_start_docstrings_to_model_forward(SIGLIP_TEXT_INPUTS_DOCSTRING)
-    def get_text_features(
-        self,
-        input_ids: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.Tensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> torch.FloatTensor:
-        r"""
-        Returns:
-            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
-            applying the projection layer to the pooled output of [`SiglipTextModel`].
-        Examples:
-        ```python
-        >>> from transformers import AutoTokenizer, AutoModel
-        >>> import torch
-        >>> model = AutoModel.from_pretrained("google/siglip-base-patch16-224")
-        >>> tokenizer = AutoTokenizer.from_pretrained("google/siglip-base-patch16-224")
-        >>> # important: make sure to set padding="max_length" as that's how the model was trained
-        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding="max_length", return_tensors="pt")
-        >>> with torch.no_grad():
-        ...     text_features = model.get_text_features(**inputs)
-        ```"""
-        # Use SigLIP model's config for some fields (if specified) instead of those of vision & text components.
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        text_outputs = self.text_model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        pooled_output = text_outputs[1]
-        return pooled_output
-    @add_start_docstrings_to_model_forward(SIGLIP_VISION_INPUTS_DOCSTRING)
-    def get_image_features(
-        self,
-        pixel_values: Optional[torch.FloatTensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> torch.FloatTensor:
-        r"""
-        Returns:
-            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
-            applying the projection layer to the pooled output of [`SiglipVisionModel`].
-        Examples:
-        ```python
-        >>> from PIL import Image
-        >>> import requests
-        >>> from transformers import AutoProcessor, AutoModel
-        >>> import torch
-        >>> model = AutoModel.from_pretrained("google/siglip-base-patch16-224")
-        >>> processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224")
-        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-        >>> image = Image.open(requests.get(url, stream=True).raw)
-        >>> inputs = processor(images=image, return_tensors="pt")
-        >>> with torch.no_grad():
-        ...     image_features = model.get_image_features(**inputs)
-        ```"""
-        # Use SiglipModel's config for some fields (if specified) instead of those of vision & text components.
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        vision_outputs = self.vision_model(
-            pixel_values=pixel_values,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        pooled_output = vision_outputs[1]
-        return pooled_output
-    @add_start_docstrings_to_model_forward(SIGLIP_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=SiglipOutput, config_class=SiglipConfig)
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        pixel_values: Optional[torch.FloatTensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        return_loss: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, SiglipOutput]:
-        r"""
-        Returns:
-        Examples:
-        ```python
-        >>> from PIL import Image
-        >>> import requests
-        >>> from transformers import AutoProcessor, AutoModel
-        >>> import torch
-        >>> model = AutoModel.from_pretrained("google/siglip-base-patch16-224")
-        >>> processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224")
-        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-        >>> image = Image.open(requests.get(url, stream=True).raw)
-        >>> texts = ["a photo of 2 cats", "a photo of 2 dogs"]
-        >>> # important: we pass `padding=max_length` since the model was trained with this
-        >>> inputs = processor(text=texts, images=image, padding="max_length", return_tensors="pt")
-        >>> with torch.no_grad():
-        ...     outputs = model(**inputs)
-        >>> logits_per_image = outputs.logits_per_image
-        >>> probs = torch.sigmoid(logits_per_image) # these are the probabilities
-        >>> print(f"{probs[0][0]:.1%} that image 0 is '{texts[0]}'")
-        31.9% that image 0 is 'a photo of 2 cats'
-        ```"""
-        # Use SigLIP model's config for some fields (if specified) instead of those of vision & text components.
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        vision_outputs = self.vision_model(
-            pixel_values=pixel_values,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        text_outputs = self.text_model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        image_embeds = vision_outputs[1]
-        text_embeds = text_outputs[1]
-        # normalized features
-        image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True)
-        text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
-        # cosine similarity as logits
-        logits_per_text = torch.matmul(text_embeds, image_embeds.t()) * self.logit_scale.exp() + self.logit_bias
-        logits_per_image = logits_per_text.t()
-        loss = None
-        if return_loss:
-            raise NotImplementedError("SigLIP loss to be implemented")
-        if not return_dict:
-            output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs)
-            return ((loss,) + output) if loss is not None else output
-        return SiglipOutput(
-            loss=loss,
-            logits_per_image=logits_per_image,
-            logits_per_text=logits_per_text,
-            text_embeds=text_embeds,
-            image_embeds=image_embeds,
-            text_model_output=text_outputs,
-            vision_model_output=vision_outputs,
-        )
-def get_siglip_vision_model(_flash_attn_2_enabled=True, **kwargs):
-    siglip_vision_config = {
-        "hidden_size": 1152,
-        "image_size": 448,
-        "intermediate_size": 4304,
-        "model_type": "siglip_vision_model",
-        "num_attention_heads": 16,
-        "num_hidden_layers": 27,
-        "patch_size": 14,
-    }
-    model_config = SiglipVisionConfig(**siglip_vision_config, _flash_attn_2_enabled=_flash_attn_2_enabled, **kwargs)
-    vision_model = SiglipVisionModel(model_config).vision_model
-    return vision_model

vocab.json CHANGED Viewed

The diff for this file is too large to render. See raw diff