Upload 3 files

Browse files

Fixing the AutoTokenizer issue, changing docstrings

Files changed (3) hide show

config.json +1 -4
configuration_phi3_small.py +78 -40
modeling_phi3_small.py +4 -4

config.json CHANGED Viewed

@@ -7,10 +7,7 @@
   "auto_map": {
     "AutoConfig": "configuration_phi3_small.Phi3SmallConfig",
     "AutoModelForCausalLM": "modeling_phi3_small.Phi3SmallForCausalLM",
-    "AutoTokenizer": [
-      "tokenization_phi3_small.Phi3SmallTokenizer",
-      "tokenization_phi3_small.Phi3SmallTokenizer"
-    ]
   },
   "blocksparse_block_size": 64,
   "blocksparse_homo_head_pattern": false,

   "auto_map": {
     "AutoConfig": "configuration_phi3_small.Phi3SmallConfig",
     "AutoModelForCausalLM": "modeling_phi3_small.Phi3SmallForCausalLM",
+    "AutoTokenizer": "tokenization_phi3_small.Phi3SmallTokenizer"
   },
   "blocksparse_block_size": 64,
   "blocksparse_homo_head_pattern": false,

configuration_phi3_small.py CHANGED Viewed

@@ -29,49 +29,89 @@ def next_mult(x, y):
 class Phi3SmallConfig(PretrainedConfig):
     """
-    This is the configuration class to store the configuration of a [`GPT2Model`] or a [`TFGPT2Model`]. It is used to
-    instantiate a GPT-2 model according to the specified arguments, defining the model architecture. Instantiating a
-    configuration with the defaults will yield a similar configuration to that of the GPT-2
-    [gpt2](https://huggingface.co/gpt2) architecture.
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
     Args:
-        vocab_size (`int`, *optional*, defaults to 50257):
-            Vocabulary size of the GPT-2 model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`GPT2Model`] or [`TFGPT2Model`].
-        n_positions (`int`, *optional*, defaults to 1024):
-            The maximum sequence length that this model might ever be used with. Typically set this to something large
-            just in case (e.g., 512 or 1024 or 2048).
-        n_embd (`int`, *optional*, defaults to 768):
-            Dimensionality of the embeddings and hidden states.
-        n_layer (`int`, *optional*, defaults to 12):
-            Number of hidden layers in the Transformer encoder.
-        n_head (`int`, *optional*, defaults to 12):
-            Number of attention heads for each attention layer in the Transformer encoder.
-        n_inner (`int`, *optional*, defaults to None):
-            Dimensionality of the inner feed-forward layers. `None` will set it to 4 times n_embd
-        activation_function (`str`, *optional*, defaults to `"gelu"`):
-            Activation function, to be selected in the list `["relu", "silu", "gelu", "tanh", "gelu_new"]`.
-        resid_pdrop (`float`, *optional*, defaults to 0.1):
-            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        embd_pdrop (`float`, *optional*, defaults to 0.1):
-            The dropout ratio for the embeddings.
-        attn_pdrop (`float`, *optional*, defaults to 0.1):
-            The dropout ratio for the attention.
         layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
-            The epsilon to use in the layer normalization layers.
         initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        use_cache (`bool`, *optional*, defaults to `True`):
-            Whether or not the model should return the last key/values attentions (not used by all models).
-        scale_attn_by_inverse_layer_idx (`bool`, *optional*, defaults to `False`):
-            Whether to additionally scale attention weights by `1 / layer_idx + 1`.
-        reorder_and_upcast_attn (`bool`, *optional*, defaults to `False`):
-            Whether to scale keys (K) prior to computing attention (dot-product) and upcast attention
-            dot-product/softmax to float() when training with mixed precision.
     Example:
@@ -86,7 +126,8 @@ class Phi3SmallConfig(PretrainedConfig):
     >>> # Accessing the model configuration
     >>> configuration = model.config
-    ```"""
     model_type = "phi3small"
     keys_to_ignore_at_inference = ["past_key_values"]
@@ -113,7 +154,7 @@ class Phi3SmallConfig(PretrainedConfig):
         gegelu_pad_to_256: bool = True,
         ff_dim_multiplier: Optional[int] = None,
         ff_intermediate_size: Optional[int] = 14336,
-        # Block Sparse Attention
         blocksparse_homo_head_pattern: bool = False,
         blocksparse_block_size: int = 64,
         blocksparse_num_local_blocks: int = 16,
@@ -161,7 +202,6 @@ class Phi3SmallConfig(PretrainedConfig):
         self.blocksparse_triton_kernel_block_size = blocksparse_triton_kernel_block_size
         # Frequency of block sparsity
         self.dense_attention_every_n_layers = dense_attention_every_n_layers
         # Activation function
         self.hidden_act = hidden_act
         self.gegelu_limit = gegelu_limit
@@ -176,10 +216,8 @@ class Phi3SmallConfig(PretrainedConfig):
         self.embedding_dropout_prob = embedding_dropout_prob
         self.attention_dropout_prob = attention_dropout_prob
         self.ffn_dropout_prob = ffn_dropout_prob
         self.layer_norm_epsilon = layer_norm_epsilon
         self.initializer_range = initializer_range
         # MuP parameters
         self.mup_use_scaling = mup_use_scaling
         self.mup_width_multiplier = mup_width_multiplier

 class Phi3SmallConfig(PretrainedConfig):
     """
+    This is the configuration class to store the configuration of a `Phi3Small` model. It is used to
+    instantiate a Phi-3-small model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the Phi-3-small
+    [phi3](https://arxiv.org/pdf/2404.14219) architecture.
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
     Args:
+        vocab_size (`int`, *optional*, defaults to 100352):
+            Vocabulary size of the Phi3Small model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling `Phi3Small`.
+        max_position_embeddings (`int`, *optional*, defaults to 8192):
+            The maximum sequence length that this model might safely be used with.
+        rope_embedding_base (`float`, *optional*, defaults to 10^6):
+            The base value for the RoPE (Relative Position Encoding) embedding.
+        rope_position_scale (`float`, *optional*, defaults to 1.0):
+            The scale factor for the RoPE position encoding.
+        rope_scaling (`Optional[Dict[str, Union[float, List[float], int]]]`, *optional*, defaults to None):
+            The scaling configuration used for LongRoPE.
+        hidden_size (`int`, *optional*, defaults to 4096):
+            The size of the hidden layers in the model.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            The number of layers in the model.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            The number of query heads in the model.
+        num_key_value_heads (`int`, *optional*, defaults to 8):
+            The number of key-value heads in the model.
+        hidden_act (`str`, *optional*, defaults to "gegelu"):
+            The activation function used in the model.
+        gegelu_limit (`float`, *optional*, defaults to 20.0):
+            The limit value for the GELU activation function (for numerical stability).
+        gegelu_pad_to_256 (`bool`, *optional*, defaults to True):
+            Whether to pad the intermediate size to a multiple of 256 (for faster matmul ops).
+        ff_dim_multiplier (`Optional[int]`, *optional*, defaults to None):
+            The dimension multiplier for the feed-forward layers.
+        ff_intermediate_size (`Optional[int]`, *optional*, defaults to 14336):
+            The intermediate size for the feed-forward layers.
+            One of `ff_dim_multiplier` or `ff_intermediate_size` must be specified.
+        blocksparse_homo_head_pattern (`bool`, *optional*, defaults to False):
+            Whether to use a homogeneous head pattern for block-sparse attention.
+        blocksparse_block_size (`int`, *optional*, defaults to 64):
+            The block size for block-sparse attention.
+        blocksparse_num_local_blocks (`int`, *optional*, defaults to 16):
+            The number of local blocks for block-sparse attention.
+            The local window used in blocksparse equals `blocksparse_num_local_blocks * blocksparse_block_size`
+        blocksparse_vert_stride (`int`, *optional*, defaults to 8):
+            The vertical stride for block-sparse attention.
+        blocksparse_triton_kernel_block_size (`int`, *optional*, defaults to 64):
+            The kernel block size for block-sparse attention.
+        dense_attention_every_n_layers (`Optional[int]`, *optional*, defaults to 2):
+            The frequency of all dense attention layers in the model
+        embedding_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for the embedding layer.
+        attention_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout probability for the attention layers.
+        ffn_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for the feed-forward layers.
         layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
+            The epsilon value for layer normalization.
         initializer_range (`float`, *optional*, defaults to 0.02):
+            The range for weight initialization.
+        mup_use_scaling (`bool`, *optional*, defaults to True):
+            Whether to use scaling for MuP parameters (see: https://arxiv.org/abs/2203.03466).
+        mup_width_multiplier (`bool`, *optional*, defaults to 8.0):
+            The width multiplier for MuP.
+        mup_embedding_multiplier (`bool`, *optional*, defaults to 10.0):
+            The embedding multiplier for MuP.
+        mup_attn_multiplier (`bool`, *optional*, defaults to 1.0):
+            The attention multiplier for MuP.
+        use_cache (`bool`, *optional*, defaults to True):
+            Whether to use cache for the model.
+        bos_token_id (`int`, *optional*, defaults to 100257):
+            The token ID for the beginning of sentence.
+        eos_token_id (`int`, *optional*, defaults to 100257):
+            The token ID for the end of sentence.
+        reorder_and_upcast_attn (`bool`, *optional*, defaults to False):
+            Whether to reorder and upcast attention.
+        pad_sequence_to_multiple_of_64 (`bool`, *optional*, defaults to True):
+            Whether to pad the sequence length to a multiple of 64.
+        **kwargs:
+            Additional keyword arguments.
     Example:
     >>> # Accessing the model configuration
     >>> configuration = model.config
+    ```
+    """
     model_type = "phi3small"
     keys_to_ignore_at_inference = ["past_key_values"]
         gegelu_pad_to_256: bool = True,
         ff_dim_multiplier: Optional[int] = None,
         ff_intermediate_size: Optional[int] = 14336,
+        # Block Sparse Attention Parameters
         blocksparse_homo_head_pattern: bool = False,
         blocksparse_block_size: int = 64,
         blocksparse_num_local_blocks: int = 16,
         self.blocksparse_triton_kernel_block_size = blocksparse_triton_kernel_block_size
         # Frequency of block sparsity
         self.dense_attention_every_n_layers = dense_attention_every_n_layers
         # Activation function
         self.hidden_act = hidden_act
         self.gegelu_limit = gegelu_limit
         self.embedding_dropout_prob = embedding_dropout_prob
         self.attention_dropout_prob = attention_dropout_prob
         self.ffn_dropout_prob = ffn_dropout_prob
         self.layer_norm_epsilon = layer_norm_epsilon
         self.initializer_range = initializer_range
         # MuP parameters
         self.mup_use_scaling = mup_use_scaling
         self.mup_width_multiplier = mup_width_multiplier

modeling_phi3_small.py CHANGED Viewed

@@ -155,7 +155,7 @@ class Phi3SmallMLP(nn.Module):
     def __init__(self, config: Phi3SmallConfig):
         super().__init__()
         self.config = config
-        assert self.config.hidden_act == "gegelu", "Only `gegelu` is supported for the 4.7 series of models .."
         self.hidden_size = config.hidden_size
         self.gegelu_limit = config.gegelu_limit
         self.intermediate_size = config.intermediate_size
@@ -415,7 +415,7 @@ class Phi3SmallSelfAttention(nn.Module):
         .. note::
             Right now, am assuming the expansion for the query key values is already done
-            outside. But ideally, since Flash attention handles the MQA correctly, we can
             avoid doing that.
         """
@@ -496,11 +496,11 @@ class Phi3SmallSelfAttention(nn.Module):
             torch.Tensor: The expanded key-value tensor of shape (bsz, nqp, 2, seq_len, hdim).
             Where nqp = num_q_per_kv * nkp
-        .. note::
             Right now, I am using a repeat_interleave to expand the kv to the size of q.
             This incurs a memory penalty, since the tensors are actually copied.
             TODO: If this does yield benefits, then potentially we can use the re-written
-            flash attention kernel that can handle the MQA.
         """
         repeats = torch.tensor([num_q_per_kv] * kv.size(1)).to(kv.device)

     def __init__(self, config: Phi3SmallConfig):
         super().__init__()
         self.config = config
+        assert self.config.hidden_act == "gegelu", "Only `gegelu` is supported for the Phi-3-small model .."
         self.hidden_size = config.hidden_size
         self.gegelu_limit = config.gegelu_limit
         self.intermediate_size = config.intermediate_size
         .. note::
             Right now, am assuming the expansion for the query key values is already done
+            outside. But ideally, since Flash attention handles the GQA correctly, we can
             avoid doing that.
         """
             torch.Tensor: The expanded key-value tensor of shape (bsz, nqp, 2, seq_len, hdim).
             Where nqp = num_q_per_kv * nkp
+        .. note(bapatra)::
             Right now, I am using a repeat_interleave to expand the kv to the size of q.
             This incurs a memory penalty, since the tensors are actually copied.
             TODO: If this does yield benefits, then potentially we can use the re-written
+            flash attention kernel that can handle GQA.
         """
         repeats = torch.tensor([num_q_per_kv] * kv.size(1)).to(kv.device)