Upload 3 files
Browse filesFixing the AutoTokenizer issue, changing docstrings
- config.json +1 -4
- configuration_phi3_small.py +78 -40
- modeling_phi3_small.py +4 -4
config.json
CHANGED
|
@@ -7,10 +7,7 @@
|
|
| 7 |
"auto_map": {
|
| 8 |
"AutoConfig": "configuration_phi3_small.Phi3SmallConfig",
|
| 9 |
"AutoModelForCausalLM": "modeling_phi3_small.Phi3SmallForCausalLM",
|
| 10 |
-
"AutoTokenizer":
|
| 11 |
-
"tokenization_phi3_small.Phi3SmallTokenizer",
|
| 12 |
-
"tokenization_phi3_small.Phi3SmallTokenizer"
|
| 13 |
-
]
|
| 14 |
},
|
| 15 |
"blocksparse_block_size": 64,
|
| 16 |
"blocksparse_homo_head_pattern": false,
|
|
|
|
| 7 |
"auto_map": {
|
| 8 |
"AutoConfig": "configuration_phi3_small.Phi3SmallConfig",
|
| 9 |
"AutoModelForCausalLM": "modeling_phi3_small.Phi3SmallForCausalLM",
|
| 10 |
+
"AutoTokenizer": "tokenization_phi3_small.Phi3SmallTokenizer"
|
|
|
|
|
|
|
|
|
|
| 11 |
},
|
| 12 |
"blocksparse_block_size": 64,
|
| 13 |
"blocksparse_homo_head_pattern": false,
|
configuration_phi3_small.py
CHANGED
|
@@ -29,49 +29,89 @@ def next_mult(x, y):
|
|
| 29 |
|
| 30 |
class Phi3SmallConfig(PretrainedConfig):
|
| 31 |
"""
|
| 32 |
-
This is the configuration class to store the configuration of a
|
| 33 |
-
instantiate a
|
| 34 |
-
configuration with the defaults will yield a similar configuration to that of the
|
| 35 |
-
[
|
| 36 |
|
| 37 |
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
| 38 |
documentation from [`PretrainedConfig`] for more information.
|
| 39 |
|
| 40 |
|
| 41 |
Args:
|
| 42 |
-
vocab_size (`int`, *optional*, defaults to
|
| 43 |
-
Vocabulary size of the
|
| 44 |
-
`inputs_ids` passed when calling
|
| 45 |
-
|
| 46 |
-
The maximum sequence length that this model might
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
|
| 65 |
-
The epsilon
|
| 66 |
initializer_range (`float`, *optional*, defaults to 0.02):
|
| 67 |
-
The
|
| 68 |
-
|
| 69 |
-
Whether
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
|
| 76 |
Example:
|
| 77 |
|
|
@@ -86,7 +126,8 @@ class Phi3SmallConfig(PretrainedConfig):
|
|
| 86 |
|
| 87 |
>>> # Accessing the model configuration
|
| 88 |
>>> configuration = model.config
|
| 89 |
-
```
|
|
|
|
| 90 |
|
| 91 |
model_type = "phi3small"
|
| 92 |
keys_to_ignore_at_inference = ["past_key_values"]
|
|
@@ -113,7 +154,7 @@ class Phi3SmallConfig(PretrainedConfig):
|
|
| 113 |
gegelu_pad_to_256: bool = True,
|
| 114 |
ff_dim_multiplier: Optional[int] = None,
|
| 115 |
ff_intermediate_size: Optional[int] = 14336,
|
| 116 |
-
# Block Sparse Attention
|
| 117 |
blocksparse_homo_head_pattern: bool = False,
|
| 118 |
blocksparse_block_size: int = 64,
|
| 119 |
blocksparse_num_local_blocks: int = 16,
|
|
@@ -161,7 +202,6 @@ class Phi3SmallConfig(PretrainedConfig):
|
|
| 161 |
self.blocksparse_triton_kernel_block_size = blocksparse_triton_kernel_block_size
|
| 162 |
# Frequency of block sparsity
|
| 163 |
self.dense_attention_every_n_layers = dense_attention_every_n_layers
|
| 164 |
-
|
| 165 |
# Activation function
|
| 166 |
self.hidden_act = hidden_act
|
| 167 |
self.gegelu_limit = gegelu_limit
|
|
@@ -176,10 +216,8 @@ class Phi3SmallConfig(PretrainedConfig):
|
|
| 176 |
self.embedding_dropout_prob = embedding_dropout_prob
|
| 177 |
self.attention_dropout_prob = attention_dropout_prob
|
| 178 |
self.ffn_dropout_prob = ffn_dropout_prob
|
| 179 |
-
|
| 180 |
self.layer_norm_epsilon = layer_norm_epsilon
|
| 181 |
self.initializer_range = initializer_range
|
| 182 |
-
|
| 183 |
# MuP parameters
|
| 184 |
self.mup_use_scaling = mup_use_scaling
|
| 185 |
self.mup_width_multiplier = mup_width_multiplier
|
|
|
|
| 29 |
|
| 30 |
class Phi3SmallConfig(PretrainedConfig):
|
| 31 |
"""
|
| 32 |
+
This is the configuration class to store the configuration of a `Phi3Small` model. It is used to
|
| 33 |
+
instantiate a Phi-3-small model according to the specified arguments, defining the model architecture.
|
| 34 |
+
Instantiating a configuration with the defaults will yield a similar configuration to that of the Phi-3-small
|
| 35 |
+
[phi3](https://arxiv.org/pdf/2404.14219) architecture.
|
| 36 |
|
| 37 |
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
| 38 |
documentation from [`PretrainedConfig`] for more information.
|
| 39 |
|
| 40 |
|
| 41 |
Args:
|
| 42 |
+
vocab_size (`int`, *optional*, defaults to 100352):
|
| 43 |
+
Vocabulary size of the Phi3Small model. Defines the number of different tokens that can be represented by the
|
| 44 |
+
`inputs_ids` passed when calling `Phi3Small`.
|
| 45 |
+
max_position_embeddings (`int`, *optional*, defaults to 8192):
|
| 46 |
+
The maximum sequence length that this model might safely be used with.
|
| 47 |
+
rope_embedding_base (`float`, *optional*, defaults to 10^6):
|
| 48 |
+
The base value for the RoPE (Relative Position Encoding) embedding.
|
| 49 |
+
rope_position_scale (`float`, *optional*, defaults to 1.0):
|
| 50 |
+
The scale factor for the RoPE position encoding.
|
| 51 |
+
rope_scaling (`Optional[Dict[str, Union[float, List[float], int]]]`, *optional*, defaults to None):
|
| 52 |
+
The scaling configuration used for LongRoPE.
|
| 53 |
+
hidden_size (`int`, *optional*, defaults to 4096):
|
| 54 |
+
The size of the hidden layers in the model.
|
| 55 |
+
num_hidden_layers (`int`, *optional*, defaults to 32):
|
| 56 |
+
The number of layers in the model.
|
| 57 |
+
num_attention_heads (`int`, *optional*, defaults to 32):
|
| 58 |
+
The number of query heads in the model.
|
| 59 |
+
num_key_value_heads (`int`, *optional*, defaults to 8):
|
| 60 |
+
The number of key-value heads in the model.
|
| 61 |
+
hidden_act (`str`, *optional*, defaults to "gegelu"):
|
| 62 |
+
The activation function used in the model.
|
| 63 |
+
gegelu_limit (`float`, *optional*, defaults to 20.0):
|
| 64 |
+
The limit value for the GELU activation function (for numerical stability).
|
| 65 |
+
gegelu_pad_to_256 (`bool`, *optional*, defaults to True):
|
| 66 |
+
Whether to pad the intermediate size to a multiple of 256 (for faster matmul ops).
|
| 67 |
+
ff_dim_multiplier (`Optional[int]`, *optional*, defaults to None):
|
| 68 |
+
The dimension multiplier for the feed-forward layers.
|
| 69 |
+
ff_intermediate_size (`Optional[int]`, *optional*, defaults to 14336):
|
| 70 |
+
The intermediate size for the feed-forward layers.
|
| 71 |
+
One of `ff_dim_multiplier` or `ff_intermediate_size` must be specified.
|
| 72 |
+
blocksparse_homo_head_pattern (`bool`, *optional*, defaults to False):
|
| 73 |
+
Whether to use a homogeneous head pattern for block-sparse attention.
|
| 74 |
+
blocksparse_block_size (`int`, *optional*, defaults to 64):
|
| 75 |
+
The block size for block-sparse attention.
|
| 76 |
+
blocksparse_num_local_blocks (`int`, *optional*, defaults to 16):
|
| 77 |
+
The number of local blocks for block-sparse attention.
|
| 78 |
+
The local window used in blocksparse equals `blocksparse_num_local_blocks * blocksparse_block_size`
|
| 79 |
+
blocksparse_vert_stride (`int`, *optional*, defaults to 8):
|
| 80 |
+
The vertical stride for block-sparse attention.
|
| 81 |
+
blocksparse_triton_kernel_block_size (`int`, *optional*, defaults to 64):
|
| 82 |
+
The kernel block size for block-sparse attention.
|
| 83 |
+
dense_attention_every_n_layers (`Optional[int]`, *optional*, defaults to 2):
|
| 84 |
+
The frequency of all dense attention layers in the model
|
| 85 |
+
embedding_dropout_prob (`float`, *optional*, defaults to 0.1):
|
| 86 |
+
The dropout probability for the embedding layer.
|
| 87 |
+
attention_dropout_prob (`float`, *optional*, defaults to 0.0):
|
| 88 |
+
The dropout probability for the attention layers.
|
| 89 |
+
ffn_dropout_prob (`float`, *optional*, defaults to 0.1):
|
| 90 |
+
The dropout probability for the feed-forward layers.
|
| 91 |
layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
|
| 92 |
+
The epsilon value for layer normalization.
|
| 93 |
initializer_range (`float`, *optional*, defaults to 0.02):
|
| 94 |
+
The range for weight initialization.
|
| 95 |
+
mup_use_scaling (`bool`, *optional*, defaults to True):
|
| 96 |
+
Whether to use scaling for MuP parameters (see: https://arxiv.org/abs/2203.03466).
|
| 97 |
+
mup_width_multiplier (`bool`, *optional*, defaults to 8.0):
|
| 98 |
+
The width multiplier for MuP.
|
| 99 |
+
mup_embedding_multiplier (`bool`, *optional*, defaults to 10.0):
|
| 100 |
+
The embedding multiplier for MuP.
|
| 101 |
+
mup_attn_multiplier (`bool`, *optional*, defaults to 1.0):
|
| 102 |
+
The attention multiplier for MuP.
|
| 103 |
+
use_cache (`bool`, *optional*, defaults to True):
|
| 104 |
+
Whether to use cache for the model.
|
| 105 |
+
bos_token_id (`int`, *optional*, defaults to 100257):
|
| 106 |
+
The token ID for the beginning of sentence.
|
| 107 |
+
eos_token_id (`int`, *optional*, defaults to 100257):
|
| 108 |
+
The token ID for the end of sentence.
|
| 109 |
+
reorder_and_upcast_attn (`bool`, *optional*, defaults to False):
|
| 110 |
+
Whether to reorder and upcast attention.
|
| 111 |
+
pad_sequence_to_multiple_of_64 (`bool`, *optional*, defaults to True):
|
| 112 |
+
Whether to pad the sequence length to a multiple of 64.
|
| 113 |
+
**kwargs:
|
| 114 |
+
Additional keyword arguments.
|
| 115 |
|
| 116 |
Example:
|
| 117 |
|
|
|
|
| 126 |
|
| 127 |
>>> # Accessing the model configuration
|
| 128 |
>>> configuration = model.config
|
| 129 |
+
```
|
| 130 |
+
"""
|
| 131 |
|
| 132 |
model_type = "phi3small"
|
| 133 |
keys_to_ignore_at_inference = ["past_key_values"]
|
|
|
|
| 154 |
gegelu_pad_to_256: bool = True,
|
| 155 |
ff_dim_multiplier: Optional[int] = None,
|
| 156 |
ff_intermediate_size: Optional[int] = 14336,
|
| 157 |
+
# Block Sparse Attention Parameters
|
| 158 |
blocksparse_homo_head_pattern: bool = False,
|
| 159 |
blocksparse_block_size: int = 64,
|
| 160 |
blocksparse_num_local_blocks: int = 16,
|
|
|
|
| 202 |
self.blocksparse_triton_kernel_block_size = blocksparse_triton_kernel_block_size
|
| 203 |
# Frequency of block sparsity
|
| 204 |
self.dense_attention_every_n_layers = dense_attention_every_n_layers
|
|
|
|
| 205 |
# Activation function
|
| 206 |
self.hidden_act = hidden_act
|
| 207 |
self.gegelu_limit = gegelu_limit
|
|
|
|
| 216 |
self.embedding_dropout_prob = embedding_dropout_prob
|
| 217 |
self.attention_dropout_prob = attention_dropout_prob
|
| 218 |
self.ffn_dropout_prob = ffn_dropout_prob
|
|
|
|
| 219 |
self.layer_norm_epsilon = layer_norm_epsilon
|
| 220 |
self.initializer_range = initializer_range
|
|
|
|
| 221 |
# MuP parameters
|
| 222 |
self.mup_use_scaling = mup_use_scaling
|
| 223 |
self.mup_width_multiplier = mup_width_multiplier
|
modeling_phi3_small.py
CHANGED
|
@@ -155,7 +155,7 @@ class Phi3SmallMLP(nn.Module):
|
|
| 155 |
def __init__(self, config: Phi3SmallConfig):
|
| 156 |
super().__init__()
|
| 157 |
self.config = config
|
| 158 |
-
assert self.config.hidden_act == "gegelu", "Only `gegelu` is supported for the
|
| 159 |
self.hidden_size = config.hidden_size
|
| 160 |
self.gegelu_limit = config.gegelu_limit
|
| 161 |
self.intermediate_size = config.intermediate_size
|
|
@@ -415,7 +415,7 @@ class Phi3SmallSelfAttention(nn.Module):
|
|
| 415 |
|
| 416 |
.. note::
|
| 417 |
Right now, am assuming the expansion for the query key values is already done
|
| 418 |
-
outside. But ideally, since Flash attention handles the
|
| 419 |
avoid doing that.
|
| 420 |
|
| 421 |
"""
|
|
@@ -496,11 +496,11 @@ class Phi3SmallSelfAttention(nn.Module):
|
|
| 496 |
torch.Tensor: The expanded key-value tensor of shape (bsz, nqp, 2, seq_len, hdim).
|
| 497 |
Where nqp = num_q_per_kv * nkp
|
| 498 |
|
| 499 |
-
.. note::
|
| 500 |
Right now, I am using a repeat_interleave to expand the kv to the size of q.
|
| 501 |
This incurs a memory penalty, since the tensors are actually copied.
|
| 502 |
TODO: If this does yield benefits, then potentially we can use the re-written
|
| 503 |
-
flash attention kernel that can handle
|
| 504 |
"""
|
| 505 |
|
| 506 |
repeats = torch.tensor([num_q_per_kv] * kv.size(1)).to(kv.device)
|
|
|
|
| 155 |
def __init__(self, config: Phi3SmallConfig):
|
| 156 |
super().__init__()
|
| 157 |
self.config = config
|
| 158 |
+
assert self.config.hidden_act == "gegelu", "Only `gegelu` is supported for the Phi-3-small model .."
|
| 159 |
self.hidden_size = config.hidden_size
|
| 160 |
self.gegelu_limit = config.gegelu_limit
|
| 161 |
self.intermediate_size = config.intermediate_size
|
|
|
|
| 415 |
|
| 416 |
.. note::
|
| 417 |
Right now, am assuming the expansion for the query key values is already done
|
| 418 |
+
outside. But ideally, since Flash attention handles the GQA correctly, we can
|
| 419 |
avoid doing that.
|
| 420 |
|
| 421 |
"""
|
|
|
|
| 496 |
torch.Tensor: The expanded key-value tensor of shape (bsz, nqp, 2, seq_len, hdim).
|
| 497 |
Where nqp = num_q_per_kv * nkp
|
| 498 |
|
| 499 |
+
.. note(bapatra)::
|
| 500 |
Right now, I am using a repeat_interleave to expand the kv to the size of q.
|
| 501 |
This incurs a memory penalty, since the tensors are actually copied.
|
| 502 |
TODO: If this does yield benefits, then potentially we can use the re-written
|
| 503 |
+
flash attention kernel that can handle GQA.
|
| 504 |
"""
|
| 505 |
|
| 506 |
repeats = torch.tensor([num_q_per_kv] * kv.size(1)).to(kv.device)
|