MeowFET
commited on
Commit
·
ac14fde
1
Parent(s):
3227ceb
feat: update model names, make vocab_size divisible to 128
Browse files- config.json +7 -7
- configuration_minicpm.py → configuration_cauchy.py +16 -16
- modeling_minicpm.py → modeling_cauchy.py +69 -69
- pytorch_model.bin +2 -2
config.json
CHANGED
@@ -3,16 +3,16 @@
|
|
3 |
"_ori_bos_token_id": 1,
|
4 |
"_ori_eos_token_id": 2,
|
5 |
"architectures": [
|
6 |
-
"
|
7 |
],
|
8 |
"attention_bias": false,
|
9 |
"attention_dropout": 0.0,
|
10 |
"auto_map": {
|
11 |
-
"AutoConfig": "
|
12 |
-
"AutoModel": "
|
13 |
-
"AutoModelForCausalLM": "
|
14 |
-
"AutoModelForSeq2SeqLM": "
|
15 |
-
"AutoModelForSequenceClassification": "
|
16 |
},
|
17 |
"bos_token_id": 151643,
|
18 |
"dim_model_base": 256,
|
@@ -37,5 +37,5 @@
|
|
37 |
"torch_dtype": "bfloat16",
|
38 |
"transformers_version": "4.43.3",
|
39 |
"use_cache": true,
|
40 |
-
"vocab_size":
|
41 |
}
|
|
|
3 |
"_ori_bos_token_id": 1,
|
4 |
"_ori_eos_token_id": 2,
|
5 |
"architectures": [
|
6 |
+
"CauchyForCausalLM"
|
7 |
],
|
8 |
"attention_bias": false,
|
9 |
"attention_dropout": 0.0,
|
10 |
"auto_map": {
|
11 |
+
"AutoConfig": "configuration_cauchy.CauchyConfig",
|
12 |
+
"AutoModel": "modeling_cauchy.CauchyModel",
|
13 |
+
"AutoModelForCausalLM": "modeling_cauchy.CauchyForCausalLM",
|
14 |
+
"AutoModelForSeq2SeqLM": "modeling_cauchy.CauchyForCausalLM",
|
15 |
+
"AutoModelForSequenceClassification": "modeling_cauchy.CauchyForSequenceClassification"
|
16 |
},
|
17 |
"bos_token_id": 151643,
|
18 |
"dim_model_base": 256,
|
|
|
37 |
"torch_dtype": "bfloat16",
|
38 |
"transformers_version": "4.43.3",
|
39 |
"use_cache": true,
|
40 |
+
"vocab_size": 151680
|
41 |
}
|
configuration_minicpm.py → configuration_cauchy.py
RENAMED
@@ -17,7 +17,7 @@
|
|
17 |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
18 |
# See the License for the specific language governing permissions and
|
19 |
# limitations under the License.
|
20 |
-
"""
|
21 |
|
22 |
from transformers.configuration_utils import PretrainedConfig
|
23 |
from transformers.utils import logging
|
@@ -25,14 +25,14 @@ from transformers.utils import logging
|
|
25 |
|
26 |
logger = logging.get_logger(__name__)
|
27 |
|
28 |
-
|
29 |
|
30 |
|
31 |
-
class
|
32 |
r"""
|
33 |
-
This is the configuration class to store the configuration of a [`
|
34 |
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
|
35 |
-
defaults will yield a similar configuration to that of the
|
36 |
|
37 |
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
38 |
documentation from [`PretrainedConfig`] for more information.
|
@@ -40,8 +40,8 @@ class MiniCPMConfig(PretrainedConfig):
|
|
40 |
|
41 |
Args:
|
42 |
vocab_size (`int`, *optional*, defaults to 32000):
|
43 |
-
Vocabulary size of the
|
44 |
-
`inputs_ids` passed when calling [`
|
45 |
hidden_size (`int`, *optional*, defaults to 4096):
|
46 |
Dimension of the hidden representations.
|
47 |
intermediate_size (`int`, *optional*, defaults to 11008):
|
@@ -61,8 +61,8 @@ class MiniCPMConfig(PretrainedConfig):
|
|
61 |
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
62 |
The non-linear activation function (function or string) in the decoder.
|
63 |
max_position_embeddings (`int`, *optional*, defaults to 2048):
|
64 |
-
The maximum sequence length that this model might ever be used with.
|
65 |
-
|
66 |
initializer_range (`float`, *optional*, defaults to 0.02):
|
67 |
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
68 |
rms_norm_eps (`float`, *optional*, defaults to 1e-06):
|
@@ -91,7 +91,7 @@ class MiniCPMConfig(PretrainedConfig):
|
|
91 |
`{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
|
92 |
`max_position_embeddings` to the expected new maximum. See the following thread for more information on how
|
93 |
these scaling strategies behave:
|
94 |
-
https://www.reddit.com/r/
|
95 |
experimental feature, subject to breaking API changes in future versions.
|
96 |
attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
|
97 |
Whether to use a bias in the query, key, value and output projection layers during self-attention.
|
@@ -99,19 +99,19 @@ class MiniCPMConfig(PretrainedConfig):
|
|
99 |
The dropout ratio for the attention probabilities.
|
100 |
|
101 |
```python
|
102 |
-
>>> from transformers import
|
103 |
|
104 |
-
>>> # Initializing a
|
105 |
-
>>> configuration =
|
106 |
|
107 |
-
>>> # Initializing a model from the
|
108 |
-
>>> model =
|
109 |
|
110 |
>>> # Accessing the model configuration
|
111 |
>>> configuration = model.config
|
112 |
```"""
|
113 |
|
114 |
-
model_type = "
|
115 |
keys_to_ignore_at_inference = ["past_key_values"]
|
116 |
|
117 |
def __init__(
|
|
|
17 |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
18 |
# See the License for the specific language governing permissions and
|
19 |
# limitations under the License.
|
20 |
+
""" Cauchy model configuration"""
|
21 |
|
22 |
from transformers.configuration_utils import PretrainedConfig
|
23 |
from transformers.utils import logging
|
|
|
25 |
|
26 |
logger = logging.get_logger(__name__)
|
27 |
|
28 |
+
CAUCHY_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
|
29 |
|
30 |
|
31 |
+
class CauchyConfig(PretrainedConfig):
|
32 |
r"""
|
33 |
+
This is the configuration class to store the configuration of a [`CauchyModel`]. It is used to instantiate an Cauchy
|
34 |
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
|
35 |
+
defaults will yield a similar configuration to that of the Cauchy-7B.
|
36 |
|
37 |
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
38 |
documentation from [`PretrainedConfig`] for more information.
|
|
|
40 |
|
41 |
Args:
|
42 |
vocab_size (`int`, *optional*, defaults to 32000):
|
43 |
+
Vocabulary size of the Cauchy model. Defines the number of different tokens that can be represented by the
|
44 |
+
`inputs_ids` passed when calling [`CauchyModel`]
|
45 |
hidden_size (`int`, *optional*, defaults to 4096):
|
46 |
Dimension of the hidden representations.
|
47 |
intermediate_size (`int`, *optional*, defaults to 11008):
|
|
|
61 |
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
62 |
The non-linear activation function (function or string) in the decoder.
|
63 |
max_position_embeddings (`int`, *optional*, defaults to 2048):
|
64 |
+
The maximum sequence length that this model might ever be used with. Cauchy 1 supports up to 2048 tokens,
|
65 |
+
Cauchy 2 up to 4096, CodeCauchy up to 16384.
|
66 |
initializer_range (`float`, *optional*, defaults to 0.02):
|
67 |
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
68 |
rms_norm_eps (`float`, *optional*, defaults to 1e-06):
|
|
|
91 |
`{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
|
92 |
`max_position_embeddings` to the expected new maximum. See the following thread for more information on how
|
93 |
these scaling strategies behave:
|
94 |
+
https://www.reddit.com/r/LocalCauchy/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
|
95 |
experimental feature, subject to breaking API changes in future versions.
|
96 |
attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
|
97 |
Whether to use a bias in the query, key, value and output projection layers during self-attention.
|
|
|
99 |
The dropout ratio for the attention probabilities.
|
100 |
|
101 |
```python
|
102 |
+
>>> from transformers import CauchyModel, CauchyConfig
|
103 |
|
104 |
+
>>> # Initializing a Cauchy cauchy-7b style configuration
|
105 |
+
>>> configuration = CauchyConfig()
|
106 |
|
107 |
+
>>> # Initializing a model from the cauchy-7b style configuration
|
108 |
+
>>> model = CauchyModel(configuration)
|
109 |
|
110 |
>>> # Accessing the model configuration
|
111 |
>>> configuration = model.config
|
112 |
```"""
|
113 |
|
114 |
+
model_type = "cauchy"
|
115 |
keys_to_ignore_at_inference = ["past_key_values"]
|
116 |
|
117 |
def __init__(
|
modeling_minicpm.py → modeling_cauchy.py
RENAMED
@@ -17,7 +17,7 @@
|
|
17 |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
18 |
# See the License for the specific language governing permissions and
|
19 |
# limitations under the License.
|
20 |
-
""" PyTorch
|
21 |
import math
|
22 |
import warnings
|
23 |
from typing import List, Optional, Tuple, Union, Dict
|
@@ -48,7 +48,7 @@ from transformers.utils import (
|
|
48 |
replace_return_docstrings,
|
49 |
)
|
50 |
from transformers.utils.import_utils import is_torch_fx_available
|
51 |
-
from .
|
52 |
import re
|
53 |
|
54 |
try:
|
@@ -69,7 +69,7 @@ if is_torch_fx_available():
|
|
69 |
|
70 |
logger = logging.get_logger(__name__)
|
71 |
|
72 |
-
_CONFIG_FOR_DOC = "
|
73 |
|
74 |
|
75 |
def _get_unpad_data(attention_mask):
|
@@ -86,7 +86,7 @@ def _get_unpad_data(attention_mask):
|
|
86 |
|
87 |
def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
|
88 |
warnings.warn(
|
89 |
-
"Calling `transformers.models.
|
90 |
)
|
91 |
return _prepare_4d_attention_mask(mask=mask, dtype=dtype, tgt_len=tgt_len)
|
92 |
|
@@ -95,7 +95,7 @@ def _make_causal_mask(
|
|
95 |
input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
|
96 |
):
|
97 |
warnings.warn(
|
98 |
-
"Calling `transformers.models.
|
99 |
)
|
100 |
return AttentionMaskConverter._make_causal_mask(
|
101 |
input_ids_shape=input_ids_shape, dtype=dtype, device=device, past_key_values_length=past_key_values_length
|
@@ -110,10 +110,10 @@ def rms_layernorm(hidden: torch.Tensor, weight: torch.Tensor, eps: float):
|
|
110 |
return hidden * weight
|
111 |
|
112 |
|
113 |
-
class
|
114 |
def __init__(self, hidden_size, eps=1e-6):
|
115 |
"""
|
116 |
-
|
117 |
"""
|
118 |
super().__init__()
|
119 |
self.weight = nn.Parameter(torch.ones(hidden_size))
|
@@ -123,10 +123,10 @@ class MiniCPMRMSNorm(nn.Module):
|
|
123 |
return rms_layernorm(hidden_states, self.weight, self.variance_epsilon)
|
124 |
|
125 |
|
126 |
-
ALL_LAYERNORM_LAYERS.append(
|
127 |
|
128 |
|
129 |
-
class
|
130 |
def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
|
131 |
super().__init__()
|
132 |
|
@@ -163,8 +163,8 @@ class MiniCPMRotaryEmbedding(nn.Module):
|
|
163 |
)
|
164 |
|
165 |
|
166 |
-
class
|
167 |
-
"""
|
168 |
|
169 |
def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
|
170 |
self.scaling_factor = scaling_factor
|
@@ -182,8 +182,8 @@ class MiniCPMLinearScalingRotaryEmbedding(MiniCPMRotaryEmbedding):
|
|
182 |
self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
|
183 |
|
184 |
|
185 |
-
class
|
186 |
-
"""
|
187 |
|
188 |
def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
|
189 |
self.scaling_factor = scaling_factor
|
@@ -250,7 +250,7 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
|
|
250 |
k_embed = (k_fp32 * cos) + (rotate_half(k_fp32) * sin)
|
251 |
return q_embed.to(dtype=orig_dtype), k_embed.to(dtype=orig_dtype)
|
252 |
|
253 |
-
class
|
254 |
def __init__(self, config):
|
255 |
super().__init__()
|
256 |
self.config = config
|
@@ -297,10 +297,10 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
|
|
297 |
|
298 |
|
299 |
|
300 |
-
class
|
301 |
"""Multi-headed attention from 'Attention Is All You Need' paper"""
|
302 |
|
303 |
-
def __init__(self, config:
|
304 |
super().__init__()
|
305 |
self.config = config
|
306 |
self.layer_idx = layer_idx
|
@@ -335,12 +335,12 @@ class MiniCPMAttention(nn.Module):
|
|
335 |
self._init_rope()
|
336 |
|
337 |
if self.qk_norm:
|
338 |
-
self.q_norm =
|
339 |
-
self.k_norm =
|
340 |
|
341 |
def _init_rope(self):
|
342 |
if self.config.rope_scaling is None:
|
343 |
-
self.rotary_emb =
|
344 |
self.head_dim,
|
345 |
max_position_embeddings=self.max_position_embeddings,
|
346 |
base=self.rope_theta,
|
@@ -349,14 +349,14 @@ class MiniCPMAttention(nn.Module):
|
|
349 |
scaling_type = self.config.rope_scaling["type"]
|
350 |
scaling_factor = self.config.rope_scaling["factor"]
|
351 |
if scaling_type == "linear":
|
352 |
-
self.rotary_emb =
|
353 |
self.head_dim,
|
354 |
max_position_embeddings=self.max_position_embeddings,
|
355 |
scaling_factor=scaling_factor,
|
356 |
base=self.rope_theta,
|
357 |
)
|
358 |
elif scaling_type == "dynamic":
|
359 |
-
self.rotary_emb =
|
360 |
self.head_dim,
|
361 |
max_position_embeddings=self.max_position_embeddings,
|
362 |
scaling_factor=scaling_factor,
|
@@ -477,9 +477,9 @@ class MiniCPMAttention(nn.Module):
|
|
477 |
return attn_output, attn_weights, past_key_value
|
478 |
|
479 |
|
480 |
-
class
|
481 |
"""
|
482 |
-
|
483 |
untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
|
484 |
flash attention and deal with padding tokens in case the input contains any of them.
|
485 |
"""
|
@@ -502,7 +502,7 @@ class MiniCPMFlashAttention2(MiniCPMAttention):
|
|
502 |
use_cache: bool = False,
|
503 |
**kwargs,
|
504 |
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
|
505 |
-
#
|
506 |
if "padding_mask" in kwargs:
|
507 |
warnings.warn(
|
508 |
"Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
|
@@ -552,7 +552,7 @@ class MiniCPMFlashAttention2(MiniCPMAttention):
|
|
552 |
# therefore the input hidden states gets silently casted in float32. Hence, we need
|
553 |
# cast them back in the correct dtype just to be sure everything works as expected.
|
554 |
# This might slowdown training & inference so it is recommended to not cast the LayerNorms
|
555 |
-
# in fp32. (
|
556 |
|
557 |
input_dtype = query_states.dtype
|
558 |
if input_dtype == torch.float32:
|
@@ -609,7 +609,7 @@ class MiniCPMFlashAttention2(MiniCPMAttention):
|
|
609 |
if not self._flash_attn_uses_top_left_mask:
|
610 |
causal = self.is_causal
|
611 |
else:
|
612 |
-
# TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in
|
613 |
causal = self.is_causal and query_length != 1
|
614 |
# Contains at least one padding token in the sequence
|
615 |
if attention_mask is not None:
|
@@ -680,14 +680,14 @@ class MiniCPMFlashAttention2(MiniCPMAttention):
|
|
680 |
)
|
681 |
|
682 |
|
683 |
-
class
|
684 |
"""
|
685 |
-
|
686 |
-
`
|
687 |
SDPA API.
|
688 |
"""
|
689 |
|
690 |
-
# Adapted from
|
691 |
def forward(
|
692 |
self,
|
693 |
hidden_states: torch.Tensor,
|
@@ -700,7 +700,7 @@ class MiniCPMSdpaAttention(MiniCPMAttention):
|
|
700 |
if output_attentions:
|
701 |
# TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
|
702 |
logger.warning_once(
|
703 |
-
"
|
704 |
'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
|
705 |
)
|
706 |
return super().forward(
|
@@ -771,22 +771,22 @@ class MiniCPMSdpaAttention(MiniCPMAttention):
|
|
771 |
return attn_output, None, past_key_value
|
772 |
|
773 |
|
774 |
-
|
775 |
-
"eager":
|
776 |
-
"flash_attention_2":
|
777 |
-
"sdpa":
|
778 |
}
|
779 |
|
780 |
|
781 |
-
class
|
782 |
-
def __init__(self, config:
|
783 |
super().__init__()
|
784 |
self.hidden_size = config.hidden_size
|
785 |
-
self.self_attn =
|
786 |
|
787 |
-
self.mlp =
|
788 |
-
self.input_layernorm =
|
789 |
-
self.post_attention_layernorm =
|
790 |
|
791 |
self.scale_depth = config.scale_depth
|
792 |
self.num_hidden_layers = config.num_hidden_layers
|
@@ -853,7 +853,7 @@ class MiniCPMDecoderLayer(nn.Module):
|
|
853 |
return outputs
|
854 |
|
855 |
|
856 |
-
|
857 |
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
|
858 |
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
|
859 |
etc.)
|
@@ -863,7 +863,7 @@ MINICPM_START_DOCSTRING = r"""
|
|
863 |
and behavior.
|
864 |
|
865 |
Parameters:
|
866 |
-
config ([`
|
867 |
Model configuration class with all the parameters of the model. Initializing with a config file does not
|
868 |
load the weights associated with the model, only the configuration. Check out the
|
869 |
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
|
@@ -871,14 +871,14 @@ MINICPM_START_DOCSTRING = r"""
|
|
871 |
|
872 |
|
873 |
@add_start_docstrings(
|
874 |
-
"The bare
|
875 |
-
|
876 |
)
|
877 |
-
class
|
878 |
-
config_class =
|
879 |
base_model_prefix = "model"
|
880 |
supports_gradient_checkpointing = True
|
881 |
-
_no_split_modules = ["
|
882 |
_skip_keys_device_placement = "past_key_values"
|
883 |
_supports_flash_attn_2 = True
|
884 |
_supports_sdpa = True
|
@@ -896,7 +896,7 @@ class MiniCPMPreTrainedModel(PreTrainedModel):
|
|
896 |
module.weight.data[module.padding_idx].zero_()
|
897 |
|
898 |
|
899 |
-
|
900 |
Args:
|
901 |
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
|
902 |
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
|
@@ -967,30 +967,30 @@ MINICPM_INPUTS_DOCSTRING = r"""
|
|
967 |
|
968 |
|
969 |
@add_start_docstrings(
|
970 |
-
"The bare
|
971 |
-
|
972 |
)
|
973 |
-
class
|
974 |
"""
|
975 |
-
Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`
|
976 |
|
977 |
Args:
|
978 |
-
config:
|
979 |
"""
|
980 |
|
981 |
-
def __init__(self, config:
|
982 |
super().__init__(config)
|
983 |
self.padding_idx = config.pad_token_id
|
984 |
self.vocab_size = config.vocab_size
|
985 |
|
986 |
self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
|
987 |
self.layers = nn.ModuleList(
|
988 |
-
[
|
989 |
)
|
990 |
self._use_sdpa = config._attn_implementation == "sdpa"
|
991 |
self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
|
992 |
|
993 |
-
self.norm =
|
994 |
|
995 |
self.gradient_checkpointing = False
|
996 |
# Initialize weights and apply final processing
|
@@ -1002,7 +1002,7 @@ class MiniCPMModel(MiniCPMPreTrainedModel):
|
|
1002 |
def set_input_embeddings(self, value):
|
1003 |
self.embed_tokens = value
|
1004 |
|
1005 |
-
@add_start_docstrings_to_model_forward(
|
1006 |
def forward(
|
1007 |
self,
|
1008 |
input_ids: torch.LongTensor = None,
|
@@ -1135,12 +1135,12 @@ class MiniCPMModel(MiniCPMPreTrainedModel):
|
|
1135 |
)
|
1136 |
|
1137 |
|
1138 |
-
class
|
1139 |
_tied_weights_keys = ["lm_head.weight"]
|
1140 |
|
1141 |
def __init__(self, config):
|
1142 |
super().__init__(config)
|
1143 |
-
self.model =
|
1144 |
self.vocab_size = config.vocab_size
|
1145 |
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
|
1146 |
|
@@ -1165,7 +1165,7 @@ class MiniCPMForCausalLM(MiniCPMPreTrainedModel):
|
|
1165 |
def get_decoder(self):
|
1166 |
return self.model
|
1167 |
|
1168 |
-
@add_start_docstrings_to_model_forward(
|
1169 |
@replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
|
1170 |
def forward(
|
1171 |
self,
|
@@ -1192,9 +1192,9 @@ class MiniCPMForCausalLM(MiniCPMPreTrainedModel):
|
|
1192 |
Example:
|
1193 |
|
1194 |
```python
|
1195 |
-
>>> from transformers import AutoTokenizer,
|
1196 |
|
1197 |
-
>>> model =
|
1198 |
>>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
|
1199 |
|
1200 |
>>> prompt = "Hey, are you conscious? Can you talk to me?"
|
@@ -1354,9 +1354,9 @@ class MiniCPMForCausalLM(MiniCPMPreTrainedModel):
|
|
1354 |
|
1355 |
@add_start_docstrings(
|
1356 |
"""
|
1357 |
-
The
|
1358 |
|
1359 |
-
[`
|
1360 |
(e.g. GPT-2) do.
|
1361 |
|
1362 |
Since it does classification on the last token, it requires to know the position of the last token. If a
|
@@ -1365,13 +1365,13 @@ class MiniCPMForCausalLM(MiniCPMPreTrainedModel):
|
|
1365 |
padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
|
1366 |
each row of the batch).
|
1367 |
""",
|
1368 |
-
|
1369 |
)
|
1370 |
-
class
|
1371 |
def __init__(self, config):
|
1372 |
super().__init__(config)
|
1373 |
self.num_labels = config.num_labels
|
1374 |
-
self.model =
|
1375 |
self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
|
1376 |
|
1377 |
# Initialize weights and apply final processing
|
@@ -1383,7 +1383,7 @@ class MiniCPMForSequenceClassification(MiniCPMPreTrainedModel):
|
|
1383 |
def set_input_embeddings(self, value):
|
1384 |
self.model.embed_tokens = value
|
1385 |
|
1386 |
-
@add_start_docstrings_to_model_forward(
|
1387 |
def forward(
|
1388 |
self,
|
1389 |
input_ids: torch.LongTensor = None,
|
|
|
17 |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
18 |
# See the License for the specific language governing permissions and
|
19 |
# limitations under the License.
|
20 |
+
""" PyTorch Cauchy model."""
|
21 |
import math
|
22 |
import warnings
|
23 |
from typing import List, Optional, Tuple, Union, Dict
|
|
|
48 |
replace_return_docstrings,
|
49 |
)
|
50 |
from transformers.utils.import_utils import is_torch_fx_available
|
51 |
+
from .configuration_cauchy import CauchyConfig
|
52 |
import re
|
53 |
|
54 |
try:
|
|
|
69 |
|
70 |
logger = logging.get_logger(__name__)
|
71 |
|
72 |
+
_CONFIG_FOR_DOC = "CauchyConfig"
|
73 |
|
74 |
|
75 |
def _get_unpad_data(attention_mask):
|
|
|
86 |
|
87 |
def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
|
88 |
warnings.warn(
|
89 |
+
"Calling `transformers.models.cauchy.modeling_cauchy._prepare_4d_attention_mask` is deprecated and will be removed in v4.37. Use `transformers.modeling_attn_mask_utils._prepare_4d_attention_mask"
|
90 |
)
|
91 |
return _prepare_4d_attention_mask(mask=mask, dtype=dtype, tgt_len=tgt_len)
|
92 |
|
|
|
95 |
input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
|
96 |
):
|
97 |
warnings.warn(
|
98 |
+
"Calling `transformers.models.cauchy.modeling_cauchy._make_causal_mask` is deprecated and will be removed in v4.37. Use `transformers.models.cauchy.modeling_cauchy.AttentionMaskConverter._make_causal_mask"
|
99 |
)
|
100 |
return AttentionMaskConverter._make_causal_mask(
|
101 |
input_ids_shape=input_ids_shape, dtype=dtype, device=device, past_key_values_length=past_key_values_length
|
|
|
110 |
return hidden * weight
|
111 |
|
112 |
|
113 |
+
class CauchyRMSNorm(nn.Module):
|
114 |
def __init__(self, hidden_size, eps=1e-6):
|
115 |
"""
|
116 |
+
CauchyRMSNorm is equivalent to T5LayerNorm
|
117 |
"""
|
118 |
super().__init__()
|
119 |
self.weight = nn.Parameter(torch.ones(hidden_size))
|
|
|
123 |
return rms_layernorm(hidden_states, self.weight, self.variance_epsilon)
|
124 |
|
125 |
|
126 |
+
ALL_LAYERNORM_LAYERS.append(CauchyRMSNorm)
|
127 |
|
128 |
|
129 |
+
class CauchyRotaryEmbedding(nn.Module):
|
130 |
def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
|
131 |
super().__init__()
|
132 |
|
|
|
163 |
)
|
164 |
|
165 |
|
166 |
+
class CauchyLinearScalingRotaryEmbedding(CauchyRotaryEmbedding):
|
167 |
+
"""CauchyRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
|
168 |
|
169 |
def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
|
170 |
self.scaling_factor = scaling_factor
|
|
|
182 |
self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
|
183 |
|
184 |
|
185 |
+
class CauchyDynamicNTKScalingRotaryEmbedding(CauchyRotaryEmbedding):
|
186 |
+
"""CauchyRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
|
187 |
|
188 |
def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
|
189 |
self.scaling_factor = scaling_factor
|
|
|
250 |
k_embed = (k_fp32 * cos) + (rotate_half(k_fp32) * sin)
|
251 |
return q_embed.to(dtype=orig_dtype), k_embed.to(dtype=orig_dtype)
|
252 |
|
253 |
+
class CauchyMLP(nn.Module):
|
254 |
def __init__(self, config):
|
255 |
super().__init__()
|
256 |
self.config = config
|
|
|
297 |
|
298 |
|
299 |
|
300 |
+
class CauchyAttention(nn.Module):
|
301 |
"""Multi-headed attention from 'Attention Is All You Need' paper"""
|
302 |
|
303 |
+
def __init__(self, config: CauchyConfig, layer_idx: Optional[int] = None):
|
304 |
super().__init__()
|
305 |
self.config = config
|
306 |
self.layer_idx = layer_idx
|
|
|
335 |
self._init_rope()
|
336 |
|
337 |
if self.qk_norm:
|
338 |
+
self.q_norm = CauchyRMSNorm(self.head_dim, eps=config.rms_norm_eps)
|
339 |
+
self.k_norm = CauchyRMSNorm(self.head_dim, eps=config.rms_norm_eps)
|
340 |
|
341 |
def _init_rope(self):
|
342 |
if self.config.rope_scaling is None:
|
343 |
+
self.rotary_emb = CauchyRotaryEmbedding(
|
344 |
self.head_dim,
|
345 |
max_position_embeddings=self.max_position_embeddings,
|
346 |
base=self.rope_theta,
|
|
|
349 |
scaling_type = self.config.rope_scaling["type"]
|
350 |
scaling_factor = self.config.rope_scaling["factor"]
|
351 |
if scaling_type == "linear":
|
352 |
+
self.rotary_emb = CauchyLinearScalingRotaryEmbedding(
|
353 |
self.head_dim,
|
354 |
max_position_embeddings=self.max_position_embeddings,
|
355 |
scaling_factor=scaling_factor,
|
356 |
base=self.rope_theta,
|
357 |
)
|
358 |
elif scaling_type == "dynamic":
|
359 |
+
self.rotary_emb = CauchyDynamicNTKScalingRotaryEmbedding(
|
360 |
self.head_dim,
|
361 |
max_position_embeddings=self.max_position_embeddings,
|
362 |
scaling_factor=scaling_factor,
|
|
|
477 |
return attn_output, attn_weights, past_key_value
|
478 |
|
479 |
|
480 |
+
class CauchyFlashAttention2(CauchyAttention):
|
481 |
"""
|
482 |
+
Cauchy flash attention module. This module inherits from `CauchyAttention` as the weights of the module stays
|
483 |
untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
|
484 |
flash attention and deal with padding tokens in case the input contains any of them.
|
485 |
"""
|
|
|
502 |
use_cache: bool = False,
|
503 |
**kwargs,
|
504 |
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
|
505 |
+
# CauchyFlashAttention2 attention does not support output_attentions
|
506 |
if "padding_mask" in kwargs:
|
507 |
warnings.warn(
|
508 |
"Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
|
|
|
552 |
# therefore the input hidden states gets silently casted in float32. Hence, we need
|
553 |
# cast them back in the correct dtype just to be sure everything works as expected.
|
554 |
# This might slowdown training & inference so it is recommended to not cast the LayerNorms
|
555 |
+
# in fp32. (CauchyRMSNorm handles it correctly)
|
556 |
|
557 |
input_dtype = query_states.dtype
|
558 |
if input_dtype == torch.float32:
|
|
|
609 |
if not self._flash_attn_uses_top_left_mask:
|
610 |
causal = self.is_causal
|
611 |
else:
|
612 |
+
# TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in CauchyFlashAttention2 __init__.
|
613 |
causal = self.is_causal and query_length != 1
|
614 |
# Contains at least one padding token in the sequence
|
615 |
if attention_mask is not None:
|
|
|
680 |
)
|
681 |
|
682 |
|
683 |
+
class CauchySdpaAttention(CauchyAttention):
|
684 |
"""
|
685 |
+
Cauchy attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
|
686 |
+
`CauchyAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
|
687 |
SDPA API.
|
688 |
"""
|
689 |
|
690 |
+
# Adapted from CauchyAttention.forward
|
691 |
def forward(
|
692 |
self,
|
693 |
hidden_states: torch.Tensor,
|
|
|
700 |
if output_attentions:
|
701 |
# TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
|
702 |
logger.warning_once(
|
703 |
+
"CauchyModel is using CauchySdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
|
704 |
'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
|
705 |
)
|
706 |
return super().forward(
|
|
|
771 |
return attn_output, None, past_key_value
|
772 |
|
773 |
|
774 |
+
CAUCHY_ATTENTION_CLASSES = {
|
775 |
+
"eager": CauchyAttention,
|
776 |
+
"flash_attention_2": CauchyFlashAttention2,
|
777 |
+
"sdpa": CauchySdpaAttention,
|
778 |
}
|
779 |
|
780 |
|
781 |
+
class CauchyDecoderLayer(nn.Module):
|
782 |
+
def __init__(self, config: CauchyConfig, layer_idx: int):
|
783 |
super().__init__()
|
784 |
self.hidden_size = config.hidden_size
|
785 |
+
self.self_attn = CAUCHY_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx)
|
786 |
|
787 |
+
self.mlp = CauchyMLP(config)
|
788 |
+
self.input_layernorm = CauchyRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
789 |
+
self.post_attention_layernorm = CauchyRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
790 |
|
791 |
self.scale_depth = config.scale_depth
|
792 |
self.num_hidden_layers = config.num_hidden_layers
|
|
|
853 |
return outputs
|
854 |
|
855 |
|
856 |
+
CAUCHY_START_DOCSTRING = r"""
|
857 |
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
|
858 |
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
|
859 |
etc.)
|
|
|
863 |
and behavior.
|
864 |
|
865 |
Parameters:
|
866 |
+
config ([`CauchyConfig`]):
|
867 |
Model configuration class with all the parameters of the model. Initializing with a config file does not
|
868 |
load the weights associated with the model, only the configuration. Check out the
|
869 |
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
|
|
|
871 |
|
872 |
|
873 |
@add_start_docstrings(
|
874 |
+
"The bare Cauchy Model outputting raw hidden-states without any specific head on top.",
|
875 |
+
CAUCHY_START_DOCSTRING,
|
876 |
)
|
877 |
+
class CauchyPreTrainedModel(PreTrainedModel):
|
878 |
+
config_class = CauchyConfig
|
879 |
base_model_prefix = "model"
|
880 |
supports_gradient_checkpointing = True
|
881 |
+
_no_split_modules = ["CauchyDecoderLayer"]
|
882 |
_skip_keys_device_placement = "past_key_values"
|
883 |
_supports_flash_attn_2 = True
|
884 |
_supports_sdpa = True
|
|
|
896 |
module.weight.data[module.padding_idx].zero_()
|
897 |
|
898 |
|
899 |
+
CAUCHY_INPUTS_DOCSTRING = r"""
|
900 |
Args:
|
901 |
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
|
902 |
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
|
|
|
967 |
|
968 |
|
969 |
@add_start_docstrings(
|
970 |
+
"The bare Cauchy Model outputting raw hidden-states without any specific head on top.",
|
971 |
+
CAUCHY_START_DOCSTRING,
|
972 |
)
|
973 |
+
class CauchyModel(CauchyPreTrainedModel):
|
974 |
"""
|
975 |
+
Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`CauchyDecoderLayer`]
|
976 |
|
977 |
Args:
|
978 |
+
config: CauchyConfig
|
979 |
"""
|
980 |
|
981 |
+
def __init__(self, config: CauchyConfig):
|
982 |
super().__init__(config)
|
983 |
self.padding_idx = config.pad_token_id
|
984 |
self.vocab_size = config.vocab_size
|
985 |
|
986 |
self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
|
987 |
self.layers = nn.ModuleList(
|
988 |
+
[CauchyDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
|
989 |
)
|
990 |
self._use_sdpa = config._attn_implementation == "sdpa"
|
991 |
self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
|
992 |
|
993 |
+
self.norm = CauchyRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
994 |
|
995 |
self.gradient_checkpointing = False
|
996 |
# Initialize weights and apply final processing
|
|
|
1002 |
def set_input_embeddings(self, value):
|
1003 |
self.embed_tokens = value
|
1004 |
|
1005 |
+
@add_start_docstrings_to_model_forward(CAUCHY_INPUTS_DOCSTRING)
|
1006 |
def forward(
|
1007 |
self,
|
1008 |
input_ids: torch.LongTensor = None,
|
|
|
1135 |
)
|
1136 |
|
1137 |
|
1138 |
+
class CauchyForCausalLM(CauchyPreTrainedModel):
|
1139 |
_tied_weights_keys = ["lm_head.weight"]
|
1140 |
|
1141 |
def __init__(self, config):
|
1142 |
super().__init__(config)
|
1143 |
+
self.model = CauchyModel(config)
|
1144 |
self.vocab_size = config.vocab_size
|
1145 |
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
|
1146 |
|
|
|
1165 |
def get_decoder(self):
|
1166 |
return self.model
|
1167 |
|
1168 |
+
@add_start_docstrings_to_model_forward(CAUCHY_INPUTS_DOCSTRING)
|
1169 |
@replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
|
1170 |
def forward(
|
1171 |
self,
|
|
|
1192 |
Example:
|
1193 |
|
1194 |
```python
|
1195 |
+
>>> from transformers import AutoTokenizer, CauchyForCausalLM
|
1196 |
|
1197 |
+
>>> model = CauchyForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
|
1198 |
>>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
|
1199 |
|
1200 |
>>> prompt = "Hey, are you conscious? Can you talk to me?"
|
|
|
1354 |
|
1355 |
@add_start_docstrings(
|
1356 |
"""
|
1357 |
+
The Cauchy Model transformer with a sequence classification head on top (linear layer).
|
1358 |
|
1359 |
+
[`CauchyForSequenceClassification`] uses the last token in order to do the classification, as other causal models
|
1360 |
(e.g. GPT-2) do.
|
1361 |
|
1362 |
Since it does classification on the last token, it requires to know the position of the last token. If a
|
|
|
1365 |
padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
|
1366 |
each row of the batch).
|
1367 |
""",
|
1368 |
+
CAUCHY_START_DOCSTRING,
|
1369 |
)
|
1370 |
+
class CauchyForSequenceClassification(CauchyPreTrainedModel):
|
1371 |
def __init__(self, config):
|
1372 |
super().__init__(config)
|
1373 |
self.num_labels = config.num_labels
|
1374 |
+
self.model = CauchyModel(config)
|
1375 |
self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
|
1376 |
|
1377 |
# Initialize weights and apply final processing
|
|
|
1383 |
def set_input_embeddings(self, value):
|
1384 |
self.model.embed_tokens = value
|
1385 |
|
1386 |
+
@add_start_docstrings_to_model_forward(CAUCHY_INPUTS_DOCSTRING)
|
1387 |
def forward(
|
1388 |
self,
|
1389 |
input_ids: torch.LongTensor = None,
|
pytorch_model.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c23ecc5e0665c45154097ff165e98e769e9be180e7bec074871838ebe2a415e0
|
3 |
+
size 6220791502
|