MeowFET commited on
Commit
ac14fde
·
1 Parent(s): 3227ceb

feat: update model names, make vocab_size divisible to 128

Browse files
config.json CHANGED
@@ -3,16 +3,16 @@
3
  "_ori_bos_token_id": 1,
4
  "_ori_eos_token_id": 2,
5
  "architectures": [
6
- "MiniCPMForCausalLM"
7
  ],
8
  "attention_bias": false,
9
  "attention_dropout": 0.0,
10
  "auto_map": {
11
- "AutoConfig": "configuration_minicpm.MiniCPMConfig",
12
- "AutoModel": "modeling_minicpm.MiniCPMModel",
13
- "AutoModelForCausalLM": "modeling_minicpm.MiniCPMForCausalLM",
14
- "AutoModelForSeq2SeqLM": "modeling_minicpm.MiniCPMForCausalLM",
15
- "AutoModelForSequenceClassification": "modeling_minicpm.MiniCPMForSequenceClassification"
16
  },
17
  "bos_token_id": 151643,
18
  "dim_model_base": 256,
@@ -37,5 +37,5 @@
37
  "torch_dtype": "bfloat16",
38
  "transformers_version": "4.43.3",
39
  "use_cache": true,
40
- "vocab_size": 151646
41
  }
 
3
  "_ori_bos_token_id": 1,
4
  "_ori_eos_token_id": 2,
5
  "architectures": [
6
+ "CauchyForCausalLM"
7
  ],
8
  "attention_bias": false,
9
  "attention_dropout": 0.0,
10
  "auto_map": {
11
+ "AutoConfig": "configuration_cauchy.CauchyConfig",
12
+ "AutoModel": "modeling_cauchy.CauchyModel",
13
+ "AutoModelForCausalLM": "modeling_cauchy.CauchyForCausalLM",
14
+ "AutoModelForSeq2SeqLM": "modeling_cauchy.CauchyForCausalLM",
15
+ "AutoModelForSequenceClassification": "modeling_cauchy.CauchyForSequenceClassification"
16
  },
17
  "bos_token_id": 151643,
18
  "dim_model_base": 256,
 
37
  "torch_dtype": "bfloat16",
38
  "transformers_version": "4.43.3",
39
  "use_cache": true,
40
+ "vocab_size": 151680
41
  }
configuration_minicpm.py → configuration_cauchy.py RENAMED
@@ -17,7 +17,7 @@
17
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18
  # See the License for the specific language governing permissions and
19
  # limitations under the License.
20
- """ MiniCPM model configuration"""
21
 
22
  from transformers.configuration_utils import PretrainedConfig
23
  from transformers.utils import logging
@@ -25,14 +25,14 @@ from transformers.utils import logging
25
 
26
  logger = logging.get_logger(__name__)
27
 
28
- MINICPM_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
29
 
30
 
31
- class MiniCPMConfig(PretrainedConfig):
32
  r"""
33
- This is the configuration class to store the configuration of a [`MiniCPMModel`]. It is used to instantiate an MiniCPM
34
  model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
35
- defaults will yield a similar configuration to that of the MiniCPM-7B.
36
 
37
  Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
38
  documentation from [`PretrainedConfig`] for more information.
@@ -40,8 +40,8 @@ class MiniCPMConfig(PretrainedConfig):
40
 
41
  Args:
42
  vocab_size (`int`, *optional*, defaults to 32000):
43
- Vocabulary size of the MiniCPM model. Defines the number of different tokens that can be represented by the
44
- `inputs_ids` passed when calling [`MiniCPMModel`]
45
  hidden_size (`int`, *optional*, defaults to 4096):
46
  Dimension of the hidden representations.
47
  intermediate_size (`int`, *optional*, defaults to 11008):
@@ -61,8 +61,8 @@ class MiniCPMConfig(PretrainedConfig):
61
  hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
62
  The non-linear activation function (function or string) in the decoder.
63
  max_position_embeddings (`int`, *optional*, defaults to 2048):
64
- The maximum sequence length that this model might ever be used with. MiniCPM 1 supports up to 2048 tokens,
65
- MiniCPM 2 up to 4096, CodeMiniCPM up to 16384.
66
  initializer_range (`float`, *optional*, defaults to 0.02):
67
  The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
68
  rms_norm_eps (`float`, *optional*, defaults to 1e-06):
@@ -91,7 +91,7 @@ class MiniCPMConfig(PretrainedConfig):
91
  `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
92
  `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
93
  these scaling strategies behave:
94
- https://www.reddit.com/r/LocalMiniCPM/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
95
  experimental feature, subject to breaking API changes in future versions.
96
  attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
97
  Whether to use a bias in the query, key, value and output projection layers during self-attention.
@@ -99,19 +99,19 @@ class MiniCPMConfig(PretrainedConfig):
99
  The dropout ratio for the attention probabilities.
100
 
101
  ```python
102
- >>> from transformers import MiniCPMModel, MiniCPMConfig
103
 
104
- >>> # Initializing a MiniCPM minicpm-7b style configuration
105
- >>> configuration = MiniCPMConfig()
106
 
107
- >>> # Initializing a model from the minicpm-7b style configuration
108
- >>> model = MiniCPMModel(configuration)
109
 
110
  >>> # Accessing the model configuration
111
  >>> configuration = model.config
112
  ```"""
113
 
114
- model_type = "minicpm"
115
  keys_to_ignore_at_inference = ["past_key_values"]
116
 
117
  def __init__(
 
17
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18
  # See the License for the specific language governing permissions and
19
  # limitations under the License.
20
+ """ Cauchy model configuration"""
21
 
22
  from transformers.configuration_utils import PretrainedConfig
23
  from transformers.utils import logging
 
25
 
26
  logger = logging.get_logger(__name__)
27
 
28
+ CAUCHY_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
29
 
30
 
31
+ class CauchyConfig(PretrainedConfig):
32
  r"""
33
+ This is the configuration class to store the configuration of a [`CauchyModel`]. It is used to instantiate an Cauchy
34
  model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
35
+ defaults will yield a similar configuration to that of the Cauchy-7B.
36
 
37
  Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
38
  documentation from [`PretrainedConfig`] for more information.
 
40
 
41
  Args:
42
  vocab_size (`int`, *optional*, defaults to 32000):
43
+ Vocabulary size of the Cauchy model. Defines the number of different tokens that can be represented by the
44
+ `inputs_ids` passed when calling [`CauchyModel`]
45
  hidden_size (`int`, *optional*, defaults to 4096):
46
  Dimension of the hidden representations.
47
  intermediate_size (`int`, *optional*, defaults to 11008):
 
61
  hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
62
  The non-linear activation function (function or string) in the decoder.
63
  max_position_embeddings (`int`, *optional*, defaults to 2048):
64
+ The maximum sequence length that this model might ever be used with. Cauchy 1 supports up to 2048 tokens,
65
+ Cauchy 2 up to 4096, CodeCauchy up to 16384.
66
  initializer_range (`float`, *optional*, defaults to 0.02):
67
  The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
68
  rms_norm_eps (`float`, *optional*, defaults to 1e-06):
 
91
  `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
92
  `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
93
  these scaling strategies behave:
94
+ https://www.reddit.com/r/LocalCauchy/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
95
  experimental feature, subject to breaking API changes in future versions.
96
  attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
97
  Whether to use a bias in the query, key, value and output projection layers during self-attention.
 
99
  The dropout ratio for the attention probabilities.
100
 
101
  ```python
102
+ >>> from transformers import CauchyModel, CauchyConfig
103
 
104
+ >>> # Initializing a Cauchy cauchy-7b style configuration
105
+ >>> configuration = CauchyConfig()
106
 
107
+ >>> # Initializing a model from the cauchy-7b style configuration
108
+ >>> model = CauchyModel(configuration)
109
 
110
  >>> # Accessing the model configuration
111
  >>> configuration = model.config
112
  ```"""
113
 
114
+ model_type = "cauchy"
115
  keys_to_ignore_at_inference = ["past_key_values"]
116
 
117
  def __init__(
modeling_minicpm.py → modeling_cauchy.py RENAMED
@@ -17,7 +17,7 @@
17
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18
  # See the License for the specific language governing permissions and
19
  # limitations under the License.
20
- """ PyTorch MiniCPM model."""
21
  import math
22
  import warnings
23
  from typing import List, Optional, Tuple, Union, Dict
@@ -48,7 +48,7 @@ from transformers.utils import (
48
  replace_return_docstrings,
49
  )
50
  from transformers.utils.import_utils import is_torch_fx_available
51
- from .configuration_minicpm import MiniCPMConfig
52
  import re
53
 
54
  try:
@@ -69,7 +69,7 @@ if is_torch_fx_available():
69
 
70
  logger = logging.get_logger(__name__)
71
 
72
- _CONFIG_FOR_DOC = "MiniCPMConfig"
73
 
74
 
75
  def _get_unpad_data(attention_mask):
@@ -86,7 +86,7 @@ def _get_unpad_data(attention_mask):
86
 
87
  def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
88
  warnings.warn(
89
- "Calling `transformers.models.minicpm.modeling_minicpm._prepare_4d_attention_mask` is deprecated and will be removed in v4.37. Use `transformers.modeling_attn_mask_utils._prepare_4d_attention_mask"
90
  )
91
  return _prepare_4d_attention_mask(mask=mask, dtype=dtype, tgt_len=tgt_len)
92
 
@@ -95,7 +95,7 @@ def _make_causal_mask(
95
  input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
96
  ):
97
  warnings.warn(
98
- "Calling `transformers.models.minicpm.modeling_minicpm._make_causal_mask` is deprecated and will be removed in v4.37. Use `transformers.models.minicpm.modeling_minicpm.AttentionMaskConverter._make_causal_mask"
99
  )
100
  return AttentionMaskConverter._make_causal_mask(
101
  input_ids_shape=input_ids_shape, dtype=dtype, device=device, past_key_values_length=past_key_values_length
@@ -110,10 +110,10 @@ def rms_layernorm(hidden: torch.Tensor, weight: torch.Tensor, eps: float):
110
  return hidden * weight
111
 
112
 
113
- class MiniCPMRMSNorm(nn.Module):
114
  def __init__(self, hidden_size, eps=1e-6):
115
  """
116
- MiniCPMRMSNorm is equivalent to T5LayerNorm
117
  """
118
  super().__init__()
119
  self.weight = nn.Parameter(torch.ones(hidden_size))
@@ -123,10 +123,10 @@ class MiniCPMRMSNorm(nn.Module):
123
  return rms_layernorm(hidden_states, self.weight, self.variance_epsilon)
124
 
125
 
126
- ALL_LAYERNORM_LAYERS.append(MiniCPMRMSNorm)
127
 
128
 
129
- class MiniCPMRotaryEmbedding(nn.Module):
130
  def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
131
  super().__init__()
132
 
@@ -163,8 +163,8 @@ class MiniCPMRotaryEmbedding(nn.Module):
163
  )
164
 
165
 
166
- class MiniCPMLinearScalingRotaryEmbedding(MiniCPMRotaryEmbedding):
167
- """MiniCPMRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
168
 
169
  def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
170
  self.scaling_factor = scaling_factor
@@ -182,8 +182,8 @@ class MiniCPMLinearScalingRotaryEmbedding(MiniCPMRotaryEmbedding):
182
  self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
183
 
184
 
185
- class MiniCPMDynamicNTKScalingRotaryEmbedding(MiniCPMRotaryEmbedding):
186
- """MiniCPMRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
187
 
188
  def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
189
  self.scaling_factor = scaling_factor
@@ -250,7 +250,7 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
250
  k_embed = (k_fp32 * cos) + (rotate_half(k_fp32) * sin)
251
  return q_embed.to(dtype=orig_dtype), k_embed.to(dtype=orig_dtype)
252
 
253
- class MiniCPMMLP(nn.Module):
254
  def __init__(self, config):
255
  super().__init__()
256
  self.config = config
@@ -297,10 +297,10 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
297
 
298
 
299
 
300
- class MiniCPMAttention(nn.Module):
301
  """Multi-headed attention from 'Attention Is All You Need' paper"""
302
 
303
- def __init__(self, config: MiniCPMConfig, layer_idx: Optional[int] = None):
304
  super().__init__()
305
  self.config = config
306
  self.layer_idx = layer_idx
@@ -335,12 +335,12 @@ class MiniCPMAttention(nn.Module):
335
  self._init_rope()
336
 
337
  if self.qk_norm:
338
- self.q_norm = MiniCPMRMSNorm(self.head_dim, eps=config.rms_norm_eps)
339
- self.k_norm = MiniCPMRMSNorm(self.head_dim, eps=config.rms_norm_eps)
340
 
341
  def _init_rope(self):
342
  if self.config.rope_scaling is None:
343
- self.rotary_emb = MiniCPMRotaryEmbedding(
344
  self.head_dim,
345
  max_position_embeddings=self.max_position_embeddings,
346
  base=self.rope_theta,
@@ -349,14 +349,14 @@ class MiniCPMAttention(nn.Module):
349
  scaling_type = self.config.rope_scaling["type"]
350
  scaling_factor = self.config.rope_scaling["factor"]
351
  if scaling_type == "linear":
352
- self.rotary_emb = MiniCPMLinearScalingRotaryEmbedding(
353
  self.head_dim,
354
  max_position_embeddings=self.max_position_embeddings,
355
  scaling_factor=scaling_factor,
356
  base=self.rope_theta,
357
  )
358
  elif scaling_type == "dynamic":
359
- self.rotary_emb = MiniCPMDynamicNTKScalingRotaryEmbedding(
360
  self.head_dim,
361
  max_position_embeddings=self.max_position_embeddings,
362
  scaling_factor=scaling_factor,
@@ -477,9 +477,9 @@ class MiniCPMAttention(nn.Module):
477
  return attn_output, attn_weights, past_key_value
478
 
479
 
480
- class MiniCPMFlashAttention2(MiniCPMAttention):
481
  """
482
- MiniCPM flash attention module. This module inherits from `MiniCPMAttention` as the weights of the module stays
483
  untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
484
  flash attention and deal with padding tokens in case the input contains any of them.
485
  """
@@ -502,7 +502,7 @@ class MiniCPMFlashAttention2(MiniCPMAttention):
502
  use_cache: bool = False,
503
  **kwargs,
504
  ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
505
- # MiniCPMFlashAttention2 attention does not support output_attentions
506
  if "padding_mask" in kwargs:
507
  warnings.warn(
508
  "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
@@ -552,7 +552,7 @@ class MiniCPMFlashAttention2(MiniCPMAttention):
552
  # therefore the input hidden states gets silently casted in float32. Hence, we need
553
  # cast them back in the correct dtype just to be sure everything works as expected.
554
  # This might slowdown training & inference so it is recommended to not cast the LayerNorms
555
- # in fp32. (MiniCPMRMSNorm handles it correctly)
556
 
557
  input_dtype = query_states.dtype
558
  if input_dtype == torch.float32:
@@ -609,7 +609,7 @@ class MiniCPMFlashAttention2(MiniCPMAttention):
609
  if not self._flash_attn_uses_top_left_mask:
610
  causal = self.is_causal
611
  else:
612
- # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in MiniCPMFlashAttention2 __init__.
613
  causal = self.is_causal and query_length != 1
614
  # Contains at least one padding token in the sequence
615
  if attention_mask is not None:
@@ -680,14 +680,14 @@ class MiniCPMFlashAttention2(MiniCPMAttention):
680
  )
681
 
682
 
683
- class MiniCPMSdpaAttention(MiniCPMAttention):
684
  """
685
- MiniCPM attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
686
- `MiniCPMAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
687
  SDPA API.
688
  """
689
 
690
- # Adapted from MiniCPMAttention.forward
691
  def forward(
692
  self,
693
  hidden_states: torch.Tensor,
@@ -700,7 +700,7 @@ class MiniCPMSdpaAttention(MiniCPMAttention):
700
  if output_attentions:
701
  # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
702
  logger.warning_once(
703
- "MiniCPMModel is using MiniCPMSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
704
  'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
705
  )
706
  return super().forward(
@@ -771,22 +771,22 @@ class MiniCPMSdpaAttention(MiniCPMAttention):
771
  return attn_output, None, past_key_value
772
 
773
 
774
- MINICPM_ATTENTION_CLASSES = {
775
- "eager": MiniCPMAttention,
776
- "flash_attention_2": MiniCPMFlashAttention2,
777
- "sdpa": MiniCPMSdpaAttention,
778
  }
779
 
780
 
781
- class MiniCPMDecoderLayer(nn.Module):
782
- def __init__(self, config: MiniCPMConfig, layer_idx: int):
783
  super().__init__()
784
  self.hidden_size = config.hidden_size
785
- self.self_attn = MINICPM_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx)
786
 
787
- self.mlp = MiniCPMMLP(config)
788
- self.input_layernorm = MiniCPMRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
789
- self.post_attention_layernorm = MiniCPMRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
790
 
791
  self.scale_depth = config.scale_depth
792
  self.num_hidden_layers = config.num_hidden_layers
@@ -853,7 +853,7 @@ class MiniCPMDecoderLayer(nn.Module):
853
  return outputs
854
 
855
 
856
- MINICPM_START_DOCSTRING = r"""
857
  This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
858
  library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
859
  etc.)
@@ -863,7 +863,7 @@ MINICPM_START_DOCSTRING = r"""
863
  and behavior.
864
 
865
  Parameters:
866
- config ([`MiniCPMConfig`]):
867
  Model configuration class with all the parameters of the model. Initializing with a config file does not
868
  load the weights associated with the model, only the configuration. Check out the
869
  [`~PreTrainedModel.from_pretrained`] method to load the model weights.
@@ -871,14 +871,14 @@ MINICPM_START_DOCSTRING = r"""
871
 
872
 
873
  @add_start_docstrings(
874
- "The bare MiniCPM Model outputting raw hidden-states without any specific head on top.",
875
- MINICPM_START_DOCSTRING,
876
  )
877
- class MiniCPMPreTrainedModel(PreTrainedModel):
878
- config_class = MiniCPMConfig
879
  base_model_prefix = "model"
880
  supports_gradient_checkpointing = True
881
- _no_split_modules = ["MiniCPMDecoderLayer"]
882
  _skip_keys_device_placement = "past_key_values"
883
  _supports_flash_attn_2 = True
884
  _supports_sdpa = True
@@ -896,7 +896,7 @@ class MiniCPMPreTrainedModel(PreTrainedModel):
896
  module.weight.data[module.padding_idx].zero_()
897
 
898
 
899
- MINICPM_INPUTS_DOCSTRING = r"""
900
  Args:
901
  input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
902
  Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
@@ -967,30 +967,30 @@ MINICPM_INPUTS_DOCSTRING = r"""
967
 
968
 
969
  @add_start_docstrings(
970
- "The bare MiniCPM Model outputting raw hidden-states without any specific head on top.",
971
- MINICPM_START_DOCSTRING,
972
  )
973
- class MiniCPMModel(MiniCPMPreTrainedModel):
974
  """
975
- Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MiniCPMDecoderLayer`]
976
 
977
  Args:
978
- config: MiniCPMConfig
979
  """
980
 
981
- def __init__(self, config: MiniCPMConfig):
982
  super().__init__(config)
983
  self.padding_idx = config.pad_token_id
984
  self.vocab_size = config.vocab_size
985
 
986
  self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
987
  self.layers = nn.ModuleList(
988
- [MiniCPMDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
989
  )
990
  self._use_sdpa = config._attn_implementation == "sdpa"
991
  self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
992
 
993
- self.norm = MiniCPMRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
994
 
995
  self.gradient_checkpointing = False
996
  # Initialize weights and apply final processing
@@ -1002,7 +1002,7 @@ class MiniCPMModel(MiniCPMPreTrainedModel):
1002
  def set_input_embeddings(self, value):
1003
  self.embed_tokens = value
1004
 
1005
- @add_start_docstrings_to_model_forward(MINICPM_INPUTS_DOCSTRING)
1006
  def forward(
1007
  self,
1008
  input_ids: torch.LongTensor = None,
@@ -1135,12 +1135,12 @@ class MiniCPMModel(MiniCPMPreTrainedModel):
1135
  )
1136
 
1137
 
1138
- class MiniCPMForCausalLM(MiniCPMPreTrainedModel):
1139
  _tied_weights_keys = ["lm_head.weight"]
1140
 
1141
  def __init__(self, config):
1142
  super().__init__(config)
1143
- self.model = MiniCPMModel(config)
1144
  self.vocab_size = config.vocab_size
1145
  self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
1146
 
@@ -1165,7 +1165,7 @@ class MiniCPMForCausalLM(MiniCPMPreTrainedModel):
1165
  def get_decoder(self):
1166
  return self.model
1167
 
1168
- @add_start_docstrings_to_model_forward(MINICPM_INPUTS_DOCSTRING)
1169
  @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
1170
  def forward(
1171
  self,
@@ -1192,9 +1192,9 @@ class MiniCPMForCausalLM(MiniCPMPreTrainedModel):
1192
  Example:
1193
 
1194
  ```python
1195
- >>> from transformers import AutoTokenizer, MiniCPMForCausalLM
1196
 
1197
- >>> model = MiniCPMForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
1198
  >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
1199
 
1200
  >>> prompt = "Hey, are you conscious? Can you talk to me?"
@@ -1354,9 +1354,9 @@ class MiniCPMForCausalLM(MiniCPMPreTrainedModel):
1354
 
1355
  @add_start_docstrings(
1356
  """
1357
- The MiniCPM Model transformer with a sequence classification head on top (linear layer).
1358
 
1359
- [`MiniCPMForSequenceClassification`] uses the last token in order to do the classification, as other causal models
1360
  (e.g. GPT-2) do.
1361
 
1362
  Since it does classification on the last token, it requires to know the position of the last token. If a
@@ -1365,13 +1365,13 @@ class MiniCPMForCausalLM(MiniCPMPreTrainedModel):
1365
  padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
1366
  each row of the batch).
1367
  """,
1368
- MINICPM_START_DOCSTRING,
1369
  )
1370
- class MiniCPMForSequenceClassification(MiniCPMPreTrainedModel):
1371
  def __init__(self, config):
1372
  super().__init__(config)
1373
  self.num_labels = config.num_labels
1374
- self.model = MiniCPMModel(config)
1375
  self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
1376
 
1377
  # Initialize weights and apply final processing
@@ -1383,7 +1383,7 @@ class MiniCPMForSequenceClassification(MiniCPMPreTrainedModel):
1383
  def set_input_embeddings(self, value):
1384
  self.model.embed_tokens = value
1385
 
1386
- @add_start_docstrings_to_model_forward(MINICPM_INPUTS_DOCSTRING)
1387
  def forward(
1388
  self,
1389
  input_ids: torch.LongTensor = None,
 
17
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18
  # See the License for the specific language governing permissions and
19
  # limitations under the License.
20
+ """ PyTorch Cauchy model."""
21
  import math
22
  import warnings
23
  from typing import List, Optional, Tuple, Union, Dict
 
48
  replace_return_docstrings,
49
  )
50
  from transformers.utils.import_utils import is_torch_fx_available
51
+ from .configuration_cauchy import CauchyConfig
52
  import re
53
 
54
  try:
 
69
 
70
  logger = logging.get_logger(__name__)
71
 
72
+ _CONFIG_FOR_DOC = "CauchyConfig"
73
 
74
 
75
  def _get_unpad_data(attention_mask):
 
86
 
87
  def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
88
  warnings.warn(
89
+ "Calling `transformers.models.cauchy.modeling_cauchy._prepare_4d_attention_mask` is deprecated and will be removed in v4.37. Use `transformers.modeling_attn_mask_utils._prepare_4d_attention_mask"
90
  )
91
  return _prepare_4d_attention_mask(mask=mask, dtype=dtype, tgt_len=tgt_len)
92
 
 
95
  input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
96
  ):
97
  warnings.warn(
98
+ "Calling `transformers.models.cauchy.modeling_cauchy._make_causal_mask` is deprecated and will be removed in v4.37. Use `transformers.models.cauchy.modeling_cauchy.AttentionMaskConverter._make_causal_mask"
99
  )
100
  return AttentionMaskConverter._make_causal_mask(
101
  input_ids_shape=input_ids_shape, dtype=dtype, device=device, past_key_values_length=past_key_values_length
 
110
  return hidden * weight
111
 
112
 
113
+ class CauchyRMSNorm(nn.Module):
114
  def __init__(self, hidden_size, eps=1e-6):
115
  """
116
+ CauchyRMSNorm is equivalent to T5LayerNorm
117
  """
118
  super().__init__()
119
  self.weight = nn.Parameter(torch.ones(hidden_size))
 
123
  return rms_layernorm(hidden_states, self.weight, self.variance_epsilon)
124
 
125
 
126
+ ALL_LAYERNORM_LAYERS.append(CauchyRMSNorm)
127
 
128
 
129
+ class CauchyRotaryEmbedding(nn.Module):
130
  def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
131
  super().__init__()
132
 
 
163
  )
164
 
165
 
166
+ class CauchyLinearScalingRotaryEmbedding(CauchyRotaryEmbedding):
167
+ """CauchyRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
168
 
169
  def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
170
  self.scaling_factor = scaling_factor
 
182
  self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
183
 
184
 
185
+ class CauchyDynamicNTKScalingRotaryEmbedding(CauchyRotaryEmbedding):
186
+ """CauchyRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
187
 
188
  def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
189
  self.scaling_factor = scaling_factor
 
250
  k_embed = (k_fp32 * cos) + (rotate_half(k_fp32) * sin)
251
  return q_embed.to(dtype=orig_dtype), k_embed.to(dtype=orig_dtype)
252
 
253
+ class CauchyMLP(nn.Module):
254
  def __init__(self, config):
255
  super().__init__()
256
  self.config = config
 
297
 
298
 
299
 
300
+ class CauchyAttention(nn.Module):
301
  """Multi-headed attention from 'Attention Is All You Need' paper"""
302
 
303
+ def __init__(self, config: CauchyConfig, layer_idx: Optional[int] = None):
304
  super().__init__()
305
  self.config = config
306
  self.layer_idx = layer_idx
 
335
  self._init_rope()
336
 
337
  if self.qk_norm:
338
+ self.q_norm = CauchyRMSNorm(self.head_dim, eps=config.rms_norm_eps)
339
+ self.k_norm = CauchyRMSNorm(self.head_dim, eps=config.rms_norm_eps)
340
 
341
  def _init_rope(self):
342
  if self.config.rope_scaling is None:
343
+ self.rotary_emb = CauchyRotaryEmbedding(
344
  self.head_dim,
345
  max_position_embeddings=self.max_position_embeddings,
346
  base=self.rope_theta,
 
349
  scaling_type = self.config.rope_scaling["type"]
350
  scaling_factor = self.config.rope_scaling["factor"]
351
  if scaling_type == "linear":
352
+ self.rotary_emb = CauchyLinearScalingRotaryEmbedding(
353
  self.head_dim,
354
  max_position_embeddings=self.max_position_embeddings,
355
  scaling_factor=scaling_factor,
356
  base=self.rope_theta,
357
  )
358
  elif scaling_type == "dynamic":
359
+ self.rotary_emb = CauchyDynamicNTKScalingRotaryEmbedding(
360
  self.head_dim,
361
  max_position_embeddings=self.max_position_embeddings,
362
  scaling_factor=scaling_factor,
 
477
  return attn_output, attn_weights, past_key_value
478
 
479
 
480
+ class CauchyFlashAttention2(CauchyAttention):
481
  """
482
+ Cauchy flash attention module. This module inherits from `CauchyAttention` as the weights of the module stays
483
  untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
484
  flash attention and deal with padding tokens in case the input contains any of them.
485
  """
 
502
  use_cache: bool = False,
503
  **kwargs,
504
  ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
505
+ # CauchyFlashAttention2 attention does not support output_attentions
506
  if "padding_mask" in kwargs:
507
  warnings.warn(
508
  "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
 
552
  # therefore the input hidden states gets silently casted in float32. Hence, we need
553
  # cast them back in the correct dtype just to be sure everything works as expected.
554
  # This might slowdown training & inference so it is recommended to not cast the LayerNorms
555
+ # in fp32. (CauchyRMSNorm handles it correctly)
556
 
557
  input_dtype = query_states.dtype
558
  if input_dtype == torch.float32:
 
609
  if not self._flash_attn_uses_top_left_mask:
610
  causal = self.is_causal
611
  else:
612
+ # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in CauchyFlashAttention2 __init__.
613
  causal = self.is_causal and query_length != 1
614
  # Contains at least one padding token in the sequence
615
  if attention_mask is not None:
 
680
  )
681
 
682
 
683
+ class CauchySdpaAttention(CauchyAttention):
684
  """
685
+ Cauchy attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
686
+ `CauchyAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
687
  SDPA API.
688
  """
689
 
690
+ # Adapted from CauchyAttention.forward
691
  def forward(
692
  self,
693
  hidden_states: torch.Tensor,
 
700
  if output_attentions:
701
  # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
702
  logger.warning_once(
703
+ "CauchyModel is using CauchySdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
704
  'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
705
  )
706
  return super().forward(
 
771
  return attn_output, None, past_key_value
772
 
773
 
774
+ CAUCHY_ATTENTION_CLASSES = {
775
+ "eager": CauchyAttention,
776
+ "flash_attention_2": CauchyFlashAttention2,
777
+ "sdpa": CauchySdpaAttention,
778
  }
779
 
780
 
781
+ class CauchyDecoderLayer(nn.Module):
782
+ def __init__(self, config: CauchyConfig, layer_idx: int):
783
  super().__init__()
784
  self.hidden_size = config.hidden_size
785
+ self.self_attn = CAUCHY_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx)
786
 
787
+ self.mlp = CauchyMLP(config)
788
+ self.input_layernorm = CauchyRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
789
+ self.post_attention_layernorm = CauchyRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
790
 
791
  self.scale_depth = config.scale_depth
792
  self.num_hidden_layers = config.num_hidden_layers
 
853
  return outputs
854
 
855
 
856
+ CAUCHY_START_DOCSTRING = r"""
857
  This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
858
  library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
859
  etc.)
 
863
  and behavior.
864
 
865
  Parameters:
866
+ config ([`CauchyConfig`]):
867
  Model configuration class with all the parameters of the model. Initializing with a config file does not
868
  load the weights associated with the model, only the configuration. Check out the
869
  [`~PreTrainedModel.from_pretrained`] method to load the model weights.
 
871
 
872
 
873
  @add_start_docstrings(
874
+ "The bare Cauchy Model outputting raw hidden-states without any specific head on top.",
875
+ CAUCHY_START_DOCSTRING,
876
  )
877
+ class CauchyPreTrainedModel(PreTrainedModel):
878
+ config_class = CauchyConfig
879
  base_model_prefix = "model"
880
  supports_gradient_checkpointing = True
881
+ _no_split_modules = ["CauchyDecoderLayer"]
882
  _skip_keys_device_placement = "past_key_values"
883
  _supports_flash_attn_2 = True
884
  _supports_sdpa = True
 
896
  module.weight.data[module.padding_idx].zero_()
897
 
898
 
899
+ CAUCHY_INPUTS_DOCSTRING = r"""
900
  Args:
901
  input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
902
  Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
 
967
 
968
 
969
  @add_start_docstrings(
970
+ "The bare Cauchy Model outputting raw hidden-states without any specific head on top.",
971
+ CAUCHY_START_DOCSTRING,
972
  )
973
+ class CauchyModel(CauchyPreTrainedModel):
974
  """
975
+ Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`CauchyDecoderLayer`]
976
 
977
  Args:
978
+ config: CauchyConfig
979
  """
980
 
981
+ def __init__(self, config: CauchyConfig):
982
  super().__init__(config)
983
  self.padding_idx = config.pad_token_id
984
  self.vocab_size = config.vocab_size
985
 
986
  self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
987
  self.layers = nn.ModuleList(
988
+ [CauchyDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
989
  )
990
  self._use_sdpa = config._attn_implementation == "sdpa"
991
  self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
992
 
993
+ self.norm = CauchyRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
994
 
995
  self.gradient_checkpointing = False
996
  # Initialize weights and apply final processing
 
1002
  def set_input_embeddings(self, value):
1003
  self.embed_tokens = value
1004
 
1005
+ @add_start_docstrings_to_model_forward(CAUCHY_INPUTS_DOCSTRING)
1006
  def forward(
1007
  self,
1008
  input_ids: torch.LongTensor = None,
 
1135
  )
1136
 
1137
 
1138
+ class CauchyForCausalLM(CauchyPreTrainedModel):
1139
  _tied_weights_keys = ["lm_head.weight"]
1140
 
1141
  def __init__(self, config):
1142
  super().__init__(config)
1143
+ self.model = CauchyModel(config)
1144
  self.vocab_size = config.vocab_size
1145
  self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
1146
 
 
1165
  def get_decoder(self):
1166
  return self.model
1167
 
1168
+ @add_start_docstrings_to_model_forward(CAUCHY_INPUTS_DOCSTRING)
1169
  @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
1170
  def forward(
1171
  self,
 
1192
  Example:
1193
 
1194
  ```python
1195
+ >>> from transformers import AutoTokenizer, CauchyForCausalLM
1196
 
1197
+ >>> model = CauchyForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
1198
  >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
1199
 
1200
  >>> prompt = "Hey, are you conscious? Can you talk to me?"
 
1354
 
1355
  @add_start_docstrings(
1356
  """
1357
+ The Cauchy Model transformer with a sequence classification head on top (linear layer).
1358
 
1359
+ [`CauchyForSequenceClassification`] uses the last token in order to do the classification, as other causal models
1360
  (e.g. GPT-2) do.
1361
 
1362
  Since it does classification on the last token, it requires to know the position of the last token. If a
 
1365
  padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
1366
  each row of the batch).
1367
  """,
1368
+ CAUCHY_START_DOCSTRING,
1369
  )
1370
+ class CauchyForSequenceClassification(CauchyPreTrainedModel):
1371
  def __init__(self, config):
1372
  super().__init__(config)
1373
  self.num_labels = config.num_labels
1374
+ self.model = CauchyModel(config)
1375
  self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
1376
 
1377
  # Initialize weights and apply final processing
 
1383
  def set_input_embeddings(self, value):
1384
  self.model.embed_tokens = value
1385
 
1386
+ @add_start_docstrings_to_model_forward(CAUCHY_INPUTS_DOCSTRING)
1387
  def forward(
1388
  self,
1389
  input_ids: torch.LongTensor = None,
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4c9af2c0dc4ba4583879030d052032cd413b387a39b26ece3f10abaea9554fba
3
- size 6919416998
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c23ecc5e0665c45154097ff165e98e769e9be180e7bec074871838ebe2a415e0
3
+ size 6220791502