feat: update model names, make vocab_size divisible to 128

Files changed (4) hide show

config.json +7 -7
configuration_minicpm.py → configuration_cauchy.py +16 -16
modeling_minicpm.py → modeling_cauchy.py +69 -69
pytorch_model.bin +2 -2

config.json CHANGED Viewed

@@ -3,16 +3,16 @@
   "_ori_bos_token_id": 1,
   "_ori_eos_token_id": 2,
   "architectures": [
-    "MiniCPMForCausalLM"
   ],
   "attention_bias": false,
   "attention_dropout": 0.0,
   "auto_map": {
-    "AutoConfig": "configuration_minicpm.MiniCPMConfig",
-    "AutoModel": "modeling_minicpm.MiniCPMModel",
-    "AutoModelForCausalLM": "modeling_minicpm.MiniCPMForCausalLM",
-    "AutoModelForSeq2SeqLM": "modeling_minicpm.MiniCPMForCausalLM",
-    "AutoModelForSequenceClassification": "modeling_minicpm.MiniCPMForSequenceClassification"
   },
   "bos_token_id": 151643,
   "dim_model_base": 256,
@@ -37,5 +37,5 @@
   "torch_dtype": "bfloat16",
   "transformers_version": "4.43.3",
   "use_cache": true,
-  "vocab_size": 151646
 }

   "_ori_bos_token_id": 1,
   "_ori_eos_token_id": 2,
   "architectures": [
+    "CauchyForCausalLM"
   ],
   "attention_bias": false,
   "attention_dropout": 0.0,
   "auto_map": {
+    "AutoConfig": "configuration_cauchy.CauchyConfig",
+    "AutoModel": "modeling_cauchy.CauchyModel",
+    "AutoModelForCausalLM": "modeling_cauchy.CauchyForCausalLM",
+    "AutoModelForSeq2SeqLM": "modeling_cauchy.CauchyForCausalLM",
+    "AutoModelForSequenceClassification": "modeling_cauchy.CauchyForSequenceClassification"
   },
   "bos_token_id": 151643,
   "dim_model_base": 256,
   "torch_dtype": "bfloat16",
   "transformers_version": "4.43.3",
   "use_cache": true,
+  "vocab_size": 151680
 }

configuration_minicpm.py → configuration_cauchy.py RENAMED Viewed

@@ -17,7 +17,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" MiniCPM model configuration"""
 from transformers.configuration_utils import PretrainedConfig
 from transformers.utils import logging
@@ -25,14 +25,14 @@ from transformers.utils import logging
 logger = logging.get_logger(__name__)
-MINICPM_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
-class MiniCPMConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a [`MiniCPMModel`]. It is used to instantiate an MiniCPM
     model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
-    defaults will yield a similar configuration to that of the MiniCPM-7B.
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
@@ -40,8 +40,8 @@ class MiniCPMConfig(PretrainedConfig):
     Args:
         vocab_size (`int`, *optional*, defaults to 32000):
-            Vocabulary size of the MiniCPM model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`MiniCPMModel`]
         hidden_size (`int`, *optional*, defaults to 4096):
             Dimension of the hidden representations.
         intermediate_size (`int`, *optional*, defaults to 11008):
@@ -61,8 +61,8 @@ class MiniCPMConfig(PretrainedConfig):
         hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
             The non-linear activation function (function or string) in the decoder.
         max_position_embeddings (`int`, *optional*, defaults to 2048):
-            The maximum sequence length that this model might ever be used with. MiniCPM 1 supports up to 2048 tokens,
-            MiniCPM 2 up to 4096, CodeMiniCPM up to 16384.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         rms_norm_eps (`float`, *optional*, defaults to 1e-06):
@@ -91,7 +91,7 @@ class MiniCPMConfig(PretrainedConfig):
             `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
             `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
             these scaling strategies behave:
-            https://www.reddit.com/r/LocalMiniCPM/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
             experimental feature, subject to breaking API changes in future versions.
         attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
             Whether to use a bias in the query, key, value and output projection layers during self-attention.
@@ -99,19 +99,19 @@ class MiniCPMConfig(PretrainedConfig):
             The dropout ratio for the attention probabilities.
     ```python
-    >>> from transformers import MiniCPMModel, MiniCPMConfig
-    >>> # Initializing a MiniCPM minicpm-7b style configuration
-    >>> configuration = MiniCPMConfig()
-    >>> # Initializing a model from the minicpm-7b style configuration
-    >>> model = MiniCPMModel(configuration)
     >>> # Accessing the model configuration
     >>> configuration = model.config
     ```"""
-    model_type = "minicpm"
     keys_to_ignore_at_inference = ["past_key_values"]
     def __init__(

 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+""" Cauchy model configuration"""
 from transformers.configuration_utils import PretrainedConfig
 from transformers.utils import logging
 logger = logging.get_logger(__name__)
+CAUCHY_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
+class CauchyConfig(PretrainedConfig):
     r"""
+    This is the configuration class to store the configuration of a [`CauchyModel`]. It is used to instantiate an Cauchy
     model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the Cauchy-7B.
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
     Args:
         vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the Cauchy model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`CauchyModel`]
         hidden_size (`int`, *optional*, defaults to 4096):
             Dimension of the hidden representations.
         intermediate_size (`int`, *optional*, defaults to 11008):
         hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
             The non-linear activation function (function or string) in the decoder.
         max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with. Cauchy 1 supports up to 2048 tokens,
+            Cauchy 2 up to 4096, CodeCauchy up to 16384.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         rms_norm_eps (`float`, *optional*, defaults to 1e-06):
             `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
             `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
             these scaling strategies behave:
+            https://www.reddit.com/r/LocalCauchy/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
             experimental feature, subject to breaking API changes in future versions.
         attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
             Whether to use a bias in the query, key, value and output projection layers during self-attention.
             The dropout ratio for the attention probabilities.
     ```python
+    >>> from transformers import CauchyModel, CauchyConfig
+    >>> # Initializing a Cauchy cauchy-7b style configuration
+    >>> configuration = CauchyConfig()
+    >>> # Initializing a model from the cauchy-7b style configuration
+    >>> model = CauchyModel(configuration)
     >>> # Accessing the model configuration
     >>> configuration = model.config
     ```"""
+    model_type = "cauchy"
     keys_to_ignore_at_inference = ["past_key_values"]
     def __init__(

modeling_minicpm.py → modeling_cauchy.py RENAMED Viewed

@@ -17,7 +17,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch MiniCPM model."""
 import math
 import warnings
 from typing import List, Optional, Tuple, Union, Dict
@@ -48,7 +48,7 @@ from transformers.utils import (
     replace_return_docstrings,
 )
 from transformers.utils.import_utils import is_torch_fx_available
-from .configuration_minicpm import MiniCPMConfig
 import re
 try:
@@ -69,7 +69,7 @@ if is_torch_fx_available():
 logger = logging.get_logger(__name__)
-_CONFIG_FOR_DOC = "MiniCPMConfig"
 def _get_unpad_data(attention_mask):
@@ -86,7 +86,7 @@ def _get_unpad_data(attention_mask):
 def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
     warnings.warn(
-        "Calling `transformers.models.minicpm.modeling_minicpm._prepare_4d_attention_mask` is deprecated and will be removed in v4.37. Use `transformers.modeling_attn_mask_utils._prepare_4d_attention_mask"
     )
     return _prepare_4d_attention_mask(mask=mask, dtype=dtype, tgt_len=tgt_len)
@@ -95,7 +95,7 @@ def _make_causal_mask(
     input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
 ):
     warnings.warn(
-        "Calling `transformers.models.minicpm.modeling_minicpm._make_causal_mask` is deprecated and will be removed in v4.37. Use `transformers.models.minicpm.modeling_minicpm.AttentionMaskConverter._make_causal_mask"
     )
     return AttentionMaskConverter._make_causal_mask(
         input_ids_shape=input_ids_shape, dtype=dtype, device=device, past_key_values_length=past_key_values_length
@@ -110,10 +110,10 @@ def rms_layernorm(hidden: torch.Tensor, weight: torch.Tensor, eps: float):
     return hidden * weight
-class MiniCPMRMSNorm(nn.Module):
     def __init__(self, hidden_size, eps=1e-6):
         """
-        MiniCPMRMSNorm is equivalent to T5LayerNorm
         """
         super().__init__()
         self.weight = nn.Parameter(torch.ones(hidden_size))
@@ -123,10 +123,10 @@ class MiniCPMRMSNorm(nn.Module):
         return rms_layernorm(hidden_states, self.weight, self.variance_epsilon)
-ALL_LAYERNORM_LAYERS.append(MiniCPMRMSNorm)
-class MiniCPMRotaryEmbedding(nn.Module):
     def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
         super().__init__()
@@ -163,8 +163,8 @@ class MiniCPMRotaryEmbedding(nn.Module):
         )
-class MiniCPMLinearScalingRotaryEmbedding(MiniCPMRotaryEmbedding):
-    """MiniCPMRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
     def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
         self.scaling_factor = scaling_factor
@@ -182,8 +182,8 @@ class MiniCPMLinearScalingRotaryEmbedding(MiniCPMRotaryEmbedding):
         self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
-class MiniCPMDynamicNTKScalingRotaryEmbedding(MiniCPMRotaryEmbedding):
-    """MiniCPMRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
     def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
         self.scaling_factor = scaling_factor
@@ -250,7 +250,7 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
     k_embed = (k_fp32 * cos) + (rotate_half(k_fp32) * sin)
     return q_embed.to(dtype=orig_dtype), k_embed.to(dtype=orig_dtype)
-class MiniCPMMLP(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.config = config
@@ -297,10 +297,10 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
-class MiniCPMAttention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
-    def __init__(self, config: MiniCPMConfig, layer_idx: Optional[int] = None):
         super().__init__()
         self.config = config
         self.layer_idx = layer_idx
@@ -335,12 +335,12 @@ class MiniCPMAttention(nn.Module):
         self._init_rope()
         if self.qk_norm:
-            self.q_norm = MiniCPMRMSNorm(self.head_dim, eps=config.rms_norm_eps)
-            self.k_norm = MiniCPMRMSNorm(self.head_dim, eps=config.rms_norm_eps)
     def _init_rope(self):
         if self.config.rope_scaling is None:
-            self.rotary_emb = MiniCPMRotaryEmbedding(
                 self.head_dim,
                 max_position_embeddings=self.max_position_embeddings,
                 base=self.rope_theta,
@@ -349,14 +349,14 @@ class MiniCPMAttention(nn.Module):
             scaling_type = self.config.rope_scaling["type"]
             scaling_factor = self.config.rope_scaling["factor"]
             if scaling_type == "linear":
-                self.rotary_emb = MiniCPMLinearScalingRotaryEmbedding(
                     self.head_dim,
                     max_position_embeddings=self.max_position_embeddings,
                     scaling_factor=scaling_factor,
                     base=self.rope_theta,
                 )
             elif scaling_type == "dynamic":
-                self.rotary_emb = MiniCPMDynamicNTKScalingRotaryEmbedding(
                     self.head_dim,
                     max_position_embeddings=self.max_position_embeddings,
                     scaling_factor=scaling_factor,
@@ -477,9 +477,9 @@ class MiniCPMAttention(nn.Module):
         return attn_output, attn_weights, past_key_value
-class MiniCPMFlashAttention2(MiniCPMAttention):
     """
-    MiniCPM flash attention module. This module inherits from `MiniCPMAttention` as the weights of the module stays
     untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
     flash attention and deal with padding tokens in case the input contains any of them.
     """
@@ -502,7 +502,7 @@ class MiniCPMFlashAttention2(MiniCPMAttention):
         use_cache: bool = False,
         **kwargs,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        # MiniCPMFlashAttention2 attention does not support output_attentions
         if "padding_mask" in kwargs:
             warnings.warn(
                 "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
@@ -552,7 +552,7 @@ class MiniCPMFlashAttention2(MiniCPMAttention):
         # therefore the input hidden states gets silently casted in float32. Hence, we need
         # cast them back in the correct dtype just to be sure everything works as expected.
         # This might slowdown training & inference so it is recommended to not cast the LayerNorms
-        # in fp32. (MiniCPMRMSNorm handles it correctly)
         input_dtype = query_states.dtype
         if input_dtype == torch.float32:
@@ -609,7 +609,7 @@ class MiniCPMFlashAttention2(MiniCPMAttention):
         if not self._flash_attn_uses_top_left_mask:
             causal = self.is_causal
         else:
-            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in MiniCPMFlashAttention2 __init__.
             causal = self.is_causal and query_length != 1
         # Contains at least one padding token in the sequence
         if attention_mask is not None:
@@ -680,14 +680,14 @@ class MiniCPMFlashAttention2(MiniCPMAttention):
         )
-class MiniCPMSdpaAttention(MiniCPMAttention):
     """
-    MiniCPM attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
-    `MiniCPMAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
     SDPA API.
     """
-    # Adapted from MiniCPMAttention.forward
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -700,7 +700,7 @@ class MiniCPMSdpaAttention(MiniCPMAttention):
         if output_attentions:
             # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
             logger.warning_once(
-                "MiniCPMModel is using MiniCPMSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
                 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
             )
             return super().forward(
@@ -771,22 +771,22 @@ class MiniCPMSdpaAttention(MiniCPMAttention):
         return attn_output, None, past_key_value
-MINICPM_ATTENTION_CLASSES = {
-    "eager": MiniCPMAttention,
-    "flash_attention_2": MiniCPMFlashAttention2,
-    "sdpa": MiniCPMSdpaAttention,
 }
-class MiniCPMDecoderLayer(nn.Module):
-    def __init__(self, config: MiniCPMConfig, layer_idx: int):
         super().__init__()
         self.hidden_size = config.hidden_size
-        self.self_attn = MINICPM_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx)
-        self.mlp = MiniCPMMLP(config)
-        self.input_layernorm = MiniCPMRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.post_attention_layernorm = MiniCPMRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.scale_depth = config.scale_depth
         self.num_hidden_layers = config.num_hidden_layers
@@ -853,7 +853,7 @@ class MiniCPMDecoderLayer(nn.Module):
         return outputs
-MINICPM_START_DOCSTRING = r"""
     This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
     library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
     etc.)
@@ -863,7 +863,7 @@ MINICPM_START_DOCSTRING = r"""
     and behavior.
     Parameters:
-        config ([`MiniCPMConfig`]):
             Model configuration class with all the parameters of the model. Initializing with a config file does not
             load the weights associated with the model, only the configuration. Check out the
             [`~PreTrainedModel.from_pretrained`] method to load the model weights.
@@ -871,14 +871,14 @@ MINICPM_START_DOCSTRING = r"""
 @add_start_docstrings(
-    "The bare MiniCPM Model outputting raw hidden-states without any specific head on top.",
-    MINICPM_START_DOCSTRING,
 )
-class MiniCPMPreTrainedModel(PreTrainedModel):
-    config_class = MiniCPMConfig
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
-    _no_split_modules = ["MiniCPMDecoderLayer"]
     _skip_keys_device_placement = "past_key_values"
     _supports_flash_attn_2 = True
     _supports_sdpa = True
@@ -896,7 +896,7 @@ class MiniCPMPreTrainedModel(PreTrainedModel):
                 module.weight.data[module.padding_idx].zero_()
-MINICPM_INPUTS_DOCSTRING = r"""
     Args:
         input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
@@ -967,30 +967,30 @@ MINICPM_INPUTS_DOCSTRING = r"""
 @add_start_docstrings(
-    "The bare MiniCPM Model outputting raw hidden-states without any specific head on top.",
-    MINICPM_START_DOCSTRING,
 )
-class MiniCPMModel(MiniCPMPreTrainedModel):
     """
-    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MiniCPMDecoderLayer`]
     Args:
-        config: MiniCPMConfig
     """
-    def __init__(self, config: MiniCPMConfig):
         super().__init__(config)
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
         self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
         self.layers = nn.ModuleList(
-            [MiniCPMDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
         )
         self._use_sdpa = config._attn_implementation == "sdpa"
         self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
-        self.norm = MiniCPMRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.gradient_checkpointing = False
         # Initialize weights and apply final processing
@@ -1002,7 +1002,7 @@ class MiniCPMModel(MiniCPMPreTrainedModel):
     def set_input_embeddings(self, value):
         self.embed_tokens = value
-    @add_start_docstrings_to_model_forward(MINICPM_INPUTS_DOCSTRING)
     def forward(
         self,
         input_ids: torch.LongTensor = None,
@@ -1135,12 +1135,12 @@ class MiniCPMModel(MiniCPMPreTrainedModel):
         )
-class MiniCPMForCausalLM(MiniCPMPreTrainedModel):
     _tied_weights_keys = ["lm_head.weight"]
     def __init__(self, config):
         super().__init__(config)
-        self.model = MiniCPMModel(config)
         self.vocab_size = config.vocab_size
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
@@ -1165,7 +1165,7 @@ class MiniCPMForCausalLM(MiniCPMPreTrainedModel):
     def get_decoder(self):
         return self.model
-    @add_start_docstrings_to_model_forward(MINICPM_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
@@ -1192,9 +1192,9 @@ class MiniCPMForCausalLM(MiniCPMPreTrainedModel):
         Example:
         ```python
-        >>> from transformers import AutoTokenizer, MiniCPMForCausalLM
-        >>> model = MiniCPMForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
         >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
         >>> prompt = "Hey, are you conscious? Can you talk to me?"
@@ -1354,9 +1354,9 @@ class MiniCPMForCausalLM(MiniCPMPreTrainedModel):
 @add_start_docstrings(
     """
-    The MiniCPM Model transformer with a sequence classification head on top (linear layer).
-    [`MiniCPMForSequenceClassification`] uses the last token in order to do the classification, as other causal models
     (e.g. GPT-2) do.
     Since it does classification on the last token, it requires to know the position of the last token. If a
@@ -1365,13 +1365,13 @@ class MiniCPMForCausalLM(MiniCPMPreTrainedModel):
     padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
     each row of the batch).
     """,
-    MINICPM_START_DOCSTRING,
 )
-class MiniCPMForSequenceClassification(MiniCPMPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
-        self.model = MiniCPMModel(config)
         self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
         # Initialize weights and apply final processing
@@ -1383,7 +1383,7 @@ class MiniCPMForSequenceClassification(MiniCPMPreTrainedModel):
     def set_input_embeddings(self, value):
         self.model.embed_tokens = value
-    @add_start_docstrings_to_model_forward(MINICPM_INPUTS_DOCSTRING)
     def forward(
         self,
         input_ids: torch.LongTensor = None,

 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+""" PyTorch Cauchy model."""
 import math
 import warnings
 from typing import List, Optional, Tuple, Union, Dict
     replace_return_docstrings,
 )
 from transformers.utils.import_utils import is_torch_fx_available
+from .configuration_cauchy import CauchyConfig
 import re
 try:
 logger = logging.get_logger(__name__)
+_CONFIG_FOR_DOC = "CauchyConfig"
 def _get_unpad_data(attention_mask):
 def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
     warnings.warn(
+        "Calling `transformers.models.cauchy.modeling_cauchy._prepare_4d_attention_mask` is deprecated and will be removed in v4.37. Use `transformers.modeling_attn_mask_utils._prepare_4d_attention_mask"
     )
     return _prepare_4d_attention_mask(mask=mask, dtype=dtype, tgt_len=tgt_len)
     input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
 ):
     warnings.warn(
+        "Calling `transformers.models.cauchy.modeling_cauchy._make_causal_mask` is deprecated and will be removed in v4.37. Use `transformers.models.cauchy.modeling_cauchy.AttentionMaskConverter._make_causal_mask"
     )
     return AttentionMaskConverter._make_causal_mask(
         input_ids_shape=input_ids_shape, dtype=dtype, device=device, past_key_values_length=past_key_values_length
     return hidden * weight
+class CauchyRMSNorm(nn.Module):
     def __init__(self, hidden_size, eps=1e-6):
         """
+        CauchyRMSNorm is equivalent to T5LayerNorm
         """
         super().__init__()
         self.weight = nn.Parameter(torch.ones(hidden_size))
         return rms_layernorm(hidden_states, self.weight, self.variance_epsilon)
+ALL_LAYERNORM_LAYERS.append(CauchyRMSNorm)
+class CauchyRotaryEmbedding(nn.Module):
     def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
         super().__init__()
         )
+class CauchyLinearScalingRotaryEmbedding(CauchyRotaryEmbedding):
+    """CauchyRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
     def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
         self.scaling_factor = scaling_factor
         self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+class CauchyDynamicNTKScalingRotaryEmbedding(CauchyRotaryEmbedding):
+    """CauchyRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
     def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
         self.scaling_factor = scaling_factor
     k_embed = (k_fp32 * cos) + (rotate_half(k_fp32) * sin)
     return q_embed.to(dtype=orig_dtype), k_embed.to(dtype=orig_dtype)
+class CauchyMLP(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.config = config
+class CauchyAttention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(self, config: CauchyConfig, layer_idx: Optional[int] = None):
         super().__init__()
         self.config = config
         self.layer_idx = layer_idx
         self._init_rope()
         if self.qk_norm:
+            self.q_norm = CauchyRMSNorm(self.head_dim, eps=config.rms_norm_eps)
+            self.k_norm = CauchyRMSNorm(self.head_dim, eps=config.rms_norm_eps)
     def _init_rope(self):
         if self.config.rope_scaling is None:
+            self.rotary_emb = CauchyRotaryEmbedding(
                 self.head_dim,
                 max_position_embeddings=self.max_position_embeddings,
                 base=self.rope_theta,
             scaling_type = self.config.rope_scaling["type"]
             scaling_factor = self.config.rope_scaling["factor"]
             if scaling_type == "linear":
+                self.rotary_emb = CauchyLinearScalingRotaryEmbedding(
                     self.head_dim,
                     max_position_embeddings=self.max_position_embeddings,
                     scaling_factor=scaling_factor,
                     base=self.rope_theta,
                 )
             elif scaling_type == "dynamic":
+                self.rotary_emb = CauchyDynamicNTKScalingRotaryEmbedding(
                     self.head_dim,
                     max_position_embeddings=self.max_position_embeddings,
                     scaling_factor=scaling_factor,
         return attn_output, attn_weights, past_key_value
+class CauchyFlashAttention2(CauchyAttention):
     """
+    Cauchy flash attention module. This module inherits from `CauchyAttention` as the weights of the module stays
     untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
     flash attention and deal with padding tokens in case the input contains any of them.
     """
         use_cache: bool = False,
         **kwargs,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        # CauchyFlashAttention2 attention does not support output_attentions
         if "padding_mask" in kwargs:
             warnings.warn(
                 "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
         # therefore the input hidden states gets silently casted in float32. Hence, we need
         # cast them back in the correct dtype just to be sure everything works as expected.
         # This might slowdown training & inference so it is recommended to not cast the LayerNorms
+        # in fp32. (CauchyRMSNorm handles it correctly)
         input_dtype = query_states.dtype
         if input_dtype == torch.float32:
         if not self._flash_attn_uses_top_left_mask:
             causal = self.is_causal
         else:
+            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in CauchyFlashAttention2 __init__.
             causal = self.is_causal and query_length != 1
         # Contains at least one padding token in the sequence
         if attention_mask is not None:
         )
+class CauchySdpaAttention(CauchyAttention):
     """
+    Cauchy attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
+    `CauchyAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
     SDPA API.
     """
+    # Adapted from CauchyAttention.forward
     def forward(
         self,
         hidden_states: torch.Tensor,
         if output_attentions:
             # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
             logger.warning_once(
+                "CauchyModel is using CauchySdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
                 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
             )
             return super().forward(
         return attn_output, None, past_key_value
+CAUCHY_ATTENTION_CLASSES = {
+    "eager": CauchyAttention,
+    "flash_attention_2": CauchyFlashAttention2,
+    "sdpa": CauchySdpaAttention,
 }
+class CauchyDecoderLayer(nn.Module):
+    def __init__(self, config: CauchyConfig, layer_idx: int):
         super().__init__()
         self.hidden_size = config.hidden_size
+        self.self_attn = CAUCHY_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx)
+        self.mlp = CauchyMLP(config)
+        self.input_layernorm = CauchyRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = CauchyRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.scale_depth = config.scale_depth
         self.num_hidden_layers = config.num_hidden_layers
         return outputs
+CAUCHY_START_DOCSTRING = r"""
     This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
     library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
     etc.)
     and behavior.
     Parameters:
+        config ([`CauchyConfig`]):
             Model configuration class with all the parameters of the model. Initializing with a config file does not
             load the weights associated with the model, only the configuration. Check out the
             [`~PreTrainedModel.from_pretrained`] method to load the model weights.
 @add_start_docstrings(
+    "The bare Cauchy Model outputting raw hidden-states without any specific head on top.",
+    CAUCHY_START_DOCSTRING,
 )
+class CauchyPreTrainedModel(PreTrainedModel):
+    config_class = CauchyConfig
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
+    _no_split_modules = ["CauchyDecoderLayer"]
     _skip_keys_device_placement = "past_key_values"
     _supports_flash_attn_2 = True
     _supports_sdpa = True
                 module.weight.data[module.padding_idx].zero_()
+CAUCHY_INPUTS_DOCSTRING = r"""
     Args:
         input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
 @add_start_docstrings(
+    "The bare Cauchy Model outputting raw hidden-states without any specific head on top.",
+    CAUCHY_START_DOCSTRING,
 )
+class CauchyModel(CauchyPreTrainedModel):
     """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`CauchyDecoderLayer`]
     Args:
+        config: CauchyConfig
     """
+    def __init__(self, config: CauchyConfig):
         super().__init__(config)
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
         self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
         self.layers = nn.ModuleList(
+            [CauchyDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
         )
         self._use_sdpa = config._attn_implementation == "sdpa"
         self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
+        self.norm = CauchyRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.gradient_checkpointing = False
         # Initialize weights and apply final processing
     def set_input_embeddings(self, value):
         self.embed_tokens = value
+    @add_start_docstrings_to_model_forward(CAUCHY_INPUTS_DOCSTRING)
     def forward(
         self,
         input_ids: torch.LongTensor = None,
         )
+class CauchyForCausalLM(CauchyPreTrainedModel):
     _tied_weights_keys = ["lm_head.weight"]
     def __init__(self, config):
         super().__init__(config)
+        self.model = CauchyModel(config)
         self.vocab_size = config.vocab_size
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
     def get_decoder(self):
         return self.model
+    @add_start_docstrings_to_model_forward(CAUCHY_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
         Example:
         ```python
+        >>> from transformers import AutoTokenizer, CauchyForCausalLM
+        >>> model = CauchyForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
         >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
         >>> prompt = "Hey, are you conscious? Can you talk to me?"
 @add_start_docstrings(
     """
+    The Cauchy Model transformer with a sequence classification head on top (linear layer).
+    [`CauchyForSequenceClassification`] uses the last token in order to do the classification, as other causal models
     (e.g. GPT-2) do.
     Since it does classification on the last token, it requires to know the position of the last token. If a
     padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
     each row of the batch).
     """,
+    CAUCHY_START_DOCSTRING,
 )
+class CauchyForSequenceClassification(CauchyPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
+        self.model = CauchyModel(config)
         self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
         # Initialize weights and apply final processing
     def set_input_embeddings(self, value):
         self.model.embed_tokens = value
+    @add_start_docstrings_to_model_forward(CAUCHY_INPUTS_DOCSTRING)
     def forward(
         self,
         input_ids: torch.LongTensor = None,

pytorch_model.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4c9af2c0dc4ba4583879030d052032cd413b387a39b26ece3f10abaea9554fba
-size 6919416998

 version https://git-lfs.github.com/spec/v1
+oid sha256:c23ecc5e0665c45154097ff165e98e769e9be180e7bec074871838ebe2a415e0
+size 6220791502