|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""Ernie model configuration""" |
|
import copy |
|
|
|
from typing import List, Optional, Tuple, Union |
|
|
|
from transformers import PretrainedConfig |
|
|
|
|
|
__all__ = [ |
|
"ERNIE_PRETRAINED_INIT_CONFIGURATION", |
|
"Ernie4_5_Config", |
|
"Ernie4_5_MoEConfig", |
|
"Ernie4_5_VLMoEConfig", |
|
] |
|
|
|
|
|
class DFNRopeVisionTransformerConfig(PretrainedConfig): |
|
""" |
|
Configuration class for DFNRopeVisionTransformer model. |
|
This class inherits from [`PretrainedConfig`] and can be used to control the model outputs. Read the |
|
documentation from [`PretrainedConfig`] for more information. |
|
""" |
|
|
|
model_type = "DFNRope_vision_transformer" |
|
base_model_tp_plan = {} |
|
|
|
def __init__( |
|
self, |
|
depth=32, |
|
embed_dim=1280, |
|
hidden_size=3584, |
|
hidden_act="quick_gelu", |
|
mlp_ratio=4, |
|
num_heads=16, |
|
in_channels=3, |
|
patch_size=14, |
|
spatial_merge_size=2, |
|
attn_implementation="eager", |
|
pp_data_balance=False, |
|
recompute=False, |
|
attn_sep=False, |
|
vit_first_fwd_bsz=128, |
|
vit_num_recompute_layers=10000, |
|
**kwargs, |
|
): |
|
""" |
|
Initialize DFNRopeVisionTransformer model configuration with default or specified parameters. |
|
|
|
Args: |
|
depth (int): Number of transformer layers in the model. |
|
embed_dim (int): Dimensionality of the embedding layer. |
|
hidden_size (int): Dimensionality of the feedforward network. |
|
hidden_act (str): Activation function for the feedforward network. |
|
mlp_ratio (float): Ratio between the number of input features and |
|
the number of output features in the feedforward network. |
|
num_heads (int): Number of attention heads in each attention layer. |
|
in_channels (int): Number of channels in the input image. |
|
patch_size (int): |
|
Size of patches in the input image. Defaults to 14. |
|
spatial_merge_size (int): |
|
Spatial merge size for the spatial transformer module. Defaults to 2. |
|
attn_implementation (str): Attention implementation type. Defaults to "eager". |
|
pp_data_balance (bool): Whether to balance data during preprocessing. Defaults to False. |
|
recompute (bool): Whether to use recompute. Defaults to False. |
|
attn_sep (bool): Whether to separate attention computation into two stages. Defaults to False. |
|
vit_first_fwd_bsz (int): First forward batch size for ViT. Defaults to 128. |
|
vit_num_recompute_layers (int): Number of recomputed layers for ViT. Defaults to |
|
""" |
|
super().__init__(**kwargs) |
|
|
|
self.depth = depth |
|
self.embed_dim = embed_dim |
|
self.hidden_size = hidden_size |
|
self.hidden_act = hidden_act |
|
self.mlp_ratio = mlp_ratio |
|
self.num_heads = num_heads |
|
self.in_channels = in_channels |
|
self.patch_size = patch_size |
|
self.spatial_merge_size = spatial_merge_size |
|
self.attn_implementation = attn_implementation |
|
self.pp_data_balance = pp_data_balance |
|
self.recompute = recompute |
|
self.attn_sep = attn_sep |
|
self.vit_first_fwd_bsz = vit_first_fwd_bsz |
|
self.vit_num_recompute_layers = vit_num_recompute_layers |
|
|
|
def get(self, key, default=None): |
|
"""get config value by key""" |
|
if hasattr(self, key): |
|
return getattr(self, key) |
|
else: |
|
return default |
|
|
|
|
|
ERNIE_PRETRAINED_INIT_CONFIGURATION = { |
|
"ernie/tiny-random-ernie": { |
|
"hidden_size": 768, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 11008, |
|
"max_position_embeddings": 2048, |
|
"model_type": "ernie", |
|
"num_attention_heads": 2, |
|
"num_hidden_layers": 2, |
|
"rms_norm_eps": 1e-06, |
|
"vocab_size": 32000, |
|
"bos_token_id": 1, |
|
"eos_token_id": 2, |
|
"pad_token_id": 0, |
|
"use_cache": False, |
|
"recompute": False, |
|
"use_flash_attn": True, |
|
"use_pure_fp16": False, |
|
}, |
|
} |
|
|
|
|
|
class Ernie4_5_Config(PretrainedConfig): |
|
""" |
|
Configuration class for ERNIE model. |
|
|
|
This class stores the configuration of an ERNIE model, defining the model architecture. |
|
It inherits from PretrainedConfig and can be used to control model outputs. |
|
""" |
|
|
|
model_type = "ernie" |
|
pretrained_init_configuration = ERNIE_PRETRAINED_INIT_CONFIGURATION |
|
base_model_tp_plan = {} |
|
|
|
def __init__( |
|
self, |
|
vocab_size=32000, |
|
hidden_size=768, |
|
intermediate_size=11008, |
|
max_position_embeddings=32768, |
|
num_hidden_layers=2, |
|
num_attention_heads=2, |
|
initializer_range=0.02, |
|
rms_norm_eps=1e-6, |
|
use_cache=False, |
|
use_flash_attention=True, |
|
use_sparse_flash_attn=True, |
|
use_var_len_flash_attn=False, |
|
recompute=False, |
|
recompute_granularity="core_attn", |
|
recompute_use_reentrant=False, |
|
use_rmsnorm=True, |
|
fuse_rms_norm=False, |
|
fuse_ln=False, |
|
pad_token_id=0, |
|
bos_token_id=1, |
|
eos_token_id=2, |
|
fuse_swiglu=False, |
|
use_bias=False, |
|
rope_theta=10000, |
|
fuse_rope=False, |
|
fuse_softmax_mask=False, |
|
use_fast_ln=False, |
|
weight_share_add_bias=True, |
|
fuse_linear=False, |
|
max_sequence_length=1024, |
|
ignored_index=-100, |
|
add_tail_layers=False, |
|
use_recompute_lm_head=False, |
|
use_recompute_loss_fn=False, |
|
refined_recompute=dict(), |
|
attention_probs_dropout_prob=0.0, |
|
hidden_dropout_prob=0.0, |
|
compression_ratio: float = 1.0, |
|
num_key_value_heads=None, |
|
use_sparse_head_and_loss_fn=False, |
|
micro_batch_size=-1, |
|
use_ep_comm_overlap=False, |
|
use_fused_head_and_loss_fn=False, |
|
token_balance_loss=False, |
|
token_balance_seqlen=False, |
|
cachekv_quant: bool = False, |
|
pp_seg_method="layer:ErnieDecoderLayer|EmptyLayer", |
|
**kwargs, |
|
): |
|
""" |
|
Initialize ERNIE model configuration with default or specified parameters. |
|
|
|
Args: |
|
vocab_size (int): Size of the vocabulary (number of unique tokens) |
|
hidden_size (int): Dimensionality of the encoder layers and the pooler layer |
|
intermediate_size (int): Dimensionality of the "intermediate" (feed-forward) layer |
|
max_position_embeddings (int): Maximum sequence length the model can handle |
|
num_hidden_layers (int): Number of hidden layers in the Transformer encoder |
|
num_attention_heads (int): Number of attention heads for each attention layer |
|
rms_norm_eps (float): The epsilon used by the RMS normalization layers |
|
use_cache (bool): Whether to use caching for faster generation (decoding) |
|
use_flash_attention (bool): Whether to use FlashAttention for optimized attention computation |
|
use_sparse_flash_attn (bool): Whether to use sparse FlashAttention |
|
use_var_len_flash_attn (bool): Whether to use variable-length FlashAttention |
|
recompute (bool): Whether to use gradient checkpointing to save memory |
|
recompute_granularity (str): Granularity of recomputation ("core_attn", "full", etc.) |
|
recompute_use_reentrant (bool): Whether to use reentrant checkpointing |
|
use_rmsnorm (bool): Whether to use RMSNorm instead of LayerNorm |
|
fuse_rms_norm (bool): Whether to fuse RMSNorm operations for optimization |
|
fuse_ln (bool): Whether to fuse LayerNorm operations |
|
pad_token_id (int): Token ID used for padding sequences |
|
bos_token_id (int): Token ID used for beginning-of-sequence |
|
eos_token_id (int): Token ID used for end-of-sequence |
|
fuse_swiglu (bool): Whether to fuse SwiGLU operations |
|
use_bias (bool): Whether to use bias terms in linear layers |
|
rope_theta (float): The base period of the RoPE embeddings |
|
fuse_rope (bool): Whether to fuse RoPE operations |
|
use_fast_ln (bool): Whether to use optimized LayerNorm implementation |
|
weight_share_add_bias (bool): Whether to share bias weights in certain layers |
|
fuse_linear (bool): Whether to fuse linear operations |
|
max_sequence_length (int): Maximum sequence length for positional embeddings |
|
ignored_index (int): Target value that is ignored during loss computation |
|
add_tail_layers (bool): Whether to add additional layers at the end |
|
use_recompute_lm_head (bool): Whether to recompute gradients for language model head |
|
use_recompute_loss_fn (bool): Whether to recompute gradients for loss function |
|
refined_recompute (dict): Dictionary specifying refined recomputation settings |
|
attention_probs_dropout_prob (float): Dropout probability for attention weights |
|
hidden_dropout_prob (float): Dropout probability for hidden layers |
|
compression_ratio (float): Ratio for KV cache compression (1.0 = no compression) |
|
num_key_value_heads (int): Number of key/value heads (for Grouped Query Attention) |
|
use_sparse_head_and_loss_fn (bool): Whether to use sparse attention head and loss function |
|
micro_batch_size (int): Size of micro batches (-1 for automatic) |
|
use_ep_comm_overlap (bool): Whether to overlap communication with computation |
|
use_fused_head_loss_fn (bool): Whether to use fused head and loss function |
|
token_balance_loss (bool): Whether to balance loss by token count |
|
token_balance_seqlen (bool): Whether to balance sequence lengths |
|
cachekv_quant (bool): Whether to quantize key-value cache |
|
pp_seg_method (str): Method for pipeline parallel segmentation |
|
**kwargs: Additional keyword arguments passed to parent class |
|
""" |
|
|
|
|
|
if "tie_word_embeddings" not in kwargs: |
|
kwargs["tie_word_embeddings"] = False |
|
super().__init__( |
|
pad_token_id=pad_token_id, |
|
bos_token_id=bos_token_id, |
|
eos_token_id=eos_token_id, |
|
**kwargs, |
|
) |
|
self.vocab_size = vocab_size |
|
self.hidden_size = hidden_size |
|
self.intermediate_size = intermediate_size |
|
self.max_position_embeddings = max_position_embeddings |
|
self.num_hidden_layers = num_hidden_layers |
|
self.num_attention_heads = num_attention_heads |
|
self.initializer_range = initializer_range |
|
self.rms_norm_eps = rms_norm_eps |
|
self.use_cache = use_cache |
|
self.recompute = recompute |
|
self.recompute_granularity = recompute_granularity |
|
self.use_flash_attention = use_flash_attention |
|
self.use_sparse_flash_attn = use_sparse_flash_attn |
|
self.recompute_use_reentrant = recompute_use_reentrant |
|
self.use_var_len_flash_attn = use_var_len_flash_attn |
|
self.pad_token_id = pad_token_id |
|
self.bos_token_id = bos_token_id |
|
self.eos_token_id = eos_token_id |
|
self.fuse_swiglu = fuse_swiglu |
|
self.fuse_rms_norm = fuse_rms_norm |
|
self.fuse_ln = fuse_ln |
|
self.use_rmsnorm = use_rmsnorm |
|
self.micro_batch_size = micro_batch_size |
|
|
|
self.max_sequence_length = max_sequence_length |
|
self.use_bias = use_bias |
|
self.weight_share_add_bias = weight_share_add_bias |
|
self.rope_theta = rope_theta |
|
self.fuse_rope = fuse_rope |
|
self.fuse_softmax_mask = fuse_softmax_mask |
|
self.use_fast_ln = use_fast_ln |
|
|
|
self.fuse_linear = fuse_linear |
|
self.ignored_index = ignored_index |
|
self.add_tail_layers = add_tail_layers |
|
self.use_recompute_lm_head = use_recompute_lm_head |
|
self.use_recompute_loss_fn = use_recompute_loss_fn |
|
|
|
self.refined_recompute = refined_recompute |
|
self.skip_recompute_ops = dict() |
|
""" |
|
`refined_recompute` is a dictionary that specifies fine-grained gradient recomputation settings, |
|
which currently only takes effect in Pipeline Parallel (PP) mode. |
|
|
|
In PP mode, this dictionary populates `self.skip_recompute_ops` with the following structure: |
|
- Key (`op_name`): The operation name to configure, with possible values: |
|
* "mlp_row_ln" - MLP row-wise layer normalization |
|
* "flash_attn" - Flash attention operation |
|
* "attention_row_ln" - Attention row-wise layer normalization |
|
* "attention_column_ln" - Attention column-wise layer normalization |
|
* "mlp_column_ln" - MLP column-wise layer normalization |
|
|
|
- Value (`skip_num`): Controls how many times to skip recomputation: |
|
* 0: Never skip recomputation (minimum memory usage) |
|
* -1: Always skip recomputation (maximum memory usage) |
|
* [0,1,...,12]: Skip recomputation for specified number of times |
|
* ≥12: Equivalent to -1 (always skip recomputation) |
|
|
|
This allows precise control over memory/computation tradeoffs for different operations. |
|
""" |
|
self.attention_probs_dropout_prob = attention_probs_dropout_prob |
|
self.hidden_dropout_prob = hidden_dropout_prob |
|
self.compression_ratio = compression_ratio |
|
self.num_key_value_heads = num_key_value_heads |
|
self.use_sparse_head_and_loss_fn = use_sparse_head_and_loss_fn |
|
self.use_ep_comm_overlap = use_ep_comm_overlap |
|
self.use_fused_head_and_loss_fn = use_fused_head_and_loss_fn |
|
self.token_balance_loss = token_balance_loss |
|
self.token_balance_seqlen = token_balance_seqlen |
|
self.cachekv_quant = cachekv_quant |
|
self.pp_seg_method = pp_seg_method |
|
|
|
def get(self, key, default=None): |
|
"""get config value by key""" |
|
if hasattr(self, key): |
|
return getattr(self, key) |
|
else: |
|
return default |
|
|
|
|
|
class Ernie4_5_MoEConfig(Ernie4_5_Config): |
|
r""" |
|
Configuration class for ErnieMoE model architecture. |
|
|
|
This class stores the configuration for a [`~ErnieModel`] and is used to instantiate |
|
an ErnieMoE model according to the specified arguments. Inherits from [`PretrainedConfig`] |
|
and can control model outputs. |
|
|
|
Attributes: |
|
Inherits all attributes from Ernie4_5_Config and adds MoE-specific configurations. |
|
""" |
|
|
|
model_type = "ernie" |
|
attribute_map = { |
|
"n_positions": "max_position_embeddings", |
|
"n_embd": "hidden_size", |
|
"n_layer": "num_hidden_layers", |
|
"n_head": "num_attention_heads", |
|
"n_inner": "intermediate_size", |
|
"activation_function": "hidden_act", |
|
} |
|
pretrained_init_configuration = ERNIE_PRETRAINED_INIT_CONFIGURATION |
|
base_model_tp_plan = {} |
|
|
|
def __init__( |
|
self, |
|
moe_num_experts: Union[int, list] = 0, |
|
use_recompute_moe=False, |
|
moe_capacity=(), |
|
moe_layer_interval=2, |
|
moe_layer_start_index=0, |
|
moe_layer_end_index=-1, |
|
moe_aux_loss_lambda=1e-2, |
|
moe_z_loss_lambda=1e-4, |
|
moe_orthogonal_loss_lambda=1e-2, |
|
sinkhorn_2gate=True, |
|
sinkhorn_temp=3e-2, |
|
global_aux_loss=False, |
|
moe_dropout_prob=0.0, |
|
moe_group="world", |
|
moe_gate="top2", |
|
moe_intermediate_size: Union[int, list] = 0, |
|
moe_num_shared_experts: int = 0, |
|
moe_reverse_token_drop: bool = False, |
|
moe_gate_act: str = "softmax", |
|
moe_norm_gate_logits=True, |
|
moe_all_to_all_dropout: float = 0.0, |
|
moe_k=2, |
|
moe_use_aux_free: bool = False, |
|
|
|
moe_group_experts: bool = False, |
|
moe_group_orthogonal_loss: bool = True, |
|
enable_delay_scale_loss: bool = True, |
|
num_acc_steps: int = 1, |
|
fuse_gate_detach_matmul: bool = False, |
|
dpo_config=None, |
|
moe_multimodal_dispatch_use_allgather: str = "", |
|
moe_use_hard_gate=False, |
|
moe_dense_experts_token_type_id=3, |
|
**kwargs, |
|
): |
|
""" |
|
Initialize ErnieMoE configuration with MoE-specific parameters. |
|
|
|
Args: |
|
moe_num_experts: Number of experts in MoE layers |
|
use_recompute_moe: Whether to use recomputation for MoE layers |
|
moe_capacity: Capacity configuration for MoE layers |
|
moe_layer_interval: Interval between MoE layers |
|
moe_layer_start_index: Starting layer index for MoE |
|
moe_layer_end_index: Ending layer index for MoE (-1 means last layer) |
|
moe_aux_loss_lambda: Weight for auxiliary loss |
|
moe_z_loss_lambda: Weight for z-loss |
|
moe_orthogonal_loss_lambda: Weight for orthogonal loss |
|
sinkhorn_2gate: Whether to use sinkhorn 2-gate routing |
|
sinkhorn_temp: Temperature for sinkhorn routing |
|
global_aux_loss: Whether to use global auxiliary loss |
|
moe_dropout_prob: Dropout probability for MoE layers |
|
moe_group: Group configuration for MoE experts |
|
moe_gate: Type of gating mechanism ('top2', etc.) |
|
moe_intermediate_size: Intermediate size for MoE layers |
|
moe_num_shared_experts: Number of shared experts |
|
moe_reverse_token_drop: Whether to use reverse token dropping |
|
moe_gate_act: Activation function for gating |
|
moe_norm_gate_logits: Whether to normalize gate logits |
|
moe_all_to_all_dropout: Dropout for all-to-all communication |
|
moe_k: Number of experts to route to |
|
moe_use_aux_free: Whether to use auxiliary-free routing |
|
moe_group_experts: Whether to group experts (requires hard gating) |
|
moe_group_orthogonal_loss: Whether to use group orthogonal loss |
|
enable_delay_scale_loss: Whether to enable delayed loss scaling |
|
num_acc_steps: Number of accumulation steps |
|
fuse_gate_detach_matmul: Whether to fuse gate detach matmul |
|
**kwargs: Additional base model configuration parameters |
|
|
|
Note: |
|
When use_recompute_moe is True, recompute_granularity will be changed to full_attn. |
|
""" |
|
|
|
if use_recompute_moe: |
|
logger.warning( |
|
"set `use_recompute_moe`=True, disabling `recompute_granularity=full`, change to full_attn." |
|
) |
|
if kwargs["recompute"] and kwargs["recompute_granularity"] == "full": |
|
kwargs["recompute_granularity"] = "full_attn" |
|
super().__init__(**kwargs) |
|
|
|
self.moe_num_experts = moe_num_experts |
|
self.use_recompute_moe = use_recompute_moe |
|
self.moe_capacity = moe_capacity |
|
self.moe_aux_loss_lambda = moe_aux_loss_lambda |
|
self.moe_z_loss_lambda = moe_z_loss_lambda |
|
self.moe_orthogonal_loss_lambda = moe_orthogonal_loss_lambda |
|
self.global_aux_loss = global_aux_loss |
|
self.sinkhorn_2gate = sinkhorn_2gate |
|
self.sinkhorn_temp = sinkhorn_temp |
|
self.moe_layer_interval = moe_layer_interval |
|
self.moe_dropout_prob = moe_dropout_prob |
|
self.moe_group = moe_group |
|
self.moe_gate = moe_gate |
|
self.moe_intermediate_size = moe_intermediate_size |
|
self.moe_num_shared_experts = moe_num_shared_experts |
|
self.moe_reverse_token_drop = moe_reverse_token_drop |
|
self.moe_k = moe_k |
|
self.moe_all_to_all_dropout = moe_all_to_all_dropout |
|
self.moe_group_experts = moe_group_experts |
|
self.moe_group_orthogonal_loss = moe_group_orthogonal_loss |
|
self.enable_delay_scale_loss = enable_delay_scale_loss |
|
self.num_acc_steps = num_acc_steps |
|
self.moe_layer_start_index = moe_layer_start_index |
|
self.moe_layer_end_index = ( |
|
self.num_hidden_layers - 1 |
|
if moe_layer_end_index == -1 |
|
else moe_layer_end_index |
|
) |
|
self.moe_gate_act = moe_gate_act |
|
self.moe_norm_gate_logits = moe_norm_gate_logits |
|
self.moe_use_aux_free = moe_use_aux_free |
|
self.fuse_gate_detach_matmul = fuse_gate_detach_matmul |
|
self.dpo_config = dpo_config |
|
self.moe_multimodal_dispatch_use_allgather = ( |
|
moe_multimodal_dispatch_use_allgather |
|
) |
|
self.moe_use_hard_gate = moe_use_hard_gate |
|
self.moe_dense_experts_token_type_id = moe_dense_experts_token_type_id |
|
|
|
@property |
|
def multimodel_experts(self) -> bool: |
|
"""multimodel experts.""" |
|
return ( |
|
isinstance(self.moe_num_experts, (tuple, list)) |
|
and len(self.moe_num_experts) > 1 |
|
) |
|
|
|
@property |
|
def use_moe(self) -> bool: |
|
""" |
|
Check if model is using MoE architecture. |
|
|
|
Returns: |
|
bool: True if moe_num_experts > 0, False otherwise |
|
""" |
|
return self.moe_num_experts > 0 |
|
|
|
|
|
class Ernie4_5_VLMoEConfig(Ernie4_5_MoEConfig): |
|
""" |
|
This is the configuration class to store the configuration of a [`~ErnieModel`]. It is used to instantiate an Ernie |
|
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the |
|
defaults will yield a similar configuration to that of the Ernie-7B. |
|
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the |
|
documentation from [`PretrainedConfig`] for more information. |
|
Args: |
|
vocab_size (`int`, *optional*, defaults to 32000): |
|
Vocabulary size of the Ernie model. Defines the number of different tokens that can be represented by the |
|
`inputs_ids` passed when calling [`~ErnieModel`] or [`~TFErnieModel`]. |
|
hidden_size (`int`, *optional*, defaults to 4096): |
|
Dimension of the hidden representations. |
|
intermediate_size (`int`, *optional*, defaults to 11008): |
|
Dimension of the MLP representations. |
|
num_hidden_layers (`int`, *optional*, defaults to 32): |
|
Number of hidden layers in the Transformer encoder. |
|
num_attention_heads (`int`, *optional*, defaults to 32): |
|
Number of attention heads for each attention layer in the Transformer encoder. |
|
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): |
|
The non-linear activation function (function or string) in the decoder. |
|
initializer_range (`float`, *optional*, defaults to 0.02): |
|
The standard deviation of the truncated_normal_initializer for initializing all weight matrices. |
|
rms_norm_eps (`float`, *optional*, defaults to 1e-12): |
|
The epsilon used by the rms normalization layers. |
|
use_cache (`bool`, *optional*, defaults to `True`): |
|
Whether or not the model should return the last key/values attentions (not used by all models). Only |
|
relevant if `config.is_decoder=True`. |
|
tie_word_embeddings(`bool`, *optional*, defaults to `False`): |
|
Whether to tie weight embeddings |
|
""" |
|
|
|
model_type = "ernie4_5_moe_vl" |
|
attribute_map = { |
|
"n_positions": "max_position_embeddings", |
|
"n_embd": "hidden_size", |
|
"n_layer": "num_hidden_layers", |
|
"n_head": "num_attention_heads", |
|
"n_inner": "intermediate_size", |
|
"activation_function": "hidden_act", |
|
} |
|
base_model_tp_plan = { |
|
"ernie.layers.*.self_attn.qkv_proj": "colwise", |
|
"ernie.layers.*.self_attn.o_proj": "rowwise", |
|
"ernie.layers.*.mlp_text.experts.*.up_gate_proj": "colwise", |
|
"ernie.layers.*.mlp_text.experts.*.down_proj": "rowwise", |
|
"ernie.layers.*.mlp_text.gate": "colwise_rep", |
|
"ernie.layers.*.mlp.experts.*.up_gate_proj": "colwise", |
|
"ernie.layers.*.mlp.experts.*.down_proj": "rowwise", |
|
"ernie.layers.*.mlp.gate": "colwise_rep", |
|
"ernie.layers.*.mlp.up_gate_proj": "colwise", |
|
"ernie.layers.*.mlp.down_proj": "rowwise", |
|
"lm_head": "colwise_rep", |
|
} |
|
|
|
def __init__( |
|
self, |
|
vision_config=None, |
|
im_patch_id=None, |
|
pixel_hidden_size=None, |
|
modality_detach=False, |
|
temporal_conv_size=2, |
|
spatial_conv_size=2, |
|
mm_vocab_size=0, |
|
max_text_id=None, |
|
use_temporal_conv=True, |
|
moe_use_size_all2all=False, |
|
moe_num_attn_experts=False, |
|
moe_dense_experts_token_type_id: int = 3, |
|
moe_use_hard_gate: bool = True, |
|
moe_fuse_experts: bool = False, |
|
moe_use_token_type_bias: bool = False, |
|
disable_ffn_model_parallel=False, |
|
fuse_attn_ffn=True, |
|
rope_3d=True, |
|
freq_allocation=20, |
|
using_precision_check=False, |
|
use_recompute_resampler=False, |
|
resampler_fuse_rms_norm=False, |
|
moe_layer_feed_fake_token=False, |
|
tensor_parallel_degree=1, |
|
**kwargs, |
|
): |
|
super().__init__(**kwargs) |
|
if isinstance(vision_config, dict): |
|
self.vision_config = DFNRopeVisionTransformerConfig(**vision_config) |
|
else: |
|
self.vision_config = DFNRopeVisionTransformerConfig() |
|
self.im_patch_id = im_patch_id |
|
self.pixel_hidden_size = pixel_hidden_size |
|
self.modality_detach = modality_detach |
|
self.temporal_conv_size = temporal_conv_size |
|
self.spatial_conv_size = spatial_conv_size |
|
self.mm_vocab_size = mm_vocab_size |
|
self.max_text_id = max_text_id |
|
self.use_temporal_conv = use_temporal_conv |
|
|
|
self.moe_use_size_all2all = moe_use_size_all2all |
|
self.moe_num_attn_experts = moe_num_attn_experts |
|
self.moe_dense_experts_token_type_id = moe_dense_experts_token_type_id |
|
self.moe_use_hard_gate = moe_use_hard_gate |
|
self.moe_fuse_experts = moe_fuse_experts |
|
self.moe_use_token_type_bias = moe_use_token_type_bias |
|
self.disable_ffn_model_parallel = disable_ffn_model_parallel |
|
|
|
self.fuse_attn_ffn = fuse_attn_ffn |
|
self.rope_3d = rope_3d |
|
self.freq_allocation = freq_allocation |
|
self.using_precision_check = using_precision_check |
|
self.use_recompute_resampler = use_recompute_resampler |
|
self.resampler_fuse_rms_norm = resampler_fuse_rms_norm |
|
self.moe_layer_feed_fake_token = moe_layer_feed_fake_token |
|
|
|
self.tensor_parallel_degree = tensor_parallel_degree |
|
|
|
@property |
|
def multimodel_experts(self) -> bool: |
|
"""Check if model is using more than 1 multimodel experts.""" |
|
return ( |
|
isinstance(self.moe_num_experts, (tuple, list)) |
|
and len(self.moe_num_experts) > 1 |
|
) |
|
|
|
@property |
|
def use_moe(self) -> bool: |
|
""" |
|
Check if model is using MoE architecture. |
|
|
|
Returns: |
|
bool: True if moe_num_experts > 0, False otherwise |
|
""" |
|
return ( |
|
sum(self.moe_num_experts) > 0 |
|
if self.multimodel_experts |
|
else self.moe_num_experts > 0 |
|
) |
|
|
|
def to_dict(self, saving_file=False): |
|
"""to_dict""" |
|
output = copy.deepcopy(self.__dict__) |
|
if self.vision_config: |
|
output["vision_config"] = ( |
|
self.vision_config.to_dict() |
|
if isinstance(self.vision_config, (DFNRopeVisionTransformerConfig)) |
|
else self.vision_config |
|
) |
|
|
|
output["model_type"] = self.__class__.model_type |
|
return output |
|
|