ERNIE-4.5-VL-28B-A3B-PT / configuration_ernie_45t_vl.py

first commit

3ea2afa 2 days ago

28.2 kB

	# Copyright (c) 2025 Baidu, Inc. All Rights Reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	"""Ernie model configuration"""
	import copy

	from typing import List, Optional, Tuple, Union

	from transformers import PretrainedConfig


	__all__ = [
	"ERNIE_PRETRAINED_INIT_CONFIGURATION",
	"Ernie4_5_Config",
	"Ernie4_5_MoEConfig",
	"Ernie4_5_VLMoEConfig",
	]


	class DFNRopeVisionTransformerConfig(PretrainedConfig):
	"""
	Configuration class for DFNRopeVisionTransformer model.
	This class inherits from [`PretrainedConfig`] and can be used to control the model outputs. Read the
	documentation from [`PretrainedConfig`] for more information.
	"""

	model_type = "DFNRope_vision_transformer"
	base_model_tp_plan = {}

	def __init__(
	self,
	depth=32,
	embed_dim=1280,
	hidden_size=3584,
	hidden_act="quick_gelu",
	mlp_ratio=4,
	num_heads=16,
	in_channels=3,
	patch_size=14,
	spatial_merge_size=2,
	attn_implementation="eager", # new added
	pp_data_balance=False,
	recompute=False,
	attn_sep=False,
	vit_first_fwd_bsz=128,
	vit_num_recompute_layers=10000,
	**kwargs,
	):
	"""
	Initialize DFNRopeVisionTransformer model configuration with default or specified parameters.

	Args:
	depth (int): Number of transformer layers in the model.
	embed_dim (int): Dimensionality of the embedding layer.
	hidden_size (int): Dimensionality of the feedforward network.
	hidden_act (str): Activation function for the feedforward network.
	mlp_ratio (float): Ratio between the number of input features and
	the number of output features in the feedforward network.
	num_heads (int): Number of attention heads in each attention layer.
	in_channels (int): Number of channels in the input image.
	patch_size (int):
	Size of patches in the input image. Defaults to 14.
	spatial_merge_size (int):
	Spatial merge size for the spatial transformer module. Defaults to 2.
	attn_implementation (str): Attention implementation type. Defaults to "eager".
	pp_data_balance (bool): Whether to balance data during preprocessing. Defaults to False.
	recompute (bool): Whether to use recompute. Defaults to False.
	attn_sep (bool): Whether to separate attention computation into two stages. Defaults to False.
	vit_first_fwd_bsz (int): First forward batch size for ViT. Defaults to 128.
	vit_num_recompute_layers (int): Number of recomputed layers for ViT. Defaults to
	"""
	super().__init__(**kwargs)

	self.depth = depth
	self.embed_dim = embed_dim
	self.hidden_size = hidden_size
	self.hidden_act = hidden_act
	self.mlp_ratio = mlp_ratio
	self.num_heads = num_heads
	self.in_channels = in_channels
	self.patch_size = patch_size
	self.spatial_merge_size = spatial_merge_size
	self.attn_implementation = attn_implementation
	self.pp_data_balance = pp_data_balance
	self.recompute = recompute
	self.attn_sep = attn_sep
	self.vit_first_fwd_bsz = vit_first_fwd_bsz
	self.vit_num_recompute_layers = vit_num_recompute_layers

	def get(self, key, default=None):
	"""get config value by key"""
	if hasattr(self, key):
	return getattr(self, key)
	else:
	return default


	ERNIE_PRETRAINED_INIT_CONFIGURATION = {
	"ernie/tiny-random-ernie": {
	"hidden_size": 768,
	"initializer_range": 0.02,
	"intermediate_size": 11008,
	"max_position_embeddings": 2048,
	"model_type": "ernie",
	"num_attention_heads": 2,
	"num_hidden_layers": 2,
	"rms_norm_eps": 1e-06,
	"vocab_size": 32000,
	"bos_token_id": 1,
	"eos_token_id": 2,
	"pad_token_id": 0,
	"use_cache": False,
	"recompute": False,
	"use_flash_attn": True,
	"use_pure_fp16": False,
	},
	}


	class Ernie4_5_Config(PretrainedConfig):
	"""
	Configuration class for ERNIE model.

	This class stores the configuration of an ERNIE model, defining the model architecture.
	It inherits from PretrainedConfig and can be used to control model outputs.
	"""

	model_type = "ernie"
	pretrained_init_configuration = ERNIE_PRETRAINED_INIT_CONFIGURATION
	base_model_tp_plan = {}

	def __init__(
	self,
	vocab_size=32000,
	hidden_size=768,
	intermediate_size=11008,
	max_position_embeddings=32768,
	num_hidden_layers=2,
	num_attention_heads=2,
	initializer_range=0.02, # no use
	rms_norm_eps=1e-6,
	use_cache=False,
	use_flash_attention=True,
	use_sparse_flash_attn=True,
	use_var_len_flash_attn=False,
	recompute=False,
	recompute_granularity="core_attn",
	recompute_use_reentrant=False,
	use_rmsnorm=True,
	fuse_rms_norm=False,
	fuse_ln=False,
	pad_token_id=0,
	bos_token_id=1,
	eos_token_id=2,
	fuse_swiglu=False,
	use_bias=False,
	rope_theta=10000,
	fuse_rope=False,
	fuse_softmax_mask=False,
	use_fast_ln=False,
	weight_share_add_bias=True,
	fuse_linear=False,
	max_sequence_length=1024,
	ignored_index=-100,
	add_tail_layers=False,
	use_recompute_lm_head=False,
	use_recompute_loss_fn=False,
	refined_recompute=dict(),
	attention_probs_dropout_prob=0.0,
	hidden_dropout_prob=0.0,
	compression_ratio: float = 1.0,
	num_key_value_heads=None,
	use_sparse_head_and_loss_fn=False,
	micro_batch_size=-1,
	use_ep_comm_overlap=False,
	use_fused_head_and_loss_fn=False,
	token_balance_loss=False,
	token_balance_seqlen=False, # calculated based on batchsize and seqlen
	cachekv_quant: bool = False,
	pp_seg_method="layer:ErnieDecoderLayer\|EmptyLayer",
	**kwargs,
	):
	"""
	Initialize ERNIE model configuration with default or specified parameters.

	Args:
	vocab_size (int): Size of the vocabulary (number of unique tokens)
	hidden_size (int): Dimensionality of the encoder layers and the pooler layer
	intermediate_size (int): Dimensionality of the "intermediate" (feed-forward) layer
	max_position_embeddings (int): Maximum sequence length the model can handle
	num_hidden_layers (int): Number of hidden layers in the Transformer encoder
	num_attention_heads (int): Number of attention heads for each attention layer
	rms_norm_eps (float): The epsilon used by the RMS normalization layers
	use_cache (bool): Whether to use caching for faster generation (decoding)
	use_flash_attention (bool): Whether to use FlashAttention for optimized attention computation
	use_sparse_flash_attn (bool): Whether to use sparse FlashAttention
	use_var_len_flash_attn (bool): Whether to use variable-length FlashAttention
	recompute (bool): Whether to use gradient checkpointing to save memory
	recompute_granularity (str): Granularity of recomputation ("core_attn", "full", etc.)
	recompute_use_reentrant (bool): Whether to use reentrant checkpointing
	use_rmsnorm (bool): Whether to use RMSNorm instead of LayerNorm
	fuse_rms_norm (bool): Whether to fuse RMSNorm operations for optimization
	fuse_ln (bool): Whether to fuse LayerNorm operations
	pad_token_id (int): Token ID used for padding sequences
	bos_token_id (int): Token ID used for beginning-of-sequence
	eos_token_id (int): Token ID used for end-of-sequence
	fuse_swiglu (bool): Whether to fuse SwiGLU operations
	use_bias (bool): Whether to use bias terms in linear layers
	rope_theta (float): The base period of the RoPE embeddings
	fuse_rope (bool): Whether to fuse RoPE operations
	use_fast_ln (bool): Whether to use optimized LayerNorm implementation
	weight_share_add_bias (bool): Whether to share bias weights in certain layers
	fuse_linear (bool): Whether to fuse linear operations
	max_sequence_length (int): Maximum sequence length for positional embeddings
	ignored_index (int): Target value that is ignored during loss computation
	add_tail_layers (bool): Whether to add additional layers at the end
	use_recompute_lm_head (bool): Whether to recompute gradients for language model head
	use_recompute_loss_fn (bool): Whether to recompute gradients for loss function
	refined_recompute (dict): Dictionary specifying refined recomputation settings
	attention_probs_dropout_prob (float): Dropout probability for attention weights
	hidden_dropout_prob (float): Dropout probability for hidden layers
	compression_ratio (float): Ratio for KV cache compression (1.0 = no compression)
	num_key_value_heads (int): Number of key/value heads (for Grouped Query Attention)
	use_sparse_head_and_loss_fn (bool): Whether to use sparse attention head and loss function
	micro_batch_size (int): Size of micro batches (-1 for automatic)
	use_ep_comm_overlap (bool): Whether to overlap communication with computation
	use_fused_head_loss_fn (bool): Whether to use fused head and loss function
	token_balance_loss (bool): Whether to balance loss by token count
	token_balance_seqlen (bool): Whether to balance sequence lengths
	cachekv_quant (bool): Whether to quantize key-value cache
	pp_seg_method (str): Method for pipeline parallel segmentation
	**kwargs: Additional keyword arguments passed to parent class
	"""

	# Set default for tied embeddings if not specified.
	if "tie_word_embeddings" not in kwargs:
	kwargs["tie_word_embeddings"] = False
	super().__init__(
	pad_token_id=pad_token_id,
	bos_token_id=bos_token_id,
	eos_token_id=eos_token_id,
	**kwargs,
	)
	self.vocab_size = vocab_size
	self.hidden_size = hidden_size
	self.intermediate_size = intermediate_size
	self.max_position_embeddings = max_position_embeddings
	self.num_hidden_layers = num_hidden_layers
	self.num_attention_heads = num_attention_heads
	self.initializer_range = initializer_range
	self.rms_norm_eps = rms_norm_eps
	self.use_cache = use_cache
	self.recompute = recompute
	self.recompute_granularity = recompute_granularity
	self.use_flash_attention = use_flash_attention
	self.use_sparse_flash_attn = use_sparse_flash_attn
	self.recompute_use_reentrant = recompute_use_reentrant
	self.use_var_len_flash_attn = use_var_len_flash_attn
	self.pad_token_id = pad_token_id
	self.bos_token_id = bos_token_id
	self.eos_token_id = eos_token_id
	self.fuse_swiglu = fuse_swiglu
	self.fuse_rms_norm = fuse_rms_norm
	self.fuse_ln = fuse_ln
	self.use_rmsnorm = use_rmsnorm
	self.micro_batch_size = micro_batch_size

	self.max_sequence_length = max_sequence_length
	self.use_bias = use_bias
	self.weight_share_add_bias = weight_share_add_bias
	self.rope_theta = rope_theta
	self.fuse_rope = fuse_rope
	self.fuse_softmax_mask = fuse_softmax_mask
	self.use_fast_ln = use_fast_ln

	self.fuse_linear = fuse_linear
	self.ignored_index = ignored_index
	self.add_tail_layers = add_tail_layers
	self.use_recompute_lm_head = use_recompute_lm_head
	self.use_recompute_loss_fn = use_recompute_loss_fn

	self.refined_recompute = refined_recompute
	self.skip_recompute_ops = dict()
	"""
	`refined_recompute` is a dictionary that specifies fine-grained gradient recomputation settings,
	which currently only takes effect in Pipeline Parallel (PP) mode.

	In PP mode, this dictionary populates `self.skip_recompute_ops` with the following structure:
	- Key (`op_name`): The operation name to configure, with possible values:
	* "mlp_row_ln" - MLP row-wise layer normalization
	* "flash_attn" - Flash attention operation
	* "attention_row_ln" - Attention row-wise layer normalization
	* "attention_column_ln" - Attention column-wise layer normalization
	* "mlp_column_ln" - MLP column-wise layer normalization

	- Value (`skip_num`): Controls how many times to skip recomputation:
	* 0: Never skip recomputation (minimum memory usage)
	* -1: Always skip recomputation (maximum memory usage)
	* [0,1,...,12]: Skip recomputation for specified number of times
	* ≥12: Equivalent to -1 (always skip recomputation)

	This allows precise control over memory/computation tradeoffs for different operations.
	"""
	self.attention_probs_dropout_prob = attention_probs_dropout_prob
	self.hidden_dropout_prob = hidden_dropout_prob
	self.compression_ratio = compression_ratio
	self.num_key_value_heads = num_key_value_heads
	self.use_sparse_head_and_loss_fn = use_sparse_head_and_loss_fn
	self.use_ep_comm_overlap = use_ep_comm_overlap
	self.use_fused_head_and_loss_fn = use_fused_head_and_loss_fn
	self.token_balance_loss = token_balance_loss
	self.token_balance_seqlen = token_balance_seqlen
	self.cachekv_quant = cachekv_quant
	self.pp_seg_method = pp_seg_method

	def get(self, key, default=None):
	"""get config value by key"""
	if hasattr(self, key):
	return getattr(self, key)
	else:
	return default


	class Ernie4_5_MoEConfig(Ernie4_5_Config):
	r"""
	Configuration class for ErnieMoE model architecture.

	This class stores the configuration for a [`~ErnieModel`] and is used to instantiate
	an ErnieMoE model according to the specified arguments. Inherits from [`PretrainedConfig`]
	and can control model outputs.

	Attributes:
	Inherits all attributes from Ernie4_5_Config and adds MoE-specific configurations.
	"""

	model_type = "ernie"
	attribute_map = {
	"n_positions": "max_position_embeddings",
	"n_embd": "hidden_size",
	"n_layer": "num_hidden_layers",
	"n_head": "num_attention_heads",
	"n_inner": "intermediate_size",
	"activation_function": "hidden_act",
	}
	pretrained_init_configuration = ERNIE_PRETRAINED_INIT_CONFIGURATION
	base_model_tp_plan = {}

	def __init__(
	self,
	moe_num_experts: Union[int, list] = 0,
	use_recompute_moe=False,
	moe_capacity=(),
	moe_layer_interval=2,
	moe_layer_start_index=0,
	moe_layer_end_index=-1,
	moe_aux_loss_lambda=1e-2,
	moe_z_loss_lambda=1e-4,
	moe_orthogonal_loss_lambda=1e-2,
	sinkhorn_2gate=True,
	sinkhorn_temp=3e-2,
	global_aux_loss=False,
	moe_dropout_prob=0.0,
	moe_group="world",
	moe_gate="top2",
	moe_intermediate_size: Union[int, list] = 0,
	moe_num_shared_experts: int = 0,
	moe_reverse_token_drop: bool = False,
	moe_gate_act: str = "softmax",
	moe_norm_gate_logits=True,
	moe_all_to_all_dropout: float = 0.0,
	moe_k=2,
	moe_use_aux_free: bool = False,
	# `moe_group_experts` must be used with `moe_use_hard_gate=True`
	moe_group_experts: bool = False,
	moe_group_orthogonal_loss: bool = True,
	enable_delay_scale_loss: bool = True,
	num_acc_steps: int = 1,
	fuse_gate_detach_matmul: bool = False,
	dpo_config=None,
	moe_multimodal_dispatch_use_allgather: str = "",
	moe_use_hard_gate=False,
	moe_dense_experts_token_type_id=3,
	**kwargs,
	):
	"""
	Initialize ErnieMoE configuration with MoE-specific parameters.

	Args:
	moe_num_experts: Number of experts in MoE layers
	use_recompute_moe: Whether to use recomputation for MoE layers
	moe_capacity: Capacity configuration for MoE layers
	moe_layer_interval: Interval between MoE layers
	moe_layer_start_index: Starting layer index for MoE
	moe_layer_end_index: Ending layer index for MoE (-1 means last layer)
	moe_aux_loss_lambda: Weight for auxiliary loss
	moe_z_loss_lambda: Weight for z-loss
	moe_orthogonal_loss_lambda: Weight for orthogonal loss
	sinkhorn_2gate: Whether to use sinkhorn 2-gate routing
	sinkhorn_temp: Temperature for sinkhorn routing
	global_aux_loss: Whether to use global auxiliary loss
	moe_dropout_prob: Dropout probability for MoE layers
	moe_group: Group configuration for MoE experts
	moe_gate: Type of gating mechanism ('top2', etc.)
	moe_intermediate_size: Intermediate size for MoE layers
	moe_num_shared_experts: Number of shared experts
	moe_reverse_token_drop: Whether to use reverse token dropping
	moe_gate_act: Activation function for gating
	moe_norm_gate_logits: Whether to normalize gate logits
	moe_all_to_all_dropout: Dropout for all-to-all communication
	moe_k: Number of experts to route to
	moe_use_aux_free: Whether to use auxiliary-free routing
	moe_group_experts: Whether to group experts (requires hard gating)
	moe_group_orthogonal_loss: Whether to use group orthogonal loss
	enable_delay_scale_loss: Whether to enable delayed loss scaling
	num_acc_steps: Number of accumulation steps
	fuse_gate_detach_matmul: Whether to fuse gate detach matmul
	**kwargs: Additional base model configuration parameters

	Note:
	When use_recompute_moe is True, recompute_granularity will be changed to full_attn.
	"""

	if use_recompute_moe:
	logger.warning(
	"set `use_recompute_moe`=True, disabling `recompute_granularity=full`, change to full_attn."
	)
	if kwargs["recompute"] and kwargs["recompute_granularity"] == "full":
	kwargs["recompute_granularity"] = "full_attn"
	super().__init__(**kwargs)

	self.moe_num_experts = moe_num_experts
	self.use_recompute_moe = use_recompute_moe
	self.moe_capacity = moe_capacity
	self.moe_aux_loss_lambda = moe_aux_loss_lambda
	self.moe_z_loss_lambda = moe_z_loss_lambda
	self.moe_orthogonal_loss_lambda = moe_orthogonal_loss_lambda
	self.global_aux_loss = global_aux_loss
	self.sinkhorn_2gate = sinkhorn_2gate
	self.sinkhorn_temp = sinkhorn_temp
	self.moe_layer_interval = moe_layer_interval
	self.moe_dropout_prob = moe_dropout_prob
	self.moe_group = moe_group
	self.moe_gate = moe_gate
	self.moe_intermediate_size = moe_intermediate_size
	self.moe_num_shared_experts = moe_num_shared_experts
	self.moe_reverse_token_drop = moe_reverse_token_drop
	self.moe_k = moe_k
	self.moe_all_to_all_dropout = moe_all_to_all_dropout
	self.moe_group_experts = moe_group_experts
	self.moe_group_orthogonal_loss = moe_group_orthogonal_loss
	self.enable_delay_scale_loss = enable_delay_scale_loss
	self.num_acc_steps = num_acc_steps
	self.moe_layer_start_index = moe_layer_start_index
	self.moe_layer_end_index = (
	self.num_hidden_layers - 1
	if moe_layer_end_index == -1
	else moe_layer_end_index
	)
	self.moe_gate_act = moe_gate_act
	self.moe_norm_gate_logits = moe_norm_gate_logits
	self.moe_use_aux_free = moe_use_aux_free
	self.fuse_gate_detach_matmul = fuse_gate_detach_matmul
	self.dpo_config = dpo_config
	self.moe_multimodal_dispatch_use_allgather = (
	moe_multimodal_dispatch_use_allgather
	)
	self.moe_use_hard_gate = moe_use_hard_gate
	self.moe_dense_experts_token_type_id = moe_dense_experts_token_type_id

	@property
	def multimodel_experts(self) -> bool:
	"""multimodel experts."""
	return (
	isinstance(self.moe_num_experts, (tuple, list))
	and len(self.moe_num_experts) > 1
	)

	@property
	def use_moe(self) -> bool:
	"""
	Check if model is using MoE architecture.

	Returns:
	bool: True if moe_num_experts > 0, False otherwise
	"""
	return self.moe_num_experts > 0


	class Ernie4_5_VLMoEConfig(Ernie4_5_MoEConfig):
	"""
	This is the configuration class to store the configuration of a [`~ErnieModel`]. It is used to instantiate an Ernie
	model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
	defaults will yield a similar configuration to that of the Ernie-7B.
	Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
	documentation from [`PretrainedConfig`] for more information.
	Args:
	vocab_size (`int`, optional, defaults to 32000):
	Vocabulary size of the Ernie model. Defines the number of different tokens that can be represented by the
	`inputs_ids` passed when calling [`~ErnieModel`] or [`~TFErnieModel`].
	hidden_size (`int`, optional, defaults to 4096):
	Dimension of the hidden representations.
	intermediate_size (`int`, optional, defaults to 11008):
	Dimension of the MLP representations.
	num_hidden_layers (`int`, optional, defaults to 32):
	Number of hidden layers in the Transformer encoder.
	num_attention_heads (`int`, optional, defaults to 32):
	Number of attention heads for each attention layer in the Transformer encoder.
	hidden_act (`str` or `function`, optional, defaults to `"silu"`):
	The non-linear activation function (function or string) in the decoder.
	initializer_range (`float`, optional, defaults to 0.02):
	The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
	rms_norm_eps (`float`, optional, defaults to 1e-12):
	The epsilon used by the rms normalization layers.
	use_cache (`bool`, optional, defaults to `True`):
	Whether or not the model should return the last key/values attentions (not used by all models). Only
	relevant if `config.is_decoder=True`.
	tie_word_embeddings(`bool`, optional, defaults to `False`):
	Whether to tie weight embeddings
	"""

	model_type = "ernie4_5_moe_vl"
	attribute_map = {
	"n_positions": "max_position_embeddings",
	"n_embd": "hidden_size",
	"n_layer": "num_hidden_layers",
	"n_head": "num_attention_heads",
	"n_inner": "intermediate_size",
	"activation_function": "hidden_act",
	}
	base_model_tp_plan = {
	"ernie.layers.*.self_attn.qkv_proj": "colwise",
	"ernie.layers.*.self_attn.o_proj": "rowwise",
	"ernie.layers..mlp_text.experts..up_gate_proj": "colwise",
	"ernie.layers..mlp_text.experts..down_proj": "rowwise",
	"ernie.layers.*.mlp_text.gate": "colwise_rep",
	"ernie.layers..mlp.experts..up_gate_proj": "colwise",
	"ernie.layers..mlp.experts..down_proj": "rowwise",
	"ernie.layers.*.mlp.gate": "colwise_rep",
	"ernie.layers.*.mlp.up_gate_proj": "colwise",
	"ernie.layers.*.mlp.down_proj": "rowwise",
	"lm_head": "colwise_rep",
	}

	def __init__(
	self,
	vision_config=None,
	im_patch_id=None,
	pixel_hidden_size=None,
	modality_detach=False,
	temporal_conv_size=2,
	spatial_conv_size=2,
	mm_vocab_size=0, # vocab for mm specialtokens
	max_text_id=None,
	use_temporal_conv=True,
	moe_use_size_all2all=False,
	moe_num_attn_experts=False,
	moe_dense_experts_token_type_id: int = 3,
	moe_use_hard_gate: bool = True,
	moe_fuse_experts: bool = False,
	moe_use_token_type_bias: bool = False,
	disable_ffn_model_parallel=False,
	fuse_attn_ffn=True,
	rope_3d=True,
	freq_allocation=20,
	using_precision_check=False,
	use_recompute_resampler=False,
	resampler_fuse_rms_norm=False,
	moe_layer_feed_fake_token=False,
	tensor_parallel_degree=1,
	**kwargs,
	):
	super().__init__(**kwargs)
	if isinstance(vision_config, dict):
	self.vision_config = DFNRopeVisionTransformerConfig(**vision_config)
	else:
	self.vision_config = DFNRopeVisionTransformerConfig()
	self.im_patch_id = im_patch_id
	self.pixel_hidden_size = pixel_hidden_size
	self.modality_detach = modality_detach
	self.temporal_conv_size = temporal_conv_size
	self.spatial_conv_size = spatial_conv_size
	self.mm_vocab_size = mm_vocab_size
	self.max_text_id = max_text_id
	self.use_temporal_conv = use_temporal_conv

	self.moe_use_size_all2all = moe_use_size_all2all
	self.moe_num_attn_experts = moe_num_attn_experts
	self.moe_dense_experts_token_type_id = moe_dense_experts_token_type_id
	self.moe_use_hard_gate = moe_use_hard_gate
	self.moe_fuse_experts = moe_fuse_experts
	self.moe_use_token_type_bias = moe_use_token_type_bias
	self.disable_ffn_model_parallel = disable_ffn_model_parallel

	self.fuse_attn_ffn = fuse_attn_ffn
	self.rope_3d = rope_3d
	self.freq_allocation = freq_allocation
	self.using_precision_check = using_precision_check
	self.use_recompute_resampler = use_recompute_resampler
	self.resampler_fuse_rms_norm = resampler_fuse_rms_norm
	self.moe_layer_feed_fake_token = moe_layer_feed_fake_token

	self.tensor_parallel_degree = tensor_parallel_degree

	@property
	def multimodel_experts(self) -> bool:
	"""Check if model is using more than 1 multimodel experts."""
	return (
	isinstance(self.moe_num_experts, (tuple, list))
	and len(self.moe_num_experts) > 1
	)

	@property
	def use_moe(self) -> bool:
	"""
	Check if model is using MoE architecture.

	Returns:
	bool: True if moe_num_experts > 0, False otherwise
	"""
	return (
	sum(self.moe_num_experts) > 0
	if self.multimodel_experts
	else self.moe_num_experts > 0
	)

	def to_dict(self, saving_file=False):
	"""to_dict"""
	output = copy.deepcopy(self.__dict__)
	if self.vision_config:
	output["vision_config"] = (
	self.vision_config.to_dict()
	if isinstance(self.vision_config, (DFNRopeVisionTransformerConfig))
	else self.vision_config
	)

	output["model_type"] = self.__class__.model_type
	return output