ricardomonti08 commited on Jul 28

Commit

f0644c2

verified ·

1 Parent(s): 4c363a3

Upload folder using huggingface_hub

Browse files

Files changed (35) hide show

.gitattributes +1 -0
act_ckpt.py +123 -0
added_tokens.json +28 -0
attention.py +450 -0
blocks.py +137 -0
config.json +84 -0
config_defaults.py +5 -0
config_moe_args.py +159 -0
configuration_mpt.py +252 -0
custom_embedding.py +10 -0
dmoe.py +138 -0
fc.py +8 -0
ffn.py +272 -0
generation_config.json +5 -0
layer_builders.py +33 -0
layers_registry.py +22 -0
merges.txt +0 -0
model-00001-of-00007.safetensors +3 -0
model-00002-of-00007.safetensors +3 -0
model-00003-of-00007.safetensors +3 -0
model-00004-of-00007.safetensors +3 -0
model-00005-of-00007.safetensors +3 -0
model-00006-of-00007.safetensors +3 -0
model-00007-of-00007.safetensors +3 -0
model.safetensors.index.json +289 -0
modeling_mpt.py +696 -0
mpt_param_count.py +130 -0
norm.py +79 -0
param_init_fns.py +448 -0
registry_utils.py +131 -0
special_tokens_map.json +31 -0
tokenizer.json +3 -0
tokenizer_config.json +240 -0
vocab.json +0 -0
warnings.py +72 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

act_ckpt.py ADDED Viewed

	@@ -0,0 +1,123 @@

+from typing import Any
+import torch
+from .layers_registry import attention_classes, ffns, ffns_with_megablocks, ffns_with_norm, norms
+from .blocks import FusedNormAttentionNorm, MPTBlock
+def pass_on_block_idx(parent: torch.nn.Module):
+    if not hasattr(parent, 'block_idx') or not hasattr(parent, 'max_block_idx'):
+        return
+    for child in parent.children():
+        child.block_idx = parent.block_idx
+        child.max_block_idx = parent.max_block_idx
+        if child.children():
+            pass_on_block_idx(child)
+def get_act_ckpt_module(mod_name: str) -> Any:
+    """Get the module type from the module name."""
+    if mod_name.lower() == 'mptblock':
+        mod_type = MPTBlock
+    elif mod_name in attention_classes:
+        mod_type = attention_classes.get(mod_name)
+    elif mod_name.lower() == 'norm_attn_norm':
+        mod_type = FusedNormAttentionNorm
+    elif mod_name in ffns:
+        mod_type = ffns.get(mod_name)
+    elif mod_name in ffns_with_norm:
+        mod_type = ffns_with_norm.get(mod_name)
+    elif mod_name in ffns_with_megablocks:
+        mod_type = ffns_with_megablocks.get(mod_name)
+    elif mod_name in norms:
+        mod_type = norms.get(mod_name)
+    else:
+        msg = ', '.join(list(attention_classes.get_all()) + list(ffns.get_all()) + list(ffns_with_norm.get_all()) + list(ffns_with_megablocks.get_all()) + list(norms.get_all()) + ['MPTBlock'])
+        raise ValueError(f'{mod_name} (specified in activation_checkpointing_target) is not a recognized option out of available options {msg}.')
+    return mod_type
+def parse_ele_str(ele: str, max_block_idx: int) -> list:
+    """Parse a string in target_blocks and return a list of block ids to add.
+    Supported formats are: first-n, middle-m, last-k, range-i-j which correspond
+    to the first n, the middle m,  the last k, and the range [i, j).
+    """
+    to_add = None
+    if ele.startswith('first-'):
+        assert ele[6:].isdigit(), f'Invalid target_blocks element {ele}'
+        to_add = list(range(min(int(ele[6:]), max_block_idx + 1)))
+    elif ele.startswith('last-'):
+        assert ele[5:].isdigit(), f'Invalid target_blocks element {ele}'
+        to_add = list(range(max(max_block_idx - int(ele[5:]) + 1, 0), max_block_idx + 1))
+    elif ele.startswith('middle-'):
+        assert ele[7:].isdigit(), f'Invalid target_blocks element {ele}'
+        num = int(ele[7:])
+        start = max(max_block_idx // 2 - num // 2, 0)
+        end = min(start + num, max_block_idx + 1)
+        to_add = list(range(start, end))
+    elif ele.startswith('range-'):
+        r = ele[6:].split('-')
+        assert len(r) == 2, f'Invalid target_blocks element {ele}'
+        start, end = (int(r[0]), int(r[1]))
+        start = max(start, 0)
+        end = min(end, max_block_idx + 1)
+        to_add = list(range(start, end))
+    else:
+        raise ValueError(f'Invalid target_blocks element {ele}')
+    return to_add
+def get_target_block_list(target_blocks: Any, max_block_idx: int) -> list:
+    """Parse the user input and return a list of block ids."""
+    candidate_block_ids = []
+    if isinstance(target_blocks, int):
+        candidate_block_ids = list(range(target_blocks))
+    elif isinstance(target_blocks, list):
+        for ele in target_blocks:
+            if isinstance(ele, int):
+                candidate_block_ids.append(ele)
+            elif isinstance(ele, str):
+                to_add = parse_ele_str(ele, max_block_idx)
+                candidate_block_ids.extend(to_add)
+            else:
+                raise ValueError(f'target_blocks must be a list of integers or "first-n", "middle-m", "last-k", or "range-i-j" where n, m, k, i, j are integers, but got {target_blocks}')
+    elif isinstance(target_blocks, str):
+        target_blocks = target_blocks.replace(' ', '')
+        for ele in target_blocks.split(','):
+            to_add = parse_ele_str(ele, max_block_idx)
+            candidate_block_ids.extend(to_add)
+    else:
+        raise ValueError(f'target_blocks must be either a single integer, or a list of integers, or a comma separated string made of "first-n", "last-m", "middle-k", "range-i-j", or a list of mixed integers and before-mentioned strings, but got {type(target_blocks)}')
+    candidate_block_ids = list(set(candidate_block_ids))
+    return candidate_block_ids
+def check_mapping_blocks_overlap(mapping: dict, max_block_idx: int) -> None:
+    """Check if the block ids in the mapping overlap with each other."""
+    all_blocks = [None] * (max_block_idx + 1)
+    for k, v in mapping.items():
+        if v == -1:
+            v = list(range(max_block_idx + 1))
+        for vv in v:
+            if vv < 0 or vv > max_block_idx:
+                continue
+            elif all_blocks[vv] is not None:
+                raise ValueError(f'Block {vv} is assigned to both {k} and {all_blocks[vv]}. Each block can only have one granularity of activation checkpointing. Make sure the target_blocks in activation_checkpointing_target do not overlap. For more details, refer to the docs of activation_checkpointing_fn.')
+            else:
+                all_blocks[vv] = k
+def build_act_ckpt_mod_to_blocks(act_ckpt_target: Any, top_module: Any, max_block_idx: int) -> dict:
+    act_ckpt_mod_to_blocks = {}
+    if act_ckpt_target is None or act_ckpt_target == []:
+        mod = top_module
+        act_ckpt_mod_to_blocks[mod] = -1
+    elif isinstance(act_ckpt_target, str):
+        mod = get_act_ckpt_module(act_ckpt_target)
+        act_ckpt_mod_to_blocks[mod] = -1
+    elif isinstance(act_ckpt_target, list):
+        for target in act_ckpt_target:
+            mod = get_act_ckpt_module(target)
+            act_ckpt_mod_to_blocks[mod] = -1
+    elif isinstance(act_ckpt_target, dict):
+        for k, v in act_ckpt_target.items():
+            mod = get_act_ckpt_module(k)
+            block_ids = get_target_block_list(v, max_block_idx)
+            act_ckpt_mod_to_blocks[mod] = block_ids
+    else:
+        raise ValueError(f'activation_checkpointing_target must be either a single string or a list or a dict, but got {type(act_ckpt_target)}')
+    return act_ckpt_mod_to_blocks

added_tokens.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "</think>": 151668,
+  "</tool_call>": 151658,
+  "</tool_response>": 151666,
+  "<think>": 151667,
+  "<tool_call>": 151657,
+  "<tool_response>": 151665,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

attention.py ADDED Viewed

	@@ -0,0 +1,450 @@

+"""Attention layers."""
+import copy
+import math
+import warnings
+from typing import Any, Optional
+import torch
+import transformers
+from einops import rearrange
+from packaging import version
+from torch import nn
+from .layers_registry import attention_classes, attention_implementations
+from .layer_builders import build_fc, build_norm
+from .config_defaults import fc_type_defaults
+def is_flash_v2_installed(v2_version: str='2.0.0'):
+    assert version.parse(v2_version) >= version.parse('2.0.0')
+    try:
+        import flash_attn as flash_attn
+    except:
+        return False
+    return version.parse(flash_attn.__version__) >= version.parse(v2_version)
+def is_flash_v1_installed():
+    try:
+        import flash_attn as flash_attn
+    except:
+        return False
+    return version.parse(flash_attn.__version__) < version.parse('2.0.0')
+def is_transformers_version_gte(hf_version: str) -> bool:
+    return version.parse(transformers.__version__) >= version.parse(hf_version)
+def check_alibi_support(attention_impl: str) -> bool:
+    return attention_impl != 'flash' or is_flash_v2_installed(v2_version='v2.4.2')
+from transformers.models.llama.modeling_llama import apply_rotary_pos_emb
+def _reset_is_causal(num_query_tokens: int, num_key_tokens: int, original_is_causal: bool) -> bool:
+    if original_is_causal and num_query_tokens != num_key_tokens:
+        if num_query_tokens != 1:
+            raise NotImplementedError('MPT does not support query and key with different number of tokens, unless number of query tokens is 1.')
+        else:
+            return False
+    return original_is_causal
+def repeat_kv_for_gqa(hidden: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """Perform repeat of kv heads along a particular dimension.
+    hidden.shape expected to be: (batch size, seq len, kv_n_heads, head_dim)
+    n_rep: amount of repetitions of kv_n_heads
+    Unlike torch.repeat_interleave, this function avoids allocating new memory.
+    """
+    if n_rep == 1:
+        return hidden
+    b, s, kv_n_heads, d = hidden.shape
+    hidden = hidden[:, :, :, None, :].expand(b, s, kv_n_heads, n_rep, d)
+    return hidden.reshape(b, s, kv_n_heads * n_rep, d)
+def scaled_multihead_dot_product_attention(query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, n_heads: int, kv_n_heads: int, past_key_value: Optional[tuple[torch.Tensor, torch.Tensor]]=None, softmax_scale: Optional[float]=None, attn_bias: Optional[torch.Tensor]=None, key_padding_mask: Optional[torch.Tensor]=None, is_causal: bool=False, dropout_p: float=0.0, training: bool=False, needs_weights: bool=False, attn_logit_softcapping: Optional[float]=None, sliding_window_size: int=-1) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor, torch.Tensor]]]:
+    q = rearrange(query, 'b s (h d) -> b h s d', h=n_heads)
+    k = rearrange(key, 'b s (h d) -> b h d s', h=kv_n_heads)
+    v = rearrange(value, 'b s (h d) -> b h s d', h=kv_n_heads)
+    if past_key_value is not None:
+        if len(past_key_value) != 0:
+            k = torch.cat([past_key_value[0], k], dim=3)
+            v = torch.cat([past_key_value[1], v], dim=2)
+        past_key_value = (k, v)
+    b, _, s_q, d = q.shape
+    s_k = k.size(-1)
+    if kv_n_heads > 1 and kv_n_heads < n_heads:
+        k = repeat_kv_for_gqa(k.transpose(1, 2), n_heads // kv_n_heads).transpose(1, 2)
+        v = repeat_kv_for_gqa(v.transpose(1, 2), n_heads // kv_n_heads).transpose(1, 2)
+    if softmax_scale is None:
+        softmax_scale = 1 / math.sqrt(d)
+    attn_weight = q.matmul(k) * softmax_scale
+    if attn_logit_softcapping is not None:
+        attn_weight = attn_logit_softcapping * torch.tanh(attn_weight / attn_logit_softcapping)
+    if attn_bias is not None:
+        _s_q = max(0, attn_bias.size(2) - s_q)
+        _s_k = max(0, attn_bias.size(3) - s_k)
+        attn_bias = attn_bias[:, :, _s_q:, _s_k:]
+        if attn_bias.size(-1) != 1 and attn_bias.size(-1) != s_k or (attn_bias.size(-2) != 1 and attn_bias.size(-2) != s_q):
+            raise RuntimeError(f'attn_bias (shape: {attn_bias.shape}) is expected to broadcast to shape: {attn_weight.shape}.')
+        attn_weight = attn_weight + attn_bias
+    min_val = torch.finfo(q.dtype).min
+    if key_padding_mask is not None:
+        if attn_bias is not None:
+            warnings.warn('Propagating key_padding_mask to the attention module ' + 'and applying it within the attention module can cause ' + 'unnecessary computation/memory usage. Consider integrating ' + 'into attn_bias once and passing that to each attention ' + 'module instead.')
+        attn_weight = attn_weight.masked_fill(~key_padding_mask.view((b, 1, 1, s_k)), min_val)
+    if is_causal and (not s_q == 1):
+        s = max(s_q, s_k)
+        causal_mask = attn_weight.new_ones(s, s, dtype=torch.float32)
+        causal_mask = causal_mask.tril()
+        causal_mask = causal_mask.to(torch.bool)
+        causal_mask = ~causal_mask
+        causal_mask = causal_mask[-s_q:, -s_k:]
+        attn_weight = attn_weight.masked_fill(causal_mask.view(1, 1, s_q, s_k), min_val)
+    if sliding_window_size != -1:
+        window_mask = torch.ones((s_q, s_k), dtype=torch.bool, device=attn_weight.device)
+        if not s_q == 1:
+            if s_q != s_k:
+                raise ValueError('Number of queries should be equal to the number of keys.')
+            window_mask = torch.tril(window_mask, diagonal=sliding_window_size)
+            window_mask = torch.triu(window_mask, diagonal=-sliding_window_size)
+        else:
+            window_mask[:, :-(sliding_window_size + 1)] = False
+        window_mask = ~window_mask
+        attn_weight = attn_weight.masked_fill(window_mask.view(1, 1, s_q, s_k), min_val)
+    attn_weight = torch.softmax(attn_weight, dim=-1)
+    if dropout_p:
+        attn_weight = torch.nn.functional.dropout(attn_weight, p=dropout_p, training=training, inplace=True)
+    out = attn_weight.to(v.dtype).matmul(v)
+    out = rearrange(out, 'b h s d -> b s (h d)')
+    if needs_weights:
+        return (out, attn_weight, past_key_value)
+    return (out, None, past_key_value)
+def check_valid_inputs(*tensors: torch.Tensor, valid_dtypes: Optional[list[torch.dtype]]=None):
+    if valid_dtypes is None:
+        valid_dtypes = [torch.float32, torch.float16, torch.bfloat16]
+    for tensor in tensors:
+        if tensor.dtype not in valid_dtypes:
+            raise TypeError(f'tensor.dtype={tensor.dtype!r} must be in valid_dtypes={valid_dtypes!r}.')
+        if not tensor.is_cuda:
+            raise TypeError(f'Inputs must be cuda tensors (tensor.is_cuda={tensor.is_cuda!r}).')
+def flash_attn_fn(query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, n_heads: int, kv_n_heads: int, past_key_value: Optional[tuple[torch.Tensor, torch.Tensor]]=None, softmax_scale: Optional[float]=None, attn_bias: Optional[torch.Tensor]=None, key_padding_mask: Optional[torch.Tensor]=None, is_causal: bool=False, dropout_p: float=0.0, training: bool=False, needs_weights: bool=False, should_repeat_kv_for_gqa: Optional[bool]=True, sliding_window_size: int=-1, alibi_slopes: Optional[torch.Tensor]=None, flash_attn_padding_info: Optional[dict[str, torch.Tensor]]=None, attn_logit_softcapping: Optional[float]=None) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor, torch.Tensor]]]:
+    if key_padding_mask is not None:
+        raise ValueError('key_padding_mask should be None for flash attn.')
+    del key_padding_mask
+    if flash_attn_padding_info is None:
+        raise ValueError('flash_attn_padding_info is required for flash attn.')
+    try:
+        from flash_attn import bert_padding, flash_attn_interface
+    except:
+        raise RuntimeError('Please install flash-attn==1.0.9 or flash-attn==2.3.6')
+    check_valid_inputs(query, key, value)
+    if past_key_value is not None:
+        if len(past_key_value) != 0:
+            key = torch.cat([past_key_value[0], key], dim=1)
+            value = torch.cat([past_key_value[1], value], dim=1)
+        past_key_value = (key, value)
+    if attn_bias is not None:
+        raise NotImplementedError(f'attn_bias not implemented for flash attn.')
+    batch_size, seqlen = query.shape[:2]
+    indices_q = flash_attn_padding_info['indices_q'].to(query.device)
+    indices_k = flash_attn_padding_info['indices_k'].to(key.device)
+    indices_v = flash_attn_padding_info['indices_v'].to(value.device)
+    cu_seqlens_q = flash_attn_padding_info['cu_seqlens_q'].to(query.device)
+    cu_seqlens_k = flash_attn_padding_info['cu_seqlens_k'].to(key.device)
+    max_seqlen_q = flash_attn_padding_info['max_seqlen_q']
+    max_seqlen_k = flash_attn_padding_info['max_seqlen_k']
+    query_unpad = bert_padding.index_first_axis(rearrange(query, 'b s ... -> (b s) ...'), indices_q)
+    query_unpad = rearrange(query_unpad, 'nnz (h d) -> nnz h d', h=n_heads)
+    key_unpad = bert_padding.index_first_axis(rearrange(key, 'b s ... -> (b s) ...'), indices_k)
+    key_unpad = rearrange(key_unpad, 'nnz (h d) -> nnz h d', h=kv_n_heads)
+    value_unpad = bert_padding.index_first_axis(rearrange(value, 'b s ... -> (b s) ...'), indices_v)
+    value_unpad = rearrange(value_unpad, 'nnz (h d) -> nnz h d', h=kv_n_heads)
+    if kv_n_heads < n_heads and (not is_flash_v2_installed()) and (not should_repeat_kv_for_gqa):
+        raise ValueError('For Grouped Query Attention or Multi Query Attention, should_repeat_kv_for_gqa should be set to True if not using Flash Attention v2.')
+    if should_repeat_kv_for_gqa:
+        if kv_n_heads == 1:
+            key_unpad = key_unpad.expand(key_unpad.size(0), n_heads, key_unpad.size(-1))
+            value_unpad = value_unpad.expand(value_unpad.size(0), n_heads, value_unpad.size(-1))
+        elif kv_n_heads < n_heads:
+            key_unpad = repeat_kv_for_gqa(key_unpad.view(1, key_unpad.size(0), kv_n_heads, -1), n_heads // kv_n_heads).view(key_unpad.size(0), n_heads, -1)
+            value_unpad = repeat_kv_for_gqa(value_unpad.view(1, value_unpad.size(0), kv_n_heads, -1), n_heads // kv_n_heads).view(value_unpad.size(0), n_heads, -1)
+    dropout_p = dropout_p if training else 0.0
+    reset_is_causal = _reset_is_causal(query.size(1), key.size(1), is_causal)
+    if is_flash_v1_installed():
+        output_unpad = flash_attn_interface.flash_attn_unpadded_func(q=query_unpad, k=key_unpad, v=value_unpad, cu_seqlens_q=cu_seqlens_q, cu_seqlens_k=cu_seqlens_k, max_seqlen_q=max_seqlen_q, max_seqlen_k=max_seqlen_k, dropout_p=dropout_p, softmax_scale=softmax_scale, causal=reset_is_causal, return_attn_probs=needs_weights)
+    elif is_flash_v2_installed():
+        extra_attn_kwargs = {}
+        if check_alibi_support('flash'):
+            extra_attn_kwargs['alibi_slopes'] = alibi_slopes
+        elif alibi_slopes is not None:
+            raise ValueError('alibi_slopes is only supported for flash-attn>=2.4.2')
+        if is_flash_v2_installed(v2_version='v2.6.2') and attn_logit_softcapping is not None:
+            extra_attn_kwargs['softcap'] = attn_logit_softcapping
+        output_unpad = flash_attn_interface.flash_attn_varlen_func(q=query_unpad, k=key_unpad, v=value_unpad, cu_seqlens_q=cu_seqlens_q, cu_seqlens_k=cu_seqlens_k, max_seqlen_q=max_seqlen_q, max_seqlen_k=max_seqlen_k, dropout_p=dropout_p, softmax_scale=softmax_scale, causal=reset_is_causal, return_attn_probs=needs_weights, window_size=(sliding_window_size, sliding_window_size), **extra_attn_kwargs)
+    else:
+        raise RuntimeError('flash-attn==1.0.9 or flash-attn==2.4.2 is required.')
+    output = bert_padding.pad_input(rearrange(output_unpad, 'nnz h d -> nnz (h d)'), indices_q, batch_size, seqlen)
+    return (output, None, past_key_value)
+@attention_classes.register_class('grouped_query_attention')
+class GroupedQueryAttention(nn.Module):
+    """Grouped Query Attention (GQA) is a generalization of Multi-head (MHA).
+    and Multi-query attention (MQA).
+    This allows the user to set a variable of number of kv_n_heads, rather than
+    just n_heads or 1, as in MHA and MQA. Using torch attention implementation
+    enables user to also use additive bias. This class also supports
+    cross-attention with different `in_features` for key and value fc projections.
+    """
+    def __init__(self, d_model: int, n_heads: int, kv_n_heads: int, attn_impl: str='flash', clip_qkv: Optional[float]=None, qk_ln: bool=False, qk_gn: bool=False, fused_qkv: bool=True, softmax_scale: Optional[float]=None, attn_pdrop: float=0.0, norm_type: str='low_precision_layernorm', norm_eps: float=1e-05, fc_type: Optional[dict[str, Any]]=None, device: Optional[str]=None, bias: bool=True, sliding_window_size: int=-1, reuse_kv_layer_idx: Optional[int]=None, attn_logit_softcapping: Optional[float]=None, kv_dim: Optional[int]=None):
+        super().__init__()
+        self.attn_impl = attn_impl
+        self.clip_qkv = clip_qkv
+        self.qk_ln = qk_ln
+        self.qk_gn = qk_gn
+        self.fused_qkv = fused_qkv
+        self.d_model = d_model
+        self.n_heads = n_heads
+        self.kv_n_heads = kv_n_heads
+        self.sliding_window_size = sliding_window_size
+        self.reuse_kv_layer_idx = reuse_kv_layer_idx
+        self.attn_logit_softcapping = attn_logit_softcapping
+        self.kv_dim = kv_dim if kv_dim is not None else self.d_model
+        self.head_dim = d_model // n_heads
+        if fc_type is None:
+            fc_type = copy.deepcopy(fc_type_defaults)
+            fc_type['bias'] = bias
+            fc_type['device'] = device
+        fc_type_name = fc_type['name']
+        if self.kv_n_heads <= 0:
+            raise ValueError('kv_n_heads should be greater than zero.')
+        if self.kv_n_heads > self.n_heads:
+            raise ValueError('The number of KV heads should be less than or equal to Q heads.')
+        if self.n_heads % self.kv_n_heads != 0:
+            raise ValueError('Each Q head should get the same number of KV heads, so n_heads must be divisible by kv_n_heads.')
+        if qk_ln and qk_gn:
+            raise ValueError('Only one of qk_ln and qk_gn can be set to True.')
+        self.softmax_scale = softmax_scale
+        if self.softmax_scale is None:
+            self.softmax_scale = 1 / math.sqrt(self.d_model / self.n_heads)
+        self.attn_dropout_p = attn_pdrop
+        if self.reuse_kv_layer_idx is not None:
+            self.Wq = build_fc(name=fc_type_name, in_features=self.d_model, out_features=self.d_model, fc_kwargs=fc_type)
+            fuse_splits = [i * self.head_dim for i in range(1, self.n_heads)]
+            self.Wq._fused = (0, fuse_splits)
+        elif self.fused_qkv:
+            self.Wqkv = build_fc(name=fc_type_name, in_features=self.d_model, out_features=self.d_model + 2 * self.kv_n_heads * self.head_dim, fc_kwargs=fc_type)
+            fuse_splits = [i * self.head_dim for i in range(1, self.n_heads + 2 * self.kv_n_heads)]
+            self.Wqkv._fused = (0, fuse_splits)
+        else:
+            self.Wq = build_fc(name=fc_type_name, in_features=self.d_model, out_features=self.d_model, fc_kwargs=fc_type)
+            self.Wk = build_fc(name=fc_type_name, in_features=self.kv_dim, out_features=self.kv_n_heads * self.head_dim, fc_kwargs=fc_type)
+            self.Wv = build_fc(name=fc_type_name, in_features=self.kv_dim, out_features=self.kv_n_heads * self.head_dim, fc_kwargs=fc_type)
+            q_fuse_splits = [i * self.head_dim for i in range(1, self.n_heads)]
+            kv_fuse_splits = [i * self.head_dim for i in range(1, self.kv_n_heads)]
+            self.Wq._fused = (0, q_fuse_splits)
+            self.Wk._fused = (0, kv_fuse_splits)
+            self.Wv._fused = (0, kv_fuse_splits)
+        if self.qk_ln or self.qk_gn:
+            norm_size = self.head_dim if qk_gn else d_model
+            self.q_ln = build_norm(name=norm_type.lower(), normalized_shape=norm_size, eps=norm_eps, device=device)
+            if self.reuse_kv_layer_idx is None:
+                if qk_ln:
+                    norm_size = self.head_dim * kv_n_heads
+                self.k_ln = build_norm(name=norm_type.lower(), normalized_shape=norm_size, eps=norm_eps, device=device)
+        self.attn_fn = attention_implementations.get(self.attn_impl)
+        self.out_proj = build_fc(name=fc_type_name, in_features=self.d_model, out_features=self.d_model, fc_kwargs=fc_type)
+        self.out_proj._is_residual = True
+    def forward(self, x: torch.Tensor, past_key_value: Optional[tuple[torch.Tensor, torch.Tensor]]=None, attn_bias: Optional[torch.Tensor]=None, attention_mask: Optional[torch.Tensor]=None, rotary_emb_w_meta_info: Optional[dict]=None, is_causal: bool=True, needs_weights: bool=False, alibi_slopes: Optional[torch.Tensor]=None, flash_attn_padding_info: Optional[dict[str, torch.Tensor]]=None, prev_layer_key_value: Optional[tuple[torch.Tensor, torch.Tensor]]=None, key_value_states: Optional[torch.Tensor]=None) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor, torch.Tensor]]]:
+        extra_kwargs = {}
+        if prev_layer_key_value is not None:
+            extra_kwargs['prev_layer_key_value'] = prev_layer_key_value
+        query, key, value = self.get_qkv(x=x, key_value_states=key_value_states, **extra_kwargs)
+        if rotary_emb_w_meta_info is not None:
+            query, key, value = self._apply_rotary_embeddings(rotary_emb_w_meta_info, query, key, value)
+        extra_attn_kwargs = self.get_implementation_specific_args(attention_mask, alibi_slopes, flash_attn_padding_info)
+        context, attn_weights, past_key_value = self.attn_fn(query, key, value, n_heads=self.n_heads, kv_n_heads=self.kv_n_heads, past_key_value=past_key_value, softmax_scale=self.softmax_scale, attn_bias=attn_bias, is_causal=is_causal, dropout_p=self.attn_dropout_p, training=self.training, needs_weights=needs_weights, attn_logit_softcapping=self.attn_logit_softcapping, sliding_window_size=self.sliding_window_size, **extra_attn_kwargs)
+        return (self.out_proj(context), attn_weights, past_key_value)
+    def get_qkv(self, x: torch.Tensor, prev_layer_key_value: Optional[tuple[torch.Tensor, torch.Tensor]]=None, key_value_states: Optional[torch.Tensor]=None) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Computes and returns the query, key, and value tensors.
+        Args:
+            x (torch.Tensor): The input query tensor.
+            prev_layer_key_value  (Optional[Tuple[torch.Tensor, torch.Tensor]]): The key value of the previous layer.
+            key_value_states (Optional[torch.Tensor]): The input tensor for keys and values.
+        Returns:
+            query (torch.Tensor): The query tensor.
+            key (torch.Tensor): The key tensor.
+            value (torch.Tensor): The value tensor.
+        """
+        if self.reuse_kv_layer_idx is not None:
+            if prev_layer_key_value is None:
+                raise ValueError('prev_layer_key_value is None, cannot reuse_prev_layer_kv.')
+            key, value = prev_layer_key_value
+            if self.attn_impl == 'torch':
+                key = rearrange(key, 'b h d s -> b s (h d)')
+                value = rearrange(value, 'b h s d -> b s (h d)')
+            query = self.Wq(x)
+            if self.clip_qkv:
+                query = query.clamp(min=-self.clip_qkv, max=self.clip_qkv)
+            if self.qk_ln or self.qk_gn:
+                q_shape = query.shape
+                if self.qk_gn:
+                    b, s = query.shape[:2]
+                    query = query.view(b, s, self.n_heads, -1)
+                dtype = query.dtype
+                query = self.q_ln(query).to(dtype).view(q_shape)
+            return (query, key, value)
+        if self.fused_qkv:
+            if key_value_states is not None:
+                raise ValueError('Cannot use separate hidden and key_value states when fused_qkv = True.')
+            qkv = self.Wqkv(x)
+            if self.clip_qkv:
+                qkv = qkv.clamp(min=-self.clip_qkv, max=self.clip_qkv)
+            query, key, value = qkv.split([self.d_model, self.kv_n_heads * self.head_dim, self.kv_n_heads * self.head_dim], dim=2)
+        else:
+            query = self.Wq(x)
+            if key_value_states is not None:
+                key = self.Wk(key_value_states)
+                value = self.Wv(key_value_states)
+            else:
+                key = self.Wk(x)
+                value = self.Wv(x)
+            if self.clip_qkv:
+                query = query.clamp(min=-self.clip_qkv, max=self.clip_qkv)
+                key = key.clamp(min=-self.clip_qkv, max=self.clip_qkv)
+                value = value.clamp(min=-self.clip_qkv, max=self.clip_qkv)
+        if self.qk_ln or self.qk_gn:
+            q_shape, k_shape = (query.shape, key.shape)
+            if self.qk_gn:
+                b, s = query.shape[:2]
+                query = query.view(b, s, self.n_heads, -1)
+                key = key.view(b, s, self.kv_n_heads, -1)
+            dtype = query.dtype
+            query = self.q_ln(query).to(dtype).view(q_shape)
+            key = self.k_ln(key).to(dtype).view(k_shape)
+        return (query, key, value)
+    def _apply_rotary_embeddings(self, rotary_emb_w_meta_info: dict[str, Any], query: torch.Tensor, key: torch.Tensor, value: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        if self.reuse_kv_layer_idx is not None:
+            orig_key, orig_value = (key, value)
+            key, value = (torch.empty_like(key), torch.empty_like(value))
+        rotary_emb = rotary_emb_w_meta_info['rotary_emb']
+        seq_len = rotary_emb_w_meta_info['seq_len']
+        offset_info = rotary_emb_w_meta_info['offset_info']
+        bsz, seqlen = query.shape[:2]
+        query = query.view(bsz, seqlen, -1, self.head_dim)
+        key = key.view(bsz, seqlen, -1, self.head_dim)
+        if rotary_emb_w_meta_info['impl'] == 'dail':
+            value = value.view(bsz, seqlen, -1, self.head_dim)
+            kv = torch.stack([key, value], dim=2)
+            query, kv = rotary_emb(query, kv, seqlen_offset=offset_info, max_seqlen=seq_len)
+            [key, value] = torch.unbind(kv, dim=2)
+            value = value.view(bsz, seqlen, -1)
+        elif rotary_emb_w_meta_info['impl'] == 'hf':
+            if is_transformers_version_gte('4.38'):
+                cos, sin = rotary_emb(x=value, position_ids=offset_info)
+            else:
+                cos, sin = rotary_emb(x=value, seq_len=seq_len)
+            if is_transformers_version_gte('4.38'):
+                cos = cos.to(query.device)
+                sin = sin.to(query.device)
+                query, key = apply_rotary_pos_emb(q=query, k=key, cos=cos, sin=sin, position_ids=None, unsqueeze_dim=2)
+            elif is_transformers_version_gte('4.36'):
+                query, key = apply_rotary_pos_emb(q=query, k=key, cos=cos, sin=sin, position_ids=offset_info, unsqueeze_dim=2)
+            else:
+                query = query.transpose(1, 2)
+                key = key.transpose(1, 2)
+                query, key = apply_rotary_pos_emb(q=query, k=key, cos=cos, sin=sin, position_ids=offset_info)
+                query = query.transpose(1, 2)
+                key = key.transpose(1, 2)
+        query = query.view(bsz, seqlen, -1)
+        key = key.view(bsz, seqlen, -1)
+        if self.reuse_kv_layer_idx is not None:
+            return (query, orig_key, orig_value)
+        return (query, key, value)
+    def get_implementation_specific_args(self, attention_mask: Optional[torch.Tensor]=None, alibi_slopes: Optional[torch.Tensor]=None, flash_attn_padding_info: Optional[dict[str, torch.Tensor]]=None) -> dict[str, Any]:
+        """Returns attention implementation specific args.
+        Args:
+            attention_mask (Optional[torch.Tensor]): The attention mask.
+            alibi_slopes (Optional[torch.Tensor]): The alibi slopes.
+            flash_attn_padding_info (Optional[dict[str, torch.Tensor]]): The padding information, only required for flash attention.
+        Returns:
+            extra_attn_kwargs (dict[str, Any]): Implementation specific args.
+        """
+        if self.attn_impl == 'flash':
+            extra_attn_kwargs = {'should_repeat_kv_for_gqa': not is_flash_v2_installed(), 'alibi_slopes': alibi_slopes, 'flash_attn_padding_info': flash_attn_padding_info, 'key_padding_mask': None}
+        else:
+            extra_attn_kwargs = {'key_padding_mask': attention_mask}
+        return extra_attn_kwargs
+@attention_classes.register_class('multihead_attention')
+class MultiheadAttention(GroupedQueryAttention):
+    """Multi-head self attention.
+    Using torch attention implementation enables user to also use additive bias.
+    """
+    def __init__(self, d_model: int, n_heads: int, attn_impl: str='flash', clip_qkv: Optional[float]=None, qk_ln: bool=False, qk_gn: bool=False, fused_qkv: bool=True, softmax_scale: Optional[float]=None, attn_pdrop: float=0.0, norm_type: str='low_precision_layernorm', norm_eps: float=1e-05, fc_type: Optional[dict[str, Any]]=None, device: Optional[str]=None, bias: bool=True, sliding_window_size: int=-1, reuse_kv_layer_idx: Optional[int]=None, attn_logit_softcapping: Optional[float]=None, kv_dim: Optional[int]=None):
+        super().__init__(d_model=d_model, n_heads=n_heads, kv_n_heads=n_heads, attn_impl=attn_impl, clip_qkv=clip_qkv, qk_ln=qk_ln, qk_gn=qk_gn, fused_qkv=fused_qkv, softmax_scale=softmax_scale, attn_pdrop=attn_pdrop, norm_type=norm_type, norm_eps=norm_eps, fc_type=fc_type, device=device, bias=bias, sliding_window_size=sliding_window_size, reuse_kv_layer_idx=reuse_kv_layer_idx, attn_logit_softcapping=attn_logit_softcapping, kv_dim=kv_dim)
+@attention_classes.register_class('multiquery_attention')
+class MultiQueryAttention(GroupedQueryAttention):
+    """Multi-Query self attention.
+    Using torch attention implementation enables user to also use additive bias.
+    """
+    def __init__(self, d_model: int, n_heads: int, attn_impl: str='flash', clip_qkv: Optional[float]=None, qk_ln: bool=False, qk_gn: bool=False, fused_qkv: bool=True, softmax_scale: Optional[float]=None, attn_pdrop: float=0.0, norm_type: str='low_precision_layernorm', norm_eps: float=1e-05, fc_type: Optional[dict[str, Any]]=None, device: Optional[str]=None, bias: bool=True, sliding_window_size: int=-1, reuse_kv_layer_idx: Optional[int]=None, attn_logit_softcapping: Optional[float]=None, kv_dim: Optional[int]=None):
+        super().__init__(d_model=d_model, n_heads=n_heads, kv_n_heads=1, attn_impl=attn_impl, clip_qkv=clip_qkv, qk_ln=qk_ln, qk_gn=qk_gn, fused_qkv=fused_qkv, softmax_scale=softmax_scale, attn_pdrop=attn_pdrop, norm_type=norm_type, norm_eps=norm_eps, fc_type=fc_type, device=device, bias=bias, sliding_window_size=sliding_window_size, reuse_kv_layer_idx=reuse_kv_layer_idx, attn_logit_softcapping=attn_logit_softcapping, kv_dim=kv_dim)
+def attn_bias_shape(attn_impl: str, n_heads: int, seq_len: int, alibi: bool, causal: bool, use_sequence_id: bool) -> Optional[tuple[int, int, int, int]]:
+    if attn_impl == 'flash':
+        return None
+    elif attn_impl == 'torch':
+        if alibi:
+            if not causal or use_sequence_id:
+                return (1, n_heads, seq_len, seq_len)
+            return (1, n_heads, 1, seq_len)
+        elif use_sequence_id:
+            return (1, 1, seq_len, seq_len)
+        return None
+    else:
+        raise ValueError(f'attn_impl={attn_impl!r} is an invalid setting.')
+def build_attn_bias(attn_impl: str, attn_bias: torch.Tensor, n_heads: int, seq_len: int, causal: bool=False, alibi: bool=False, alibi_bias_max: int=8) -> Optional[torch.Tensor]:
+    if attn_impl == 'flash':
+        return None
+    elif attn_impl == 'torch':
+        if alibi:
+            device, dtype = (attn_bias.device, attn_bias.dtype)
+            attn_bias = attn_bias.add(build_alibi_bias(n_heads, seq_len, full=not causal, alibi_bias_max=alibi_bias_max, device=device, dtype=dtype))
+        return attn_bias
+    else:
+        raise ValueError(f'attn_impl={attn_impl!r} is an invalid setting.')
+def gen_slopes(n_heads: int, alibi_bias_max: int=8, device: Optional[torch.device]=None, return_1d: bool=False) -> torch.Tensor:
+    _n_heads = 2 ** math.ceil(math.log2(n_heads))
+    m = torch.arange(1, _n_heads + 1, dtype=torch.float32, device=device)
+    m = m.mul(alibi_bias_max / _n_heads)
+    slopes = 1.0 / torch.pow(2, m)
+    if _n_heads != n_heads:
+        slopes = torch.concat([slopes[1::2], slopes[::2]])[:n_heads]
+    if return_1d:
+        return slopes
+    return slopes.view(1, n_heads, 1, 1)
+def build_alibi_bias(n_heads: int, seq_len: int, full: bool=False, alibi_bias_max: int=8, device: Optional[torch.device]=None, dtype: Optional[torch.dtype]=None) -> torch.Tensor:
+    alibi_bias = torch.arange(1 - seq_len, 1, dtype=torch.int32, device=device).view(1, 1, 1, seq_len)
+    if full:
+        alibi_bias = alibi_bias - torch.arange(1 - seq_len, 1, dtype=torch.int32, device=device).view(1, 1, seq_len, 1)
+        alibi_bias = alibi_bias.abs().mul(-1)
+    slopes = gen_slopes(n_heads, alibi_bias_max, device=device)
+    alibi_bias = alibi_bias * slopes
+    return alibi_bias.to(dtype=dtype)
+attention_implementations.register('flash', func=flash_attn_fn)
+attention_implementations.register('torch', func=scaled_multihead_dot_product_attention)

blocks.py ADDED Viewed

	@@ -0,0 +1,137 @@

+"""GPT Blocks used for the GPT Model."""
+import copy
+from typing import Any, Optional
+import torch
+import torch.nn as nn
+from .layers_registry import ffns_with_norm
+from .layer_builders import build_attention_layer, build_ffn, build_norm
+from .config_defaults import attn_config_defaults, fc_type_defaults
+try:
+    from flash_attn.bert_padding import unpad_input, pad_input
+except:
+    unpad_input, pad_input = (None, None)
+class MPTBlock(nn.Module):
+    def __init__(self, d_model: int, n_heads: int, expansion_ratio: int, attn_config: Optional[dict]=None, ffn_config: Optional[dict]=None, resid_pdrop: float=0.0, norm_type: str='low_precision_layernorm', norm_eps: float=1e-05, fc_type: Optional[dict[str, Any]]=None, device: Optional[str]=None, no_bias: bool=False, use_pad_tok_in_ffn: bool=True, **kwargs: Any):
+        if attn_config is None:
+            attn_config = attn_config_defaults
+        if ffn_config is None:
+            self.ffn_config: dict[str, Any] = {'ffn_type': 'mptmlp'}
+        else:
+            self.ffn_config = ffn_config
+        if fc_type is None:
+            fc_type = copy.deepcopy(fc_type_defaults)
+        fc_type['bias'] = not no_bias
+        fc_type['device'] = device
+        self.ffn_config['fc_type'] = fc_type
+        self.fuse_norm_attn_norm = kwargs.get('fuse_norm_attn_norm', False)
+        del kwargs
+        super().__init__()
+        ffn_type = self.ffn_config['ffn_type']
+        ffn_has_norm = ffn_type in ffns_with_norm
+        if self.fuse_norm_attn_norm:
+            self.norm_attn_norm = FusedNormAttentionNorm(d_model=d_model, n_heads=n_heads, args_to_exclude_in_attn_class=self.args_to_exclude_in_attn_class, attn_config=attn_config, ffn_has_norm=ffn_has_norm, fc_type=fc_type, resid_pdrop=resid_pdrop, norm_type=norm_type, norm_eps=norm_eps, device=device, no_bias=no_bias)
+        else:
+            assert isinstance(attn_config['attn_type'], str)
+            attn_config_subset_for_attn_class = {k: v for k, v in attn_config.items() if k not in self.args_to_exclude_in_attn_class}
+            self.norm_1 = build_norm(name=norm_type.lower(), normalized_shape=d_model, eps=norm_eps, device=device)
+            self.attn = build_attention_layer(name=attn_config['attn_type'], attn_kwargs={'d_model': d_model, 'n_heads': n_heads, 'fc_type': fc_type, 'device': device, 'bias': not no_bias, **attn_config_subset_for_attn_class})
+            self.norm_2 = None
+            if not ffn_has_norm:
+                self.norm_2 = build_norm(name=norm_type.lower(), normalized_shape=d_model, eps=norm_eps, device=device)
+        self.ffn = build_ffn(name=ffn_type, d_model=d_model, expansion_ratio=expansion_ratio, device=device, bias=not no_bias, ffn_kwargs=self.ffn_config)
+        self.resid_attn_dropout = nn.Dropout(resid_pdrop)
+        self.resid_ffn_dropout = nn.Dropout(resid_pdrop)
+        self.use_pad_tok_in_ffn = use_pad_tok_in_ffn
+    @property
+    def args_to_exclude_in_attn_class(self):
+        return {'attn_type', 'alibi', 'attn_uses_sequence_id', 'alibi_bias_max', 'rope', 'rope_theta', 'rope_impl', 'rope_dail_config', 'rope_hf_config'}
+    def forward(self, x: torch.Tensor, past_key_value: Optional[tuple[torch.Tensor, torch.Tensor]]=None, attn_bias: Optional[torch.Tensor]=None, rotary_emb_w_meta_info: Optional[dict]=None, attention_mask: Optional[torch.ByteTensor]=None, is_causal: bool=True, output_attentions: bool=False, alibi_slopes: Optional[torch.Tensor]=None, flash_attn_padding_info: Optional[dict[str, torch.Tensor]]=None, prev_layer_key_value: Optional[tuple[torch.Tensor, torch.Tensor]]=None, key_value_states: Optional[torch.Tensor]=None) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor, torch.Tensor]]]:
+        extra_kwargs = {}
+        if prev_layer_key_value is not None:
+            extra_kwargs['prev_layer_key_value'] = prev_layer_key_value
+        if key_value_states is not None:
+            extra_kwargs['key_value_states'] = key_value_states
+        if self.fuse_norm_attn_norm:
+            x, m, attn_weights, past_key_value = self.norm_attn_norm(x, past_key_value=past_key_value, attn_bias=attn_bias, rotary_emb_w_meta_info=rotary_emb_w_meta_info, attention_mask=attention_mask, is_causal=is_causal, output_attentions=output_attentions, alibi_slopes=alibi_slopes, flash_attn_padding_info=flash_attn_padding_info, **extra_kwargs)
+        else:
+            a = self.norm_1(x)
+            b, attn_weights, past_key_value = self.attn(a, past_key_value=past_key_value, attn_bias=attn_bias, rotary_emb_w_meta_info=rotary_emb_w_meta_info, attention_mask=attention_mask, is_causal=is_causal, needs_weights=output_attentions, alibi_slopes=alibi_slopes, flash_attn_padding_info=flash_attn_padding_info, **extra_kwargs)
+            x = x + self.resid_attn_dropout(b)
+            m = x
+            if self.norm_2 is not None:
+                m = self.norm_2(x)
+        n = self.apply_ffn(attention_mask, m)
+        x = x.to(device=n.device) + self.resid_ffn_dropout(n).to(device=n.device)
+        return (x, attn_weights, past_key_value)
+    def apply_ffn(self, attention_mask: Optional[torch.ByteTensor], m: torch.Tensor) -> torch.Tensor:
+        """Apply feed forward layers to the input.
+        Args:
+            attention_mask (Optional[torch.ByteTensor]): The attention mask.
+            m (torch.Tensor): The input.
+        Returns:
+            n (torch.Tensor): The output.
+        """
+        batch_size, seq_len = m.size()[:2]
+        indices = None
+        if not self.use_pad_tok_in_ffn and attention_mask is not None:
+            assert unpad_input is not None
+            attention_mask = self.slice_attention_mask(attention_mask, seq_len)
+            m, indices, *_ = unpad_input(m, attention_mask)
+        n = self.ffn(m)
+        if not self.use_pad_tok_in_ffn and attention_mask is not None:
+            assert pad_input is not None
+            n = pad_input(n, indices, batch_size, seq_len)
+        return n
+    def slice_attention_mask(self, attention_mask: torch.ByteTensor, seq_len: int) -> torch.ByteTensor:
+        """Slice attention mask to the correct size.
+        Can be overridden by subclasses to apply different slicing logic.
+        Args:
+            attention_mask (torch.ByteTensor): The attention mask.
+            seq_len (int): The sequence length.
+        Returns:
+            torch.ByteTensor: The sliced attention mask.
+        """
+        return attention_mask
+class FusedNormAttentionNorm(nn.Module):
+    def __init__(self, d_model: int, n_heads: int, args_to_exclude_in_attn_class: set[str], attn_config: Optional[dict]=None, ffn_has_norm: bool=False, fc_type: Optional[dict[str, Any]]=None, resid_pdrop: float=0.0, norm_type: str='low_precision_layernorm', norm_eps: float=1e-05, device: Optional[str]=None, no_bias: bool=False, **kwargs: Any):
+        super().__init__()
+        assert attn_config is not None
+        assert isinstance(attn_config['attn_type'], str)
+        if fc_type is None:
+            fc_type = copy.deepcopy(fc_type_defaults)
+            fc_type['bias'] = not no_bias
+            fc_type['device'] = device
+        attn_config_subset_for_attn_class = {k: v for k, v in attn_config.items() if k not in args_to_exclude_in_attn_class}
+        self.norm_1 = build_norm(name=norm_type.lower(), normalized_shape=d_model, eps=norm_eps, device=device)
+        self.attn = build_attention_layer(name=attn_config['attn_type'], attn_kwargs={'d_model': d_model, 'n_heads': n_heads, 'fc_type': fc_type, 'device': device, 'bias': not no_bias, **attn_config_subset_for_attn_class})
+        self.norm_2 = None
+        if not ffn_has_norm:
+            self.norm_2 = build_norm(name=norm_type.lower(), normalized_shape=d_model, eps=norm_eps, device=device)
+        self.resid_attn_dropout = nn.Dropout(resid_pdrop)
+    def forward(self, x: torch.Tensor, past_key_value: Optional[tuple[torch.Tensor, torch.Tensor]]=None, attn_bias: Optional[torch.Tensor]=None, rotary_emb_w_meta_info: Optional[dict]=None, attention_mask: Optional[torch.ByteTensor]=None, is_causal: bool=True, output_attentions: bool=False, alibi_slopes: Optional[torch.Tensor]=None, flash_attn_padding_info: Optional[dict[str, torch.Tensor]]=None, prev_layer_key_value: Optional[tuple[torch.Tensor, torch.Tensor]]=None, key_value_states: Optional[torch.Tensor]=None) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor, torch.Tensor]]]:
+        a = self.norm_1(x)
+        extra_kwargs = {}
+        if prev_layer_key_value is not None:
+            extra_kwargs['prev_layer_key_value'] = prev_layer_key_value
+        if key_value_states is not None:
+            extra_kwargs['key_value_states'] = key_value_states
+        b, attn_weights, past_key_value = self.attn(a, past_key_value=past_key_value, attn_bias=attn_bias, rotary_emb_w_meta_info=rotary_emb_w_meta_info, attention_mask=attention_mask, is_causal=is_causal, needs_weights=output_attentions, alibi_slopes=alibi_slopes, flash_attn_padding_info=flash_attn_padding_info, **extra_kwargs)
+        x = x + self.resid_attn_dropout(b)
+        m = x
+        if self.norm_2 is not None:
+            m = self.norm_2(x)
+        return (x, m, attn_weights, past_key_value)

config.json ADDED Viewed

	@@ -0,0 +1,84 @@

+{
+  "architectures": [
+    "MPTForCausalLM"
+  ],
+  "attn_config": {
+    "alibi": false,
+    "alibi_bias_max": 8,
+    "attn_impl": "torch",
+    "attn_logit_softcapping": null,
+    "attn_pdrop": 0.0,
+    "attn_type": "grouped_query_attention",
+    "attn_uses_sequence_id": true,
+    "clip_qkv": null,
+    "fused_qkv": true,
+    "kv_dim": null,
+    "kv_n_heads": 8,
+    "qk_gn": false,
+    "qk_ln": false,
+    "rope": true,
+    "rope_dail_config": {
+      "pos_idx_in_fp32": true,
+      "type": "original",
+      "xpos_scale_base": 512
+    },
+    "rope_hf_config": {
+      "factor": 1.0,
+      "type": "no_scaling"
+    },
+    "rope_impl": "dail",
+    "rope_theta": 50000,
+    "sliding_window_size": -1,
+    "softmax_scale": null
+  },
+  "auto_map": {
+    "AutoConfig": "configuration_mpt.MPTConfig",
+    "AutoModelForCausalLM": "modeling_mpt.MPTForCausalLM"
+  },
+  "block_overrides": null,
+  "d_model": 3072,
+  "emb_pdrop": 0.0,
+  "embedding_fraction": 1.0,
+  "expansion_ratio": 4,
+  "fc_type": {
+    "name": "torch"
+  },
+  "ffn_config": {
+    "fc_type": {
+      "name": "torch"
+    },
+    "ffn_act_fn": {
+      "name": "silu"
+    },
+    "ffn_type": "mptmlp"
+  },
+  "ffn_hidden_size": 8192,
+  "final_logit_softcapping": null,
+  "init_config": {
+    "emb_init_std": null,
+    "emb_init_uniform_lim": null,
+    "fan_mode": "fan_in",
+    "init_div_is_residual": true,
+    "init_gain": 0.0,
+    "init_nonlinearity": "relu",
+    "init_std": null,
+    "name": "kaiming_normal_"
+  },
+  "init_device": "cpu",
+  "layer_norm_epsilon": 1e-05,
+  "learned_pos_emb": false,
+  "logit_scale": null,
+  "max_seq_len": 4096,
+  "model_type": "mpt",
+  "n_heads": 24,
+  "n_layers": 28,
+  "no_bias": false,
+  "norm_eps": 1e-05,
+  "norm_type": "rmsnorm",
+  "resid_pdrop": 0.0,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.51.3",
+  "use_cache": false,
+  "use_pad_tok_in_ffn": true,
+  "vocab_size": 152000
+}

config_defaults.py ADDED Viewed

	@@ -0,0 +1,5 @@

+"""Defaults for MPT model component configs."""
+ffn_config_defaults: dict = {'ffn_type': 'mptmlp'}
+attn_config_defaults: dict = {'attn_type': 'multihead_attention', 'attn_pdrop': 0.0, 'attn_impl': 'flash', 'qk_ln': False, 'qk_gn': False, 'fused_qkv': True, 'clip_qkv': None, 'softmax_scale': None, 'attn_uses_sequence_id': False, 'sliding_window_size': -1, 'attn_logit_softcapping': None, 'alibi': False, 'alibi_bias_max': 8, 'rope': False, 'rope_theta': 10000, 'rope_impl': 'dail', 'rope_dail_config': {'type': 'original', 'pos_idx_in_fp32': True, 'xpos_scale_base': 512}, 'rope_hf_config': {'type': 'no_scaling', 'factor': 1.0}, 'kv_dim': None}
+init_config_defaults: dict = {'name': 'kaiming_normal_', 'fan_mode': 'fan_in', 'init_nonlinearity': 'relu', 'init_div_is_residual': True, 'emb_init_std': None, 'emb_init_uniform_lim': None, 'init_std': None, 'init_gain': 0.0}
+fc_type_defaults: dict = {'name': 'torch'}

config_moe_args.py ADDED Viewed

	@@ -0,0 +1,159 @@

+"""Helper function to configure MPT with MoEs."""
+import inspect
+from typing import Callable, Optional, Union
+import torch
+from packaging import version
+from torch import distributed
+from torch.distributed._tensor import DeviceMesh
+from .layers_registry import ffns_with_megablocks
+from .ffn import resolve_ffn_hidden_size
+def create_process_group_ranks(ranks: tuple[int, ...]):
+    """Creates a new distributed group.
+    Used in create_set_process_group and create_mod_process_group methods below.
+    This function is an alternative to `distributed.new_group(ranks)`.
+    Args:
+        ranks (tuple[int, ...]): Tuple of ranks of group members.
+    Returns:
+        A handle of distributed group that can be given to collective calls.
+    """
+    ranks_gather_list = [None for _ in range(distributed.get_world_size())]
+    distributed.all_gather_object(ranks_gather_list, ranks)
+    ranks_per_subgroup = list(set(ranks_gather_list))
+    group, _ = distributed.distributed_c10d.new_subgroups_by_enumeration(ranks_per_subgroup)
+    return group
+def create_set_process_group(k: int):
+    """Creates a new distributed group using sets of k GPUs.
+    For example, if you have 16 GPUs and input k=4, the resulting process groups
+    will have ranks:
+        process group 0 ranks: [ 0,  1,  2,  3]
+        process group 1 ranks: [ 4,  5,  6,  7]
+        process group 2 ranks: [ 8,  9, 10, 11]
+        process group 3 ranks: [12, 13, 14, 15]
+    Args:
+        k (int): Number of GPUs to use in set size.
+    Returns:
+        A handle of distributed group that can be given to collective calls.
+    """
+    world_size = distributed.get_world_size()
+    if world_size % k != 0:
+        raise RuntimeError(f'world_size={world_size!r} must be divisible by k={k!r}.')
+    start = distributed.get_rank() // k * k
+    ranks = tuple(range(start, start + k))
+    return create_process_group_ranks(ranks)
+def get_megablocks_device_mesh(device_mesh_cfg: Optional[tuple[int, ...]], moe_world_size: int, world_size: int) -> DeviceMesh:
+    """Helper function to get the device mesh for MegaBlocks MoE.
+    Args:
+        device_mesh_cfg (Optional[tuple[int, ...]]): The device mesh configuration specification.
+        moe_world_size (int): The MoE world size.
+        world_size (int): The world size.
+    Raises:
+        ValueError: If the device mesh configuration is not valid.
+    Returns:
+        The device mesh for MegaBlocks MoE.
+    """
+    from torch.distributed._tensor.device_mesh import init_device_mesh
+    if device_mesh_cfg is None or len(device_mesh_cfg) == 1:
+        if device_mesh_cfg is not None:
+            world_size = device_mesh_cfg[0]
+        sharding_group_dim = world_size // moe_world_size
+        device_mesh = init_device_mesh('cuda', (sharding_group_dim, moe_world_size), mesh_dim_names=('weight_parallel', 'expert_parallel'))
+    else:
+        raise ValueError(f'device_mesh_cfg={device_mesh_cfg!r} must be length 1')
+    return device_mesh
+def config_megablocks_moe_args(ffn_config: dict, d_model: int, expansion_ratio: Union[int, float], n_layers: int, get_device_mesh: Callable) -> dict:
+    """Configures `ffn_config` for MegaBlocks MoE.
+    We prepare all necessary arguments for `megablocks.layers.arguments.Arguments` so that process
+    groups can be initialized and shared across all blocks in the network.
+    Args:
+        ffn_config (dict): FFN configuration before the MegaBlocks MoE is configured.
+        d_model (int): Hidden size of the network.
+        expansion_ratio (Union[int, float]): Expansion ratio in FFN.
+        n_layers (int): Number of blocks used in the network.
+        get_device_mesh (Callable): Function to get the device mesh. Takes in the device mesh config and the MoE world size.
+    Returns:
+        ffn_config (dict): FFN configuration with MegaBlocks MoE configured.
+    """
+    try:
+        import megablocks
+    except:
+        raise RuntimeError('Requirements for MegaBlocks not installed; see install instructions in `README.md`.')
+    ffn_config.setdefault('fp16', False)
+    ffn_config.setdefault('bf16', False)
+    ffn_config['num_layers'] = n_layers
+    ffn_type = ffn_config.pop('ffn_type')
+    fc_type = ffn_config.pop('fc_type')
+    ffn_act_fn = ffn_config.pop('ffn_act_fn', None)
+    world_size = 1
+    moe_world_size = ffn_config.pop('moe_world_size')
+    device_mesh = None
+    device_mesh_cfg = ffn_config.pop('device_mesh', None)
+    if moe_world_size > 1:
+        if version.parse(torch.__version__.split('.dev')[0]) < version.parse('2.2.0'):
+            raise RuntimeError('MoE world size > 1 is not supported in torch version {torch.__version__}<2.2.')
+        world_size = distributed.get_world_size()
+        if world_size < moe_world_size or world_size % moe_world_size:
+            raise ValueError(f'Invalid world size configuration: world_size={world_size!r} and moe_world_size={moe_world_size!r}')
+        device_mesh = get_device_mesh(device_mesh_cfg=device_mesh_cfg, moe_world_size=moe_world_size, world_size=world_size)
+        ffn_config['moe_expert_model_parallelism'] = True
+        ffn_config['expert_parallel_group'] = device_mesh['expert_parallel'].get_group(0)
+    lbl_process_group = ffn_config.get('lbl_process_group', None)
+    if lbl_process_group is not None:
+        if lbl_process_group == 'expert_group':
+            lbl_process_group = ffn_config['expert_parallel_group']
+        elif lbl_process_group == 'global_group':
+            lbl_process_group = distributed.group.WORLD
+        elif isinstance(lbl_process_group, int):
+            if lbl_process_group > 1:
+                lbl_process_group = create_set_process_group(lbl_process_group)
+            else:
+                lbl_process_group = None
+        elif not isinstance(lbl_process_group, distributed.ProcessGroup):
+            raise ValueError(f'Unknown lbl_process_group={lbl_process_group!r}. Options are: none | a process group | ``expert_group`` | ``global_group`` | <GROUP_SIZE>.')
+        ffn_config['lbl_process_group'] = lbl_process_group
+    ffn_hidden_size = resolve_ffn_hidden_size(d_model, expansion_ratio)
+    ffn_config.setdefault('ffn_hidden_size', ffn_hidden_size)
+    args_to_keep_in_ffn_config = inspect.signature(megablocks.layers.arguments.Arguments).parameters
+    ffn_config = {k: v for k, v in ffn_config.items() if k in args_to_keep_in_ffn_config}
+    args = megablocks.layers.arguments.Arguments(hidden_size=d_model, **ffn_config)
+    ffn_config['args'] = args
+    ffn_config['device_mesh'] = device_mesh
+    ffn_config['moe_world_size'] = moe_world_size
+    ffn_config['ffn_type'] = ffn_type
+    ffn_config['fc_type'] = fc_type
+    ffn_config['ffn_act_fn'] = ffn_act_fn
+    return ffn_config
+def config_moe_args(ffn_config: dict, d_model: int, expansion_ratio: Union[int, float], n_layers: int) -> dict:
+    """Configures `ffn_config` for MoE.
+    Args:
+        ffn_config (dict): FFN configuration before the MoE is configured.
+        d_model (int): Hidden size of the network.
+        expansion_ratio (int, float): Expansion ratio in FFN.
+        n_layers (int): Number of blocks used in the network.
+    Returns:
+        ffn_config (dict): FFN configuration with MoE configured.
+    """
+    if ffn_config['ffn_type'] in ffns_with_megablocks:
+        return config_megablocks_moe_args(ffn_config=ffn_config, d_model=d_model, expansion_ratio=expansion_ratio, n_layers=n_layers, get_device_mesh=get_megablocks_device_mesh)
+    else:
+        ffn_type = ffn_config['ffn_type']
+        raise ValueError(f'Invalid ffn_type ({ffn_type}).')

configuration_mpt.py ADDED Viewed

	@@ -0,0 +1,252 @@

+"""A HuggingFace-style model configuration."""
+import copy
+from .dmoe import _UniformExpertAssignment
+from .blocks import MPTBlock
+from .fc import *
+from .ffn import quickgelu_activation
+from .config_moe_args import create_process_group_ranks
+from .layer_builders import build_norm
+from .mpt_param_count import module_n_params
+from .norm import _cast_if_autocast_enabled
+from .act_ckpt import pass_on_block_idx
+from .custom_embedding import SharedEmbedding
+from .registry_utils import TypedRegistry
+from .param_init_fns import torch_default_param_init_fn_
+import warnings
+from typing import Any, Optional, Union
+from transformers import PretrainedConfig
+from .layers_registry import ffns_with_megablocks
+from .attention import check_alibi_support, is_flash_v2_installed
+from .config_defaults import attn_config_defaults, fc_type_defaults, ffn_config_defaults, init_config_defaults
+from .warnings import ExperimentalWarning
+class MPTConfig(PretrainedConfig):
+    model_type = 'mpt'
+    def __init__(self, d_model: int=2048, n_heads: int=16, n_layers: int=24, expansion_ratio: Union[int, float]=4, max_seq_len: int=2048, vocab_size: int=50368, resid_pdrop: float=0.0, emb_pdrop: float=0.0, learned_pos_emb: bool=True, attn_config: Optional[dict]=None, ffn_config: Optional[dict]=None, init_device: str='cpu', logit_scale: Optional[Union[float, str]]=None, no_bias: bool=False, embedding_fraction: float=1.0, norm_type: str='low_precision_layernorm', norm_eps: float=1e-05, use_cache: bool=False, init_config: Optional[dict]=None, fc_type: Union[str, dict]='torch', tie_word_embeddings: bool=True, use_pad_tok_in_ffn: bool=True, block_overrides: Optional[dict[str, Any]]=None, final_logit_softcapping: Optional[float]=None, **kwargs: Any):
+        """The MPT configuration class.
+        Args:
+            d_model (int): The size of the embedding dimension of the model.
+            n_heads (int): The number of attention heads.
+            n_layers (int): The number of layers in the model.
+            expansion_ratio (Union[int, float]): The ratio of the up/down scale in the ffn.
+            max_seq_len (int): The maximum sequence length of the model.
+            vocab_size (int): The size of the vocabulary.
+            resid_pdrop (float): The dropout probability applied to the attention output before combining with residual.
+            emb_pdrop (float): The dropout probability for the embedding layer.
+            learned_pos_emb (bool): Whether to use learned positional embeddings
+            attn_config (Dict): A dictionary used to configure the model's attention module:
+                attn_type (str): type of attention to use. Options: multihead_attention, multiquery_attention, grouped_query_attention
+                attn_pdrop (float): The dropout probability for the attention layers.
+                attn_impl (str): The attention implementation to use. One of 'torch' or 'flash'.
+                qk_ln (bool): Whether to apply layer normalization to the queries and keys in the attention layer.
+                qk_gn (bool): Whether to apply group normalization to the queries and keys in the attention layer.
+                fused_qkv (bool): Whether to fuse the Wq, Wk, and Wv weight matrices in the attention layer. If True, the weights are fused into a single
+                    Wqkv matrix, which can be faster for matmuls. If False, the weights are kept separate. Defaults to True.
+                clip_qkv (Optional[float]): If not None, clip the queries, keys, and values in the attention layer to
+                    this value.
+                softmax_scale (Optional[float]): If not None, scale the softmax in the attention layer by this value. If None,
+                    use the default scale of ``1/sqrt(d_keys)``.
+                attn_uses_sequence_id (Optional[bool]): Whether to restrict attention to tokens that have the same sequence_id.
+                    When the model is in `train` mode, this requires passing an extra `sequence_id` argument which indicates
+                    which sub-sequence each token belongs to.
+                    Defaults to ``False`` meaning any provided `sequence_id` will be ignored.
+                sliding_window_size (int): Window size for sliding window local attention. Defaults to -1, which means no sliding window. Query at position i will only attend to keys between [i + seqlen_k - seqlen_q - window_size, i + seqlen_k - seqlen_q + window_size] inclusive. Only works for flash attention v2.3.0 or higher.
+                alibi (bool): Whether to use the alibi bias instead of position embeddings.
+                alibi_bias_max (int): The maximum value of the alibi bias.
+                rope (bool): Whether to use rotary positional embeddings.
+                rope_theta (int): The base frequency for rope.
+                rope_impl (str): The implementation of rope to use. One of 'hf' (to use the implementation from https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py) or 'dail' (to use the implementation from https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/layers/rotary.py).
+                rope_dail_config (Dict): The configuration for the dail implementation of rope.
+                    type (str): The type of rotary position embedding to use. Options: 'original' (for https://arxiv.org/pdf/2104.09864.pdf), 'xpos' (for https://arxiv.org/pdf/2212.10554.pdf).
+                    pos_idx_in_fp32 (bool): If True, the position indices [0, ..., seqlen - 1] are in fp32, otherwise they might be in lower precision. A consequence could be, for example, that bf16 rounds position 1995 to 2000, which leads to them having the same positional embedding.
+                    xpos_scale_base (float): The scale base for XPos (if using XPos).
+                rope_hf_config (Dict): A dictionary used to configure rope's scaling behavior (when scaling beyond the training length).
+                    type (str): Can be one of 'no_scaling', 'linear', or 'dynamic'. 'no_scaling' uses the default implementation for rotary embeddings, 'linear' uses linear scaling as proposed by the Reddit user /u/kaiokendev, and 'dynamic' uses Dynamic NTK scaling as proposed by the Reddit users /u/bloc97 and /u/emozilla.
+                    factor (float): Scaling factor to use if using 'linear' or 'dynamic' as rope_scaling.type.
+                kv_n_heads (Optional[int]): For grouped_query_attention only, allow user to specify number of kv heads.
+                kv_dim (Optional[int]): For cross-attention only, allow user to specify different input dimensions for kv projections.
+            ffn_config (Dict): A dictionary used to configure the model's ffn module:
+                ffn_type (str): type of ffn to use. Options: mptmlp, mptglu, te_ln_mlp
+            init_device (str): The device to use for parameter initialization.
+            logit_scale (Optional[Union[float, str]]): If not None, scale the logits by this value.
+            no_bias (bool): Whether to use bias in all layers.
+            embedding_fraction (float): The fraction to scale the gradients of the embedding layer by.
+            norm_type (str): choose type of norm to use
+            norm_eps (float): epsilon value for norm layer
+            use_cache (bool): Whether or not the model should return the last key/values attentions
+            init_config (Dict): A dictionary used to configure the model initialization:
+                init_config.name: The parameter initialization scheme to use. Options: 'default_', 'baseline_',
+                    'kaiming_uniform_', 'kaiming_normal_', 'neox_init_', 'small_init_', 'xavier_uniform_', or
+                    'xavier_normal_'. These mimic the parameter initialization methods in PyTorch.
+                init_div_is_residual (Union[int, float, str, bool]): Value to divide initial weights by if ``module._is_residual`` is True.
+                emb_init_std (Optional[float]): The standard deviation of the normal distribution used to initialize the embedding layer.
+                emb_init_uniform_lim (Optional[Union[Tuple[float, float], float]]): The lower and upper limits of the uniform distribution
+                    used to initialize the embedding layer. Mutually exclusive with ``emb_init_std``.
+                init_std (float): The standard deviation of the normal distribution used to initialize the model,
+                    if using the baseline_ parameter initialization scheme.
+                init_gain (float): The gain to use for parameter initialization with kaiming or xavier initialization schemes.
+                fan_mode (str): The fan mode to use for parameter initialization with kaiming initialization schemes.
+                init_nonlinearity (str): The nonlinearity to use for parameter initialization with kaiming initialization schemes.
+                ---
+                See llmfoundry.models.utils.param_init_fns.py for info on other param init config options
+            fc_type (str | Dict): Choose fc layer implementation. Options: torch and te. te layers support fp8 when using H100 GPUs. Can
+                also be a dictionary that specifies the fc layer name and any kwargs for the fc layer.
+            tie_word_embeddings (bool): Whether to tie the input embedding and output layers.
+            use_pad_tok_in_ffn (bool): Whether to forward the pad token in the feedforward networks.
+            block_overrides: This allows for overriding default block configs for certain layers. This must contain `overrides` and `order`. `order` is a nested list which describes the order of the layers. For each kind of layer, specify the `overrides` in the overrides config (default refers to a layer that does not apply any overrides).
+                To specify this model (https://research.character.ai/optimizing-inference/) , the following config will be needed:
+                    block_overrides:
+                        order:
+                        - name: default
+                        - repeat: 2
+                          order:
+                          - name: sliding_window_layer
+                          - name: sliding_window_layer_reuse
+                          - name: sliding_window_layer
+                          - repeat: 2
+                            name: sliding_window_layer_reuse
+                          - name: reuse_kv_layer
+                        overrides:
+                            sliding_window_layer:
+                                attn_config:
+                                    sliding_window_size: 1024
+                            sliding_window_layer_reuse:
+                                attn_config:
+                                    sliding_window_size: 1024
+                                    reuse_kv_layer_idx: -1 # Relative index of the layer whose kv cache to reuse
+                            reuse_kv_layer:
+                                attn_config:
+                                    reuse_kv_layer_idx: -6 # Relative index of the layer whose kv cache to reuse
+            final_logit_softcapping (float | None): Softcapping threshold for final logit. Set to None to disable (default value None). Please see https://arxiv.org/pdf/2403.08295 for more details.
+            kwargs (Any): Other relevant keyword arguments.
+        """
+        self.d_model = d_model
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.expansion_ratio = expansion_ratio
+        if max_seq_len != int(max_seq_len):
+            raise ValueError('max_seq_len must be an integer')
+        self.max_seq_len = int(max_seq_len)
+        self.vocab_size = vocab_size
+        self.resid_pdrop = resid_pdrop
+        self.emb_pdrop = emb_pdrop
+        self.learned_pos_emb = learned_pos_emb
+        self.attn_config = attn_config if attn_config is not None else copy.deepcopy(attn_config_defaults)
+        self.ffn_config = ffn_config if ffn_config is not None else copy.deepcopy(ffn_config_defaults)
+        self.init_device = init_device
+        self.logit_scale = logit_scale
+        self.no_bias = no_bias
+        self.embedding_fraction = embedding_fraction
+        self.norm_type = norm_type
+        self.norm_eps = norm_eps
+        self.use_cache = use_cache
+        self.init_config = init_config if init_config is not None else copy.deepcopy(init_config_defaults)
+        if block_overrides is not None:
+            self._validate_block_overrides(block_overrides)
+        self.block_overrides = block_overrides
+        self.final_logit_softcapping = final_logit_softcapping
+        if isinstance(fc_type, str):
+            fc_type = {'name': fc_type}
+        self.fc_type = fc_type
+        self.use_pad_tok_in_ffn = use_pad_tok_in_ffn
+        if 'name' in kwargs:
+            del kwargs['name']
+        if 'loss_fn' in kwargs:
+            del kwargs['loss_fn']
+        if self.attn_config.get('alibi', False) or self.attn_config.get('rope', False):
+            self.learned_pos_emb = False
+            warnings.warn(f'alibi or rope is turned on, setting `learned_pos_emb` to `False.`')
+        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
+        self._validate_config()
+    def _validate_block_overrides(self, block_overrides: dict[str, Any]):
+        warnings.warn(ExperimentalWarning('block_overrides'))
+        if 'order' not in block_overrides:
+            raise ValueError('`order` should be defined in block_overrides')
+        if 'overrides' not in block_overrides:
+            raise ValueError('`overrides` should be defined in block_overrides')
+        if 'default' in block_overrides['overrides'].keys():
+            raise ValueError('block overrides cannot be named "default".')
+    def _set_config_defaults(self, config: dict[str, Any], config_defaults: dict[str, Any]) -> dict[str, Any]:
+        for k, v in config_defaults.items():
+            if k not in config:
+                config[k] = v
+            elif isinstance(v, dict):
+                config[k] = self._set_config_defaults(config[k] if config[k] is not None else {}, v)
+        return config
+    def validate_attention_config(self) -> None:
+        if 'seq_parallel_world_size' in self.attn_config and self.attn_config['seq_parallel_world_size'] is None:
+            del self.attn_config['seq_parallel_world_size']
+        if self.attn_config.get('seq_parallel_world_size', 1) > 1:
+            raise NotImplementedError('Sequence Parallelism is not supported.')
+    def _validate_config(self) -> None:
+        self.attn_config = self._set_config_defaults(self.attn_config, attn_config_defaults)
+        self.ffn_config = self._set_config_defaults(self.ffn_config, ffn_config_defaults)
+        self.init_config = self._set_config_defaults(self.init_config, init_config_defaults)
+        self.fc_type = self._set_config_defaults(self.fc_type, fc_type_defaults)
+        if self.d_model % self.n_heads != 0:
+            raise ValueError('d_model must be divisible by n_heads')
+        if any((prob < 0 or prob > 1 for prob in [self.attn_config['attn_pdrop'], self.resid_pdrop, self.emb_pdrop])):
+            raise ValueError("self.attn_config['attn_pdrop'], resid_pdrop, emb_pdrop are probabilities and must be between 0 and 1")
+        if self.attn_config['attn_impl'] not in ['torch', 'flash']:
+            raise ValueError(f"Unknown attn_impl={self.attn_config['attn_impl']}")
+        if self.attn_config['alibi'] and (not check_alibi_support(self.attn_config['attn_impl'])):
+            raise NotImplementedError('alibi only implemented with torch and flash (v2.4.2 or higher) attention.')
+        if self.attn_config['attn_uses_sequence_id'] and (not (self.attn_config['attn_impl'] == 'torch' or (self.attn_config['attn_impl'] == 'flash' and is_flash_v2_installed(v2_version='v2.1.2')))):
+            raise NotImplementedError('attn_uses_sequence_id only implemented with torch and flash (v2.1.2 or higher) attention.')
+        if self.attn_config['rope'] and self.attn_config['rope_impl'] not in ['dail', 'hf']:
+            raise ValueError('If rope is being used then rope_impl should be either "dail", or "hf".')
+        if self.attn_config['rope'] and self.attn_config['rope_impl'] == 'hf' and (self.attn_config['rope_hf_config']['type'] not in ['no_scaling', 'linear', 'dynamic', 'llama3']):
+            raise ValueError('If using hf implementation of rope, the type should be one of "no_scaling", "linear" or "dynamic".')
+        if self.attn_config['rope'] and self.attn_config['rope_impl'] == 'dail':
+            if self.attn_config['rope_dail_config']['type'] not in ['original', 'xpos']:
+                raise ValueError('If using the dail implementation of rope, the type should be one of "original" or "xpos".')
+            if not is_flash_v2_installed(v2_version='2.0.1'):
+                raise ImportError('If using the dail implementation of rope, the flash_attn library v2.0.1 or higher must be installed. Please check the instructions at https://github.com/mosaicml/llm-foundry/blob/main/TUTORIAL.md#what-kinds-of-positional-embeddings-does-llm-foundry-support')
+        if self.attn_config['sliding_window_size'] != -1 and self.attn_config['attn_impl'] == 'flash' and (not is_flash_v2_installed(v2_version='v2.3.0')):
+            raise NotImplementedError('sliding window attention only implemented for torch attention and flash attention (v2.3.0 or higher).')
+        if self.attn_config['attn_logit_softcapping'] is not None:
+            if self.attn_config['attn_logit_softcapping'] <= 0:
+                raise ValueError('Attention attn_logit_softcapping should be positive.')
+            if self.attn_config['attn_impl'] == 'flash' and (not is_flash_v2_installed(v2_version='v2.6.2')):
+                raise NotImplementedError('Attention attn_logit_softcapping is only implemented with torch attention or flash attention v2.6.2 (or higher).')
+        if self.attn_config['kv_dim'] is not None and self.attn_config['fused_qkv']:
+            raise ValueError('fused_qkv should be False when "kv_dim" is specified.')
+        if self.embedding_fraction > 1 or self.embedding_fraction <= 0:
+            raise ValueError('model.embedding_fraction must be between 0 (exclusive) and 1 (inclusive)!')
+        if isinstance(self.logit_scale, str) and self.logit_scale != 'inv_sqrt_d_model':
+            raise ValueError(f"self.logit_scale={self.logit_scale!r} is not recognized as an option; use numeric value or 'inv_sqrt_d_model'.")
+        if self.init_config.get('name', None) is None:
+            raise ValueError(f"self.init_config={self.init_config!r} 'name' needs to be set.")
+        if not (self.learned_pos_emb or self.attn_config['alibi'] or self.attn_config['rope']):
+            warnings.warn(f'Positional information not being provided to the model using either learned_pos_emb or alibi or rope.')
+        if self.fc_type['name'] == 'te' or self.ffn_config['ffn_type'] == 'te_ln_mlp':
+            try:
+                import transformer_engine.pytorch as te
+                del te
+            except:
+                raise ImportError('TransformerEngine import failed. `fc_type: te` requires TransformerEngine be installed, ', 'e.g. pip install transformer-engine[pytorch]')
+        self.ffn_config['fc_type'] = self.fc_type
+        if self.ffn_config['ffn_type'] == 'mptgeglu':
+            raise ValueError('API CHANGE: `ffn_type=="mptgeglu"` changed to `ffn_type=="mptglu"`. ' + 'See [#829](https://github.com/mosaicml/llm-foundry/pull/829) for details.')
+        elif self.ffn_config['ffn_type'] in ffns_with_megablocks:
+            self.ffn_config['return_bias'] = False
+        elif self.ffn_config['ffn_type'] == 'te_ln_mlp':
+            self.ffn_config['bias'] = not self.no_bias
+            if 'ffn_act_fn' in self.ffn_config.keys():
+                raise ValueError(f'Transformer Engine block does not support custom activation functions.')
+        if not self.use_pad_tok_in_ffn:
+            try:
+                from flash_attn.bert_padding import unpad_input, pad_input
+            except:
+                raise ImportError('In order to set `use_pad_tok_in_ffn=False`, please install flash-attn==1.0.9 or flash-attn==2.3.6')
+        self.validate_attention_config()
+    @property
+    def allowed_block_overrides(self):
+        return {'attn_config': {'sliding_window_size': None, 'reuse_kv_layer_idx': None}}

custom_embedding.py ADDED Viewed

	@@ -0,0 +1,10 @@

+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+class SharedEmbedding(nn.Embedding):
+    def forward(self, input: Tensor, unembed: bool=False) -> Tensor:
+        if unembed:
+            return F.linear(input, self.weight)
+        return super().forward(input)

dmoe.py ADDED Viewed

	@@ -0,0 +1,138 @@

+from functools import partial
+from typing import Callable, Optional, Union
+import torch
+import torch.nn.functional as F
+DEFAULT_ACTIVATION_FN = partial(F.gelu, approximate='tanh')
+class _UniformExpertAssignment(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, x: torch.Tensor, num_experts: int):
+        out = torch.arange(x.numel(), dtype=x.dtype, device=x.device)
+        out = torch.remainder(out, num_experts)
+        return out.view(x.shape)
+class LearnedRouter(torch.nn.Module):
+    def __init__(self, hidden_size: int, moe_num_experts: int, moe_top_k: int, moe_jitter_eps: Optional[float], moe_normalize_expert_weights: Optional[Union[int, float]], uniform_expert_assignment: bool, device: Optional[torch.device]) -> None:
+        super().__init__()
+        self.hidden_size: int = hidden_size
+        self.moe_num_experts: int = moe_num_experts
+        self.moe_top_k: int = moe_top_k
+        self.moe_jitter_eps: Optional[float] = moe_jitter_eps
+        self.moe_normalize_expert_weights: Optional[Union[int, float]] = moe_normalize_expert_weights
+        self.uniform_expert_assignment: bool = uniform_expert_assignment
+        self.layer: torch.nn.Module = torch.nn.Linear(hidden_size, moe_num_experts, bias=False, device=device)
+    def jitter(self, x: torch.Tensor) -> torch.Tensor:
+        assert self.moe_jitter_eps is not None
+        low: float = 1.0 - self.moe_jitter_eps
+        high: float = 1.0 + self.moe_jitter_eps
+        noise: torch.Tensor = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+        return low + noise * (high - low)
+    def _top_k(self, scores: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+        if self.moe_top_k == 1:
+            values, indices = scores.max(dim=-1)
+            return (values.unsqueeze(-1), indices.unsqueeze(-1))
+        return torch.topk(scores, self.moe_top_k, dim=-1)
+    def forward(self, x: torch.Tensor):
+        if self.training and self.moe_jitter_eps is not None:
+            x = x * self.jitter(x)
+        scores = self.layer(x.view(-1, x.shape[-1])).softmax(dim=-1)
+        expert_weights, top_experts = self._top_k(scores)
+        if self.moe_normalize_expert_weights:
+            expert_weights = expert_weights / torch.norm(expert_weights, p=self.moe_normalize_expert_weights, dim=-1, keepdim=True)
+        top_experts = _UniformExpertAssignment.apply(top_experts, self.moe_num_experts) if self.uniform_expert_assignment else top_experts
+        scores = scores.to(x.dtype)
+        expert_weights = expert_weights.to(x.dtype)
+        return (scores, expert_weights, top_experts)
+class MLP(torch.nn.Module):
+    def __init__(self, hidden_size: int, ffn_hidden_size: int, moe_num_experts: int, activation_fn: Callable, device: Optional[torch.device]) -> None:
+        super().__init__()
+        self.moe_num_experts: int = moe_num_experts
+        self.ffn_hidden_size: int = ffn_hidden_size
+        self.hidden_size: int = hidden_size
+        self.activation_fn: Callable = activation_fn
+        self.w1 = torch.nn.Parameter(torch.rand(moe_num_experts * ffn_hidden_size, hidden_size, device=device))
+        self.w2 = torch.nn.Parameter(torch.rand(moe_num_experts * ffn_hidden_size, hidden_size, device=device))
+        self.activation_fn = activation_fn
+    def forward(self, x: torch.Tensor, expert_idx: int) -> torch.Tensor:
+        expert_w1 = self.w1.view(self.moe_num_experts, self.ffn_hidden_size, self.hidden_size)[expert_idx]
+        expert_w2 = self.w2.view(self.moe_num_experts, self.ffn_hidden_size, self.hidden_size)[expert_idx]
+        before_activation = x @ expert_w1.t()
+        layer_1_output = self.activation_fn(before_activation)
+        output = layer_1_output @ expert_w2
+        return output
+class GLU(torch.nn.Module):
+    def __init__(self, hidden_size: int, ffn_hidden_size: int, moe_num_experts: int, activation_fn: Callable, device: Optional[torch.device]):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.ffn_hidden_size = ffn_hidden_size
+        self.moe_num_experts = moe_num_experts
+        self.w1 = torch.nn.Parameter(torch.rand(moe_num_experts * ffn_hidden_size, hidden_size, device=device))
+        self.v1 = torch.nn.Parameter(torch.rand(moe_num_experts * ffn_hidden_size, hidden_size, device=device))
+        self.w2 = torch.nn.Parameter(torch.rand(moe_num_experts * ffn_hidden_size, hidden_size, device=device))
+        self.activation_fn = activation_fn
+    def forward(self, x: torch.Tensor, expert_idx: torch.Tensor):
+        expert_w1 = self.w1.view(self.moe_num_experts, self.ffn_hidden_size, self.hidden_size)[expert_idx]
+        expert_v1 = self.v1.view(self.moe_num_experts, self.ffn_hidden_size, self.hidden_size)[expert_idx]
+        expert_w2 = self.w2.view(self.moe_num_experts, self.ffn_hidden_size, self.hidden_size)[expert_idx]
+        x1 = x.matmul(expert_w1.t())
+        x2 = x.matmul(expert_v1.t())
+        x1 = self.activation_fn(x1)
+        x1 = x1 * x2
+        x1 = x1.matmul(expert_w2)
+        return x1
+class DroplessMLP(torch.nn.Module):
+    def __init__(self, hidden_size: int, ffn_hidden_size: int, mlp_type: str, moe_num_experts: int, activation_fn: Callable, bias: bool, device: Optional[torch.device]):
+        super().__init__()
+        self.moe_num_experts = moe_num_experts
+        if mlp_type == 'mlp':
+            self.mlp = MLP(hidden_size=hidden_size, ffn_hidden_size=ffn_hidden_size, moe_num_experts=moe_num_experts, activation_fn=activation_fn, device=device)
+        elif mlp_type == 'glu':
+            self.mlp = GLU(hidden_size=hidden_size, ffn_hidden_size=ffn_hidden_size, moe_num_experts=moe_num_experts, activation_fn=activation_fn, device=device)
+        else:
+            raise ValueError(f'Received unknown mlp_type={mlp_type!r}')
+    def forward(self, x: torch.Tensor, scores: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        in_shape = x.shape
+        hidden_size = in_shape[-1]
+        x = x.view(-1, hidden_size)
+        out = torch.zeros_like(x)
+        expert_mask = torch.nn.functional.one_hot(top_experts, num_classes=self.moe_num_experts).permute(2, 1, 0)
+        for expert_idx in range(0, self.moe_num_experts):
+            topk_idx, token_idx = torch.where(expert_mask[expert_idx])
+            if token_idx.shape[0] == 0:
+                continue
+            token_list = token_idx.tolist()
+            topk_list = topk_idx.tolist()
+            expert_tokens = x[None, token_list].reshape(-1, hidden_size)
+            mlp_output = self.mlp(expert_tokens, expert_idx)
+            expert_weights = expert_weights.to(mlp_output.device)
+            expert_out = mlp_output * expert_weights[token_list, topk_list, None]
+            out = out.to(mlp_output.device)
+            token_idx = token_idx.to(mlp_output.device)
+            out.index_add_(0, token_idx, expert_out)
+        out = out.view(in_shape)
+        return out
+class dMoE(torch.nn.Module):
+    def __init__(self, device: Optional[torch.device], hidden_size: int=1024, ffn_hidden_size: int=4096, moe_num_experts: int=1, moe_top_k: int=1, mlp_type: str='mlp', activation_fn: Callable=DEFAULT_ACTIVATION_FN, moe_jitter_eps: Optional[float]=None, moe_normalize_expert_weights: Optional[Union[int, float]]=None, uniform_expert_assignment: bool=False, bias: bool=True):
+        super().__init__()
+        self.router = LearnedRouter(hidden_size, moe_num_experts=moe_num_experts, moe_top_k=moe_top_k, moe_jitter_eps=moe_jitter_eps, moe_normalize_expert_weights=moe_normalize_expert_weights, uniform_expert_assignment=uniform_expert_assignment, device=device)
+        self.experts = DroplessMLP(hidden_size=hidden_size, ffn_hidden_size=ffn_hidden_size, mlp_type=mlp_type, moe_num_experts=moe_num_experts, activation_fn=activation_fn, bias=bias, device=device)
+    def forward(self, x: torch.Tensor):
+        scores, expert_weights, top_experts = self.router(x)
+        return self.experts(x, scores, expert_weights, top_experts)

fc.py ADDED Viewed

	@@ -0,0 +1,8 @@

+from torch import nn
+from .layers_registry import fcs
+fcs.register('torch', func=nn.Linear)
+try:
+    import transformer_engine.pytorch as te
+    fcs.register('te', func=te.Linear)
+except:
+    pass

ffn.py ADDED Viewed

	@@ -0,0 +1,272 @@

+"""MPT Blocks used for the MPT Model."""
+import logging
+from copy import deepcopy
+from functools import partial
+from typing import Any, Callable, Optional, Union
+import torch
+import torch.nn as nn
+from torch.distributed import ProcessGroup
+from torch.distributed._tensor import DeviceMesh, DTensor, Placement, Shard
+from .layers_registry import ffns, ffns_with_megablocks, ffns_with_norm
+from .dmoe import dMoE
+from .layer_builders import build_fc
+from .config_defaults import fc_type_defaults
+try:
+    import transformer_engine.pytorch as te
+    is_te_imported = True
+except ModuleNotFoundError:
+    is_te_imported = False
+try:
+    import megablocks
+    is_megablocks_imported = True
+except ModuleNotFoundError:
+    is_megablocks_imported = False
+log = logging.getLogger(__name__)
+_FFN_ACT_FN_DEFAULT = {'name': 'gelu', 'approximate': 'none'}
+def quickgelu_activation(input: torch.Tensor) -> torch.Tensor:
+    """Applies GELU approximation that is fast but somewhat inaccurate.
+    Args:
+        input (torch.Tensor): Input tensor of shape(*), where * means any
+            number of dimensions
+    Returns:
+        torch.Tensor: Tensor with same shape as input tensor
+    """
+    return input * torch.sigmoid(1.702 * input)
+def resolve_ffn_act_fn(config: Optional[dict]=None) -> Callable[[torch.Tensor], torch.Tensor]:
+    """Resolve the activation function for the feed-forward network.
+    Args:
+        config (Optional[dict]): The configuration dictionary for the activation function.
+            The dict config must specify the 'name' of a torch.nn.functional activation
+            function. All of other key values pairs are bound to the function as a partial.
+    Returns:
+        Callable[[torch.Tensor], torch.Tensor]: The activation function.
+    """
+    if config is None:
+        config = _FFN_ACT_FN_DEFAULT
+    config = deepcopy(config)
+    name = config.pop('name')
+    if name == 'quick_gelu':
+        return quickgelu_activation
+    else:
+        if not hasattr(torch.nn.functional, name):
+            raise ValueError(f'Unrecognized activation function name ({name}).')
+        act = getattr(torch.nn.functional, name)
+        return partial(act, **config)
+_DEFAULT_ACT_FN = resolve_ffn_act_fn(_FFN_ACT_FN_DEFAULT)
+def resolve_ffn_hidden_size(d_model: int, expansion_ratio: Union[int, float], ffn_hidden_size: Optional[int]=None) -> int:
+    """Resolve the hidden size of the feed-forward network.
+    Args:
+        d_model (int): The dimension of the input and output of the feed-forward network.
+        expansion_ratio (Union[int, float]): The expansion ratio of the feed-forward network.
+        ffn_hidden_size (Optional[int]): The hidden size of the feed-forward network.
+    Returns:
+        int: The hidden size of the feed-forward network.
+    """
+    if ffn_hidden_size is not None:
+        log.info(f'`expansion_ratio` (={expansion_ratio}) ignored when `ffn_hidden_size` (={ffn_hidden_size}) is specified.')
+    else:
+        ffn_hidden_size = int(d_model * expansion_ratio)
+        if ffn_hidden_size != d_model * expansion_ratio:
+            raise ValueError(f'`d_model * expansion_ratio` must be an integer (d_model={d_model!r}; expansion_ratio={expansion_ratio!r}; d_model * expansion_ratio={d_model * expansion_ratio!r}).')
+    return ffn_hidden_size
+def dtensorify_param(param: nn.Parameter, mesh: DeviceMesh, placements: list[Placement]):
+    """Construct a DTensor from an already sharded local parameter."""
+    param_dtensor = DTensor.from_local(param.data, device_mesh=mesh, placements=placements, run_check=False)
+    return nn.Parameter(param_dtensor)
+class MPTMLP(nn.Module):
+    def __init__(self, d_model: int, expansion_ratio: Union[int, float], fc_type: Optional[dict[str, Any]]=None, ffn_hidden_size: Optional[int]=None, act_fn: Callable[[torch.Tensor], torch.Tensor]=_DEFAULT_ACT_FN, device: Optional[str]=None, bias: bool=True):
+        super().__init__()
+        ffn_hidden_size = resolve_ffn_hidden_size(d_model, expansion_ratio, ffn_hidden_size)
+        if fc_type is None:
+            fc_type = fc_type_defaults
+            fc_type['bias'] = bias
+            fc_type['device'] = device
+        self.fc_type = fc_type
+        self.fc_type_name = self.fc_type['name']
+        self.up_proj = build_fc(name=self.fc_type_name, in_features=d_model, out_features=ffn_hidden_size, fc_kwargs=self.fc_type)
+        self.act = act_fn
+        self.down_proj = build_fc(name=self.fc_type_name, in_features=ffn_hidden_size, out_features=d_model, fc_kwargs=self.fc_type)
+        self.down_proj._is_residual = True
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(self.act(self.up_proj(x)))
+class MPTGLU(MPTMLP):
+    def __init__(self, d_model: int, expansion_ratio: Union[int, float], fc_type: Optional[dict[str, Any]]=None, ffn_hidden_size: Optional[int]=None, act_fn: Callable[[torch.Tensor], torch.Tensor]=_DEFAULT_ACT_FN, device: Optional[str]=None, bias: bool=True):
+        super().__init__(d_model=d_model, expansion_ratio=expansion_ratio, fc_type=fc_type, ffn_hidden_size=ffn_hidden_size, act_fn=act_fn, device=device, bias=bias)
+        self.gate_proj = build_fc(name=self.fc_type_name, in_features=d_model, out_features=self.up_proj.out_features, fc_kwargs=self.fc_type)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(self.act(self.gate_proj(x)).to(device=x.device) * self.up_proj(x))
+def build_mptglu(d_model: int, expansion_ratio: Union[int, float], fc_type: Optional[dict[str, Any]]=None, ffn_hidden_size: Optional[int]=None, ffn_act_fn: Optional[dict]=None, device: Optional[str]=None, bias: bool=True) -> nn.Module:
+    return MPTGLU(d_model=d_model, expansion_ratio=expansion_ratio, fc_type=fc_type, act_fn=resolve_ffn_act_fn(ffn_act_fn), ffn_hidden_size=ffn_hidden_size, device=device, bias=bias)
+def build_mptmlp(d_model: int, expansion_ratio: Union[int, float], fc_type: Optional[dict[str, Any]]=None, ffn_hidden_size: Optional[int]=None, ffn_act_fn: Optional[dict]=None, device: Optional[str]=None, bias: bool=True) -> nn.Module:
+    return MPTMLP(d_model=d_model, expansion_ratio=expansion_ratio, fc_type=fc_type, act_fn=resolve_ffn_act_fn(ffn_act_fn), ffn_hidden_size=ffn_hidden_size, device=device, bias=bias)
+def build_te_ln_mlp(d_model: int, expansion_ratio: Union[int, float], fc_type: Optional[dict[str, Any]]=None, ffn_hidden_size: Optional[int]=None, ffn_act_fn: Optional[dict]=None, device: Optional[str]=None, bias: bool=True, **kwargs: Any) -> nn.Module:
+    assert te is not None
+    ffn_hidden_size = resolve_ffn_hidden_size(d_model, expansion_ratio, ffn_hidden_size)
+    if ffn_act_fn is not None:
+        raise ValueError(f'Transformer Engine block does not support custom activation functions.')
+    return te.LayerNormMLP(hidden_size=d_model, ffn_hidden_size=ffn_hidden_size, bias=bias, **kwargs)
+def build_torch_dmoe(d_model: int, expansion_ratio: Union[int, float], ffn_hidden_size: Optional[int]=None, ffn_act_fn: Optional[dict]=None, device: Optional[str]=None, bias: bool=True, **kwargs: Any) -> nn.Module:
+    moe_num_experts = kwargs.pop('moe_num_experts')
+    moe_top_k = kwargs.pop('moe_top_k')
+    mlp_type = kwargs.pop('mlp_type')
+    moe_jitter_eps = kwargs.pop('moe_jitter_eps')
+    moe_normalize_expert_weights = kwargs.pop('moe_normalize_expert_weights')
+    uniform_expert_assignment = kwargs.pop('uniform_expert_assignment')
+    fc_type = kwargs.pop('fc_type', None)
+    del fc_type
+    if len(kwargs) > 0:
+        raise ValueError(f'Invalid arguments to torch dmoe: {kwargs}.')
+    return dMoE(hidden_size=d_model, ffn_hidden_size=resolve_ffn_hidden_size(d_model, expansion_ratio, ffn_hidden_size), moe_num_experts=moe_num_experts, moe_top_k=moe_top_k, mlp_type=mlp_type, bias=bias, moe_jitter_eps=moe_jitter_eps, activation_fn=resolve_ffn_act_fn(ffn_act_fn), moe_normalize_expert_weights=moe_normalize_expert_weights, uniform_expert_assignment=uniform_expert_assignment, device=torch.device(device) if device is not None else None)
+def mb_setup_args(d_model: int, expansion_ratio: Union[int, float], ffn_hidden_size: Optional[int], ffn_act_fn: Optional[dict], device: Optional[str], bias: bool, kwargs: dict[str, Any]) -> tuple['megablocks.layers.arguments.Arguments', int, ProcessGroup]:
+    """Setup the MegaBlocks args.
+    Args:
+        d_model (int): The dimension of the input and output of the FFN.
+        expansion_ratio (Union[int, float]): The expansion ratio of the FFN.
+        ffn_hidden_size (Optional[int]): The hidden size of the FFN.
+        ffn_act_fn (Optional[dict]): The activation function of the FFN.
+        device (Optional[str]): The device to run the FFN on.
+        bias (bool): Whether to include bias in the FFN.
+        kwargs (dict[str, Any]): Additional kwargs.
+    Returns:
+        tuple['megablocks.layers.arguments.Arguments', int, ProcessGroup]:
+            The MegaBlocks args, the MoE world size, and the expert parallel group.
+    """
+    if megablocks is None:
+        raise RuntimeError('Requirements for megablocks not installed; see install instructions in `README.md`.')
+    args = kwargs['args']
+    args.bias = bias
+    args.hidden_size = d_model
+    args.device = device
+    ffn_hidden_size = resolve_ffn_hidden_size(d_model, expansion_ratio, ffn_hidden_size)
+    args.ffn_hidden_size = ffn_hidden_size
+    if ffn_act_fn is not None:
+        args.activation_fn = resolve_ffn_act_fn(ffn_act_fn)
+    moe_world_size = 1
+    expert_parallel_group = args.expert_parallel_group
+    if expert_parallel_group is not None:
+        moe_world_size = expert_parallel_group.size()
+    if kwargs.get('moe_world_size') != moe_world_size:
+        raise RuntimeError(f'MoE expert_parallel_group configured with incorrect world size.')
+    return (args, moe_world_size, expert_parallel_group)
+def attach_ffn_mb_args(ffn: nn.Module, expert_parallel_group: ProcessGroup, args: 'megablocks.layers.arguments.Arguments'):
+    """Attach arguments used in parameter initialization to the FFN.
+    Args:
+        ffn (nn.Module): The FFN module.
+        expert_parallel_group (ProcessGroup): The expert parallel process group.
+        args (megablocks.layers.arguments.Arguments): The arguments for MegaBlocks.
+    """
+    ffn.experts.mlp.hidden_size = args.ffn_hidden_size
+    ffn.experts.mlp.expert_parallel_group = expert_parallel_group
+def get_fsdp_submesh_2d(device_mesh: DeviceMesh):
+    """Get the submesh for FSDP.
+    Args:
+        device_mesh (DeviceMesh): The full device mesh.
+    Returns:
+        DeviceMesh: The submesh for FSDP.
+    """
+    if device_mesh.mesh.ndim == 2:
+        submesh = device_mesh['weight_parallel']
+    elif device_mesh.mesh.ndim == 3:
+        raise RuntimeError(f'HSDP + MoE is not supported.')
+    else:
+        raise ValueError(f'device_mesh.mesh.ndim={device_mesh.mesh.ndim!r} not supported for MoE.')
+    return submesh
+def set_ffn_device_mesh(ffn: nn.Module, moe_world_size: int, device_mesh: DeviceMesh, get_fsdp_submesh: Callable[[DeviceMesh], DeviceMesh]):
+    """Sets the device mesh in FSDP kwargs.
+    Args:
+        ffn (nn.Module): The FFN module.
+        moe_world_size (int): The MoE world size.
+        device_mesh (DeviceMesh): The full device mesh.
+        get_fsdp_submesh (Callable[[DeviceMesh], DeviceMesh]): A function to get the fsdp submesh.
+    Raises:
+        RuntimeError: If the device mesh is 3D.
+        ValueError: If the device mesh is not 2D or 3D.
+    """
+    if moe_world_size > 1:
+        expert_mesh = device_mesh['expert_parallel']
+        expert_placements: list[Placement] = [Shard(0)]
+        dtensorified_params = [(name, dtensorify_param(param=parameter, mesh=expert_mesh, placements=expert_placements)) for name, parameter in ffn.experts.mlp.named_parameters()]
+        for name, dtensorified_param in dtensorified_params:
+            ffn.experts.mlp.register_parameter(name, dtensorified_param)
+        submesh = get_fsdp_submesh(device_mesh)
+        ffn.experts._fsdp_kwargs_dict = {'device_mesh': submesh}
+def moe_fused_init_setup(ffn: nn.Module):
+    """Attach the _stack_dim attribute to the FFN.
+    Args:
+        ffn (nn.Module): The FFN module.
+    """
+    ffn.experts.mlp._stack_dim = 0
+def build_mb_moe(d_model: int, expansion_ratio: Union[int, float], ffn_hidden_size: Optional[int]=None, ffn_act_fn: Optional[dict]=None, device: Optional[str]=None, bias: bool=True, **kwargs: Any) -> nn.Module:
+    if not is_megablocks_imported:
+        raise RuntimeError('Requirements for megablocks not installed; see install instructions in `README.md`.')
+    args, moe_world_size, expert_parallel_group = mb_setup_args(d_model=d_model, expansion_ratio=expansion_ratio, ffn_hidden_size=ffn_hidden_size, ffn_act_fn=ffn_act_fn, device=device, bias=bias, kwargs=kwargs)
+    ffn = megablocks.layers.moe.MoE(args)
+    moe_fused_init_setup(ffn=ffn)
+    attach_ffn_mb_args(ffn=ffn, expert_parallel_group=expert_parallel_group, args=args)
+    set_ffn_device_mesh(ffn=ffn, moe_world_size=moe_world_size, device_mesh=kwargs['device_mesh'], get_fsdp_submesh=get_fsdp_submesh_2d)
+    return ffn
+def dmoe_fused_init_setup(ffn: nn.Module, args: 'megablocks.layers.arguments.Arguments', moe_world_size: int):
+    """Attach the _fused attribute to the dMoE model.
+    This is used for parameter initialization.
+    Args:
+        ffn (nn.Module): The FFN module.
+        args (megablocks.layers.arguments.Arguments): The arguments for MegaBlocks.
+        moe_world_size (int): The MoE world size.
+    """
+    n_exp = min(1, args.moe_num_experts // moe_world_size)
+    ffn.experts.mlp._fused = (0, [(n + 1) * args.ffn_hidden_size for n in range(n_exp - 1)])
+def build_mb_dmoe(d_model: int, expansion_ratio: Union[int, float], ffn_hidden_size: Optional[int]=None, ffn_act_fn: Optional[dict]=None, device: Optional[str]=None, bias: bool=True, **kwargs: Any) -> nn.Module:
+    if not is_megablocks_imported:
+        raise RuntimeError('Requirements for megablocks not installed; see install instructions in `README.md`.')
+    args, moe_world_size, expert_parallel_group = mb_setup_args(d_model=d_model, expansion_ratio=expansion_ratio, ffn_hidden_size=ffn_hidden_size, ffn_act_fn=ffn_act_fn, device=device, bias=bias, kwargs=kwargs)
+    ffn = megablocks.layers.dmoe.dMoE(args)
+    dmoe_fused_init_setup(ffn=ffn, args=args, moe_world_size=moe_world_size)
+    attach_ffn_mb_args(ffn=ffn, expert_parallel_group=expert_parallel_group, args=args)
+    set_ffn_device_mesh(ffn=ffn, moe_world_size=moe_world_size, device_mesh=kwargs['device_mesh'], get_fsdp_submesh=get_fsdp_submesh_2d)
+    return ffn
+ffns.register('mptglu', func=build_mptglu)
+ffns.register('mptmlp', func=build_mptmlp)
+ffns.register('torch_dmoe', func=build_torch_dmoe)
+if is_te_imported:
+    ffns_with_norm.register('te_ln_mlp', func=build_te_ln_mlp)
+if is_megablocks_imported:
+    ffns_with_megablocks.register('mb_moe', func=build_mb_moe)
+    ffns_with_megablocks.register('mb_dmoe', func=build_mb_dmoe)

generation_config.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "_from_model_config": true,
+  "transformers_version": "4.51.3",
+  "use_cache": false
+}

layer_builders.py ADDED Viewed

	@@ -0,0 +1,33 @@

+from typing import Any, Optional, Union
+import torch
+from .layers_registry import attention_classes, fcs, ffns, ffns_with_megablocks, ffns_with_norm, norms
+from .registry_utils import construct_from_registry
+def build_norm(name: str, normalized_shape: Union[int, list[int], torch.Size], eps: Optional[float]=1e-05, device: Optional[str]=None):
+    kwargs = {'normalized_shape': normalized_shape, 'eps': eps, 'device': device}
+    return construct_from_registry(name=name, registry=norms, pre_validation_function=torch.nn.Module, kwargs=kwargs)
+def build_ffn(name: str, d_model: int, expansion_ratio: float, device: Optional[str], bias: bool, ffn_kwargs: dict[str, Any]):
+    registry_to_use = ffns
+    if name in ffns_with_norm:
+        registry_to_use = ffns_with_norm
+    if name in ffns_with_megablocks:
+        registry_to_use = ffns_with_megablocks
+    kwargs = {'d_model': d_model, 'expansion_ratio': expansion_ratio, 'device': device, 'bias': bias, **{k: v for k, v in ffn_kwargs.items() if k != 'ffn_type'}}
+    def _validation_function(maybe_module: Any):
+        if not isinstance(maybe_module, torch.nn.Module):
+            raise ValueError(f'Function {name} must return a torch.nn.Module.')
+    result = construct_from_registry(name=name, registry=registry_to_use, post_validation_function=_validation_function, partial_function=False, kwargs=kwargs)
+    if name in ffns_with_norm:
+        result._has_norm = True
+    if name in ffns_with_megablocks:
+        result._uses_megablocks = True
+    return result
+def build_attention_layer(name: str, attn_kwargs: dict[str, Any]):
+    return construct_from_registry(name=name, registry=attention_classes, pre_validation_function=torch.nn.Module, kwargs=attn_kwargs)
+def build_fc(name: str, in_features: int, out_features: int, fc_kwargs: dict[str, Any]):
+    kwargs = {'in_features': in_features, 'out_features': out_features, **{k: v for k, v in fc_kwargs.items() if k != 'name'}}
+    return construct_from_registry(name=name, registry=fcs, pre_validation_function=torch.nn.Module, kwargs=kwargs)

layers_registry.py ADDED Viewed

	@@ -0,0 +1,22 @@

+from typing import Callable
+import torch
+from .registry_utils import create_registry
+_norms_description = 'The norms registry is used to register classes that implement normalization layers.\n\n    One example of this is torch.nn.LayerNorm. See norm.py for examples.\n\n    Args:\n        normalized_shape Union[int, List[int], torch.Size]: The shape of the input tensor.\n        device: Optional[torch.device]: The device to use for the normalization layer.\n\n    Returns:\n        torch.nn.Module: The normalization layer.\n    '
+norms = create_registry('llmfoundry', 'norms', generic_type=type[torch.nn.Module], entry_points=True, description=_norms_description)
+_fcs_description = 'The fcs registry is used to register classes that implement fully connected layers (i.e. torch.nn.Linear).\n\n    See fc.py for examples.\n\n    Args:\n        in_features: int: The number of input features.\n        out_features: int: The number of output features.\n        kwargs: Dict[str, Any]: Additional keyword arguments to pass to the layer.\n\n    Returns:\n        torch.nn.Module: The fully connected layer.\n    '
+fcs = create_registry('llmfoundry', 'fcs', generic_type=type[torch.nn.Module], entry_points=True, description=_fcs_description)
+_ffns_description = 'The ffns registry is used to register functions that build FFN layers.\n\n    These layers are generally composed of fc layers and activation functions.\n    One example is MPTMLP. See ffn.py for examples.\n\n    Args:\n        d_model: int: The size of the input and output tensors.\n        expansion_ratio: float: The expansion ratio for the hidden layer.\n        device: Optional[str]: The device to use for the layer.\n        bias: bool: Whether or not to include a bias term.\n        kwargs: Dict[str, Any]: Additional keyword arguments to pass to the layer.\n\n    Returns:\n        torch.nn.Module: The FFN layer.\n    '
+ffns = create_registry('llmfoundry', 'ffns', generic_type=Callable, entry_points=True, description=_ffns_description)
+_ffns_with_norm_description = 'The ffns_with_norm registry is used to register functions that build FFN layers with normalization.\n\n    The resulting layer will have ._has_norm set on it.\n    One example is te.LayerNormMLP. See ffn.py for examples.\n\n    Args:\n        d_model: int: The size of the input and output tensors.\n        expansion_ratio: float: The expansion ratio for the hidden layer.\n        device: Optional[str]: The device to use for the layer.\n        bias: bool: Whether or not to include a bias term.\n        kwargs: Dict[str, Any]: Additional keyword arguments to pass to the layer.\n\n    Returns:\n        torch.nn.Module: The FFN layer.\n    '
+ffns_with_norm = create_registry('llmfoundry', 'ffns_with_norm', generic_type=Callable, entry_points=True, description=_ffns_with_norm_description)
+_ffns_with_megablocks_description = 'The ffns_with_megablocks registry is used to register functions that build ffn layers using MegaBlocks.' + 'See ffn.py for examples.'
+_ffns_with_megablocks_description = 'The ffns_with_megablocks registry is used to register functions that build FFN layers using MegaBlocks.\n\n    The resulting layer will have ._uses_megablocks set on it.\n    One example is megablocks.layers.dmoe.dMoE. See ffn.py for examples.\n\n    Returns:\n        torch.nn.Module: The FFN layer.\n    '
+ffns_with_megablocks = create_registry('llmfoundry', 'ffns_with_megablocks', generic_type=Callable, entry_points=True, description=_ffns_with_megablocks_description)
+_attention_classes_description = 'The attention_classes registry is used to register classes that implement attention layers.\n\n    The kwargs are passed directly to the constructor of the class.\n    One example is GroupedQueryAttention. See attention.py for examples.\n\n    Args:\n        kwargs: Dict[str, Any]: Additional keyword arguments to pass to the layer.\n\n    Returns:\n        torch.nn.Module: The attention layer.\n    '
+attention_classes = create_registry('llmfoundry', 'attention_classes', generic_type=type[torch.nn.Module], entry_points=True, description=_attention_classes_description)
+_attention_implementations_description = "The attention_implementations registry is used to register functions that implement the attention operation.\n\n    One example is 'flash'. See attention.py for examples.\n\n    Args:\n        query (torch.Tensor): The query tensor.\n        key (torch.Tensor): The key tensor.\n        value (torch.Tensor): The value tensor.\n        n_heads (int): The number of attention heads.\n        kv_n_heads (int): The number of attention heads for the key and value tensors.\n        past_key_value (Optional[tuple[torch.Tensor, torch.Tensor]]): The past key and value tensors.\n        softmax_scale (Optional[float]) = None\n        attn_bias (Optional[torch.Tensor]) = None\n        is_causal (bool) = False\n        dropout_p (float) = 0.0\n        training (bool) = True\n        needs_weights (bool) = False\n        kwargs: Dict[str, Any]: Additional keyword arguments the implementation accepts.\n\n    Returns:\n        tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor, torch.Tensor]]]:\n            The output tensor, the attention weights, and the past key and value tensors.\n    "
+attention_implementations = create_registry('llmfoundry', 'attention_implementations', generic_type=Callable, entry_points=True, description=_attention_implementations_description)
+_param_init_fns_description = "The param_init_fns registry is used to register functions that initialize parameters.\n\n    These functions should take in a torch.nn.Module, additional kwargs, and initialize the parameters of the module.\n    Generally they can call generic_param_init_fn_ with an appropriate partial function. See param_init_fns.py for examples.\n\n    Note: These functions should take in arbitrary kwargs, and discard any they don't need.\n\n    Args:\n        module: torch.nn.Module: The module to initialize.\n        kwargs: Dict[str, Any]: Additional keyword arguments to use for initialization.\n    "
+param_init_fns = create_registry('llmfoundry', 'param_init_fns', generic_type=Callable[..., None], entry_points=True, description=_param_init_fns_description)
+_module_init_fns_description = 'The module_init_fns registry is used to register functions that initialize specific modules.\n\n    These functions should return True if they initialize the module, and False otherwise.\n    This allows them to be called without knowing their contents. They should take in the module and additional kwargs.\n    If multiple functions can initialize the module, the one that is registered first will be used, so it is recommended to\n    override an existing function if you want to change existing initialization behavior, and add new functions if you have new\n    layer types. See param_init_fns.py for details.\n\n    Args:\n        module: torch.nn.Module: The module to initialize.\n        kwargs: Dict[str, Any]: Additional keyword arguments to use for initialization.\n\n    Returns:\n        bool: Whether or not the module was initialized.\n    '
+module_init_fns = create_registry('llmfoundry', 'module_init_fns', generic_type=Callable[..., bool], entry_points=True, description=_module_init_fns_description)

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model-00001-of-00007.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:81285ba85106fae1bd4888299ae53c8b0d9d7931fd4fe55b63586217f4f17686
+size 984249104

model-00002-of-00007.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:01e48f4e42af9c0ca90ce8126815ff3b908cd88550c651bc7ce6d9fe8d66257c
+size 988048544

model-00003-of-00007.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f8bc0d3fcaca8bdfccf6550339ccb9c0409b15a5396261ca181f76a02eb43947
+size 975467784

model-00004-of-00007.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3d4f3ef10def6e5d48bb5adf4d016574116c410cbfe90568abd417e57c63abde
+size 931425488

model-00005-of-00007.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2352c5ee8b3fa13b3ec3d3b07e6e2caa9b2a9990bcccaf57e02d6b89b667eafe
+size 931413312

model-00006-of-00007.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6cef8dff3c993f5a28c5e3ea92430a77f5a2ade4ed7547307ffb7b2063f9d977
+size 988048592

model-00007-of-00007.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d24632b50ec49b2253c989a627771dfabe2e9ecb42e455deffbe198244fbaa1e
+size 774080720

model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,289 @@

+{
+  "metadata": {
+    "total_size": 6572701696
+  },
+  "weight_map": {
+    "transformer.blocks.0.attn.Wqkv.bias": "model-00001-of-00007.safetensors",
+    "transformer.blocks.0.attn.Wqkv.weight": "model-00001-of-00007.safetensors",
+    "transformer.blocks.0.attn.out_proj.bias": "model-00001-of-00007.safetensors",
+    "transformer.blocks.0.attn.out_proj.weight": "model-00001-of-00007.safetensors",
+    "transformer.blocks.0.ffn.down_proj.bias": "model-00002-of-00007.safetensors",
+    "transformer.blocks.0.ffn.down_proj.weight": "model-00002-of-00007.safetensors",
+    "transformer.blocks.0.ffn.up_proj.bias": "model-00002-of-00007.safetensors",
+    "transformer.blocks.0.ffn.up_proj.weight": "model-00002-of-00007.safetensors",
+    "transformer.blocks.0.norm_1.weight": "model-00001-of-00007.safetensors",
+    "transformer.blocks.0.norm_2.weight": "model-00001-of-00007.safetensors",
+    "transformer.blocks.1.attn.Wqkv.bias": "model-00002-of-00007.safetensors",
+    "transformer.blocks.1.attn.Wqkv.weight": "model-00002-of-00007.safetensors",
+    "transformer.blocks.1.attn.out_proj.bias": "model-00002-of-00007.safetensors",
+    "transformer.blocks.1.attn.out_proj.weight": "model-00002-of-00007.safetensors",
+    "transformer.blocks.1.ffn.down_proj.bias": "model-00002-of-00007.safetensors",
+    "transformer.blocks.1.ffn.down_proj.weight": "model-00002-of-00007.safetensors",
+    "transformer.blocks.1.ffn.up_proj.bias": "model-00002-of-00007.safetensors",
+    "transformer.blocks.1.ffn.up_proj.weight": "model-00002-of-00007.safetensors",
+    "transformer.blocks.1.norm_1.weight": "model-00002-of-00007.safetensors",
+    "transformer.blocks.1.norm_2.weight": "model-00002-of-00007.safetensors",
+    "transformer.blocks.10.attn.Wqkv.bias": "model-00004-of-00007.safetensors",
+    "transformer.blocks.10.attn.Wqkv.weight": "model-00004-of-00007.safetensors",
+    "transformer.blocks.10.attn.out_proj.bias": "model-00004-of-00007.safetensors",
+    "transformer.blocks.10.attn.out_proj.weight": "model-00004-of-00007.safetensors",
+    "transformer.blocks.10.ffn.down_proj.bias": "model-00004-of-00007.safetensors",
+    "transformer.blocks.10.ffn.down_proj.weight": "model-00004-of-00007.safetensors",
+    "transformer.blocks.10.ffn.up_proj.bias": "model-00004-of-00007.safetensors",
+    "transformer.blocks.10.ffn.up_proj.weight": "model-00004-of-00007.safetensors",
+    "transformer.blocks.10.norm_1.weight": "model-00003-of-00007.safetensors",
+    "transformer.blocks.10.norm_2.weight": "model-00004-of-00007.safetensors",
+    "transformer.blocks.11.attn.Wqkv.bias": "model-00004-of-00007.safetensors",
+    "transformer.blocks.11.attn.Wqkv.weight": "model-00004-of-00007.safetensors",
+    "transformer.blocks.11.attn.out_proj.bias": "model-00004-of-00007.safetensors",
+    "transformer.blocks.11.attn.out_proj.weight": "model-00004-of-00007.safetensors",
+    "transformer.blocks.11.ffn.down_proj.bias": "model-00004-of-00007.safetensors",
+    "transformer.blocks.11.ffn.down_proj.weight": "model-00004-of-00007.safetensors",
+    "transformer.blocks.11.ffn.up_proj.bias": "model-00004-of-00007.safetensors",
+    "transformer.blocks.11.ffn.up_proj.weight": "model-00004-of-00007.safetensors",
+    "transformer.blocks.11.norm_1.weight": "model-00004-of-00007.safetensors",
+    "transformer.blocks.11.norm_2.weight": "model-00004-of-00007.safetensors",
+    "transformer.blocks.12.attn.Wqkv.bias": "model-00004-of-00007.safetensors",
+    "transformer.blocks.12.attn.Wqkv.weight": "model-00004-of-00007.safetensors",
+    "transformer.blocks.12.attn.out_proj.bias": "model-00004-of-00007.safetensors",
+    "transformer.blocks.12.attn.out_proj.weight": "model-00004-of-00007.safetensors",
+    "transformer.blocks.12.ffn.down_proj.bias": "model-00004-of-00007.safetensors",
+    "transformer.blocks.12.ffn.down_proj.weight": "model-00004-of-00007.safetensors",
+    "transformer.blocks.12.ffn.up_proj.bias": "model-00004-of-00007.safetensors",
+    "transformer.blocks.12.ffn.up_proj.weight": "model-00004-of-00007.safetensors",
+    "transformer.blocks.12.norm_1.weight": "model-00004-of-00007.safetensors",
+    "transformer.blocks.12.norm_2.weight": "model-00004-of-00007.safetensors",
+    "transformer.blocks.13.attn.Wqkv.bias": "model-00004-of-00007.safetensors",
+    "transformer.blocks.13.attn.Wqkv.weight": "model-00004-of-00007.safetensors",
+    "transformer.blocks.13.attn.out_proj.bias": "model-00004-of-00007.safetensors",
+    "transformer.blocks.13.attn.out_proj.weight": "model-00004-of-00007.safetensors",
+    "transformer.blocks.13.ffn.down_proj.bias": "model-00004-of-00007.safetensors",
+    "transformer.blocks.13.ffn.down_proj.weight": "model-00004-of-00007.safetensors",
+    "transformer.blocks.13.ffn.up_proj.bias": "model-00004-of-00007.safetensors",
+    "transformer.blocks.13.ffn.up_proj.weight": "model-00004-of-00007.safetensors",
+    "transformer.blocks.13.norm_1.weight": "model-00004-of-00007.safetensors",
+    "transformer.blocks.13.norm_2.weight": "model-00004-of-00007.safetensors",
+    "transformer.blocks.14.attn.Wqkv.bias": "model-00004-of-00007.safetensors",
+    "transformer.blocks.14.attn.Wqkv.weight": "model-00004-of-00007.safetensors",
+    "transformer.blocks.14.attn.out_proj.bias": "model-00004-of-00007.safetensors",
+    "transformer.blocks.14.attn.out_proj.weight": "model-00004-of-00007.safetensors",
+    "transformer.blocks.14.ffn.down_proj.bias": "model-00005-of-00007.safetensors",
+    "transformer.blocks.14.ffn.down_proj.weight": "model-00005-of-00007.safetensors",
+    "transformer.blocks.14.ffn.up_proj.bias": "model-00004-of-00007.safetensors",
+    "transformer.blocks.14.ffn.up_proj.weight": "model-00004-of-00007.safetensors",
+    "transformer.blocks.14.norm_1.weight": "model-00004-of-00007.safetensors",
+    "transformer.blocks.14.norm_2.weight": "model-00004-of-00007.safetensors",
+    "transformer.blocks.15.attn.Wqkv.bias": "model-00005-of-00007.safetensors",
+    "transformer.blocks.15.attn.Wqkv.weight": "model-00005-of-00007.safetensors",
+    "transformer.blocks.15.attn.out_proj.bias": "model-00005-of-00007.safetensors",
+    "transformer.blocks.15.attn.out_proj.weight": "model-00005-of-00007.safetensors",
+    "transformer.blocks.15.ffn.down_proj.bias": "model-00005-of-00007.safetensors",
+    "transformer.blocks.15.ffn.down_proj.weight": "model-00005-of-00007.safetensors",
+    "transformer.blocks.15.ffn.up_proj.bias": "model-00005-of-00007.safetensors",
+    "transformer.blocks.15.ffn.up_proj.weight": "model-00005-of-00007.safetensors",
+    "transformer.blocks.15.norm_1.weight": "model-00005-of-00007.safetensors",
+    "transformer.blocks.15.norm_2.weight": "model-00005-of-00007.safetensors",
+    "transformer.blocks.16.attn.Wqkv.bias": "model-00005-of-00007.safetensors",
+    "transformer.blocks.16.attn.Wqkv.weight": "model-00005-of-00007.safetensors",
+    "transformer.blocks.16.attn.out_proj.bias": "model-00005-of-00007.safetensors",
+    "transformer.blocks.16.attn.out_proj.weight": "model-00005-of-00007.safetensors",
+    "transformer.blocks.16.ffn.down_proj.bias": "model-00005-of-00007.safetensors",
+    "transformer.blocks.16.ffn.down_proj.weight": "model-00005-of-00007.safetensors",
+    "transformer.blocks.16.ffn.up_proj.bias": "model-00005-of-00007.safetensors",
+    "transformer.blocks.16.ffn.up_proj.weight": "model-00005-of-00007.safetensors",
+    "transformer.blocks.16.norm_1.weight": "model-00005-of-00007.safetensors",
+    "transformer.blocks.16.norm_2.weight": "model-00005-of-00007.safetensors",
+    "transformer.blocks.17.attn.Wqkv.bias": "model-00005-of-00007.safetensors",
+    "transformer.blocks.17.attn.Wqkv.weight": "model-00005-of-00007.safetensors",
+    "transformer.blocks.17.attn.out_proj.bias": "model-00005-of-00007.safetensors",
+    "transformer.blocks.17.attn.out_proj.weight": "model-00005-of-00007.safetensors",
+    "transformer.blocks.17.ffn.down_proj.bias": "model-00005-of-00007.safetensors",
+    "transformer.blocks.17.ffn.down_proj.weight": "model-00005-of-00007.safetensors",
+    "transformer.blocks.17.ffn.up_proj.bias": "model-00005-of-00007.safetensors",
+    "transformer.blocks.17.ffn.up_proj.weight": "model-00005-of-00007.safetensors",
+    "transformer.blocks.17.norm_1.weight": "model-00005-of-00007.safetensors",
+    "transformer.blocks.17.norm_2.weight": "model-00005-of-00007.safetensors",
+    "transformer.blocks.18.attn.Wqkv.bias": "model-00005-of-00007.safetensors",
+    "transformer.blocks.18.attn.Wqkv.weight": "model-00005-of-00007.safetensors",
+    "transformer.blocks.18.attn.out_proj.bias": "model-00005-of-00007.safetensors",
+    "transformer.blocks.18.attn.out_proj.weight": "model-00005-of-00007.safetensors",
+    "transformer.blocks.18.ffn.down_proj.bias": "model-00005-of-00007.safetensors",
+    "transformer.blocks.18.ffn.down_proj.weight": "model-00005-of-00007.safetensors",
+    "transformer.blocks.18.ffn.up_proj.bias": "model-00005-of-00007.safetensors",
+    "transformer.blocks.18.ffn.up_proj.weight": "model-00005-of-00007.safetensors",
+    "transformer.blocks.18.norm_1.weight": "model-00005-of-00007.safetensors",
+    "transformer.blocks.18.norm_2.weight": "model-00005-of-00007.safetensors",
+    "transformer.blocks.19.attn.Wqkv.bias": "model-00005-of-00007.safetensors",
+    "transformer.blocks.19.attn.Wqkv.weight": "model-00005-of-00007.safetensors",
+    "transformer.blocks.19.attn.out_proj.bias": "model-00005-of-00007.safetensors",
+    "transformer.blocks.19.attn.out_proj.weight": "model-00005-of-00007.safetensors",
+    "transformer.blocks.19.ffn.down_proj.bias": "model-00006-of-00007.safetensors",
+    "transformer.blocks.19.ffn.down_proj.weight": "model-00006-of-00007.safetensors",
+    "transformer.blocks.19.ffn.up_proj.bias": "model-00006-of-00007.safetensors",
+    "transformer.blocks.19.ffn.up_proj.weight": "model-00006-of-00007.safetensors",
+    "transformer.blocks.19.norm_1.weight": "model-00005-of-00007.safetensors",
+    "transformer.blocks.19.norm_2.weight": "model-00005-of-00007.safetensors",
+    "transformer.blocks.2.attn.Wqkv.bias": "model-00002-of-00007.safetensors",
+    "transformer.blocks.2.attn.Wqkv.weight": "model-00002-of-00007.safetensors",
+    "transformer.blocks.2.attn.out_proj.bias": "model-00002-of-00007.safetensors",
+    "transformer.blocks.2.attn.out_proj.weight": "model-00002-of-00007.safetensors",
+    "transformer.blocks.2.ffn.down_proj.bias": "model-00002-of-00007.safetensors",
+    "transformer.blocks.2.ffn.down_proj.weight": "model-00002-of-00007.safetensors",
+    "transformer.blocks.2.ffn.up_proj.bias": "model-00002-of-00007.safetensors",
+    "transformer.blocks.2.ffn.up_proj.weight": "model-00002-of-00007.safetensors",
+    "transformer.blocks.2.norm_1.weight": "model-00002-of-00007.safetensors",
+    "transformer.blocks.2.norm_2.weight": "model-00002-of-00007.safetensors",
+    "transformer.blocks.20.attn.Wqkv.bias": "model-00006-of-00007.safetensors",
+    "transformer.blocks.20.attn.Wqkv.weight": "model-00006-of-00007.safetensors",
+    "transformer.blocks.20.attn.out_proj.bias": "model-00006-of-00007.safetensors",
+    "transformer.blocks.20.attn.out_proj.weight": "model-00006-of-00007.safetensors",
+    "transformer.blocks.20.ffn.down_proj.bias": "model-00006-of-00007.safetensors",
+    "transformer.blocks.20.ffn.down_proj.weight": "model-00006-of-00007.safetensors",
+    "transformer.blocks.20.ffn.up_proj.bias": "model-00006-of-00007.safetensors",
+    "transformer.blocks.20.ffn.up_proj.weight": "model-00006-of-00007.safetensors",
+    "transformer.blocks.20.norm_1.weight": "model-00006-of-00007.safetensors",
+    "transformer.blocks.20.norm_2.weight": "model-00006-of-00007.safetensors",
+    "transformer.blocks.21.attn.Wqkv.bias": "model-00006-of-00007.safetensors",
+    "transformer.blocks.21.attn.Wqkv.weight": "model-00006-of-00007.safetensors",
+    "transformer.blocks.21.attn.out_proj.bias": "model-00006-of-00007.safetensors",
+    "transformer.blocks.21.attn.out_proj.weight": "model-00006-of-00007.safetensors",
+    "transformer.blocks.21.ffn.down_proj.bias": "model-00006-of-00007.safetensors",
+    "transformer.blocks.21.ffn.down_proj.weight": "model-00006-of-00007.safetensors",
+    "transformer.blocks.21.ffn.up_proj.bias": "model-00006-of-00007.safetensors",
+    "transformer.blocks.21.ffn.up_proj.weight": "model-00006-of-00007.safetensors",
+    "transformer.blocks.21.norm_1.weight": "model-00006-of-00007.safetensors",
+    "transformer.blocks.21.norm_2.weight": "model-00006-of-00007.safetensors",
+    "transformer.blocks.22.attn.Wqkv.bias": "model-00006-of-00007.safetensors",
+    "transformer.blocks.22.attn.Wqkv.weight": "model-00006-of-00007.safetensors",
+    "transformer.blocks.22.attn.out_proj.bias": "model-00006-of-00007.safetensors",
+    "transformer.blocks.22.attn.out_proj.weight": "model-00006-of-00007.safetensors",
+    "transformer.blocks.22.ffn.down_proj.bias": "model-00006-of-00007.safetensors",
+    "transformer.blocks.22.ffn.down_proj.weight": "model-00006-of-00007.safetensors",
+    "transformer.blocks.22.ffn.up_proj.bias": "model-00006-of-00007.safetensors",
+    "transformer.blocks.22.ffn.up_proj.weight": "model-00006-of-00007.safetensors",
+    "transformer.blocks.22.norm_1.weight": "model-00006-of-00007.safetensors",
+    "transformer.blocks.22.norm_2.weight": "model-00006-of-00007.safetensors",
+    "transformer.blocks.23.attn.Wqkv.bias": "model-00006-of-00007.safetensors",
+    "transformer.blocks.23.attn.Wqkv.weight": "model-00006-of-00007.safetensors",
+    "transformer.blocks.23.attn.out_proj.bias": "model-00006-of-00007.safetensors",
+    "transformer.blocks.23.attn.out_proj.weight": "model-00006-of-00007.safetensors",
+    "transformer.blocks.23.ffn.down_proj.bias": "model-00006-of-00007.safetensors",
+    "transformer.blocks.23.ffn.down_proj.weight": "model-00006-of-00007.safetensors",
+    "transformer.blocks.23.ffn.up_proj.bias": "model-00006-of-00007.safetensors",
+    "transformer.blocks.23.ffn.up_proj.weight": "model-00006-of-00007.safetensors",
+    "transformer.blocks.23.norm_1.weight": "model-00006-of-00007.safetensors",
+    "transformer.blocks.23.norm_2.weight": "model-00006-of-00007.safetensors",
+    "transformer.blocks.24.attn.Wqkv.bias": "model-00006-of-00007.safetensors",
+    "transformer.blocks.24.attn.Wqkv.weight": "model-00006-of-00007.safetensors",
+    "transformer.blocks.24.attn.out_proj.bias": "model-00007-of-00007.safetensors",
+    "transformer.blocks.24.attn.out_proj.weight": "model-00007-of-00007.safetensors",
+    "transformer.blocks.24.ffn.down_proj.bias": "model-00007-of-00007.safetensors",
+    "transformer.blocks.24.ffn.down_proj.weight": "model-00007-of-00007.safetensors",
+    "transformer.blocks.24.ffn.up_proj.bias": "model-00007-of-00007.safetensors",
+    "transformer.blocks.24.ffn.up_proj.weight": "model-00007-of-00007.safetensors",
+    "transformer.blocks.24.norm_1.weight": "model-00006-of-00007.safetensors",
+    "transformer.blocks.24.norm_2.weight": "model-00007-of-00007.safetensors",
+    "transformer.blocks.25.attn.Wqkv.bias": "model-00007-of-00007.safetensors",
+    "transformer.blocks.25.attn.Wqkv.weight": "model-00007-of-00007.safetensors",
+    "transformer.blocks.25.attn.out_proj.bias": "model-00007-of-00007.safetensors",
+    "transformer.blocks.25.attn.out_proj.weight": "model-00007-of-00007.safetensors",
+    "transformer.blocks.25.ffn.down_proj.bias": "model-00007-of-00007.safetensors",
+    "transformer.blocks.25.ffn.down_proj.weight": "model-00007-of-00007.safetensors",
+    "transformer.blocks.25.ffn.up_proj.bias": "model-00007-of-00007.safetensors",
+    "transformer.blocks.25.ffn.up_proj.weight": "model-00007-of-00007.safetensors",
+    "transformer.blocks.25.norm_1.weight": "model-00007-of-00007.safetensors",
+    "transformer.blocks.25.norm_2.weight": "model-00007-of-00007.safetensors",
+    "transformer.blocks.26.attn.Wqkv.bias": "model-00007-of-00007.safetensors",
+    "transformer.blocks.26.attn.Wqkv.weight": "model-00007-of-00007.safetensors",
+    "transformer.blocks.26.attn.out_proj.bias": "model-00007-of-00007.safetensors",
+    "transformer.blocks.26.attn.out_proj.weight": "model-00007-of-00007.safetensors",
+    "transformer.blocks.26.ffn.down_proj.bias": "model-00007-of-00007.safetensors",
+    "transformer.blocks.26.ffn.down_proj.weight": "model-00007-of-00007.safetensors",
+    "transformer.blocks.26.ffn.up_proj.bias": "model-00007-of-00007.safetensors",
+    "transformer.blocks.26.ffn.up_proj.weight": "model-00007-of-00007.safetensors",
+    "transformer.blocks.26.norm_1.weight": "model-00007-of-00007.safetensors",
+    "transformer.blocks.26.norm_2.weight": "model-00007-of-00007.safetensors",
+    "transformer.blocks.27.attn.Wqkv.bias": "model-00007-of-00007.safetensors",
+    "transformer.blocks.27.attn.Wqkv.weight": "model-00007-of-00007.safetensors",
+    "transformer.blocks.27.attn.out_proj.bias": "model-00007-of-00007.safetensors",
+    "transformer.blocks.27.attn.out_proj.weight": "model-00007-of-00007.safetensors",
+    "transformer.blocks.27.ffn.down_proj.bias": "model-00007-of-00007.safetensors",
+    "transformer.blocks.27.ffn.down_proj.weight": "model-00007-of-00007.safetensors",
+    "transformer.blocks.27.ffn.up_proj.bias": "model-00007-of-00007.safetensors",
+    "transformer.blocks.27.ffn.up_proj.weight": "model-00007-of-00007.safetensors",
+    "transformer.blocks.27.norm_1.weight": "model-00007-of-00007.safetensors",
+    "transformer.blocks.27.norm_2.weight": "model-00007-of-00007.safetensors",
+    "transformer.blocks.3.attn.Wqkv.bias": "model-00002-of-00007.safetensors",
+    "transformer.blocks.3.attn.Wqkv.weight": "model-00002-of-00007.safetensors",
+    "transformer.blocks.3.attn.out_proj.bias": "model-00002-of-00007.safetensors",
+    "transformer.blocks.3.attn.out_proj.weight": "model-00002-of-00007.safetensors",
+    "transformer.blocks.3.ffn.down_proj.bias": "model-00002-of-00007.safetensors",
+    "transformer.blocks.3.ffn.down_proj.weight": "model-00002-of-00007.safetensors",
+    "transformer.blocks.3.ffn.up_proj.bias": "model-00002-of-00007.safetensors",
+    "transformer.blocks.3.ffn.up_proj.weight": "model-00002-of-00007.safetensors",
+    "transformer.blocks.3.norm_1.weight": "model-00002-of-00007.safetensors",
+    "transformer.blocks.3.norm_2.weight": "model-00002-of-00007.safetensors",
+    "transformer.blocks.4.attn.Wqkv.bias": "model-00002-of-00007.safetensors",
+    "transformer.blocks.4.attn.Wqkv.weight": "model-00002-of-00007.safetensors",
+    "transformer.blocks.4.attn.out_proj.bias": "model-00002-of-00007.safetensors",
+    "transformer.blocks.4.attn.out_proj.weight": "model-00002-of-00007.safetensors",
+    "transformer.blocks.4.ffn.down_proj.bias": "model-00002-of-00007.safetensors",
+    "transformer.blocks.4.ffn.down_proj.weight": "model-00002-of-00007.safetensors",
+    "transformer.blocks.4.ffn.up_proj.bias": "model-00002-of-00007.safetensors",
+    "transformer.blocks.4.ffn.up_proj.weight": "model-00002-of-00007.safetensors",
+    "transformer.blocks.4.norm_1.weight": "model-00002-of-00007.safetensors",
+    "transformer.blocks.4.norm_2.weight": "model-00002-of-00007.safetensors",
+    "transformer.blocks.5.attn.Wqkv.bias": "model-00002-of-00007.safetensors",
+    "transformer.blocks.5.attn.Wqkv.weight": "model-00002-of-00007.safetensors",
+    "transformer.blocks.5.attn.out_proj.bias": "model-00003-of-00007.safetensors",
+    "transformer.blocks.5.attn.out_proj.weight": "model-00003-of-00007.safetensors",
+    "transformer.blocks.5.ffn.down_proj.bias": "model-00003-of-00007.safetensors",
+    "transformer.blocks.5.ffn.down_proj.weight": "model-00003-of-00007.safetensors",
+    "transformer.blocks.5.ffn.up_proj.bias": "model-00003-of-00007.safetensors",
+    "transformer.blocks.5.ffn.up_proj.weight": "model-00003-of-00007.safetensors",
+    "transformer.blocks.5.norm_1.weight": "model-00002-of-00007.safetensors",
+    "transformer.blocks.5.norm_2.weight": "model-00003-of-00007.safetensors",
+    "transformer.blocks.6.attn.Wqkv.bias": "model-00003-of-00007.safetensors",
+    "transformer.blocks.6.attn.Wqkv.weight": "model-00003-of-00007.safetensors",
+    "transformer.blocks.6.attn.out_proj.bias": "model-00003-of-00007.safetensors",
+    "transformer.blocks.6.attn.out_proj.weight": "model-00003-of-00007.safetensors",
+    "transformer.blocks.6.ffn.down_proj.bias": "model-00003-of-00007.safetensors",
+    "transformer.blocks.6.ffn.down_proj.weight": "model-00003-of-00007.safetensors",
+    "transformer.blocks.6.ffn.up_proj.bias": "model-00003-of-00007.safetensors",
+    "transformer.blocks.6.ffn.up_proj.weight": "model-00003-of-00007.safetensors",
+    "transformer.blocks.6.norm_1.weight": "model-00003-of-00007.safetensors",
+    "transformer.blocks.6.norm_2.weight": "model-00003-of-00007.safetensors",
+    "transformer.blocks.7.attn.Wqkv.bias": "model-00003-of-00007.safetensors",
+    "transformer.blocks.7.attn.Wqkv.weight": "model-00003-of-00007.safetensors",
+    "transformer.blocks.7.attn.out_proj.bias": "model-00003-of-00007.safetensors",
+    "transformer.blocks.7.attn.out_proj.weight": "model-00003-of-00007.safetensors",
+    "transformer.blocks.7.ffn.down_proj.bias": "model-00003-of-00007.safetensors",
+    "transformer.blocks.7.ffn.down_proj.weight": "model-00003-of-00007.safetensors",
+    "transformer.blocks.7.ffn.up_proj.bias": "model-00003-of-00007.safetensors",
+    "transformer.blocks.7.ffn.up_proj.weight": "model-00003-of-00007.safetensors",
+    "transformer.blocks.7.norm_1.weight": "model-00003-of-00007.safetensors",
+    "transformer.blocks.7.norm_2.weight": "model-00003-of-00007.safetensors",
+    "transformer.blocks.8.attn.Wqkv.bias": "model-00003-of-00007.safetensors",
+    "transformer.blocks.8.attn.Wqkv.weight": "model-00003-of-00007.safetensors",
+    "transformer.blocks.8.attn.out_proj.bias": "model-00003-of-00007.safetensors",
+    "transformer.blocks.8.attn.out_proj.weight": "model-00003-of-00007.safetensors",
+    "transformer.blocks.8.ffn.down_proj.bias": "model-00003-of-00007.safetensors",
+    "transformer.blocks.8.ffn.down_proj.weight": "model-00003-of-00007.safetensors",
+    "transformer.blocks.8.ffn.up_proj.bias": "model-00003-of-00007.safetensors",
+    "transformer.blocks.8.ffn.up_proj.weight": "model-00003-of-00007.safetensors",
+    "transformer.blocks.8.norm_1.weight": "model-00003-of-00007.safetensors",
+    "transformer.blocks.8.norm_2.weight": "model-00003-of-00007.safetensors",
+    "transformer.blocks.9.attn.Wqkv.bias": "model-00003-of-00007.safetensors",
+    "transformer.blocks.9.attn.Wqkv.weight": "model-00003-of-00007.safetensors",
+    "transformer.blocks.9.attn.out_proj.bias": "model-00003-of-00007.safetensors",
+    "transformer.blocks.9.attn.out_proj.weight": "model-00003-of-00007.safetensors",
+    "transformer.blocks.9.ffn.down_proj.bias": "model-00003-of-00007.safetensors",
+    "transformer.blocks.9.ffn.down_proj.weight": "model-00003-of-00007.safetensors",
+    "transformer.blocks.9.ffn.up_proj.bias": "model-00003-of-00007.safetensors",
+    "transformer.blocks.9.ffn.up_proj.weight": "model-00003-of-00007.safetensors",
+    "transformer.blocks.9.norm_1.weight": "model-00003-of-00007.safetensors",
+    "transformer.blocks.9.norm_2.weight": "model-00003-of-00007.safetensors",
+    "transformer.norm_f.weight": "model-00007-of-00007.safetensors",
+    "transformer.wte.weight": "model-00001-of-00007.safetensors"
+  }
+}

modeling_mpt.py ADDED Viewed

	@@ -0,0 +1,696 @@

+"""A simple, flexible implementation of a GPT model.
+Inspired by https://github.com/karpathy/minGPT/blob/master/mingpt/model.py
+"""
+from __future__ import annotations
+from .dmoe import _UniformExpertAssignment
+from .ffn import quickgelu_activation
+from .config_defaults import *
+from .registry_utils import TypedRegistry
+from .warnings import VersionedDeprecationWarning
+import copy
+import math
+import warnings
+from functools import cached_property
+from typing import Any, Mapping, MutableMapping, Optional, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from tabulate import tabulate
+from .layers_registry import ffns_with_megablocks
+from .attention import is_flash_v2_installed
+if is_flash_v2_installed():
+    try:
+        from flash_attn import bert_padding
+        from flash_attn.layers.rotary import RotaryEmbedding as DAILRotaryEmbedding
+    except Exception as e:
+        raise e
+import logging
+from transformers import PreTrainedModel, PreTrainedTokenizerBase
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from transformers.models.llama.modeling_llama import LlamaConfig, LlamaRotaryEmbedding
+from .layers_registry import norms, param_init_fns
+from .attention import attn_bias_shape, build_attn_bias, gen_slopes
+from .blocks import MPTBlock
+from .custom_embedding import SharedEmbedding
+from .layer_builders import build_norm
+from .configuration_mpt import MPTConfig
+from .act_ckpt import build_act_ckpt_mod_to_blocks, check_mapping_blocks_overlap, pass_on_block_idx
+from .config_moe_args import config_moe_args
+from .mpt_param_count import mpt_get_active_params, mpt_get_total_params
+from .fc import fcs
+from .param_init_fns import generic_param_init_fn_
+from .norm import LPLayerNorm
+log = logging.getLogger(__name__)
+CROSS_ENTROPY_IGNORE_INDEX = -100
+class InvalidConfigAccessError(KeyError):
+    pass
+_ALLOWED_LLAMA_CONFIG_KEYS = {'rope_scaling', 'rope_theta', 'max_position_embeddings', 'hidden_size', 'num_attention_heads', 'partial_rotary_factor', 'head_dim', '_get_generation_defaults', 'label2id', 'id2label', 'torch_dtype', 'problem_type', '__class__', '_get_global_generation_defaults'}
+class PartialLlamaConfig(LlamaConfig):
+    """Holds the rope config for Llama models and throws.
+    an `InvalidConfigAccessError` if any other config elements are read. This
+    class is necessary because the `LlamaRotaryEmbedding` class takes a full
+    `LlamaConfig` now instead of the old keyword arguments.
+    """
+    def __getattribute__(self, key: str):
+        if key not in _ALLOWED_LLAMA_CONFIG_KEYS:
+            raise InvalidConfigAccessError(key)
+        return super().__getattribute__(key)
+    def __getitem__(self, key: str):
+        if key not in _ALLOWED_LLAMA_CONFIG_KEYS:
+            raise InvalidConfigAccessError(key)
+        return super().__getitem__(key)
+def gen_rotary_embedding(rope_impl: str, rope_theta: int, rope_dail_config: dict, rope_hf_config: dict, max_seq_len: int, d_model: int, n_heads: int):
+    rope_head_dim = d_model // n_heads
+    if rope_impl == 'dail':
+        return DAILRotaryEmbedding(dim=rope_head_dim, base=rope_theta, interleaved=False, scale_base=rope_dail_config['xpos_scale_base'] if rope_dail_config['type'] == 'xpos' else None, pos_idx_in_fp32=rope_dail_config['pos_idx_in_fp32'], device='cpu')
+    elif rope_impl == 'hf':
+        llama_rope_config = {**rope_hf_config}
+        llama_rope_config['rope_type'] = llama_rope_config.pop('type')
+        if llama_rope_config['rope_type'] == 'no_scaling':
+            llama_rope_config['rope_type'] = 'default'
+        partial_llama_config = PartialLlamaConfig(rope_scaling=llama_rope_config, rope_theta=rope_theta, max_position_embeddings=max_seq_len, hidden_size=d_model, num_attention_heads=n_heads)
+        return LlamaRotaryEmbeddingFoundry(config=partial_llama_config)
+    raise ValueError('rope_impl needs to be either dail or hf')
+def gen_attention_mask_in_length(sequence_id: Union[None, torch.Tensor], S: int, attn_uses_sequence_id: bool, attn_impl: str, attention_mask: Union[torch.Tensor, None]):
+    """Generates the attention mask used for sequence masking in FA v2.
+    Only supports sequence id based sparse attention for no attention masking or attention masking with right padding.
+    In case of left padding:
+        1. Training with left padding is not supported in MPT (see https://github.com/mosaicml/llm-foundry/blob/1eecd4cb8e734499f77f6a35f657b8b20c0adfcb/llmfoundry/models/mpt/modeling_mpt.py#L407).
+        2. For generation with left padding, we only have a single sequence id per sample, so we don't need sequence id based sparse attention.
+    Args:
+        sequence_id (Union[None, torch.Tensor]): Tensor containing the sequence id for each token. Shape (batch_size, seq_len).
+        S (int): Sequence length
+        attn_uses_sequence_id (bool): Whether the attention uses sequence id based masking.
+        attn_impl (str): Attention implementation. This function is only creates attention_mask_in_length for flash attention.
+        attention_mask (Union[torch.Tensor, None]): Attention mask tensor of shape (batch_size, seq_len)
+    Returns:
+        attention_mask_in_length: (batch, seqlen), int, a nonzero number (e.g., 1, 2, 3, etc.) means length of concatenated sequence in b-th batch, and 0 means none. For example, if batch = 3 and seqlen = 6, the attention_mask_in_length is:
+            ```
+            [
+            [2, 3, 0, 0, 0, 0],
+            [3, 2, 0, 0, 0, 0],
+            [6, 0, 0, 0, 0, 0]
+            ]
+            ```
+        , which refers to the 3D-attention mask:
+            ```
+            [
+            [
+                [1, 0, 0, 0, 0, 0],
+                [1, 1, 0, 0, 0, 0],
+                [0, 0, 1, 0, 0, 0],
+                [0, 0, 1, 1, 0, 0],
+                [0, 0, 1, 1, 1, 0],
+                [0, 0, 0, 0, 0, 1]
+            ],
+            [
+                [1, 0, 0, 0, 0, 0],
+                [1, 1, 0, 0, 0, 0],
+                [1, 1, 1, 0, 0, 0],
+                [0, 0, 0, 1, 0, 0],
+                [0, 0, 0, 1, 1, 0],
+                [0, 0, 0, 0, 0, 1]
+            ],
+            [
+                [1, 0, 0, 0, 0, 0],
+                [1, 1, 0, 0, 0, 0],
+                [1, 1, 1, 0, 0, 0],
+                [1, 1, 1, 1, 0, 0],
+                [1, 1, 1, 1, 1, 0],
+                [1, 1, 1, 1, 1, 1]
+            ]
+            ]
+            ```.
+            (The description above is taken verbatim from https://github.com/Dao-AILab/flash-attention/blob/9356a1c0389660d7e231ff3163c1ac17d9e3824a/flash_attn/bert_padding.py#L125 .)
+    """
+    attention_mask_in_length = None
+    if sequence_id is not None and attn_uses_sequence_id and (attn_impl == 'flash'):
+        if attention_mask is not None and attention_mask[:, 0].sum() != attention_mask.shape[0]:
+            raise NotImplementedError('Left padding is not supported with flash attention when attn_uses_sequence_id is set to True.')
+        if S != sequence_id.shape[-1]:
+            raise ValueError(f'Sequence length ({S}) does not match length of sequences in sequence_id ({sequence_id.shape[-1]}).')
+        if attention_mask is not None:
+            sequence_id = sequence_id.masked_fill(~attention_mask, 0)
+        attention_mask_in_length = torch.nn.functional.one_hot(sequence_id)
+        if attention_mask is not None:
+            attention_mask_in_length = attention_mask_in_length.masked_fill(~attention_mask.unsqueeze(-1), 0)
+        attention_mask_in_length = attention_mask_in_length.sum(dim=1)
+        attention_mask_in_length = torch.nn.functional.pad(attention_mask_in_length, (0, S - attention_mask_in_length.shape[-1]), mode='constant', value=0)
+    return attention_mask_in_length
+def gen_flash_attn_padding_info(bsz: int, S: int, past_key_len: int, device: torch.device, attention_mask_in_length: Optional[torch.Tensor]=None, attention_mask: Optional[torch.Tensor]=None):
+    flash_attn_padding_info = {}
+    if attention_mask_in_length is None:
+        key_padding_mask = attention_mask
+        if key_padding_mask is None:
+            key_padding_mask = torch.ones((bsz, past_key_len + S), dtype=torch.bool, device=device)
+        query_padding_mask = key_padding_mask[:, -S:]
+        unpadding_function = bert_padding.unpad_input
+    else:
+        key_padding_mask = attention_mask_in_length
+        query_padding_mask = attention_mask_in_length
+        unpadding_function = bert_padding.unpad_input_for_concatenated_sequences
+    _, indices_q, cu_seqlens_q, max_seqlen_q, *_ = unpadding_function(torch.empty(bsz, S, 1, device=device), query_padding_mask)
+    _, indices_k, cu_seqlens_k, max_seqlen_k, *_ = unpadding_function(torch.empty(bsz, past_key_len + S, 1, device=device), key_padding_mask)
+    _, indices_v, *_ = unpadding_function(torch.empty(bsz, past_key_len + S, 1, device=device), key_padding_mask)
+    flash_attn_padding_info['indices_q'] = indices_q
+    flash_attn_padding_info['indices_k'] = indices_k
+    flash_attn_padding_info['indices_v'] = indices_v
+    flash_attn_padding_info['cu_seqlens_q'] = cu_seqlens_q
+    flash_attn_padding_info['cu_seqlens_k'] = cu_seqlens_k
+    flash_attn_padding_info['max_seqlen_q'] = max_seqlen_q
+    flash_attn_padding_info['max_seqlen_k'] = max_seqlen_k
+    return flash_attn_padding_info
+def apply_sequence_id(attn_bias: torch.Tensor, sequence_id: torch.LongTensor, max_seq_len: int) -> torch.Tensor:
+    seq_len = sequence_id.shape[-1]
+    if seq_len > max_seq_len:
+        raise ValueError(f'sequence_id sequence length cannot exceed max_seq_len={max_seq_len}')
+    attn_bias = attn_bias[..., :seq_len, :seq_len]
+    cannot_attend = torch.logical_not(torch.eq(sequence_id.view(-1, seq_len, 1), sequence_id.view(-1, 1, seq_len))).unsqueeze(1)
+    min_val = torch.finfo(attn_bias.dtype).min
+    attn_bias = attn_bias.masked_fill(cannot_attend, min_val)
+    return attn_bias
+class LlamaRotaryEmbeddingFoundry(LlamaRotaryEmbedding):
+    @torch.no_grad()
+    def forward(self, x: torch.Tensor, position_ids: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+        self.inv_freq = self.inv_freq.to(position_ids.device)
+        return super().forward(x=x, position_ids=position_ids)
+class MPTPreTrainedModel(PreTrainedModel):
+    config_class = MPTConfig
+    base_model_prefix = 'model'
+    _no_split_modules = ['MPTBlock']
+def _fsdp_wrap_fn(self: Union[MPTModel, MPTForCausalLM], module: nn.Module) -> bool:
+    if hasattr(module, '_fsdp_kwargs_dict'):
+        return module._fsdp_kwargs_dict
+    return isinstance(module, MPTBlock)
+class MPTModel(MPTPreTrainedModel):
+    def __init__(self, config: MPTConfig):
+        config._validate_config()
+        super().__init__(config)
+        self.attn_impl = config.attn_config['attn_impl']
+        self.attn_uses_sequence_id = config.attn_config['attn_uses_sequence_id']
+        self.alibi = config.attn_config['alibi']
+        self.alibi_bias_max = config.attn_config['alibi_bias_max']
+        self.learned_pos_emb = config.learned_pos_emb
+        if config.init_device == 'mixed':
+            if dist.get_local_rank() == 0:
+                config.init_device = 'cpu'
+            else:
+                config.init_device = 'meta'
+        if config.norm_type.lower() not in norms.get_all():
+            norm_options = ' | '.join(norms.get_all())
+            raise NotImplementedError(f'Requested norm type ({config.norm_type}) is not implemented within this repo (Options: {norm_options}).')
+        self.embedding_fraction = config.embedding_fraction
+        self.wte = SharedEmbedding(config.vocab_size, config.d_model, padding_idx=config.pad_token_id, device=config.init_device)
+        if self.learned_pos_emb:
+            self.wpe = torch.nn.Embedding(config.max_seq_len, config.d_model, device=config.init_device)
+        self.emb_drop = nn.Dropout(config.emb_pdrop)
+        self.mb_args = None
+        self.shift_labels = True
+        self.blocks = self.construct_blocks(config=config)
+        for i, block in enumerate(self.blocks):
+            block.block_idx = i
+            block.max_block_idx = config.n_layers - 1
+            pass_on_block_idx(block)
+        self.norm_f = build_norm(name=config.norm_type.lower(), normalized_shape=config.d_model, eps=config.norm_eps, device=config.init_device)
+        self.rope = config.attn_config['rope']
+        self.rope_impl = None
+        if self.rope:
+            self.rope_impl = config.attn_config['rope_impl']
+            self.rotary_embedding = gen_rotary_embedding(rope_impl=self.rope_impl, rope_theta=config.attn_config['rope_theta'], rope_dail_config=config.attn_config['rope_dail_config'], rope_hf_config=config.attn_config['rope_hf_config'], max_seq_len=self.config.max_seq_len, d_model=config.d_model, n_heads=config.n_heads)
+        if config.init_device != 'meta':
+            log.info(f'We recommend using config.init_device="meta" with Composer + FSDP for faster initialization.')
+            self.apply(self.param_init_fn)
+        self.is_causal = True
+        self._attn_bias_initialized = False
+        self.attn_bias = None
+        self.attn_bias_shape = attn_bias_shape(self.attn_impl, config.n_heads, config.max_seq_len, self.alibi, causal=self.is_causal, use_sequence_id=self.attn_uses_sequence_id)
+        if config.no_bias:
+            for module in self.modules():
+                if hasattr(module, 'bias') and isinstance(module.bias, nn.Parameter):
+                    log.debug(f'Removing bias from module={module!r}.')
+                    module.register_parameter('bias', None)
+                if hasattr(module, 'use_bias') and module.use_bias is True:
+                    log.debug(f'Setting use_bias=False for module={module!r}.')
+                    module.use_bias = False
+        log.debug(self)
+        init_config_name = self.config.init_config['name']
+        log.debug(f'Using {init_config_name} initialization.')
+    @property
+    def block_class(self) -> type[MPTBlock]:
+        return MPTBlock
+    def construct_blocks(self, config: MPTConfig) -> nn.ModuleList:
+        """Construct the nn.ModuleList with the Transformer blocks.
+        Args:
+            config (MPTConfig): The configuration object.
+        Returns:
+            nn.ModuleList: The list of Transformer blocks.
+        """
+        block_args = self.extract_block_args(config.to_dict())
+        self.kv_cache_layers = set()
+        self.blocks_fuse_norm_attn_norm = block_args.get('fuse_norm_attn_norm', False)
+        if config.block_overrides is not None:
+            block_args_list = self._get_override_block_args_list(config, block_args)
+        else:
+            block_args_list = [block_args for _ in range(config.n_layers)]
+        return nn.ModuleList([self.block_class(device=config.init_device, **block_args_i) for block_args_i in block_args_list])
+    def _get_override_block_args_list(self, config: MPTConfig, block_args: dict[str, Any]) -> list[dict[str, Any]]:
+        if config.block_overrides is None:
+            raise ValueError('config.block_overrides should not be None when calling _get_override_block_args_list.')
+        repeat = config.block_overrides.get('repeat', 1)
+        model_modules_order_expanded = MPTModel._get_modules_order_expanded(config.block_overrides['order']) * repeat
+        if len(model_modules_order_expanded) != config.n_layers:
+            raise ValueError(f'The specified block overrides do not match the number of layers: {len(model_modules_order_expanded)} vs {config.n_layers}.')
+        new_block_args_list = []
+        layer_description_list = []
+        reuse_kv_layer_idx_dict = {}
+        for b_idx in range(config.n_layers):
+            module_name = model_modules_order_expanded[b_idx]
+            override_config = {}
+            if module_name != 'default':
+                override_config = copy.deepcopy(config.block_overrides['overrides'][module_name])
+                if 'reuse_kv_layer_idx' in override_config.get('attn_config', {}):
+                    reuse_kv_layer_idx = MPTModel._resolve_reuse_kv_layer_idx(overrides_definition=config.block_overrides['overrides'], model_modules_order_expanded=model_modules_order_expanded, b_idx=b_idx, override_config=override_config, reuse_kv_layer_idx_dict=reuse_kv_layer_idx_dict)
+                    override_config['attn_config']['reuse_kv_layer_idx'] = reuse_kv_layer_idx
+                    self.kv_cache_layers.add(reuse_kv_layer_idx)
+            layer_description_list.append([b_idx, module_name, override_config])
+            new_block_args_list.append(MPTModel._override_block_args(block_args, override_config, config.allowed_block_overrides))
+        log.info('The following is a summary of overrides per layer.\n' + tabulate(layer_description_list, headers=['idx', 'name', 'overrides']))
+        return new_block_args_list
+    @staticmethod
+    def _resolve_reuse_kv_layer_idx(overrides_definition: dict[str, Any], model_modules_order_expanded: list[str], b_idx: int, override_config: dict[str, Any], reuse_kv_layer_idx_dict: dict[int, int]) -> int:
+        override_attn_config = override_config['attn_config']
+        if override_attn_config['reuse_kv_layer_idx'] >= 0:
+            reuse_kv_layer_idx = override_attn_config['reuse_kv_layer_idx']
+            raise ValueError(f"The relative index of kv layer to reuse, override_attn_config['reuse_kv_layer_idx']={reuse_kv_layer_idx}, should be negative.")
+        reuse_kv_layer_idx = b_idx + override_attn_config['reuse_kv_layer_idx']
+        if reuse_kv_layer_idx < 0:
+            raise ValueError(f'The absolute index of kv layer to reuse, {reuse_kv_layer_idx} should be non-negative.')
+        if reuse_kv_layer_idx in reuse_kv_layer_idx_dict:
+            reuse_kv_layer_idx = reuse_kv_layer_idx_dict[reuse_kv_layer_idx]
+        reuse_kv_layer_idx_dict[b_idx] = reuse_kv_layer_idx
+        parent_layer_name = model_modules_order_expanded[reuse_kv_layer_idx]
+        parent_config = {} if parent_layer_name == 'default' else copy.deepcopy(overrides_definition[parent_layer_name])
+        if 'attn_config' not in parent_config:
+            parent_config['attn_config'] = {}
+        parent_config['attn_config']['reuse_kv_layer_idx'] = override_config['attn_config']['reuse_kv_layer_idx']
+        if override_config != parent_config and (not ('allow_mismatch' in override_config and override_config['allow_mismatch'])):
+            raise ValueError('For reusing the kv cache of a previous layer, the previous layer should match the block config as the current layer.')
+        return reuse_kv_layer_idx
+    @staticmethod
+    def _get_modules_order_expanded(order: list[dict[str, Any]]) -> list[str]:
+        model_modules_order_expanded = []
+        for item in order:
+            repeat = item['repeat'] if 'repeat' in item else 1
+            if ('name' in item) == ('order' in item):
+                raise ValueError('Exactly one of `order` or `name` must be specified for each block override.')
+            if 'name' in item:
+                model_modules_order_expanded.extend([item['name']] * repeat)
+            else:
+                model_modules_order_expanded.extend(MPTModel._get_modules_order_expanded(item['order']) * repeat)
+        return model_modules_order_expanded
+    @staticmethod
+    def _override_block_args(block_args: dict[str, Any], override_config: dict[str, Any], allowed_block_overrides: dict[str, Any]) -> dict[str, Any]:
+        unpermitted_keys = override_config.keys() - allowed_block_overrides.keys()
+        if len(unpermitted_keys):
+            raise KeyError(f'Overriding {unpermitted_keys} is not supported.')
+        new_block_args = override_config | block_args
+        common_keys = override_config.keys() & block_args.keys()
+        for k in common_keys:
+            if type(override_config[k]) != type(block_args[k]):
+                raise ValueError(f'Override config should have same value types as the original config. Found override_config[{k}]={override_config[k]} vs block_args[{k}]={block_args[k]}.')
+            if isinstance(override_config[k], dict):
+                new_block_args[k] = MPTModel._override_block_args(block_args[k], override_config[k], allowed_block_overrides[k])
+            else:
+                new_block_args[k] = override_config[k]
+        return new_block_args
+    def extract_block_args(self, block_args: dict[str, Any]) -> dict[str, Any]:
+        """Sets the block args."""
+        if block_args['ffn_config']['ffn_type'] in ffns_with_megablocks:
+            block_args['ffn_config'] = config_moe_args(block_args['ffn_config'], block_args['d_model'], block_args['expansion_ratio'], block_args['n_layers'])
+            self.mb_args = block_args['ffn_config'].get('args')
+        return block_args
+    def get_input_embeddings(self) -> Union[SharedEmbedding, nn.Embedding]:
+        return self.wte
+    def set_input_embeddings(self, value: Union[SharedEmbedding, nn.Embedding]) -> None:
+        self.wte = value
+    @torch.no_grad()
+    def _attn_bias(self, device: torch.device, dtype: torch.dtype, attention_mask: Optional[torch.ByteTensor]=None, sequence_id: Optional[torch.LongTensor]=None) -> tuple[Optional[torch.Tensor], Optional[torch.ByteTensor]]:
+        if not self._attn_bias_initialized:
+            if self.attn_bias_shape:
+                self.attn_bias = torch.zeros(self.attn_bias_shape, device=device, dtype=dtype)
+                self.attn_bias = build_attn_bias(self.attn_impl, self.attn_bias, self.config.n_heads, self.config.max_seq_len, causal=self.is_causal, alibi=self.alibi, alibi_bias_max=self.alibi_bias_max)
+            self._attn_bias_initialized = True
+        if self.attn_impl == 'flash':
+            return (self.attn_bias, attention_mask)
+        if self.attn_bias is not None:
+            self.attn_bias = self.attn_bias.to(dtype=dtype, device=device)
+        attn_bias = self.attn_bias
+        if self.attn_uses_sequence_id and sequence_id is not None:
+            assert isinstance(attn_bias, torch.Tensor)
+            attn_bias = apply_sequence_id(attn_bias, sequence_id, self.config.max_seq_len)
+        if attention_mask is not None:
+            s_k = attention_mask.shape[-1]
+            if attn_bias is None:
+                attn_bias = torch.zeros((1, 1, 1, s_k), device=device, dtype=dtype)
+            else:
+                _s_k = max(0, attn_bias.size(-1) - s_k)
+                attn_bias = attn_bias[:, :, :, _s_k:]
+            min_val = torch.finfo(attn_bias.dtype).min
+            attn_bias = attn_bias.masked_fill(~attention_mask.view(-1, 1, 1, s_k), min_val)
+        return (attn_bias, attention_mask)
+    def forward(self, input_ids: Optional[torch.LongTensor]=None, past_key_values: Optional[list[tuple[torch.FloatTensor]]]=None, attention_mask: Optional[torch.ByteTensor]=None, sequence_id: Optional[torch.LongTensor]=None, return_dict: Optional[bool]=None, output_attentions: Optional[bool]=None, output_hidden_states: Optional[bool]=None, use_cache: Optional[bool]=None, inputs_embeds: Optional[torch.Tensor]=None, position_ids: Optional[torch.LongTensor]=None) -> BaseModelOutputWithPast:
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        if attention_mask is not None:
+            attention_mask = attention_mask.bool()
+        if not return_dict:
+            raise NotImplementedError('return_dict False is not implemented yet for MPT')
+        if output_attentions:
+            if self.attn_impl != 'torch':
+                raise NotImplementedError('output_attentions is not implemented for MPT when using attn_impl `flash`.')
+        if self.training and attention_mask is not None and (attention_mask[:, 0].sum() != attention_mask.shape[0]):
+            raise NotImplementedError('MPT does not support training with left padding.')
+        if self.training:
+            if self.attn_uses_sequence_id and sequence_id is None:
+                raise ValueError('sequence_id is a required argument when MPT is configured with attn_uses_sequence_id=True ' + 'and the model is in train mode.')
+            elif self.attn_uses_sequence_id is False and sequence_id is not None:
+                warnings.warn('MPT received non-None input for `sequence_id` but is configured with attn_uses_sequence_id=False. ' + 'This input will be ignored. If you want the model to use `sequence_id`, set attn_uses_sequence_id to True.')
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError('You cannot specify both input_ids and inputs_embeds.')
+        elif input_ids is not None:
+            bsz = input_ids.size(0)
+            x = self.wte(input_ids)
+            input_device = input_ids.device
+        elif inputs_embeds is not None:
+            bsz = inputs_embeds.size(0)
+            x = inputs_embeds
+            input_device = inputs_embeds.device
+        else:
+            raise ValueError('You must specify input_ids or inputs_embeds')
+        S = self.get_sequence_length(x)
+        assert S <= self.config.max_seq_len, f'Cannot forward input with seq_len={S}, this model only supports seq_len<={self.config.max_seq_len}'
+        rotary_emb_w_meta_info = None
+        past_position = 0
+        if past_key_values is not None:
+            if len(past_key_values) != self.config.n_layers:
+                raise ValueError(f'past_key_values must provide a past_key_value for each attention ' + f'layer in the network (len(past_key_values)={len(past_key_values)!r}; self.config.n_layers={self.config.n_layers!r}).')
+            past_position = past_key_values[0][0].size(1)
+            if self.attn_impl == 'torch':
+                past_position = past_key_values[0][0].size(3)
+        if self.learned_pos_emb or self.rope:
+            if self.learned_pos_emb and S + past_position > self.config.max_seq_len:
+                raise ValueError(f'Cannot forward input with past sequence length {past_position} and current sequence length ' + f'{S + 1}, this model only supports total sequence length <= {self.config.max_seq_len}.')
+            if self.learned_pos_emb or (self.rope and self.rope_impl == 'hf'):
+                if position_ids is None:
+                    pos = torch.arange(past_position, S + past_position, dtype=torch.long, device=input_device).unsqueeze(0)
+                else:
+                    pos = position_ids
+                if attention_mask is not None:
+                    pos = torch.clamp(pos - torch.cumsum((~attention_mask).to(torch.int32), dim=1)[:, past_position:], min=0)
+                if self.learned_pos_emb:
+                    x = x + self.wpe(pos)
+                elif self.rope and self.rope_impl == 'hf':
+                    rotary_emb_w_meta_info = {'impl': self.rope_impl, 'rotary_emb': self.rotary_embedding, 'offset_info': pos, 'seq_len': S + past_position}
+            elif self.rope and self.rope_impl == 'dail':
+                rotary_emb_w_meta_info = {'impl': self.rope_impl, 'rotary_emb': self.rotary_embedding, 'offset_info': past_position, 'seq_len': S + past_position}
+        if self.embedding_fraction == 1:
+            x = self.emb_drop(x)
+        else:
+            x_shrunk = x * self.embedding_fraction + x.detach() * (1 - self.embedding_fraction)
+            assert isinstance(self.emb_drop, nn.Module)
+            x = self.emb_drop(x_shrunk)
+        attn_bias, attention_mask = self._attn_bias(device=x.device, dtype=torch.float32, attention_mask=attention_mask, sequence_id=sequence_id)
+        attention_mask_in_length = gen_attention_mask_in_length(sequence_id=sequence_id, S=S, attn_uses_sequence_id=self.attn_uses_sequence_id, attn_impl=self.attn_impl, attention_mask=attention_mask)
+        alibi_slopes = None
+        if self.alibi and self.attn_impl == 'flash':
+            alibi_slopes = gen_slopes(n_heads=self.config.n_heads, alibi_bias_max=self.alibi_bias_max, device=x.device, return_1d=True)
+        presents = () if use_cache else None
+        if (use_cache or len(self.kv_cache_layers) > 0) and past_key_values is None:
+            past_key_values = [() for _ in range(self.config.n_layers)]
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        flash_attn_padding_info = {}
+        if self.attn_impl == 'flash':
+            flash_attn_padding_info = gen_flash_attn_padding_info(bsz, S, past_position, x.device, attention_mask_in_length, attention_mask)
+        layer_kv_cache_dict = {}
+        for b_idx, block in enumerate(self.blocks):
+            attn_block = block.norm_attn_norm.attn if self.blocks_fuse_norm_attn_norm else block.attn
+            if attn_block.reuse_kv_layer_idx is not None:
+                if attn_block.reuse_kv_layer_idx not in layer_kv_cache_dict:
+                    raise KeyError(f'kv cache for layer {block.reuse_kv_layer_idx} not found in layer_kv_cache_dict={layer_kv_cache_dict!r}.')
+                prev_layer_key_value = layer_kv_cache_dict[attn_block.reuse_kv_layer_idx]
+            else:
+                prev_layer_key_value = None
+            if output_hidden_states:
+                assert all_hidden_states is not None
+                all_hidden_states = all_hidden_states + (x,)
+            past_key_value = past_key_values[b_idx] if past_key_values is not None else None
+            extra_kwargs = {}
+            if prev_layer_key_value is not None:
+                extra_kwargs['prev_layer_key_value'] = prev_layer_key_value
+            x, attn_weights, present = block(x, past_key_value=past_key_value, attn_bias=attn_bias, rotary_emb_w_meta_info=rotary_emb_w_meta_info, attention_mask=attention_mask, is_causal=self.is_causal, output_attentions=bool(output_attentions), alibi_slopes=alibi_slopes, flash_attn_padding_info=flash_attn_padding_info, **extra_kwargs)
+            if presents is not None:
+                presents += (present,)
+            if b_idx in self.kv_cache_layers:
+                layer_kv_cache_dict[b_idx] = [present[0][:, past_position:], present[1][:, past_position:]]
+            if output_attentions:
+                assert all_self_attns is not None
+                all_self_attns = all_self_attns + (attn_weights,)
+        x = self.norm_f(x)
+        if output_hidden_states:
+            assert all_hidden_states is not None
+            all_hidden_states = all_hidden_states + (x,)
+        return BaseModelOutputWithPast(last_hidden_state=x, past_key_values=presents, hidden_states=all_hidden_states, attentions=all_self_attns)
+    def get_sequence_length(self, x: torch.Tensor) -> int:
+        """Returns the sequence length.
+        Args:
+            x (torch.Tensor): The input Tensor.
+        Returns:
+            S (int): The sequence length.
+        """
+        return x.size(1)
+    def param_init_fn(self, module: nn.Module) -> None:
+        init_fn_name = self.config.init_config['name']
+        param_init_fns.get(init_fn_name)(module=module, n_layers=self.config.n_layers, d_model=self.config.d_model, **self.config.init_config)
+    def fsdp_wrap_fn(self, module: nn.Module) -> bool:
+        return _fsdp_wrap_fn(self, module)
+    def activation_checkpointing_fn(self, module: nn.Module) -> bool:
+        return isinstance(module, MPTBlock)
+class MPTForCausalLM(MPTPreTrainedModel):
+    _tied_weights_keys = ['lm_head.weight']
+    _tp_plan = {'lm_head': 'colwise_rep'}
+    _pp_plan = {'lm_head': (['hidden_states'], ['logits'])}
+    def __init__(self, config: MPTConfig):
+        super().__init__(config)
+        log.info(f'Instantiating an MPTForCausalLM model from {__file__}')
+        self.transformer: MPTModel = self.backbone_model_class(config)
+        self.lm_head = None
+        if not config.tie_word_embeddings:
+            self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False, device=config.init_device)
+            self.lm_head._fsdp_wrap = True
+        for child in self.transformer.children():
+            if isinstance(child, torch.nn.ModuleList):
+                continue
+            if isinstance(child, torch.nn.Module):
+                child._fsdp_wrap = True
+        self.logit_scale = None
+        if config.logit_scale is not None:
+            logit_scale = config.logit_scale
+            if isinstance(logit_scale, str):
+                if logit_scale == 'inv_sqrt_d_model':
+                    logit_scale = 1 / math.sqrt(config.d_model)
+                else:
+                    raise ValueError(f"logit_scale={logit_scale!r} is not recognized as an option; use numeric value or 'inv_sqrt_d_model'.")
+            self.logit_scale = logit_scale
+        self.final_logit_softcapping = config.final_logit_softcapping
+    @property
+    def backbone_model_class(self) -> type[MPTModel]:
+        return MPTModel
+    def get_input_embeddings(self) -> Union[SharedEmbedding, nn.Embedding]:
+        return self.transformer.get_input_embeddings()
+    def set_input_embeddings(self, value: Union[SharedEmbedding, nn.Embedding]) -> None:
+        self.transformer.set_input_embeddings(value)
+    def get_output_embeddings(self) -> Union[SharedEmbedding, nn.Embedding, nn.Linear]:
+        if self.lm_head is not None:
+            return self.lm_head
+        return self.transformer.get_input_embeddings()
+    def set_output_embeddings(self, new_embeddings: Union[SharedEmbedding, nn.Embedding, nn.Linear]) -> None:
+        if self.lm_head is not None:
+            self.lm_head = new_embeddings
+        else:
+            if not isinstance(new_embeddings, (SharedEmbedding, nn.Embedding)):
+                raise ValueError('new_embeddings must be an instance of SharedEmbedding ' + f'or nn.Embedding, but got {type(new_embeddings)}.')
+            warnings.warn('Using `set_output_embeddings` to set the embedding layer of ' + 'MPTForCausalLM with tied weights. Given weights are tied, ' + 'using `set_input_embeddings` is recommended over using ' + '`set_output_embeddings`.')
+            self.transformer.set_input_embeddings(new_embeddings)
+    def tie_weights(self) -> None:
+        if getattr(self.config, 'tie_word_embeddings', True):
+            self.lm_head = None
+    def set_decoder(self, decoder: MPTModel) -> None:
+        self.transformer = decoder
+    def get_decoder(self) -> MPTModel:
+        return self.transformer
+    def forward(self, input_ids: Optional[torch.LongTensor]=None, past_key_values: Optional[list[tuple[torch.FloatTensor]]]=None, attention_mask: Optional[torch.ByteTensor]=None, sequence_id: Optional[torch.LongTensor]=None, labels: Optional[torch.LongTensor]=None, return_dict: Optional[bool]=None, output_attentions: Optional[bool]=None, output_hidden_states: Optional[bool]=None, use_cache: Optional[bool]=None, inputs_embeds: Optional[torch.FloatTensor]=None, position_ids: Optional[torch.LongTensor]=None) -> CausalLMOutputWithPast:
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        outputs = self.transformer(input_ids=input_ids, past_key_values=past_key_values, attention_mask=attention_mask, sequence_id=sequence_id, return_dict=return_dict, output_attentions=output_attentions, output_hidden_states=output_hidden_states, use_cache=use_cache, inputs_embeds=inputs_embeds, position_ids=position_ids)
+        if self.lm_head is not None:
+            logits = self.lm_head(outputs.last_hidden_state)
+        else:
+            out = outputs.last_hidden_state
+            out = out.to(self.transformer.wte.weight.device)
+            logits = self.transformer.wte(out, True)
+        if self.logit_scale is not None:
+            if self.logit_scale == 0:
+                warnings.warn(f'Multiplying logits by self.logit_scale={self.logit_scale!r}. This will produce uniform (uninformative) outputs.')
+            logits *= self.logit_scale
+        if self.final_logit_softcapping is not None:
+            logits = self.final_logit_softcapping * torch.tanh(logits / self.final_logit_softcapping)
+        loss = None
+        if labels is not None:
+            _labels = torch.roll(labels, shifts=-1)
+            _labels[:, -1] = CROSS_ENTROPY_IGNORE_INDEX
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), _labels.to(logits.device).view(-1))
+        return CausalLMOutputWithPast(loss=loss, logits=logits, past_key_values=outputs.past_key_values, hidden_states=outputs.hidden_states, attentions=outputs.attentions)
+    def param_init_fn(self, module: nn.Module) -> None:
+        init_fn_name = self.config.init_config['name']
+        param_init_fns.get(init_fn_name)(module=module, n_layers=self.config.n_layers, d_model=self.config.d_model, **self.config.init_config)
+    def fsdp_wrap_fn(self, module: nn.Module) -> bool:
+        return _fsdp_wrap_fn(self, module)
+    def activation_checkpointing_fn(self, module: nn.Module) -> bool:
+        """The MPT activation checkpointing (act ckpt) function.
+        When `activation_checkpointing` in fsdp_config is set to true, this function will be called on all the modules in the FSDP wrapped model and determine whether a given module should be activation checkpointed. It checks the checkpointing target (`activation_checkpointing_target` in `model`) which can be specified as below:
+            1. null (or no such field): The whole MPTBlock will be activation checkpointed on all layers
+            2. a list of modules to act ckpt on all layers, e.g.,
+                activation_checkpointing_target:
+                    - grouped_query_attention
+                    - mptmlp
+            3. a dictionary of module name with target_blocks, e.g.,
+                activation_checkpointing_target:
+                    {
+                            "mptblock": target_blocks_1,
+                            "grouped_query_attention": target_blocks_2
+                    }
+                target_blocks (target_blocks_1, target_blocks_2 above) can be:
+                - a single integer n: the first n transformer block will be activation checkpointed
+                - a string of first-n, middle-m, last-k, range-i-j: the first n, the middle m,  the last k, or the range [i, j) layers will be activation checkpointed. E.g, 'first-2, last-2' means the first 2 and last 2 transformer blocks will be activation checkpointed
+                    middle-m is range [start, end) where ``start = max(max_block_idx // 2 - m // 2, 0), end = min(start + m, max_block_idx + 1)``
+                - a list of integers corresponds to the list of transformer block ids, e.g., [2] means the second transformer block will be activation checkpointed. [2, 3] means the second and third transformer blocks will be activation checkpointed
+                - a list of mixed integers and strings of first-n, middle-m, last-k, range-i-j
+            An example in yaml config file:
+                fsdp_config:
+                    activation_checkpointing: true
+                model:
+                    activation_checkpointing_target:
+                        {
+                            "mptblock": 'first-5',
+                            "grouped_query_attention": 'last-35'
+                        }
+        """
+        if not hasattr(module, 'block_idx'):
+            log.debug(f'{module.__class__.__name__} cannot be activation checkpointed. Only transformer block or its submodules are eligible for activation checkpointing.')
+            return False
+        act_ckpt_target = getattr(self.config, 'activation_checkpointing_target', None)
+        act_ckpt_mod_to_blocks = build_act_ckpt_mod_to_blocks(act_ckpt_target, MPTBlock, module.max_block_idx)
+        check_mapping_blocks_overlap(act_ckpt_mod_to_blocks, module.max_block_idx)
+        for k in act_ckpt_mod_to_blocks.keys():
+            if isinstance(module, k):
+                blocks = act_ckpt_mod_to_blocks[k]
+                return True if blocks == -1 else module.block_idx in blocks
+        return False
+    def prepare_inputs_for_generation(self, input_ids: torch.Tensor, past_key_values: Optional[list[tuple[torch.Tensor, torch.Tensor]]]=None, inputs_embeds: Optional[torch.Tensor]=None, **kwargs: Any) -> dict[str, Any]:
+        attention_mask = kwargs['attention_mask'].bool()
+        if attention_mask[:, -1].sum() != attention_mask.shape[0]:
+            raise NotImplementedError('MPT does not support generation with right padding.')
+        if self.transformer.attn_uses_sequence_id and self.training:
+            sequence_id = torch.zeros_like(input_ids[:1])
+        else:
+            sequence_id = None
+        if past_key_values is not None:
+            input_ids = input_ids[:, -1].unsqueeze(-1)
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {'inputs_embeds': inputs_embeds}
+        else:
+            model_inputs = {'input_ids': input_ids}
+        model_inputs.update({'attention_mask': attention_mask, 'sequence_id': sequence_id, 'past_key_values': past_key_values, 'use_cache': kwargs.get('use_cache', True)})
+        return model_inputs
+    @staticmethod
+    def _reorder_cache(past_key_values: list[tuple[torch.Tensor, torch.Tensor]], beam_idx: torch.LongTensor) -> list[tuple[torch.Tensor, ...]]:
+        """Used by HuggingFace generate when using beam search with kv-caching.
+        See https://github.com/huggingface/transformers/blob/3ec7a47664ebe40c40f4b722f6bb1cd30c3821ec/src/transformers/models/gpt2/modeling_gpt2.py#L1122-L1133
+        for an example in transformers.
+        """
+        reordered_past = []
+        for layer_past in past_key_values:
+            reordered_past += [tuple((past_state.index_select(0, beam_idx) for past_state in layer_past))]
+        return reordered_past
+def get_targets(labels: torch.Tensor) -> torch.Tensor:
+    targets = torch.roll(labels, shifts=-1)
+    targets[:, -1] = CROSS_ENTROPY_IGNORE_INDEX
+    return targets
+def compute_loss_from_logits(outputs: CausalLMOutputWithPast, shift_labels: bool, labels: torch.Tensor, loss_fn: nn.Module) -> torch.Tensor:
+    targets = get_targets(labels) if shift_labels else labels
+    losses = loss_fn(outputs.logits.view(-1, outputs.logits.size(-1)), targets.view(-1))
+    if torch.all(targets == loss_fn.ignore_index):
+        loss = losses.sum()
+    else:
+        loss = losses.sum() / (targets != loss_fn.ignore_index).sum()
+    return loss

mpt_param_count.py ADDED Viewed

	@@ -0,0 +1,130 @@

+"""Helper functions for computing parameter counts for MPT model.
+Use if generic `sum(p.numel() for p in self.parameters())`
+style computation does not account for MoE parameter sharding.
+The helper functions in this file account for MoE parameter
+sharding in the parameter count calculation. The functions below
+calculate the total parameter count and the active parameter count.
+Note: MPT has both n_total_params and n_active_params methods.
+"""
+from typing import Union
+from torch import Tensor, nn
+from torch.distributed._tensor import DTensor
+from .layers_registry import ffns_with_megablocks
+def module_n_params(module: nn.Module) -> int:
+    """Gets the number of parameters in this module excluding child modules.
+    Args:
+        module (nn.Module): Module of which we get the number of parameters.
+    Returns:
+        An int for the number of parameters in this module.
+    """
+    n_params = 0
+    for p in module.parameters(recurse=False):
+        n_params += p.numel()
+    return n_params
+def _dtensor_safe_check_numel(tensor: Union[Tensor, DTensor]) -> int:
+    if isinstance(tensor, DTensor):
+        tensor = tensor._local_tensor
+    return tensor.numel()
+def megablocks_n_total_params(mpt_model) -> int:
+    """Calculates the number of parameters in a MegaBlocks enabled MPT model.
+    MoE experts are sharded across workers. This function scans for MegaBlocks
+    modules then multiplies expert params count by MoE world size.
+    Args:
+        mpt_model (ComposerMPTCausalLM): MPT model of which the number of
+            parameters is calculated.
+    Returns:
+        An int for the total number of parameters in this MPT model.
+    """
+    import megablocks
+    moe_world_size = mpt_model.config.ffn_config.get('moe_world_size')
+    n_total_params = 0
+    for module in mpt_model.modules():
+        if isinstance(module, (megablocks.layers.mlp.SparseMLP, megablocks.layers.mlp.MLP)):
+            n_w1 = _dtensor_safe_check_numel(module.w1)
+            n_total_params += n_w1 * moe_world_size
+            n_w2 = _dtensor_safe_check_numel(module.w2)
+            n_total_params += n_w2 * moe_world_size
+            if hasattr(module, 'v1'):
+                n_v1 = _dtensor_safe_check_numel(module.v1)
+                n_total_params += n_v1 * moe_world_size
+        else:
+            n_total_params += module_n_params(module)
+    return n_total_params
+def megablocks_n_active_params(mpt_model) -> int:
+    """Calculates the number of active parameters in a MegaBlocks enabled MPT.
+    This requires we calculate the number of elements per expert and
+    multiply this by top k.
+    Args:
+        mpt_model (ComposerMPTCausalLM): MPT model of which the number of
+            active parameters is calculated.
+    Returns:
+        An int for the active number of parameters in this MPT model.
+    """
+    import megablocks
+    moe_num_experts = mpt_model.config.ffn_config.get('moe_num_experts', 1)
+    moe_world_size = mpt_model.config.ffn_config.get('moe_world_size')
+    local_experts = moe_num_experts / moe_world_size
+    moe_top_k = mpt_model.config.ffn_config.get('moe_top_k', 1)
+    n_active_params = 0
+    for module in mpt_model.modules():
+        if isinstance(module, (megablocks.layers.mlp.SparseMLP, megablocks.layers.mlp.MLP)):
+            n_w1 = _dtensor_safe_check_numel(module.w1)
+            n_active_params += int(n_w1 / local_experts * moe_top_k)
+            n_w2 = _dtensor_safe_check_numel(module.w2)
+            n_active_params += int(n_w2 / local_experts * moe_top_k)
+            if hasattr(module, 'v1'):
+                n_v1 = _dtensor_safe_check_numel(module.v1)
+                n_active_params += int(n_v1 / local_experts * moe_top_k)
+        else:
+            n_active_params += module_n_params(module)
+    return n_active_params
+def mpt_get_total_params(mpt_model) -> int:
+    """Calculates the total parameter count of an MPT model.
+    Note: Must be called before model parameters are sharded by FSDP.
+    Args:
+        mpt_model (ComposerMPTCausalLM): MPT model of which the number of
+            active parameters is calculated.
+    Returns:
+        An int for the total number of parameters in this MPT model.
+    """
+    if mpt_model.config.ffn_config['ffn_type'] in ffns_with_megablocks:
+        return megablocks_n_total_params(mpt_model)
+    else:
+        return sum((p.numel() for p in mpt_model.parameters()))
+def mpt_get_active_params(mpt_model) -> int:
+    """Calculates the total parameter count of an MPT model.
+    Note: Must be called before model parameters are sharded by FSDP.
+    Args:
+        mpt_model (ComposerMPTCausalLM): MPT model of which the number of
+            active parameters is calculated.
+    Returns:
+        An int for the active number of parameters in this MPT model.
+    """
+    if mpt_model.config.ffn_config['ffn_type'] in ffns_with_megablocks:
+        params = megablocks_n_active_params(mpt_model)
+    else:
+        params = sum((p.numel() for p in mpt_model.parameters()))
+    if not mpt_model.model.transformer.config.tie_word_embeddings:
+        params -= _dtensor_safe_check_numel(mpt_model.model.transformer.wte.weight)
+    return params

norm.py ADDED Viewed

	@@ -0,0 +1,79 @@

+from typing import Optional, Union
+import torch
+from .layers_registry import norms
+norms.register(name='layernorm', func=torch.nn.LayerNorm)
+def _cast_if_autocast_enabled(tensor: torch.Tensor) -> torch.Tensor:
+    if torch.is_autocast_enabled():
+        if tensor.device.type == 'cuda':
+            dtype = torch.get_autocast_gpu_dtype()
+        elif tensor.device.type == 'cpu':
+            dtype = torch.get_autocast_cpu_dtype()
+        else:
+            raise NotImplementedError()
+        return tensor.to(dtype=dtype)
+    return tensor
+@norms.register_class('low_precision_layernorm')
+class LPLayerNorm(torch.nn.LayerNorm):
+    def __init__(self, normalized_shape: Union[int, list[int], torch.Size], eps: float=1e-05, elementwise_affine: bool=True, device: Optional[torch.device]=None, dtype: Optional[torch.dtype]=None):
+        super().__init__(normalized_shape=normalized_shape, eps=eps, elementwise_affine=elementwise_affine, device=device, dtype=dtype)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        module_device = x.device
+        downcast_x = _cast_if_autocast_enabled(x)
+        downcast_weight = _cast_if_autocast_enabled(self.weight) if self.weight is not None else self.weight
+        downcast_bias = _cast_if_autocast_enabled(self.bias) if self.bias is not None else self.bias
+        with torch.autocast(enabled=False, device_type=module_device.type):
+            return torch.nn.functional.layer_norm(downcast_x, self.normalized_shape, downcast_weight, downcast_bias, self.eps)
+def rms_norm(x: torch.Tensor, weight: Optional[torch.Tensor]=None, eps: float=1e-05) -> torch.Tensor:
+    output = x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + eps)
+    if weight is not None:
+        return output * weight
+    return output
+@norms.register_class('rmsnorm')
+class RMSNorm(torch.nn.Module):
+    def __init__(self, normalized_shape: Union[int, list[int], torch.Size], eps: float=1e-05, weight: bool=True, dtype: Optional[torch.dtype]=None, device: Optional[torch.device]=None):
+        super().__init__()
+        self.eps = eps
+        if weight:
+            self.weight = torch.nn.Parameter(torch.ones(normalized_shape, dtype=dtype, device=device))
+        else:
+            self.register_parameter('weight', None)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return rms_norm(x.float(), self.weight, self.eps).to(dtype=x.dtype)
+@norms.register_class('low_precision_rmsnorm')
+class LPRMSNorm(RMSNorm):
+    def __init__(self, normalized_shape: Union[int, list[int], torch.Size], eps: float=1e-05, weight: bool=True, dtype: Optional[torch.dtype]=None, device: Optional[torch.device]=None):
+        super().__init__(normalized_shape=normalized_shape, eps=eps, weight=weight, dtype=dtype, device=device)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        downcast_x = _cast_if_autocast_enabled(x)
+        downcast_weight = _cast_if_autocast_enabled(self.weight) if self.weight is not None else self.weight
+        with torch.autocast(enabled=False, device_type=x.device.type):
+            return rms_norm(downcast_x, downcast_weight, self.eps).to(dtype=x.dtype)
+@norms.register_class('triton_rmsnorm')
+class TritonRMSNorm(torch.nn.Module):
+    def __init__(self, normalized_shape: Union[int, list[int], torch.Size], eps: float=1e-05, device: Optional[torch.device]=None, dtype: Optional[torch.dtype]=None):
+        super().__init__()
+        self.eps = eps
+        try:
+            from flash_attn.ops.triton.layer_norm import rms_norm_fn
+        except ImportError:
+            raise ImportError('triton_rms_norm requires Flash Attention to be installed. ' + 'Please pip install flash-attn.')
+        if not isinstance(normalized_shape, int):
+            raise ValueError('TritonRMSNorm only supports 1D tensors')
+        self.rms_norm_fn = rms_norm_fn
+        self.weight = torch.nn.Parameter(torch.ones(normalized_shape, device=device, dtype=dtype))
+    def forward(self, x: torch.Tensor):
+        return self.rms_norm_fn(x, self.weight, None, residual=None, eps=self.eps, dropout_p=0.0, prenorm=False, residual_in_fp32=False)

param_init_fns.py ADDED Viewed

	@@ -0,0 +1,448 @@

+import math
+import warnings
+from collections.abc import Sequence
+from copy import deepcopy
+from functools import partial
+from typing import Any, Callable, Optional, Union
+import torch
+from torch import nn
+from torch.distributed._tensor import DTensor
+from .layers_registry import fcs, module_init_fns, norms, param_init_fns
+from .dmoe import GLU, MLP
+try:
+    import transformer_engine.pytorch as te
+except:
+    te = None
+try:
+    import megablocks
+except:
+    megablocks = None
+def torch_default_param_init_fn_(module: nn.Module, **kwargs: Any) -> None:
+    del kwargs
+    if hasattr(module, 'reset_parameters') and isinstance(module.reset_parameters, Callable):
+        module.reset_parameters()
+def fused_init_helper_(module: nn.Module, init_fn_: Callable, name_param: str='weight'):
+    """Initializes parameters which have been fused for efficiency purposes.
+    Parameter initialization is often based on the parameters shape. If a layer is fused,
+    initialization should be based on the shapes of the original tensor instead of the
+    shape of the fused tensor. Layers which are fused should have the _fused
+    attribute. First element of _fused is the dimension along which the tensor is fused.
+    Second element is a an iterable of split indices.
+    Args:
+        module (nn.Module): The module to initialize.
+        init_fn_ (Callable): Initialization method.
+        name_param (str): Name of parameter to initialize within the module.
+    """
+    _fused = getattr(module, '_fused', None)
+    if _fused is None:
+        raise RuntimeError(f'Internal logic error')
+    fused_param_init_helper(getattr(module, name_param), init_fn_, _fused)
+def fused_param_init_helper(param: torch.Tensor, init_fn_: Callable, fused_parameters: tuple[int, list[int]]):
+    """Initializes parameters that are fused together.
+    Args:
+        param (torch.Tensor): Tensor to initialize.
+        init_fn_ (Callable): Initialization method.
+        fused_parameters (tuple[int, list[int]]): First element of _fused is the dimension
+            along which the tensor is fused. Second element is a an iterable of split indices.
+    """
+    p_ndims = param.ndim
+    dim, splits = fused_parameters
+    splits = (0, *splits, param.size(dim))
+    for s, e in zip(splits[:-1], splits[1:]):
+        slice_indices = [slice(None)] * p_ndims
+        slice_indices[dim] = slice(s, e)
+        init_fn_(param[slice_indices])
+def stacked_init_helper_(module: nn.Module, init_fn_: Callable, name_param: str='weight'):
+    """Initializes parameters stacked along a new dimension.
+    Parameter initialization is often based on the parameters shape. If a layer is stacked,
+    initialization should be based on the shapes of the original tensor instead of the
+    shape of the stacked tensor. Layers which are fused should have the _stacked_dim
+    attribute defining the new dimension along which they are stacked.
+    Args:
+        module (nn.Module): The module to initialize.
+        init_fn_ (Callable): Initialization method.
+        name_param (str): Name of parameter to initialize within the module.
+    """
+    stack_dim = getattr(module, '_stack_dim', None)
+    if stack_dim is None:
+        raise RuntimeError(f'Internal logic error')
+    stacked_param_init_helper(getattr(module, name_param), init_fn_, stack_dim)
+def stacked_param_init_helper(param: torch.Tensor, init_fn_: Callable, stack_dim: int):
+    """Initialize parameters stacked along a new dimension.
+    Args:
+        param (torch.Tensor): Tensor to initialize.
+        init_fn_ (Callable): Initialization method.
+        stack_dim (int): Dimension along with parameters are stacked
+    """
+    p_ndims = param.ndim
+    for idx in range(param.size(stack_dim)):
+        slice_indices = [slice(None)] * p_ndims
+        slice_indices[stack_dim] = idx
+        init_fn_(param[slice_indices])
+def _flip_fan_mode(init_fn_: Callable):
+    """Changes the mode of an init_fn_.
+    init_fn_'s "mode" is set to operate on standard torch modules eg torch.nn.Linear.
+    If a custom layer transposes its weights before they are allied such that it is
+    opposite pytorch's conventions, we must flip the fan mode, from fan_in to fan_out.
+    Args:
+        init_fn_ (Callable): Initialization method.
+    """
+    _init_fn_ = deepcopy(init_fn_)
+    if 'mode' in _init_fn_.keywords:
+        if _init_fn_.keywords['mode'] == 'fan_in':
+            _init_fn_.keywords['mode'] = 'fan_out'
+        elif _init_fn_.keywords['mode'] == 'fan_out':
+            _init_fn_.keywords['mode'] = 'fan_in'
+    return _init_fn_
+def fc_init(module: nn.Module, init_fn_: Callable, init_div_is_residual: Union[int, float, str, bool], div_is_residual: Optional[float], **kwargs: Any) -> bool:
+    del kwargs
+    if isinstance(module, tuple({fcs.get(n) for n in fcs.get_all()})):
+        if hasattr(module, '_fused'):
+            fused_init_helper_(module, init_fn_)
+        else:
+            init_fn_(module.weight)
+        if module.bias is not None:
+            assert isinstance(module.bias, torch.Tensor)
+            torch.nn.init.zeros_(module.bias)
+        if init_div_is_residual is not False and getattr(module, '_is_residual', False):
+            with torch.no_grad():
+                module.weight.div_(div_is_residual)
+        return True
+    return False
+def embedding_init(module: nn.Module, init_fn_: Callable, emb_init_std: Optional[float], emb_init_uniform_lim: Optional[Union[tuple[float, float], float]], **kwargs: Any) -> bool:
+    del kwargs
+    if isinstance(module, nn.Embedding):
+        if emb_init_std is not None:
+            std = emb_init_std
+            if std == 0:
+                warnings.warn(f'Embedding layer initialized to 0.')
+            emb_init_fn_ = partial(torch.nn.init.normal_, mean=0.0, std=std)
+        elif emb_init_uniform_lim is not None:
+            lim = emb_init_uniform_lim
+            if isinstance(lim, Sequence):
+                if len(lim) > 2:
+                    raise ValueError(f'Uniform init requires a min and a max limit. User input: {lim}.')
+                if lim[0] == lim[1]:
+                    warnings.warn(f'Embedding layer initialized to {lim[0]}.')
+            else:
+                if lim == 0:
+                    warnings.warn(f'Embedding layer initialized to 0.')
+                lim = [-lim, lim]
+            a, b = lim
+            emb_init_fn_ = partial(torch.nn.init.uniform_, a=a, b=b)
+        else:
+            emb_init_fn_ = init_fn_
+        emb_init_fn_(module.weight)
+        if module.padding_idx is not None:
+            with torch.no_grad():
+                module.weight[module.padding_idx].fill_(0)
+        return True
+    return False
+def norm_init(module: nn.Module, **kwargs: Any) -> bool:
+    del kwargs
+    if isinstance(module, tuple({norms.get(name) for name in norms.get_all()})):
+        if hasattr(module, 'weight') and isinstance(module.weight, torch.Tensor):
+            torch.nn.init.ones_(module.weight)
+        if hasattr(module, 'bias') and isinstance(module.bias, torch.Tensor):
+            torch.nn.init.zeros_(module.bias)
+        return True
+    return False
+def multihead_attention_init(module: nn.Module, init_fn_: Callable, d_model: Optional[int], init_div_is_residual: Union[int, float, str, bool], div_is_residual: float, **kwargs: Any) -> bool:
+    del kwargs
+    if isinstance(module, nn.MultiheadAttention):
+        if module._qkv_same_embed_dim:
+            assert module.in_proj_weight is not None
+            assert module.q_proj_weight is None and module.k_proj_weight is None and (module.v_proj_weight is None)
+            assert d_model is not None
+            _d = d_model
+            splits = (0, _d, 2 * _d, 3 * _d)
+            for s, e in zip(splits[:-1], splits[1:]):
+                init_fn_(module.in_proj_weight[s:e])
+        else:
+            assert module.q_proj_weight is not None and module.k_proj_weight is not None and (module.v_proj_weight is not None)
+            assert module.in_proj_weight is None
+            init_fn_(module.q_proj_weight)
+            init_fn_(module.k_proj_weight)
+            init_fn_(module.v_proj_weight)
+        if module.in_proj_bias is not None:
+            torch.nn.init.zeros_(module.in_proj_bias)
+        if module.bias_k is not None:
+            torch.nn.init.zeros_(module.bias_k)
+        if module.bias_v is not None:
+            torch.nn.init.zeros_(module.bias_v)
+        init_fn_(module.out_proj.weight)
+        if init_div_is_residual is not False and getattr(module.out_proj, '_is_residual', False):
+            with torch.no_grad():
+                module.out_proj.weight.div_(div_is_residual)
+        if module.out_proj.bias is not None:
+            torch.nn.init.zeros_(module.out_proj.bias)
+        return True
+    return False
+def te_layernorm_mlp_init(module: nn.Module, init_fn_: Callable, **kwargs: Any) -> bool:
+    del kwargs
+    if te is not None and isinstance(module, te.LayerNormMLP):
+        if isinstance(module.layer_norm_weight, torch.Tensor):
+            torch.nn.init.ones_(module.layer_norm_weight)
+        if isinstance(module.layer_norm_bias, torch.Tensor):
+            torch.nn.init.zeros_(module.layer_norm_bias)
+        init_fn_(module.fc1_weight)
+        if module.fc1_bias is not None:
+            assert isinstance(module.fc1_bias, torch.Tensor)
+            torch.nn.init.zeros_(module.fc1_bias)
+        init_fn_(module.fc2_weight)
+        if module.fc2_bias is not None:
+            assert isinstance(module.fc2_bias, torch.Tensor)
+            torch.nn.init.zeros_(module.fc2_bias)
+        with torch.no_grad():
+            module.fc2_weight.div_(div_is_residual)
+        return True
+    return False
+def moe_init(module: nn.Module, init_fn_: Callable, init_div_is_residual: Union[int, float, str, bool], div_is_residual: float, **kwargs: Any) -> bool:
+    if megablocks is not None and isinstance(module, (megablocks.layers.moe.MoE, megablocks.layers.dmoe.dMoE, megablocks.layers.moe.ParallelMLP, megablocks.layers.dmoe.ParallelDroplessMLP)):
+        if hasattr(module, 'bias') and module.bias is not None:
+            torch.nn.init.zeros_(module.bias)
+        return True
+    elif megablocks is not None and isinstance(module, megablocks.layers.glu.SparseGLU):
+        _megablocks_sparse_glu_generic_param_init_fn_(module, init_fn_, bool(init_div_is_residual), div_is_residual)
+        return True
+    elif megablocks is not None and isinstance(module, megablocks.layers.mlp.SparseMLP):
+        _megablocks_sparse_mlp_generic_param_init_fn_(module, init_fn_, bool(init_div_is_residual), div_is_residual)
+        return True
+    elif megablocks is not None and isinstance(module, megablocks.layers.mlp.MLP):
+        _megablocks_mlp_generic_param_init_fn_(module, init_fn_, bool(init_div_is_residual), div_is_residual)
+        return True
+    elif isinstance(module, GLU):
+        init_fn_(module.w1)
+        init_fn_(module.v1)
+        init_fn_(module.w2)
+        return True
+    elif isinstance(module, MLP):
+        init_fn_(module.w1)
+        init_fn_(module.w2)
+        return True
+    return False
+def generic_param_init_fn_(module: nn.Module, init_fn_: Callable, n_layers: int, d_model: Optional[int]=None, init_div_is_residual: Union[int, float, str, bool]=True, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[tuple[float, float], float]]=None, **kwargs: Any) -> None:
+    del kwargs
+    init_div_is_residual = init_div_is_residual
+    if init_div_is_residual is False:
+        div_is_residual = 1.0
+    elif init_div_is_residual is True:
+        div_is_residual = math.sqrt(2 * n_layers)
+    elif isinstance(init_div_is_residual, float) or isinstance(init_div_is_residual, int):
+        div_is_residual = init_div_is_residual
+    elif init_div_is_residual.isnumeric():
+        div_is_residual = float(init_div_is_residual)
+    else:
+        div_is_residual = 1.0
+        raise ValueError(f'Expected init_div_is_residual to be boolean or numeric, got {init_div_is_residual}')
+    all_module_init_fns = [module_init_fns.get(name) for name in module_init_fns.get_all()]
+    did_init = False
+    for module_init_fn in all_module_init_fns:
+        did_init = module_init_fn(module=module, init_fn_=init_fn_, d_model=d_model, init_div_is_residual=init_div_is_residual, div_is_residual=div_is_residual, emb_init_std=emb_init_std, emb_init_uniform_lim=emb_init_uniform_lim)
+        if did_init:
+            break
+    if not did_init:
+        for _ in module.parameters(recurse=False):
+            raise NotImplementedError(f'{module.__class__.__name__} parameters are not initialized by any of the registered module_init_fns. ' + 'Please add an appropriate module_init_fn to the registry. Currently registered module_init_fns are: ' + ', '.join(module_init_fns.get_all()))
+def _megablocks_sparse_mlp_generic_param_init_fn_(module: nn.Module, init_fn_: Callable, init_div_is_residual: bool=False, div_is_residual: float=1.0):
+    """Initializes MegaBlocks MLP.
+    To enable elastic deterministic initialization, this method creates the entire
+    weight matrix then slice into the weight tensors such that the sampled weights
+    should not vary between moe world size for the same random seed.
+    Args:
+        module (nn.Module): The module to initialize.
+        init_fn_ (Callable): Initialization method.
+        init_div_is_residual (bool): Flag enabling parameters tagged with _is_residual
+            flag to be divided by div_is_residual.
+        div_is_residual (float): The value by which parameter initialization is divided
+            if init_div_is_residual flag is enabled.
+    """
+    expert_process_group_size, rank = (1, 0)
+    if module.expert_parallel_group is not None:
+        expert_process_group_size = int(module.expert_parallel_group.size())
+        rank = int(module.expert_parallel_group.rank())
+    hidden_size = int(module.hidden_size)
+    w1 = module.w1
+    if isinstance(w1, DTensor):
+        w1 = w1._local_tensor
+    w1_size = list(w1.shape)
+    w1_size[0] = w1_size[0] * expert_process_group_size
+    n_exp = w1_size[0] // hidden_size
+    _fused = (0, [(n + 1) * hidden_size for n in range(n_exp - 1)])
+    _w1 = w1.new_empty(w1_size)
+    fused_param_init_helper(_w1, init_fn_, _fused)
+    _w1_local = _w1.chunk(expert_process_group_size, dim=0)[rank]
+    with torch.no_grad():
+        w1.copy_(_w1_local)
+    w2 = module.w2
+    if isinstance(w2, DTensor):
+        w2 = w2._local_tensor
+    w2_size = list(w2.shape)
+    w2_size[0] = w2_size[0] * expert_process_group_size
+    _w2 = w2.new_empty(w2_size)
+    fused_param_init_helper(_w2, _flip_fan_mode(init_fn_), _fused)
+    _w2_local = _w2.chunk(expert_process_group_size, dim=0)[rank]
+    with torch.no_grad():
+        w2.copy_(_w2_local)
+    if init_div_is_residual is not False:
+        with torch.no_grad():
+            w2.div_(div_is_residual)
+def _megablocks_sparse_glu_generic_param_init_fn_(module: nn.Module, init_fn_: Callable, init_div_is_residual: bool=False, div_is_residual: float=1.0):
+    """Initializes MegaBlocks Sparse GLU.
+    Extends the Megablocks Sparse MLP case to an additional weight v1 for GLUs.
+    This additional weight v1 has the same initialization procedure as w1 for MLPs.
+    Args:
+        module (nn.Module): The module to initialize.
+        init_fn_ (Callable): Initialization method.
+        init_div_is_residual (bool): Flag enabling parameters tagged with _is_residual
+            flag to be divided by div_is_residual.
+        div_is_residual (float): The value by which parameter initialization is divided
+            if init_div_is_residual flag is enabled.
+    """
+    _megablocks_sparse_mlp_generic_param_init_fn_(module=module, init_fn_=init_fn_, init_div_is_residual=init_div_is_residual, div_is_residual=div_is_residual)
+    expert_process_group_size, rank = (1, 0)
+    if module.expert_parallel_group is not None:
+        expert_process_group_size = int(module.expert_parallel_group.size())
+        rank = int(module.expert_parallel_group.rank())
+    hidden_size = int(module.hidden_size)
+    v1 = module.v1
+    if isinstance(v1, DTensor):
+        v1 = v1._local_tensor
+    v1_size = list(v1.shape)
+    v1_size[0] = v1_size[0] * expert_process_group_size
+    n_exp = v1_size[0] // hidden_size
+    _fused = (0, [(n + 1) * hidden_size for n in range(n_exp - 1)])
+    _v1 = v1.new_empty(v1_size)
+    fused_param_init_helper(_v1, init_fn_, _fused)
+    _v1_local = _v1.chunk(expert_process_group_size, dim=0)[rank]
+    with torch.no_grad():
+        v1.copy_(_v1_local)
+def _megablocks_mlp_generic_param_init_fn_(module: nn.Module, init_fn_: Callable, init_div_is_residual: bool=False, div_is_residual: float=1.0):
+    """Initializes MegaBlocks' MLP.
+    To enable elastic deterministic initialization, this method creates the entire
+    weight matrix then slice into the weight tensors such that the sampled weights
+    should not vary between moe world size for the same random seed.
+    Args:
+        module (nn.Module): The module to initialize.
+        init_fn_ (Callable): Initialization method.
+        init_div_is_residual (bool): Flag enabling parameters tagged with _is_residual
+            flag to be divided by div_is_residual.
+        div_is_residual (float): The value by which parameter initialization is divided
+            if init_div_is_residual flag is enabled.
+    """
+    expert_process_group_size, rank = (1, 0)
+    if module.expert_parallel_group is not None:
+        expert_process_group_size = int(module.expert_parallel_group.size())
+        rank = int(module.expert_parallel_group.rank())
+    _init_fn_ = _flip_fan_mode(init_fn_)
+    w1_size = list(module.w1.shape)
+    w1_size[0] = w1_size[0] * expert_process_group_size
+    _w1 = module.w1.new_empty(w1_size)
+    stacked_param_init_helper(_w1, _init_fn_, module._stack_dim)
+    _w1_local = _w1.chunk(expert_process_group_size, dim=0)[rank]
+    with torch.no_grad():
+        module.w1.copy_(_w1_local)
+    w2_size = list(module.w2.shape)
+    w2_size[0] = w2_size[0] * expert_process_group_size
+    _w2 = module.w2.new_empty(w2_size)
+    stacked_param_init_helper(_w2, _init_fn_, module._stack_dim)
+    _w2_local = _w2.chunk(expert_process_group_size, dim=0)[rank]
+    with torch.no_grad():
+        module.w2.copy_(_w2_local)
+    if init_div_is_residual is not False:
+        with torch.no_grad():
+            module.w2.div_(div_is_residual)
+def _normal_init_(std: float, mean: float=0.0):
+    return partial(torch.nn.init.normal_, mean=mean, std=std)
+def _normal_param_init_fn_(module: nn.Module, std: float, n_layers: int, d_model: Optional[int]=None, init_div_is_residual: Union[int, float, str, bool]=True, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[tuple[float, float], float]]=None, **kwargs: Any) -> None:
+    del kwargs
+    init_fn_ = _normal_init_(std=std)
+    generic_param_init_fn_(module=module, init_fn_=init_fn_, d_model=d_model, n_layers=n_layers, init_div_is_residual=init_div_is_residual, emb_init_std=emb_init_std, emb_init_uniform_lim=emb_init_uniform_lim)
+def baseline_param_init_fn_(module: nn.Module, init_std: Optional[float], n_layers: int, d_model: Optional[int]=None, init_div_is_residual: Union[int, float, str, bool]=True, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[tuple[float, float], float]]=None, **kwargs: Any) -> None:
+    del kwargs
+    if init_std is None:
+        raise ValueError("You must set model.init_config['init_std'] to a float value to use the default initialization scheme.")
+    _normal_param_init_fn_(module=module, std=init_std, d_model=d_model, n_layers=n_layers, init_div_is_residual=init_div_is_residual, emb_init_std=emb_init_std, emb_init_uniform_lim=emb_init_uniform_lim)
+def small_param_init_fn_(module: nn.Module, n_layers: int, d_model: int, init_div_is_residual: Union[int, float, str, bool]=True, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[tuple[float, float], float]]=None, **kwargs: Any) -> None:
+    del kwargs
+    std = math.sqrt(2 / (5 * d_model))
+    _normal_param_init_fn_(module=module, std=std, d_model=d_model, n_layers=n_layers, init_div_is_residual=init_div_is_residual, emb_init_std=emb_init_std, emb_init_uniform_lim=emb_init_uniform_lim)
+def neox_param_init_fn_(module: nn.Module, n_layers: int, d_model: int, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[tuple[float, float], float]]=None, **kwargs: Any) -> None:
+    """From section 2.3.1 of GPT-NeoX-20B:
+    An Open-Source AutoregressiveLanguage Model — Black et. al. (2022)
+    see https://github.com/EleutherAI/gpt-neox/blob/9610391ab319403cef079b438edd016a2443af54/megatron/model/init_functions.py#L151
+    and https://github.com/EleutherAI/gpt-neox/blob/main/megatron/model/transformer.py
+    """
+    del kwargs
+    residual_div = n_layers / math.sqrt(10)
+    small_param_init_fn_(module=module, d_model=d_model, n_layers=n_layers, init_div_is_residual=residual_div, emb_init_std=emb_init_std, emb_init_uniform_lim=emb_init_uniform_lim)
+def kaiming_uniform_param_init_fn_(module: nn.Module, n_layers: int, d_model: Optional[int]=None, init_div_is_residual: Union[int, float, str, bool]=True, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[tuple[float, float], float]]=None, init_gain: float=0, fan_mode: str='fan_in', init_nonlinearity: str='leaky_relu', **kwargs: Any) -> None:
+    del kwargs
+    kaiming_uniform_ = partial(nn.init.kaiming_uniform_, a=init_gain, mode=fan_mode, nonlinearity=init_nonlinearity)
+    generic_param_init_fn_(module=module, init_fn_=kaiming_uniform_, d_model=d_model, n_layers=n_layers, init_div_is_residual=init_div_is_residual, emb_init_std=emb_init_std, emb_init_uniform_lim=emb_init_uniform_lim)
+def kaiming_normal_param_init_fn_(module: nn.Module, n_layers: int, d_model: Optional[int]=None, init_div_is_residual: Union[int, float, str, bool]=True, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[tuple[float, float], float]]=None, init_gain: float=0, fan_mode: str='fan_in', init_nonlinearity: str='leaky_relu', **kwargs: Any) -> None:
+    del kwargs
+    kaiming_normal_ = partial(torch.nn.init.kaiming_normal_, a=init_gain, mode=fan_mode, nonlinearity=init_nonlinearity)
+    generic_param_init_fn_(module=module, init_fn_=kaiming_normal_, d_model=d_model, n_layers=n_layers, init_div_is_residual=init_div_is_residual, emb_init_std=emb_init_std, emb_init_uniform_lim=emb_init_uniform_lim)
+def xavier_uniform_param_init_fn_(module: nn.Module, n_layers: int, d_model: Optional[int]=None, init_div_is_residual: Union[int, float, str, bool]=True, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[tuple[float, float], float]]=None, init_gain: float=0, **kwargs: Any) -> None:
+    del kwargs
+    xavier_uniform_ = partial(torch.nn.init.xavier_uniform_, gain=init_gain)
+    generic_param_init_fn_(module=module, init_fn_=xavier_uniform_, d_model=d_model, n_layers=n_layers, init_div_is_residual=init_div_is_residual, emb_init_std=emb_init_std, emb_init_uniform_lim=emb_init_uniform_lim)
+def xavier_normal_param_init_fn_(module: nn.Module, n_layers: int, d_model: Optional[int]=None, init_div_is_residual: Union[int, float, str, bool]=True, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[tuple[float, float], float]]=None, init_gain: float=0, **kwargs: Any) -> None:
+    del kwargs
+    xavier_normal_ = partial(torch.nn.init.xavier_normal_, gain=init_gain)
+    generic_param_init_fn_(module=module, init_fn_=xavier_normal_, d_model=d_model, n_layers=n_layers, init_div_is_residual=init_div_is_residual, emb_init_std=emb_init_std, emb_init_uniform_lim=emb_init_uniform_lim)
+param_init_fns.register('default_', func=torch_default_param_init_fn_)
+param_init_fns.register('baseline_', func=baseline_param_init_fn_)
+param_init_fns.register('kaiming_uniform_', func=kaiming_uniform_param_init_fn_)
+param_init_fns.register('kaiming_normal_', func=kaiming_normal_param_init_fn_)
+param_init_fns.register('neox_init_', func=neox_param_init_fn_)
+param_init_fns.register('small_init_', func=small_param_init_fn_)
+param_init_fns.register('xavier_uniform_', func=xavier_uniform_param_init_fn_)
+param_init_fns.register('xavier_normal_', func=xavier_normal_param_init_fn_)
+module_init_fns.register('fc', func=fc_init)
+module_init_fns.register('embedding', func=embedding_init)
+module_init_fns.register('norm', func=norm_init)
+module_init_fns.register('multihead_attention', func=multihead_attention_init)
+module_init_fns.register('te_layernorm_mlp', func=te_layernorm_mlp_init)
+module_init_fns.register('moe', func=moe_init)

registry_utils.py ADDED Viewed

	@@ -0,0 +1,131 @@

+import copy
+import functools
+import importlib.util
+import os
+from contextlib import contextmanager
+from pathlib import Path
+from types import ModuleType
+from typing import Any, Callable, Generic, Optional, Sequence, TypeVar, Union
+import catalogue
+T = TypeVar('T')
+TypeBoundT = TypeVar('TypeBoundT', bound=type)
+CallableBoundT = TypeVar('CallableBoundT', bound=Callable[..., Any])
+class TypedRegistry(catalogue.Registry, Generic[T]):
+    """A thin wrapper around catalogue.Registry to add static typing and.
+    descriptions.
+    """
+    def __init__(self, namespace: Sequence[str], entry_points: bool=False, description: str='') -> None:
+        super().__init__(namespace, entry_points=entry_points)
+        self.description = description
+    def __call__(self, name: str, func: Optional[T]=None) -> Callable[[T], T]:
+        return super().__call__(name, func)
+    def register(self, name: str, *, func: Optional[T]=None) -> T:
+        return super().register(name, func=func)
+    def register_class(self, name: str, *, func: Optional[TypeBoundT]=None) -> TypeBoundT:
+        return super().register(name, func=func)
+    def get(self, name: str) -> T:
+        return super().get(name)
+    def get_all(self) -> dict[str, T]:
+        return super().get_all()
+    def get_entry_point(self, name: str, default: Optional[T]=None) -> T:
+        return super().get_entry_point(name, default=default)
+    def get_entry_points(self) -> dict[str, T]:
+        return super().get_entry_points()
+S = TypeVar('S')
+def create_registry(*namespace: str, generic_type: type[S], entry_points: bool=False, description: str='') -> 'TypedRegistry[S]':
+    """Create a new registry.
+    Args:
+        namespace (str): The namespace, e.g. "llmfoundry.loggers"
+        generic_type (Type[S]): The type of the registry.
+        entry_points (bool): Accept registered functions from entry points.
+        description (str): A description of the registry.
+    Returns:
+        The TypedRegistry object.
+    """
+    if catalogue.check_exists(*namespace):
+        raise catalogue.RegistryError(f'Namespace already exists: {namespace}')
+    return TypedRegistry[generic_type](namespace, entry_points=entry_points, description=description)
+def construct_from_registry(name: str, registry: TypedRegistry, partial_function: bool=True, pre_validation_function: Optional[Union[Callable[[Any], None], type]]=None, post_validation_function: Optional[Callable[[Any], None]]=None, kwargs: Optional[dict[str, Any]]=None) -> Any:
+    """Helper function to build an item from the registry.
+    Args:
+        name (str): The name of the registered item
+        registry (catalogue.Registry): The registry to fetch the item from
+        partial_function (bool, optional): Whether to return a partial function for registered callables. Defaults to True.
+        pre_validation_function (Optional[Union[Callable[[Any], None], type]], optional): An optional validation function called
+            before constructing the item to return. This should throw an exception if validation fails. Defaults to None.
+        post_validation_function (Optional[Callable[[Any], None]], optional): An optional validation function called after
+            constructing the item to return. This should throw an exception if validation fails. Defaults to None.
+        kwargs (Optional[Dict[str, Any]]): Other relevant keyword arguments.
+    Raises:
+        ValueError: If the validation functions failed or the registered item is invalid
+    Returns:
+        Any: The constructed item from the registry
+    """
+    if kwargs is None:
+        kwargs = {}
+    registered_constructor = registry.get(name)
+    if pre_validation_function is not None:
+        if isinstance(pre_validation_function, type):
+            if not issubclass(registered_constructor, pre_validation_function):
+                raise ValueError(f'Expected {name} to be of type {pre_validation_function}, but got {type(registered_constructor)}')
+        elif isinstance(pre_validation_function, Callable):
+            pre_validation_function(registered_constructor)
+        else:
+            raise ValueError(f'Expected pre_validation_function to be a callable or a type, but got {type(pre_validation_function)}')
+    if isinstance(registered_constructor, type) or (callable(registered_constructor) and (not partial_function)):
+        constructed_item = registered_constructor(**kwargs)
+    elif callable(registered_constructor):
+        constructed_item = functools.partial(registered_constructor, **kwargs)
+    else:
+        raise ValueError(f'Expected {name} to be a class or function, but got {type(registered_constructor)}')
+    if post_validation_function is not None:
+        post_validation_function(constructed_item)
+    return constructed_item
+def import_file(loc: Union[str, Path]) -> ModuleType:
+    """Import module from a file.
+    Used to run arbitrary python code.
+    Args:
+        name (str): Name of module to load.
+        loc (str / Path): Path to the file.
+    Returns:
+        ModuleType: The module object.
+    """
+    if not os.path.exists(loc):
+        raise FileNotFoundError(f'File {loc} does not exist.')
+    spec = importlib.util.spec_from_file_location('python_code', str(loc))
+    assert spec is not None
+    assert spec.loader is not None
+    module = importlib.util.module_from_spec(spec)
+    try:
+        spec.loader.exec_module(module)
+    except Exception as e:
+        raise RuntimeError(f'Error executing {loc}') from e
+    return module
+@contextmanager
+def save_registry():
+    """Save the registry state and restore after the context manager exits."""
+    saved_registry_state = copy.deepcopy(catalogue.REGISTRY)
+    yield
+    catalogue.REGISTRY = saved_registry_state

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:aeb13307a71acd8fe81861d94ad54ab689df773318809eed3cbe794b4492dae4
+size 11422654

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,240 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151665": {
+      "content": "<tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151666": {
+      "content": "</tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151667": {
+      "content": "<think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151668": {
+      "content": "</think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "chat_template": "{%- if tools %}\n    {{- '<|im_start|>system\\n' }}\n    {%- if messages[0].role == 'system' %}\n        {{- messages[0].content + '\\n\\n' }}\n    {%- endif %}\n    {{- \"# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n    {%- for tool in tools %}\n        {{- \"\\n\" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n    {%- if messages[0].role == 'system' %}\n        {{- '<|im_start|>system\\n' + messages[0].content + '<|im_end|>\\n' }}\n    {%- endif %}\n{%- endif %}\n{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}\n{%- for message in messages[::-1] %}\n    {%- set index = (messages|length - 1) - loop.index0 %}\n    {%- if ns.multi_step_tool and message.role == \"user\" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}\n        {%- set ns.multi_step_tool = false %}\n        {%- set ns.last_query_index = index %}\n    {%- endif %}\n{%- endfor %}\n{%- for message in messages %}\n    {%- if message.content is string %}\n        {%- set content = message.content %}\n    {%- else %}\n        {%- set content = '' %}\n    {%- endif %}\n    {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) %}\n        {{- '<|im_start|>' + message.role + '\\n' + content + '<|im_end|>' + '\\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {%- set reasoning_content = '' %}\n        {%- if message.reasoning_content is string %}\n            {%- set reasoning_content = message.reasoning_content %}\n        {%- else %}\n            {%- if '</think>' in content %}\n                {%- set reasoning_content = content.split('</think>')[0].rstrip('\\n').split('<think>')[-1].lstrip('\\n') %}\n                {%- set content = content.split('</think>')[-1].lstrip('\\n') %}\n            {%- endif %}\n        {%- endif %}\n        {%- if loop.index0 > ns.last_query_index %}\n            {%- if loop.last or (not loop.last and reasoning_content) %}\n                {{- '<|im_start|>' + message.role + '\\n<think>\\n' + reasoning_content.strip('\\n') + '\\n</think>\\n\\n' + content.lstrip('\\n') }}\n            {%- else %}\n                {{- '<|im_start|>' + message.role + '\\n' + content }}\n            {%- endif %}\n        {%- else %}\n            {{- '<|im_start|>' + message.role + '\\n' + content }}\n        {%- endif %}\n        {%- if message.tool_calls %}\n            {%- for tool_call in message.tool_calls %}\n                {%- if (loop.first and content) or (not loop.first) %}\n                    {{- '\\n' }}\n                {%- endif %}\n                {%- if tool_call.function %}\n                    {%- set tool_call = tool_call.function %}\n                {%- endif %}\n                {{- '<tool_call>\\n{\"name\": \"' }}\n                {{- tool_call.name }}\n                {{- '\", \"arguments\": ' }}\n                {%- if tool_call.arguments is string %}\n                    {{- tool_call.arguments }}\n                {%- else %}\n                    {{- tool_call.arguments | tojson }}\n                {%- endif %}\n                {{- '}\\n</tool_call>' }}\n            {%- endfor %}\n        {%- endif %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if loop.first or (messages[loop.index0 - 1].role != \"tool\") %}\n            {{- '<|im_start|>user' }}\n        {%- endif %}\n        {{- '\\n<tool_response>\\n' }}\n        {{- content }}\n        {{- '\\n</tool_response>' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n            {{- '<|im_end|>\\n' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n    {%- if enable_thinking is defined and enable_thinking is false %}\n        {{- '<think>\\n\\n</think>\\n\\n' }}\n    {%- endif %}\n{%- endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 4096,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

warnings.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import functools
+import warnings
+from typing import Any, Callable, Type, TypeVar, cast
+class VersionedDeprecationWarning(UserWarning):
+    """A custom deprecation warning class that includes version information.
+    Attributes:
+        message (str): The deprecation message describing why the feature is deprecated.
+        remove_version (str): The version in which the feature will be removed.
+    Example:
+        >>> def deprecated_function():
+        ...     warnings.warn(
+        ...         VersionedDeprecationWarning(
+        ...             "Function XYZ is deprecated.",
+        ...             remove_version="2.0.0"
+        ...         )
+        ...     )
+        ...
+        >>> deprecated_function()
+        DeprecationWarning: Function XYZ is deprecated. It will be removed in version 2.0.0.
+    """
+    def __init__(self, message: str, remove_version: str) -> None:
+        super().__init__(message + f' It will be removed in version {remove_version}.')
+class ExperimentalWarning(Warning):
+    """A warning for experimental features.
+    Attributes:
+        feature_name (str): The name of the experimental feature.
+    """
+    def __init__(self, feature_name: str) -> None:
+        super().__init__(f'{feature_name} is experimental and may change with future versions.')
+F = TypeVar('F', bound=Callable[..., Any])
+def experimental_function(feature_name: str) -> Callable[[F], F]:
+    """Decorator to mark a function as experimental.
+    The message displayed will be {feature_name} is experimental and may change with future versions.
+    Args:
+        feature_name (str): The name of the experimental feature.
+    Returns:
+        The decorated function.
+    """
+    def decorator(func: Callable):
+        @functools.wraps(func)
+        def wrapper(*args: Any, **kwargs: Any):
+            warnings.warn(ExperimentalWarning(feature_name))
+            return func(*args, **kwargs)
+        return cast(F, wrapper)
+    return decorator
+def experimental_class(feature_name: str) -> Callable[[Type], Type]:
+    """Class decorator to mark a class as experimental."""
+    def class_decorator(cls: Type):
+        original_init = cls.__init__
+        cls.is_experimental = True
+        def new_init(self: Any, *args: Any, **kwargs: Any):
+            warnings.warn(ExperimentalWarning(feature_name))
+            original_init(self, *args, **kwargs)
+        cls.__init__ = new_init
+        return cls
+    return class_decorator