SmerkyG commited on May 5

Commit

3f1e910

verified ·

1 Parent(s): 47905f4

Add files using upload-large-folder tool

Browse files

Files changed (19) hide show

config.json +41 -0
configuration_rwkv6qwen2.py +206 -0
examine_ckpt.py +25 -0
generate.py +119 -0
generation_config.json +14 -0
merges.txt +0 -0
model-00001-of-00004.safetensors +3 -0
model-00002-of-00004.safetensors +3 -0
model-00003-of-00004.safetensors +3 -0
model-00004-of-00004.safetensors +3 -0
model.safetensors.index.json +682 -0
modeling_rwkv6qwen2.py +1336 -0
qwen2.py +670 -0
run_lm_eval.py +96 -0
tokenization_rwkv6qwen2.py +4 -0
tokenization_rwkv6qwen2_fast.py +4 -0
tokenizer.json +0 -0
tokenizer_config.json +207 -0
vocab.json +0 -0

config.json ADDED Viewed

	@@ -0,0 +1,41 @@

+{
+  "architectures": [
+    "RWKV6Qwen2ForCausalLM"
+  ],
+  "auto_map": {
+      "AutoConfig": "configuration_rwkv6qwen2.RWKV6Qwen2Config",
+      "AutoModelForCausalLM": "modeling_rwkv6qwen2.RWKV6Qwen2ForCausalLM"
+  },
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "attention_output_bias": false,
+  "balance_state": true,
+  "bos_token_id": 151643,
+  "eos_token_id": 151643,
+  "gate_rank_type": 1,
+  "groupnorm_att": false,
+  "hidden_act": "silu",
+  "hidden_size": 3584,
+  "initializer_range": 0.02,
+  "intermediate_size": 18944,
+  "lora_rank_decay": 96,
+  "lora_rank_tokenshift": 96,
+  "lora_rank_gate": 0,
+  "max_position_embeddings": 131072,
+  "max_window_layers": 28,
+  "model_type": "rwkv6qwen2",
+  "num_attention_heads": 28,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 4,
+  "rms_norm_eps": 1e-06,
+  "rope_theta": 1000000.0,
+  "sliding_window": 131072,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.43.1",
+  "use_cache": true,
+  "use_rope": false,
+  "use_tokenshift": true,
+  "use_sliding_window": false,
+  "vocab_size": 152064
+}

configuration_rwkv6qwen2.py ADDED Viewed

	@@ -0,0 +1,206 @@

+# coding=utf-8
+# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""RWKV6Qwen2 model configuration"""
+from transformers.configuration_utils import PretrainedConfig
+from transformers.modeling_rope_utils import rope_config_validation
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+class RWKV6Qwen2Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`RWKV6Qwen2Model`]. It is used to instantiate a
+    RWKV6Qwen2 model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of
+    Qwen2-7B-beta [Qwen/Qwen2-7B-beta](https://huggingface.co/Qwen/Qwen2-7B-beta).
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 151936):
+            Vocabulary size of the RWKV6Qwen2 model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`RWKV6Qwen2Model`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 22016):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*, defaults to 32):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 32768):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be tied.
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
+        use_sliding_window (`bool`, *optional*, defaults to `False`):
+            Whether to use sliding window attention.
+        sliding_window (`int`, *optional*, defaults to 4096):
+            Sliding window attention (SWA) window size. If not specified, will default to `4096`.
+        max_window_layers (`int`, *optional*, defaults to 28):
+            The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+    ```python
+    >>> from transformers import RWKV6Qwen2Model, RWKV6Qwen2Config
+    >>> # Initializing a RWKV6Qwen2 style configuration
+    >>> configuration = RWKV6Qwen2Config()
+    >>> # Initializing a model from the RWKV6Qwen2-7B style configuration
+    >>> model = RWKV6Qwen2Model(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "rwkv6qwen2"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    def __init__(
+        self,
+        vocab_size=151936,
+        hidden_size=4096,
+        intermediate_size=22016,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=32,
+        lora_rank_tokenshift=None,
+        lora_rank_decay=None,
+        hidden_act="silu",
+        max_position_embeddings=32768,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        tie_word_embeddings=False,
+        use_rope=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        use_sliding_window=False,
+        sliding_window=4096,
+        max_window_layers=28,
+        attention_dropout=0.0,
+        attention_bias=True,
+        attention_output_bias=False,
+        gate_rank_type=1,
+        lora_rank_gate=None,
+        balance_state=True,
+        groupnorm_att=False,
+        use_tokenshift=True,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.use_sliding_window = use_sliding_window
+        self.sliding_window = sliding_window if use_sliding_window else None
+        self.max_window_layers = max_window_layers
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.lora_rank_tokenshift = lora_rank_tokenshift
+        self.lora_rank_decay = lora_rank_decay
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.use_rope = use_rope
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_dropout = attention_dropout
+        # Validate the correctness of rotary position embeddings parameters
+        # BC: if there is a 'type' field, move it to 'rope_type'.
+        if self.rope_scaling is not None and "type" in self.rope_scaling:
+            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+        rope_config_validation(self)
+        self.attention_bias = attention_bias
+        self.attention_output_bias = attention_output_bias
+        self.gate_rank_type = gate_rank_type
+        self.lora_rank_gate = lora_rank_gate
+        self.balance_state = balance_state
+        self.groupnorm_att = groupnorm_att
+        self.use_tokenshift = use_tokenshift
+        super().__init__(
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )

examine_ckpt.py ADDED Viewed

	@@ -0,0 +1,25 @@

+import sys
+import math
+import torch
+from collections import OrderedDict
+import re
+from safetensors.torch import load_file
+if len(sys.argv) != 2:
+    print(f"Examines checkpoint keys")
+    print("Usage: python examine_ckpt.py in_file")
+    exit()
+model_path = sys.argv[1]
+print("Loading file...")
+if model_path.lower().endswith('.safetensors'):
+    state_dict = load_file(model_path)
+else:
+    state_dict = torch.load(model_path, map_location='cpu', weights_only=True)
+for name, p in state_dict.items():
+    if p.numel() == 0:
+        print(name, p.dtype, p.shape)
+    else:
+        print(name, p.dtype, p.shape, float(p.min()), float(p.max()))

generate.py ADDED Viewed

	@@ -0,0 +1,119 @@

+import sys, os
+import torch
+torch.backends.cudnn.benchmark = True
+torch.backends.cudnn.allow_tf32 = True
+torch.backends.cuda.matmul.allow_tf32 = True
+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
+from configs import parse_cmdline_configs
+from pydoc import locate
+from dataclasses import dataclass
+from typing import Any, Callable
+moby_dick =     """Call me Ishmael. Some years ago—never mind how long precisely—having little or no money in my purse, and nothing particular to interest me on shore, I thought I would sail about a little and see the watery part of the world. It is a way I have of driving off the spleen and regulating the circulation. Whenever I find myself growing grim about the mouth; whenever it is a damp, drizzly November in my soul; whenever I find myself involuntarily pausing before coffin warehouses, and bringing up the rear of every funeral I meet; and especially whenever my hypos get such an upper hand of me, that it requires a strong moral principle to prevent me from deliberately stepping into the street, and methodically knocking people’s hats off—then, I account it high time to get to sea as soon as I can. This is my substitute for pistol and ball. With a philosophical flourish Cato throws himself upon his sword; I quietly take to the ship. There is nothing surprising in this. If they but knew it, almost all men in their degree, some time or other, cherish very nearly the same feelings towards the ocean with me.
+There now is your insular city of the Manhattoes, belted round by wharves as Indian isles by coral reefs—commerce surrounds it with her surf. Right and left, the streets take you waterward. Its extreme downtown is the battery, where that noble mole is washed by waves, and cooled by breezes, which a few hours previous were out of sight of land. Look at the crowds of water-gazers there.
+Circumambulate the city of a dreamy Sabbath afternoon. Go from Corlears Hook to Coenties Slip, and from thence, by Whitehall, northward. What do you see?—Posted like silent sentinels all around the town, stand thousands upon thousands of mortal men fixed in ocean reveries. Some leaning against the spiles; some seated upon the pier-heads; some looking over the bulwarks of ships from China; some high aloft in the rigging, as if striving to get a still better seaward peep. But these are all landsmen; of week days pent up in lath and plaster—tied to counters, nailed to benches, clinched to desks. How then is this? Are the green fields gone? What do they here?
+But look! here come more crowds, pacing straight for the water, and seemingly bound for a dive. Strange! Nothing will content them but the extremest limit of the land; loitering under the shady lee of yonder warehouses will not suffice. No. They must get just as nigh the water as they possibly can without falling in. And there they stand—miles of them—leagues. Inlanders all, they come from lanes and alleys, streets and avenues—north, east, south, and west. Yet here they all unite. Tell me, does the magnetic virtue of the needles of the compasses of all those ships attract them thither?
+Once more. Say you are in the country; in some high land of lakes. Take almost any path you please, and ten to one it carries you down in a dale, and leaves you there by a pool in the stream. There is magic in it. Let the most absent-minded of men be plunged in his deepest reveries—stand that man on his legs, set his feet a-going, and he will infallibly lead you to water, if water there be in all that region. Should you ever be athirst in the great American desert, try this experiment, if your caravan happen to be supplied with a metaphysical professor. Yes, as every one knows, meditation and water are wedded for ever.
+But here is an artist. He desires to paint you the dreamiest, shadiest, quietest, most enchanting bit of romantic landscape in all the valley of the Saco. What is the chief element he employs? There stand his trees, each with a hollow trunk, as if a hermit and a crucifix were within; and here sleeps his meadow, and there sleep his cattle; and up from yonder cottage goes a sleepy smoke. Deep into distant woodlands winds a mazy way, reaching to overlapping spurs of mountains bathed in their hill-side blue. But though the picture lies thus tranced, and though this pine-tree shakes down its sighs like leaves upon this shepherd’s head, yet all were vain, unless the shepherd’s eye were fixed upon the magic stream before him. Go visit the Prairies in June, when for scores on scores of miles you wade knee-deep among Tiger-lilies—what is the one charm wanting?—Water—there is not a drop of water there! Were Niagara but a cataract of sand, would you travel your thousand miles to see it? Why did the poor poet of Tennessee, upon suddenly receiving two handfuls of silver, deliberate whether to buy him a coat, which he sadly needed, or invest his money in a pedestrian trip to Rockaway Beach? Why is almost every robust healthy boy with a robust healthy soul in him, at some time or other crazy to go to sea? Why upon your first voyage as a passenger, did you yourself feel such a mystical vibration, when first told that you and your ship were now out of sight of land? Why did the old Persians hold the sea holy? Why did the Greeks give it a separate deity, and own brother of Jove? Surely all this is not without meaning. And still deeper the meaning of that story of Narcissus, who because he could not grasp the tormenting, mild image he saw in the fountain, plunged into it and was drowned. But that same image, we ourselves see in all rivers and oceans. It is the image of the ungraspable phantom of life; and this is the key to it all.
+Now, when I say that I am in the habit of going to sea whenever I begin to grow hazy about the eyes, and begin to be over conscious of my lungs, I do not mean to have it inferred that I ever go to sea as a passenger. For to go as a passenger you must needs have a purse, and a purse is but a rag unless you have something in it. Besides, passengers get sea-sick—grow quarrelsome—don’t sleep of nights—do not enjoy themselves much, as a general thing;—no, I never go as a passenger; nor, though I am something of a salt, do I ever go to sea as a Commodore, or a Captain, or a Cook. I abandon the glory and distinction of such offices to those who like them. For my part, I abominate all honorable respectable toils, trials, and tribulations of every kind whatsoever. It is quite as much as I can do to take care of myself, without taking care of ships, barques, brigs, schooners, and what not. And as for going as cook,—though I confess there is considerable glory in that, a cook being a sort of officer on ship-board—yet, somehow, I never fancied broiling fowls;—though once broiled, judiciously buttered, and judgmatically salted and peppered, there is no one who will speak more respectfully, not to say reverentially, of a broiled fowl than I will. It is out of the idolatrous dotings of the old Egyptians upon broiled ibis and roasted river horse, that you see the mummies of those creatures in their huge bake-houses the pyramids.
+No, when I go to sea, I go as a simple sailor, right before the mast, plumb down into the forecastle, aloft there to the royal mast-head. True, they rather order me about some, and make me jump from spar to spar, like a grasshopper in a May meadow. And at first, this sort of thing is unpleasant enough. It touches one’s sense of honor, particularly if you come of an old established family in the land, the Van Rensselaers, or Randolphs, or Hardicanutes. And more than all, if just previous to putting your hand into the tar-pot, you have been lording it as a country schoolmaster, making the tallest boys stand in awe of you. The transition is a keen one, I assure you, from a schoolmaster to a sailor, and requires a strong decoction of Seneca and the Stoics to enable you to grin and bear it. But even this wears off in time.
+What of it, if some old hunks of a sea-captain orders me to get a broom and sweep down the decks? What does that indignity amount to, weighed, I mean, in the scales of the New Testament? Do you think the archangel Gabriel thinks anything the less of me, because I promptly and respectfully obey that old hunks in that particular instance? Who ain’t a slave? Tell me that. Well, then, however the old sea-captains may order me about—however they may thump and punch me about, I have the satisfaction of knowing that it is all right; that everybody else is one way or other served in much the same way—either in a physical or metaphysical point of view, that is; and so the universal thump is passed round, and all hands should rub each other’s shoulder-blades, and be content.
+Again, I always go to sea as a sailor, because they make a point of paying me for my trouble, whereas they never pay passengers a single penny that I ever heard of. On the contrary, passengers themselves must pay. And there is all the difference in the world between paying and being paid. The act of paying is perhaps the most uncomfortable infliction that the two orchard thieves entailed upon us. But being paid,—what will compare with it? The urbane activity with which a man receives money is really marvellous, considering that we so earnestly believe money to be the root of all earthly ills, and that on no account can a monied man enter heaven. Ah! how cheerfully we consign ourselves to perdition!
+Finally, I always go to sea as a sailor, because """
+@dataclass
+class CLI_Config:
+    tokenizer_path: str
+    model_path: str
+    attn_path: str = 'rwkv6attn.RWKV6Attention'
+    prompt:str = 'How many quarts are in a gallon?'
+    max_len:int = 30
+    attempts:int = 1
+    precision: int | str = 'bf16'
+    attn_classes_path: str = 'transformers.models.qwen2.modeling_qwen2.QWEN2_ATTENTION_CLASSES' # 'transformers.models.llama.modeling_llama.LLAMA_ATTENTION_CLASSES'
+    seed: int | None = None
+    train:Any = None
+config, errors = parse_cmdline_configs(sys.argv[1:], CLI_Config)
+if errors != '':
+    print(errors)
+    exit()
+match config.precision:
+    case 32:
+        dtype = torch.float32
+    case '32':
+        dtype = torch.float32
+    case 16:
+        dtype = torch.float16
+    case '16':
+        dtype = torch.float16
+    case 'bf16':
+        dtype = torch.bfloat16
+    case _:
+        print("Bad precision type specified")
+        exit()
+# avoid 1000 huggingface warnings "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...""
+os.environ['TOKENIZERS_PARALLELISM'] = 'false'
+print(f'Loading model - {config.model_path}')
+model_config = AutoConfig.from_pretrained(config.model_path, trust_remote_code=True)
+# if config.model_path.startswith('.'):
+#     # replace attention classes
+#     ReplacementSelfAttentionType = locate(config.attn_path)
+#     assert isinstance(ReplacementSelfAttentionType, Callable)
+#     attn_classes_dict = locate(config.attn_classes_path)
+#     assert isinstance(attn_classes_dict, dict), 'could not find attention classes dict at path provided'
+#     for key in list(attn_classes_dict.keys()):
+#         attn_classes_dict[key] = ReplacementSelfAttentionType
+model = AutoModelForCausalLM.from_pretrained(config.model_path, config=model_config, torch_dtype=dtype, device_map='cuda', trust_remote_code=True)
+tokenizer = AutoTokenizer.from_pretrained(config.tokenizer_path, trust_remote_code=True)
+#device = 'cuda'
+#model = model.to(device=device, dtype=dtype)
+model.eval()
+if config.seed is None:
+    config.seed = 1234
+from transformers import AutoTokenizer, Qwen2ForCausalLM, set_seed
+set_seed(config.seed)
+text = config.prompt
+messages = [
+    {"role": "system", "content": "You are a helpful assistant."},
+    {"role": "user", "content": config.prompt}
+]
+text = tokenizer.apply_chat_template(
+    messages,
+    tokenize=False,
+    add_generation_prompt=True
+)
+inputs = tokenizer(text, return_tensors="pt").to('cuda')
+# Generate
+for i in range(config.attempts):
+    print(f"Attempt {i+1}:")
+    generate_ids = model.generate(inputs.input_ids, max_new_tokens=config.max_len, use_cache=True, do_sample=True, temperature=1.0, top_p=1.0)#, typical_p=0.95)#top_p=0.7, repetition_penalty=0.25)
+    print(tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False, use_cache=False)[0])

generation_config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "bos_token_id": 151643,
+  "pad_token_id": 151643,
+  "do_sample": true,
+  "eos_token_id": [
+    151645,
+    151643
+  ],
+  "repetition_penalty": 1.05,
+  "temperature": 0.7,
+  "top_p": 0.8,
+  "top_k": 20,
+  "transformers_version": "4.37.0"
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model-00001-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:662ccdc789fe510fdf88cda5727f77061d133d9d31f0591d7a008b8e8add6b04
+size 4955133312

model-00002-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b9cdf366e3cca4c985365bb2571668ce7d2d3649071bf270e1166a18208d55c4
+size 4865370824

model-00003-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:367ae7e5f35819bf5d72cfb4159fd7d560a7cf943380d53a0e64d8e50b38d61c
+size 4865370920

model-00004-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cfe5f39f1ff90b14ab41d7b12f852b3e506c40b2230a40735bf181360de839bc
+size 1497374264

model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,682 @@

+{
+  "metadata": {
+    "total_size": 16183172096
+  },
+  "weight_map": {
+    "model.embed_tokens.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.time_maa_x": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.time_maa_r": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.time_maa_k": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.time_maa_v": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.time_maa_w": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.time_maa_g": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.time_maa_w2": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.time_maa_w1": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.time_decay": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.time_decay_w1": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.time_decay_w2": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.gate.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.time_maa_x": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.time_maa_r": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.time_maa_k": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.time_maa_v": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.time_maa_w": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.time_maa_g": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.time_maa_w2": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.time_maa_w1": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.time_decay": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.time_decay_w1": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.time_decay_w2": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.gate.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.time_maa_x": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.time_maa_r": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.time_maa_k": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.time_maa_v": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.time_maa_w": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.time_maa_g": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.time_maa_w2": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.time_maa_w1": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.time_decay": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.time_decay_w1": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.time_decay_w2": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.gate.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.time_maa_x": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.time_maa_r": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.time_maa_k": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.time_maa_v": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.time_maa_w": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.time_maa_g": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.time_maa_w2": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.time_maa_w1": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.time_decay": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.time_decay_w1": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.time_decay_w2": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.gate.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.time_maa_x": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.time_maa_r": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.time_maa_k": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.time_maa_v": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.time_maa_w": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.time_maa_g": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.time_maa_w2": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.time_maa_w1": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.time_decay": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.time_decay_w1": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.time_decay_w2": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.gate.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.time_maa_x": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.time_maa_r": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.time_maa_k": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.time_maa_v": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.time_maa_w": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.time_maa_g": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.time_maa_w2": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.time_maa_w1": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.time_decay": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.time_decay_w1": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.time_decay_w2": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.gate.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.time_maa_x": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.time_maa_r": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.time_maa_k": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.time_maa_v": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.time_maa_w": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.time_maa_g": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.time_maa_w2": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.time_maa_w1": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.time_decay": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.time_decay_w1": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.time_decay_w2": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.gate.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.time_maa_x": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.time_maa_r": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.time_maa_k": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.time_maa_v": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.time_maa_w": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.time_maa_g": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.time_maa_w2": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.time_maa_w1": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.time_decay": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.time_decay_w1": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.time_decay_w2": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.gate.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.self_attn.time_maa_x": "model-00002-of-00004.safetensors",
+    "model.layers.8.self_attn.time_maa_r": "model-00002-of-00004.safetensors",
+    "model.layers.8.self_attn.time_maa_k": "model-00002-of-00004.safetensors",
+    "model.layers.8.self_attn.time_maa_v": "model-00002-of-00004.safetensors",
+    "model.layers.8.self_attn.time_maa_w": "model-00002-of-00004.safetensors",
+    "model.layers.8.self_attn.time_maa_g": "model-00002-of-00004.safetensors",
+    "model.layers.8.self_attn.time_maa_w2": "model-00002-of-00004.safetensors",
+    "model.layers.8.self_attn.time_maa_w1": "model-00002-of-00004.safetensors",
+    "model.layers.8.self_attn.time_decay": "model-00002-of-00004.safetensors",
+    "model.layers.8.self_attn.time_decay_w1": "model-00002-of-00004.safetensors",
+    "model.layers.8.self_attn.time_decay_w2": "model-00002-of-00004.safetensors",
+    "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.self_attn.gate.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.time_maa_x": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.time_maa_r": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.time_maa_k": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.time_maa_v": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.time_maa_w": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.time_maa_g": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.time_maa_w2": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.time_maa_w1": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.time_decay": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.time_decay_w1": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.time_decay_w2": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.gate.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.time_maa_x": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.time_maa_r": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.time_maa_k": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.time_maa_v": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.time_maa_w": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.time_maa_g": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.time_maa_w2": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.time_maa_w1": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.time_decay": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.time_decay_w1": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.time_decay_w2": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.gate.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.time_maa_x": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.time_maa_r": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.time_maa_k": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.time_maa_v": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.time_maa_w": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.time_maa_g": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.time_maa_w2": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.time_maa_w1": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.time_decay": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.time_decay_w1": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.time_decay_w2": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.gate.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.time_maa_x": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.time_maa_r": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.time_maa_k": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.time_maa_v": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.time_maa_w": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.time_maa_g": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.time_maa_w2": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.time_maa_w1": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.time_decay": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.time_decay_w1": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.time_decay_w2": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.gate.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.time_maa_x": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.time_maa_r": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.time_maa_k": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.time_maa_v": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.time_maa_w": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.time_maa_g": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.time_maa_w2": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.time_maa_w1": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.time_decay": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.time_decay_w1": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.time_decay_w2": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.gate.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.time_maa_x": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.time_maa_r": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.time_maa_k": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.time_maa_v": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.time_maa_w": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.time_maa_g": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.time_maa_w2": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.time_maa_w1": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.time_decay": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.time_decay_w1": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.time_decay_w2": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.gate.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.time_maa_x": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.time_maa_r": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.time_maa_k": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.time_maa_v": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.time_maa_w": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.time_maa_g": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.time_maa_w2": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.time_maa_w1": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.time_decay": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.time_decay_w1": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.time_decay_w2": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.gate.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.time_maa_x": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.time_maa_r": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.time_maa_k": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.time_maa_v": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.time_maa_w": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.time_maa_g": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.time_maa_w2": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.time_maa_w1": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.time_decay": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.time_decay_w1": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.time_decay_w2": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.gate.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.time_maa_x": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.time_maa_r": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.time_maa_k": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.time_maa_v": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.time_maa_w": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.time_maa_g": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.time_maa_w2": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.time_maa_w1": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.time_decay": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.time_decay_w1": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.time_decay_w2": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.gate.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.17.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.self_attn.time_maa_x": "model-00003-of-00004.safetensors",
+    "model.layers.18.self_attn.time_maa_r": "model-00003-of-00004.safetensors",
+    "model.layers.18.self_attn.time_maa_k": "model-00003-of-00004.safetensors",
+    "model.layers.18.self_attn.time_maa_v": "model-00003-of-00004.safetensors",
+    "model.layers.18.self_attn.time_maa_w": "model-00003-of-00004.safetensors",
+    "model.layers.18.self_attn.time_maa_g": "model-00003-of-00004.safetensors",
+    "model.layers.18.self_attn.time_maa_w2": "model-00003-of-00004.safetensors",
+    "model.layers.18.self_attn.time_maa_w1": "model-00003-of-00004.safetensors",
+    "model.layers.18.self_attn.time_decay": "model-00003-of-00004.safetensors",
+    "model.layers.18.self_attn.time_decay_w1": "model-00003-of-00004.safetensors",
+    "model.layers.18.self_attn.time_decay_w2": "model-00003-of-00004.safetensors",
+    "model.layers.18.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.18.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.18.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.18.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.self_attn.gate.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.time_maa_x": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.time_maa_r": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.time_maa_k": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.time_maa_v": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.time_maa_w": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.time_maa_g": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.time_maa_w2": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.time_maa_w1": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.time_decay": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.time_decay_w1": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.time_decay_w2": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.gate.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.time_maa_x": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.time_maa_r": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.time_maa_k": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.time_maa_v": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.time_maa_w": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.time_maa_g": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.time_maa_w2": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.time_maa_w1": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.time_decay": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.time_decay_w1": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.time_decay_w2": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.gate.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.time_maa_x": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.time_maa_r": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.time_maa_k": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.time_maa_v": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.time_maa_w": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.time_maa_g": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.time_maa_w2": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.time_maa_w1": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.time_decay": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.time_decay_w1": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.time_decay_w2": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.gate.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.time_maa_x": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.time_maa_r": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.time_maa_k": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.time_maa_v": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.time_maa_w": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.time_maa_g": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.time_maa_w2": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.time_maa_w1": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.time_decay": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.time_decay_w1": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.time_decay_w2": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.gate.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.time_maa_x": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.time_maa_r": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.time_maa_k": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.time_maa_v": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.time_maa_w": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.time_maa_g": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.time_maa_w2": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.time_maa_w1": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.time_decay": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.time_decay_w1": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.time_decay_w2": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.gate.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.time_maa_x": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.time_maa_r": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.time_maa_k": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.time_maa_v": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.time_maa_w": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.time_maa_g": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.time_maa_w2": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.time_maa_w1": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.time_decay": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.time_decay_w1": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.time_decay_w2": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.gate.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.time_maa_x": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.time_maa_r": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.time_maa_k": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.time_maa_v": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.time_maa_w": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.time_maa_g": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.time_maa_w2": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.time_maa_w1": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.time_decay": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.time_decay_w1": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.time_decay_w2": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.gate.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.time_maa_x": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.time_maa_r": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.time_maa_k": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.time_maa_v": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.time_maa_w": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.time_maa_g": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.time_maa_w2": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.time_maa_w1": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.time_decay": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.time_decay_w1": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.time_decay_w2": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.gate.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.time_maa_x": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.time_maa_r": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.time_maa_k": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.time_maa_v": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.time_maa_w": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.time_maa_g": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.time_maa_w2": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.time_maa_w1": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.time_decay": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.time_decay_w1": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.time_decay_w2": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.gate.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.27.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.27.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "model.norm.weight": "model-00004-of-00004.safetensors",
+    "lm_head.weight": "model-00004-of-00004.safetensors"
+  }
+}

modeling_rwkv6qwen2.py ADDED Viewed

	@@ -0,0 +1,1336 @@

+# coding=utf-8
+# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch RWKV6Qwen2 model."""
+import math
+import inspect
+from typing import List, Optional, Tuple, Union, Dict, Any
+import torch
+import torch.utils.checkpoint
+from torch import nn
+import torch.nn.functional as F
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from transformers.cache_utils import Cache, StaticCache, DynamicCache
+from transformers.generation import GenerationMixin
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutputWithPast,
+    TokenClassifierOutput,
+)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_flash_attn_2_available,
+    is_flash_attn_greater_or_equal_2_10,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_rwkv6qwen2 import RWKV6Qwen2Config
+from transformers.models.qwen2.modeling_qwen2 import Qwen2DecoderLayer, Qwen2MLP, Qwen2RMSNorm, repeat_kv
+from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS
+logger = logging.get_logger(__name__)
+_CHECKPOINT_FOR_DOC = "RWKV/RWKV6Qwen2-7B"
+_CONFIG_FOR_DOC = "RWKV6Qwen2Config"
+class RWKV6State(Cache):
+    def __init__(self) -> None:
+        super().__init__()
+        self._seen_tokens = 0  # Used in `generate` to keep tally of how many tokens the cache has seen
+        self.layer_kv_states: List[torch.Tensor] = []
+        self.layer_shift_states:  List[torch.Tensor] = []
+    def __getitem__(self, layer_idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Support for backwards-compatible `past_key_value` indexing, e.g. `past_key_value[0][0].shape[2]` to get the
+        sequence length.
+        """
+        if layer_idx < len(self):
+            return (self.layer_kv_states[layer_idx], self.layer_shift_states[layer_idx])
+        else:
+            raise KeyError(f"Cache only has {len(self)} layers, attempted to access layer with index {layer_idx}")
+    def __iter__(self):
+        """
+        Support for backwards-compatible `past_key_value` iteration, e.g. `for x in past_key_value:` to iterate over
+        keys and values
+        """
+        for layer_idx in range(len(self)):
+            yield (self.layer_kv_states[layer_idx], self.layer_shift_states[layer_idx])
+    def __len__(self):
+        """
+        Support for backwards-compatible `past_key_value` length, e.g. `len(past_key_value)`. This value corresponds
+        to the number of layers in the model.
+        """
+        return len(self.layer_kv_states)
+    def get_usable_length(self, new_seq_length: int, layer_idx: Optional[int] = 0) -> int:
+        """Given the sequence length of the new inputs, returns the usable length of the cache."""
+        # Linear Attention variants do not have a maximum length
+        return new_seq_length
+    def reorder_cache(self, beam_idx: torch.LongTensor):
+        """Reorders the cache for beam search, given the selected beam indices."""
+        raise NotImplementedError('Cannot reorder Linear Attention state')
+    def get_seq_length(self, layer_idx: int = 0) -> int:
+        """Returns the sequence length of the cached states. A layer index can be optionally passed."""
+        return self._seen_tokens
+    def get_max_cache_shape(self) -> Optional[int]:
+        """Returns the maximum sequence length of the cache object. DynamicCache does not have a maximum length."""
+        return None
+    def get_max_length(self) -> Optional[int]:
+        """
+        Returns the maximum sequence length of the cached states. DynamicCache does not have a maximum length.
+        """
+        return None
+    # def to_legacy_cache(self) -> Tuple[Tuple[torch.Tensor, torch.Tensor]]:
+    #     """Converts the `DynamicCache` instance into the its equivalent in the legacy cache format. Used for
+    #     backward compatibility."""
+    #     legacy_cache = ()
+    #     for layer_idx in range(len(self)):
+    #         legacy_cache += ((self.layer_kv_states[layer_idx], self.layer_shift_states[layer_idx]),)
+    #     return legacy_cache
+    # @classmethod
+    # #@deprecate_kwarg("num_hidden_layers", version="4.47.0")
+    # def from_legacy_cache(
+    #     cls, past_key_values: Optional[Tuple[Tuple[torch.FloatTensor, torch.FloatTensor]]] = None, num_hidden_layers: int | None = None
+    # ) -> "RWKV6State":
+    #     """Converts a cache in the legacy cache format into an equivalent `DynamicCache`. Used for
+    #     backward compatibility."""
+    #     cache = cls()
+    #     if past_key_values is not None:
+    #         for layer_idx in range(len(past_key_values)):
+    #             layer_kv_state, layer_shift_state = past_key_values[layer_idx]
+    #             cache.update(layer_kv_state, layer_shift_state, layer_idx)
+    #     return cache
+    def crop(self, max_length: int):
+        # can't implement this for linear attention variants
+        return
+    @torch.no_grad
+    def update(
+        self,
+        kv_state: torch.Tensor,
+        shift_state: torch.Tensor,
+        token_count: int,
+        layer_idx: int,
+        cache_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Update the number of seen tokens
+        if layer_idx == 0:
+            self._seen_tokens += token_count
+        # Update the cache
+        # There may be skipped layers, fill them with empty lists
+        for _ in range(len(self.layer_kv_states), layer_idx + 1):
+            self.layer_kv_states.append(torch.zeros_like(kv_state).requires_grad_(False))
+            self.layer_shift_states.append(torch.zeros_like(shift_state).requires_grad_(False))
+        self.layer_kv_states[layer_idx].copy_(kv_state)
+        self.layer_shift_states[layer_idx].copy_(shift_state)
+        return self.layer_kv_states[layer_idx], self.layer_shift_states[layer_idx]
+    # @deprecate_kwarg("num_hidden_layers", version="4.47.0")
+    # def batch_split(
+    #     self, full_batch_size: int, split_size: int, num_hidden_layers: int = None
+    # ) -> List["DynamicCache"]:
+    #     """Split the current instance into a list of `DynamicCache` by the batch size. This will be used by
+    #     `_split_model_inputs()` in `generation.utils`"""
+    #     out = []
+    #     for i in range(0, full_batch_size, split_size):
+    #         current_split = DynamicCache()
+    #         current_split._seen_tokens = self._seen_tokens
+    #         current_split.key_cache = [tensor[i : i + split_size] for tensor in self.key_cache]
+    #         current_split.value_cache = [tensor[i : i + split_size] for tensor in self.value_cache]
+    #         out.append(current_split)
+    #     return out
+    # @classmethod
+    # @deprecate_kwarg("num_hidden_layers", version="4.47.0")
+    # def from_batch_splits(cls, splits: List["DynamicCache"], num_hidden_layers: int = None) -> "DynamicCache":
+    #     """This is the opposite of the above `batch_split()` method. This will be used by `stack_model_outputs` in
+    #     `generation.utils`"""
+    #     cache = cls()
+    #     for idx in range(len(splits[0])):
+    #         key_cache = [current.key_cache[idx] for current in splits if current.key_cache[idx] != []]
+    #         value_cache = [current.key_cache[idx] for current in splits if current.key_cache[idx] != []]
+    #         if key_cache != []:
+    #             layer_keys = torch.cat(key_cache, dim=0)
+    #             layer_values = torch.cat(value_cache, dim=0)
+    #             cache.update(layer_keys, layer_values, idx)
+    #     return cache
+    # def batch_repeat_interleave(self, repeats: int):
+    #     """Repeat the cache `repeats` times in the batch dimension. Used in contrastive search."""
+    #     for layer_idx in range(len(self)):
+    #         self.key_cache[layer_idx] = self.key_cache[layer_idx].repeat_interleave(repeats, dim=0)
+    #         self.value_cache[layer_idx] = self.value_cache[layer_idx].repeat_interleave(repeats, dim=0)
+    # def batch_select_indices(self, indices: torch.Tensor):
+    #     """Only keep the `indices` in the batch dimension of the cache. Used in contrastive search."""
+    #     for layer_idx in range(len(self)):
+    #         self.key_cache[layer_idx] = self.key_cache[layer_idx][indices, ...]
+    #         self.value_cache[layer_idx] = self.value_cache[layer_idx][indices, ...]
+try:
+    #from fla.ops.gla.chunk import chunk_gla
+    from fla.ops.gla.fused_recurrent import fused_recurrent_gla
+except ImportError:
+    print("Required module is not installed. Please install it using the following commands:")
+    print("pip install -U git+https://github.com/fla-org/flash-linear-attention")
+    print("Additionally, ensure you have at least version 2.2.0 of Triton installed:")
+    print("pip install triton>=2.2.0")
+class Qwen2RotaryEmbedding(nn.Module):
+    def __init__(self, config: RWKV6Qwen2Config, device=None):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+    def _dynamic_frequency_update(self, position_ids, device):
+        """
+        dynamic RoPE layers should recompute `inv_freq` in the following situations:
+        1 - growing beyond the cached sequence length (allow scaling)
+        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
+        """
+        seq_len = torch.max(position_ids) + 1
+        if seq_len > self.max_seq_len_cached:  # growth
+            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
+            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
+            self.max_seq_len_cached = seq_len
+        if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
+            # This .to() is needed if the model has been moved to a device after being initialized (because
+            # the buffer is automatically moved, but not the original copy)
+            self.original_inv_freq = self.original_inv_freq.to(device)
+            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
+            self.max_seq_len_cached = self.original_max_seq_len
+    @torch.no_grad()
+    def forward(self, x, position_ids):
+        if "dynamic" in self.rope_type:
+            self._dynamic_frequency_update(position_ids, device=x.device)
+        # Core RoPE block
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        position_ids_expanded = position_ids[:, None, :].float()
+        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
+        device_type = x.device.type
+        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos()
+            sin = emb.sin()
+        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
+        cos = cos * self.attention_scaling
+        sin = sin * self.attention_scaling
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+def generate_rotary_embedding(max_seqlen:int, dim:int, theta:float = 10000.0, scale:float = 1):
+    #inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float).to(device) / dim))
+    angular_velocity = theta ** -(torch.arange(0, dim, 2, dtype=torch.float) / dim) / scale # frequencies from 1.0 ... 1/theta
+    angles = torch.outer(torch.arange(max_seqlen), angular_velocity)
+    # Different from paper, but it uses a different permutation in order to obtain the same calculation
+    emb = torch.cat((angles, angles), dim=-1)
+    return torch.stack([emb.cos(), emb.sin()], dim=0)
+    #return torch.polar(torch.ones_like(angles), angles)
+# Copied from transformers.models.llama.modeling_llama.rotate_half
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+# # Copied from transformers.models.mixtral.modeling_mixtral.apply_rotary_pos_emb
+# def apply_rotary_pos_emb(q, k, cos, sin, unsqueeze_dim:int=1):
+#     B, L = q.size(0), q.size(-2)
+#     cos = cos[:L].unsqueeze(0).expand(B,L,-1).unsqueeze(unsqueeze_dim)
+#     sin = sin[:L].unsqueeze(0).expand(B,L,-1).unsqueeze(unsqueeze_dim)
+#     q_embed = (q * cos) + (rotate_half(q) * sin)
+#     k_embed = (k * cos) + (rotate_half(k) * sin)
+#     return q_embed, k_embed
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+def ortho_init(x, scale):
+    with torch.no_grad():
+        shape = x.shape
+        if len(shape) == 2:
+            gain = math.sqrt(shape[0] / shape[1]) if shape[0] > shape[1] else 1
+            #nn.init.orthogonal_(x, gain=gain * scale)
+            x.copy_(nn.init.orthogonal_(torch.empty_like(x, dtype=torch.float32), gain=gain * scale))
+        elif len(shape) == 3:
+            gain = math.sqrt(shape[1] / shape[2]) if shape[1] > shape[2] else 1
+            for i in range(shape[0]):
+                #nn.init.orthogonal_(x[i], gain=gain * scale)
+                x[i].copy_(nn.init.orthogonal_(torch.empty_like(x[i], dtype=torch.float32), gain=gain * scale))
+        else:
+            assert False
+        return x
+class RWKV6Attention(nn.Module):
+    def __init__(self, config, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will "
+                "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = getattr(config, 'head_dim', self.hidden_size // self.num_heads)
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.attention_dropout = config.attention_dropout
+        n_layer = self.config.num_hidden_layers
+        n_embd = self.hidden_size
+        dim_att = self.num_heads * self.head_dim
+        layer_id = self.layer_idx
+        if self.hidden_size % self.num_heads != 0:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=getattr(config, 'attention_output_bias', config.attention_bias))
+        calc_lora_rank = lambda exponent, multiplier: max(1, round(self.hidden_size ** exponent * multiplier / 32)) * 32
+        if config.gate_rank_type == 1:
+            self.gate = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        elif config.gate_rank_type == 2:
+            lora_rank_gate = config.lora_rank_gate or calc_lora_rank(0.8, 0.6)
+            self.g1 = nn.Parameter(torch.empty(n_embd, lora_rank_gate))
+            self.g2 = nn.Parameter(torch.empty(lora_rank_gate, n_embd))
+        if config.groupnorm_att:
+            self.ln_x = nn.GroupNorm(self.num_heads, dim_att, eps=self.head_dim * 1e-5)
+        with torch.no_grad():
+            if config.gate_rank_type == 1:
+                self.gate.weight.zero_()
+            elif config.gate_rank_type == 2:
+                self.g1.zero_()
+                ortho_init(self.g2, 0.1)
+            ratio_0_to_1 = layer_id / (n_layer - 1)  # 0 to 1
+            ratio_1_to_almost0 = 1.0 - (layer_id / n_layer)  # 1 to ~0
+            if self.config.use_tokenshift:
+                ddd = torch.ones(1, 1, n_embd)
+                for i in range(n_embd):
+                    ddd[0, 0, i] = i / n_embd
+                ddd = torch.zeros(1, 1, n_embd)
+                self.time_maa_x = nn.Parameter(1.0 - torch.pow(ddd, ratio_1_to_almost0))
+                self.time_maa_r = nn.Parameter(torch.zeros_like(ddd))
+                self.time_maa_k = nn.Parameter(torch.zeros_like(ddd))
+                self.time_maa_v = nn.Parameter(torch.zeros_like(ddd))
+                self.time_maa_w = nn.Parameter(torch.zeros_like(ddd))
+                self.time_maa_g = nn.Parameter(torch.zeros_like(ddd))
+                lora_rank_tokenshift = config.lora_rank_tokenshift or (32 if n_embd < 4096 else 64)
+                self.time_maa_w2 = nn.Parameter(torch.zeros(5, lora_rank_tokenshift, n_embd).uniform_(-0.01, 0.01))
+                self.time_maa_w1 = nn.Parameter(torch.zeros(n_embd, lora_rank_tokenshift*self.time_maa_w2.size(0)))
+            lora_rank_decay = config.lora_rank_decay or (64 if n_embd < 4096 else 128)
+            # RWKV-6
+            decay_speed = torch.ones(dim_att)
+            for n in range(dim_att):
+                decay_speed[n] = -6 + 5 * (n / (dim_att - 1)) ** (0.7 + 1.3 * ratio_0_to_1)
+            self.time_decay = nn.Parameter(decay_speed.reshape(1,1,dim_att))
+            self.time_decay_w1 = nn.Parameter(torch.zeros(n_embd, lora_rank_decay))
+            self.time_decay_w2 = nn.Parameter(torch.zeros(lora_rank_decay, dim_att).uniform_(-0.01, 0.01))
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[RWKV6State] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
+    ):
+        output_shift_state = hidden_states[:, -1:].detach().clone()
+        bsz, q_len, hidden_dim = hidden_states.size()
+        H = self.num_heads
+        x = hidden_states
+        if use_cache and past_key_values is not None and len(past_key_values) > self.layer_idx:
+            input_kv_state, input_shift_state = past_key_values[self.layer_idx]
+            xprev = torch.cat([input_shift_state, x[:, :-1]], dim=1)
+        else:
+            input_kv_state = None
+            xprev = F.pad(x, (0, 0, 1, -1))
+        if self.config.use_tokenshift:
+            dxprev = xprev - x
+            xxx = x + dxprev * self.time_maa_x
+            xxx = torch.tanh(xxx @ self.time_maa_w1).view(bsz*q_len, self.time_maa_w2.size(0), -1).transpose(0, 1)
+            xxx = torch.bmm(xxx, self.time_maa_w2).view(self.time_maa_w2.size(0), bsz, q_len, hidden_dim)
+            mr, mk, mv, mw, mg = xxx.unbind(dim=0)
+            xr = x + dxprev * (self.time_maa_r + mr)
+            xk = x + dxprev * (self.time_maa_k + mk)
+            xv = x + dxprev * (self.time_maa_v + mv)
+            xw = x + dxprev * (self.time_maa_w + mw)
+            xg = x + dxprev * (self.time_maa_g + mg)
+        else:
+            xr = xk = xv = xw = xg = x
+        query_states = self.q_proj(xr)
+        key_states = self.k_proj(xk)
+        value_states = self.v_proj(xv)
+        decay_states = (self.time_decay + torch.tanh(xw @ self.time_decay_w1) @ self.time_decay_w2).to(query_states.dtype)
+        if self.config.gate_rank_type == 1:
+            gate_states = torch.sigmoid(self.gate(xg))
+        elif self.config.gate_rank_type == 2:
+            gate_states = torch.sigmoid(xg @ self.g1) @ self.g2
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        decay_states = decay_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        if position_embeddings is not None:
+            cos, sin = position_embeddings
+            query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, unsqueeze_dim=1)
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        dropout_rate = 0.0 if not self.training else self.attention_dropout
+        decay_states_log = -decay_states.float().exp()
+        decay_states_log = decay_states_log.clamp(-5) # FIXME - is this necessary?
+        if self.config.balance_state:
+            key_states = (key_states * (1 - decay_states_log.exp())).to(key_states.dtype)
+        # dealing with left-padding
+        if attention_mask is not None:
+            value_states = value_states * attention_mask[:, None, -value_states.shape[-2]:, None]
+        query_states = query_states.to(value_states.dtype)
+        key_states = key_states.to(value_states.dtype)
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in float16 just to be sure everything works as expected.
+        input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = torch.get_autocast_gpu_dtype()
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+            logger.warning_once(
+                f"The input hidden states seems to be silently casted in float32, this might be related to"
+                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+        attn_weights = torch.empty(0, device=x.device)
+        scale = query_states.shape[-1] ** -0.5
+        output_final_state = not self.training and use_cache and past_key_values is not None
+        #attn_output, output_kv_state = ChunkGLAFunction.apply(query_states, key_states, value_states, decay_states_log.float(), scale, input_kv_state, output_final_state)
+        #attn_output, output_kv_state = chunk_gla(query_states, key_states, value_states, decay_states_log, scale, input_kv_state, output_final_state)
+        attn_output, output_kv_state = fused_recurrent_gla(query_states, key_states, value_states, decay_states_log, None, scale, input_kv_state, output_final_state)
+        if output_final_state:
+            past_key_values.update(output_kv_state, output_shift_state, q_len, self.layer_idx)
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(bsz, q_len, -1)
+        if self.config.groupnorm_att:
+            attn_output = self.ln_x(attn_output.view(bsz * q_len, -1)).view(bsz, q_len, -1)
+        if self.config.gate_rank_type != 0:
+            attn_output = attn_output * gate_states
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+class RWKV6Qwen2DecoderLayer(Qwen2DecoderLayer):
+    def __init__(self, config: RWKV6Qwen2Config, layer_idx: int):
+        nn.Module.__init__(self)
+        self.hidden_size = config.hidden_size
+        self.self_attn = RWKV6Attention(config, layer_idx) #QWEN2_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
+        self.mlp = Qwen2MLP(config)
+        self.input_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+        hidden_states = residual + hidden_states
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights,)
+        return outputs
+RWKV6QWEN2_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+    Parameters:
+        config ([`RWKV6Qwen2Config`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+@add_start_docstrings(
+    "The bare Qwen2 Model outputting raw hidden-states without any specific head on top.",
+    RWKV6QWEN2_START_DOCSTRING,
+)
+class RWKV6Qwen2PreTrainedModel(PreTrainedModel):
+    config_class = RWKV6Qwen2Config
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["RWKV6Qwen2DecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_cache_class = True
+    _supports_quantized_cache = True
+    _supports_static_cache = True
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+RWKV6QWEN2_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+            Two formats are allowed:
+            - a [`~cache_utils.Cache`] instance, see our
+            [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);
+            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
+            cache format.
+            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
+            legacy cache format will be returned.
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+            of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
+            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
+            the complete sequence length.
+"""
+@add_start_docstrings(
+    "The bare RWKV6Qwen2 Model outputting raw hidden-states without any specific head on top.",
+    RWKV6QWEN2_START_DOCSTRING,
+)
+class RWKV6Qwen2Model(RWKV6Qwen2PreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Qwen2DecoderLayer`]
+    Args:
+        config: RWKV6Qwen2Config
+    """
+    def __init__(self, config: RWKV6Qwen2Config):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [RWKV6Qwen2DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self._attn_implementation = config._attn_implementation
+        self.norm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = Qwen2RotaryEmbedding(config=config)
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.embed_tokens
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+    @add_start_docstrings_to_model_forward(RWKV6QWEN2_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+        # kept for BC (non `Cache` `past_key_values` inputs)
+        #return_legacy_cache = False
+        if use_cache and not isinstance(past_key_values, RWKV6State):
+            #return_legacy_cache = True
+            past_key_values = RWKV6State()
+            # if past_key_values is None:
+            #     past_key_values = DynamicCache()
+            # else:
+            #     past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+            #     logger.warning_once(
+            #         "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and "
+            #         "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class "
+            #         "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)"
+            #     )
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+        # causal_mask = self._update_causal_mask(
+        #     attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
+        # )
+        causal_mask = None
+        hidden_states = inputs_embeds
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = None
+        if self.config.use_rope:
+            position_embeddings = self.rotary_emb(hidden_states, position_ids)
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = None
+        for decoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    causal_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                    cache_position,
+                    position_embeddings,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_values=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    cache_position=cache_position,
+                    position_embeddings=position_embeddings,
+                )
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+        hidden_states = self.norm(hidden_states)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        #if return_legacy_cache:
+        #    next_cache = next_cache.to_legacy_cache()
+        if not return_dict:
+            return tuple(v for v in [hidden_states, past_key_values, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+class RWKV6Qwen2ForCausalLM(RWKV6Qwen2PreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = RWKV6Qwen2Model(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    def set_decoder(self, decoder):
+        self.model = decoder
+    def get_decoder(self):
+        return self.model
+    @add_start_docstrings_to_model_forward(RWKV6QWEN2_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        num_logits_to_keep: int = 0,
+        **loss_kwargs,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+            num_logits_to_keep (`int`, *optional*):
+                Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all
+                `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+                token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+        Returns:
+        Example:
+        ```python
+        >>> from transformers import AutoTokenizer, RWKV6Qwen2ForCausalLM
+        >>> model = RWKV6Qwen2ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
+        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+        )
+        hidden_states = outputs[0]
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :])
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits, labels, self.vocab_size, **loss_kwargs)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    def prepare_inputs_for_generation(
+        self,
+        input_ids: torch.LongTensor,
+        past_key_values: Optional[Cache] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ):
+        # only last token for `inputs_ids` if the `past_key_values` is not empty.
+        if past_key_values is not None and len(past_key_values) > 0:
+            input_ids = input_ids[:, -1:]
+        model_inputs = {
+            'past_key_values': past_key_values,
+            'attention_mask': attention_mask,
+            'cache_position': cache_position,
+        }
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs['inputs_embeds'] = inputs_embeds
+        else:
+            # The `contiguous()` here is necessary to have a static stride during decoding. torchdynamo otherwise
+            # recompiles graphs as the stride of the inputs is a guard.
+            # Ref: https://github.com/huggingface/transformers/pull/29114
+            # TODO: use `next_tokens` directly instead.
+            model_inputs['input_ids'] = input_ids.contiguous()
+        model_inputs.update(**kwargs)
+        # 8. Remove unexpected `generate` inputs (TODO @joao: fix trainer and examples)
+        model_inputs.pop("labels", None)
+        return model_inputs
+@add_start_docstrings(
+    """
+    The RWKV6Qwen2 Model transformer with a sequence classification head on top (linear layer).
+    [`RWKV6Qwen2ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
+    (e.g. GPT-2) do.
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """,
+    RWKV6QWEN2_START_DOCSTRING,
+)
+class RWKV6Qwen2ForSequenceClassification(RWKV6Qwen2PreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = RWKV6Qwen2Model(config)
+        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    @add_start_docstrings_to_model_forward(RWKV6QWEN2_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        transformer_outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.pad_token_id is None:
+            sequence_lengths = -1
+        else:
+            if input_ids is not None:
+                # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
+                sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
+                sequence_lengths = sequence_lengths % input_ids.shape[-1]
+                sequence_lengths = sequence_lengths.to(logits.device)
+            else:
+                sequence_lengths = -1
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels)
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+@add_start_docstrings(
+    """
+    The RWKV6Qwen2 Model transformer with a token classification head on top (a linear layer on top of the hidden-states
+    output) e.g. for Named-Entity-Recognition (NER) tasks.
+    """,
+    RWKV6QWEN2_START_DOCSTRING,
+)
+# Copied from transformers.models.llama.modeling_llama.LlamaForTokenClassification with Llama->RWKV6Qwen2, LLAMA->RWKV6QWEN2
+class RWKV6Qwen2ForTokenClassification(RWKV6Qwen2PreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = RWKV6Qwen2Model(config)
+        if getattr(config, "classifier_dropout", None) is not None:
+            classifier_dropout = config.classifier_dropout
+        elif getattr(config, "hidden_dropout", None) is not None:
+            classifier_dropout = config.hidden_dropout
+        else:
+            classifier_dropout = 0.1
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.score = nn.Linear(config.hidden_size, config.num_labels)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    @add_start_docstrings_to_model_forward(RWKV6QWEN2_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, TokenClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        sequence_output = self.dropout(sequence_output)
+        logits = self.score(sequence_output)
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits, labels, self.config)
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+@add_start_docstrings(
+    """
+The RWKV6Qwen2 Model transformer with a span classification head on top for extractive question-answering tasks like
+SQuAD (a linear layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    RWKV6QWEN2_START_DOCSTRING,
+)
+# Copied from transformers.models.mistral.modeling_mistral.MistralForQuestionAnswering with Mistral->RWKV6Qwen2, MISTRAL->RWKV6QWEN2
+class RWKV6Qwen2ForQuestionAnswering(RWKV6Qwen2PreTrainedModel):
+    base_model_prefix = "model"
+    # Copied from models.models.bloom.modeling_bloom.BloomForQuestionAnswering.__init__ with Bloom->RWKV6Qwen2
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = RWKV6Qwen2Model(config)
+        self.qa_outputs = nn.Linear(config.hidden_size, 2)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    @add_start_docstrings_to_model_forward(RWKV6QWEN2_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        start_positions: Optional[torch.LongTensor] = None,
+        end_positions: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[Tuple, QuestionAnsweringModelOutput]:
+        r"""
+        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+        loss = None
+        if start_positions is not None and end_positions is not None:
+            loss = self.loss_function(start_logits, end_logits, start_positions, end_positions, **kwargs)
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+        return QuestionAnsweringModelOutput(
+            loss=loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )

qwen2.py ADDED Viewed

	@@ -0,0 +1,670 @@

+import os, math, gc, importlib.util
+import torch
+import torch.utils.checkpoint
+import torch.nn as nn
+from torch.nn import functional as F
+from torch import Tensor
+from typing import Tuple, Optional
+#from src.state import ModelState, BlockState, ChannelMixState, TimeMixState, Shared
+#from configs import TrainerCLI_Config, Model_Config, Transformer_Config, Train_Config
+from transformers.models.qwen2.configuration_qwen2 import Qwen2Config
+#from src.rotary import generate_rotary_embedding, generate_binary_rotary_embedding, apply_rotary_embedding
+#from src.CoreDependencies import *
+from dataclasses import dataclass
+import torch.utils.checkpoint
+if importlib.util.find_spec('deepspeed'):
+    import deepspeed
+from logger import print0 as print
+from fla.ops.gla.chunk import chunk_gla
+from fla.ops.gla.fused_recurrent import fused_recurrent_gla
+class ModelState:
+    def __init__(self):
+        self.seq_pos = 0
+        self.input_tokens_cache = torch.tensor([])
+        self.k_cache = torch.tensor([])
+        self.block_states:list[BlockState] = []
+class TimeMixState:
+    def __init__(self, wkv_state=torch.tensor([]), shift_state=torch.tensor([])):
+        self.wkv_state = wkv_state
+        self.shift_state = shift_state
+class ChannelMixState:
+    def __init__(self, shift_state=torch.tensor([])):
+        self.shift_state = shift_state
+class BlockState:
+    def __init__(self, time_mix_state: TimeMixState, channel_mix_state: ChannelMixState):
+        self.time_mix_state = time_mix_state
+        self.channel_mix_state = channel_mix_state
+class Shared:
+    def __init__(self):
+        self.angles = torch.tensor([])
+        self.bias_mask = torch.tensor([])
+# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Qwen2
+class Qwen2RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        Qwen2RMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+def generate_rotary_embedding(max_seqlen:int, dim:int, theta:float = 10000.0, scale:float = 1):
+    #inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float).to(device) / dim))
+    angular_velocity = theta ** -(torch.arange(0, dim, 2, dtype=torch.float) / dim) / scale # frequencies from 1.0 ... 1/theta
+    angles = torch.outer(torch.arange(max_seqlen), angular_velocity)
+    # Different from paper, but it uses a different permutation in order to obtain the same calculation
+    emb = torch.cat((angles, angles), dim=-1)
+    return torch.stack([emb.cos(), emb.sin()], dim=0)
+    #return torch.polar(torch.ones_like(angles), angles)
+# Copied from transformers.models.llama.modeling_llama.rotate_half
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+# Copied from transformers.models.mixtral.modeling_mixtral.apply_rotary_pos_emb
+def apply_rotary_pos_emb(q, k, cos, sin, unsqueeze_dim:int=1):
+    B, L = q.size(0), q.size(-2)
+    cos = cos[:L].unsqueeze(0).expand(B,L,-1).unsqueeze(unsqueeze_dim)
+    sin = sin[:L].unsqueeze(0).expand(B,L,-1).unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+# Copied from transformers.models.llama.modeling_llama.repeat_kv
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+def get_tmix_default_state(x:Tensor, config:Qwen2Config, requires_grad:bool):
+    B, T, C = x.size()
+    return TimeMixState(
+        torch.zeros([B, config.num_attention_heads, config.hidden_size // config.num_attention_heads, config.hidden_size // config.num_attention_heads], dtype=x.dtype, device=x.device, requires_grad=requires_grad),
+        torch.zeros([B, C], dtype=x.dtype, device=x.device, requires_grad=requires_grad)
+    )
+@dataclass
+class LLMOutput:
+    logits: torch.FloatTensor = None
+    model_state: ModelState = None
+    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
+    post_attention_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
+    student_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
+    student_post_attention_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
+class TMix_qwen2(nn.Module):
+    def get_default_state_factory(self): return get_tmix_default_state
+    def __init__(self, config:Qwen2Config, layer_id):
+        super().__init__()
+        self.config = config
+        self.layer_id = layer_id
+        self.ctx_len = config.max_position_embeddings
+        self.head_dim = config.hidden_size // config.num_attention_heads
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.num_key_value_heads = config.num_key_value_heads if config.num_key_value_heads > 0 else self.num_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        # self.max_position_embeddings = config.max_position_embeddings
+        # self.rope_theta = config.rope_theta
+        # self.is_causal = True
+        # self.attention_dropout = config.attention_dropout
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=True)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+        # self.rotary_emb = Qwen2RotaryEmbedding(
+        #     self.head_dim,
+        #     max_position_embeddings=config.rope.max_seqlen,
+        #     base=config.rope.base,
+        # )
+    def forward(self, x, last_model_state:ModelState, shared:Shared, output_attentions:bool=False):
+        last_state = last_model_state.block_states[self.layer_id].time_mix_state
+        B, L, D = x.size()
+        QH = self.num_heads
+        KVH = self.num_key_value_heads
+        q = self.q_proj(x)
+        k = self.k_proj(x)
+        v = self.v_proj(x)
+        wkv_state = last_state.wkv_state
+        # handle recurrent inference via maintaining a kv cache
+        # if not self.training:
+        #     new_kv_cache = torch.stack([k, v], dim=0)
+        #     wkv_state = torch.cat([wkv_state, new_kv_cache], dim=-2)
+        #     k, v = wkv_state.unbind(0)
+        #     k, v = k.contiguous(), v.contiguous()
+        is_causal = q.size(1)==k.size(1)
+        q = q.view(B,L,QH,-1).transpose(1,2)
+        k = k.view(B,L,KVH,-1).transpose(1,2)
+        v = v.view(B,L,KVH,-1).transpose(1,2)
+        #q, k = apply_rotary_embedding(q, k, shared.angles)
+        #kv_seq_len, position_ids = L, torch.arange(L, dtype=torch.int, device=v.device).view(1, L).expand(B, L)
+        #cos, sin = self.rotary_emb(v, seq_len=kv_seq_len)
+        cos, sin = shared.angles.unbind(0)
+        q, k = apply_rotary_pos_emb(q, k, cos, sin)
+        q = q.to(v.dtype)
+        k = k.to(v.dtype)
+        # repeat k/v heads if n_kv_heads < n_heads
+        k = repeat_kv(k, self.num_key_value_groups)
+        v = repeat_kv(v, self.num_key_value_groups)
+        if output_attentions:
+            attn_weights = (q * (self.head_dim ** -0.5)) @ k.mT
+            #y = nn.functional.softmax(attn_weights + causal_mask, dim=-1, dtype=torch.float32).to(q.dtype) @ v
+            #attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+            #y = torch.matmul(attn_weights, v)
+            # NOTE - we are outputting the non-softmaxed attention weights, just with exp() maxed to 1.0 since we're comparing against pre-normalized output of linear attention
+            # upcast attention to fp32
+            causal_mask = torch.full([L, L], fill_value=-torch.inf, device=attn_weights.device, dtype=attn_weights.dtype).triu(1)
+            attn_weights = nn.functional.softmax(attn_weights + causal_mask, dim=-1, dtype=torch.float32).to(q.dtype)
+            #attn_weights = attn_weights.tril()
+            #attn_weights = (attn_weights - attn_weights.max() + causal_mask).exp()
+            #attn_weights = (attn_weights - torch.max(attn_weights, dim=-1, keepdim=True).values + causal_mask).exp()
+        else:
+            attn_weights = torch.empty(0, device=x.device)
+        y = nn.functional.scaled_dot_product_attention(q, k, v, dropout_p=0.0, is_causal=is_causal)
+        y = y.transpose(1,2).reshape(B,L,D)
+        y = self.o_proj(y)
+        return y, TimeMixState(wkv_state, last_state.shift_state), attn_weights
+class TMix_qwen2rwkv(TMix_qwen2):
+    """
+    Qwen2 RWKV-6cSimple attention module, following Qwen2 attention module. This module inherits from `Qwen2Attention`
+    and adds RWKV specific weights for tokenshift, decay, time_first, and the final layernorm.
+    """
+    def __init__(self, config:Qwen2Config, layer_id):
+        super().__init__(config, layer_id)
+        n_layer = config.num_hidden_layers
+        n_embd = self.hidden_size
+        dim_att = self.num_heads * self.head_dim
+        layer_id = self.layer_id
+        with torch.no_grad():
+            ratio_0_to_1 = layer_id / (n_layer - 1)  # 0 to 1
+            ratio_1_to_almost0 = 1.0 - (layer_id / n_layer)  # 1 to ~0
+            ddd = torch.ones(1, 1, n_embd)
+            for i in range(n_embd):
+                ddd[0, 0, i] = i / n_embd
+            # self.time_maa_x = nn.Parameter(1.0 - torch.pow(ddd, ratio_1_to_almost0))
+            # self.time_maa_r = nn.Parameter(1.0 - torch.pow(ddd, 0.5 * ratio_1_to_almost0))
+            # self.time_maa_k = nn.Parameter(1.0 - torch.pow(ddd, ratio_1_to_almost0))
+            # self.time_maa_v = nn.Parameter(1.0 - (torch.pow(ddd, ratio_1_to_almost0) + 0.3 * ratio_0_to_1))
+            # self.time_maa_w = nn.Parameter(1.0 - torch.pow(ddd, ratio_1_to_almost0))
+            ddd = torch.zeros(1, 1, n_embd)
+            self.time_maa_x = nn.Parameter(1.0 - torch.pow(ddd, ratio_1_to_almost0))
+            self.time_maa_r = nn.Parameter(torch.zeros_like(ddd))
+            self.time_maa_k = nn.Parameter(torch.zeros_like(ddd))
+            self.time_maa_v = nn.Parameter(torch.zeros_like(ddd))
+            self.time_maa_w = nn.Parameter(torch.zeros_like(ddd))
+            self.time_maa_g = nn.Parameter(torch.zeros_like(ddd))
+            D_MIX_LORA = 32 if n_embd < 4096 else 64
+            self.time_maa_w2 = nn.Parameter(torch.zeros(5, D_MIX_LORA, n_embd).uniform_(-0.01, 0.01))
+            self.time_maa_w1 = nn.Parameter(torch.zeros(n_embd, D_MIX_LORA*self.time_maa_w2.size(0)))
+            # # per-head RWKV-6
+            # H = self.num_heads
+            # # fancy time_decay
+            # decay_speed = torch.ones(H)
+            # for h in range(H):
+            #     decay_speed[h] = -6 + 5 * (h / max(H - 1, 1)) ** (0.7 + 1.3 * ratio_0_to_1)
+            # self.time_decay = nn.Parameter(decay_speed)
+            # #self.time_decay = nn.Parameter(torch.empty(H)).uniform_(-8, -7)
+            # D_DECAY_LORA = 64 if n_embd < 4096 else 128
+            # self.time_decay_w1 = nn.Parameter(torch.zeros(n_embd, D_DECAY_LORA))
+            # self.time_decay_w2 = nn.Parameter(torch.zeros(D_DECAY_LORA, H).uniform_(-0.01, 0.01))
+            # RWKV-6
+            decay_speed = torch.ones(dim_att)
+            for n in range(dim_att):
+                decay_speed[n] = -6 + 5 * (n / (dim_att - 1)) ** (0.7 + 1.3 * ratio_0_to_1)
+            self.time_decay = nn.Parameter(decay_speed.reshape(1,1,dim_att))
+            D_DECAY_LORA = 64 if n_embd < 4096 else 128
+            self.time_decay_w1 = nn.Parameter(torch.zeros(n_embd, D_DECAY_LORA))
+            self.time_decay_w2 = nn.Parameter(torch.zeros(D_DECAY_LORA, dim_att).uniform_(-0.01, 0.01))
+            # tmp = torch.zeros(dim_att)
+            # for n in range(dim_att):
+            #     zigzag = ((n + 1) % 3 - 1) * 0.1
+            #     tmp[n] = ratio_0_to_1 * (1 - (n / (dim_att - 1))) + zigzag
+            # self.time_faaaa = nn.Parameter(tmp.reshape(self.n_head, self.head_size))
+        self.gate = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        # start gate out with no effect
+        nn.init.zeros_(self.gate.weight)
+        #nn.init.ones_(self.gate.bias)
+        #self.ln_x = nn.LayerNorm(dim_att)
+    def segsum(self, w_log): # B H L 1
+        w_log_cumsum = torch.cumsum(w_log, dim=-2) # (B, H, L, 1)
+        w_mask = torch.exp((w_log_cumsum - w_log_cumsum.mT).tril()).tril() # (B, H, L, L)
+        return w_mask
+    def forward(self, x, last_model_state:ModelState, shared:Shared, output_attentions:bool=False):
+        last_state = last_model_state.block_states[self.layer_id].time_mix_state
+        bsz, q_len, hidden_dim = x.size()
+        dxprev = torch.nn.functional.pad(x, (0, 0, 1, -1)) - x
+        xxx = x + dxprev * self.time_maa_x
+        xxx = torch.tanh(xxx @ self.time_maa_w1).view(bsz*q_len, self.time_maa_w2.size(0), -1).transpose(0, 1)
+        xxx = torch.bmm(xxx, self.time_maa_w2).view(self.time_maa_w2.size(0), bsz, q_len, hidden_dim)
+        mr, mk, mv, mw, mg = xxx.unbind(dim=0)
+        xr = x + dxprev * (self.time_maa_r + mr)
+        xk = x + dxprev * (self.time_maa_k + mk)
+        xv = x + dxprev * (self.time_maa_v + mv)
+        xw = x + dxprev * (self.time_maa_w + mw)
+        xg = x + dxprev * (self.time_maa_g + mg)
+        query_states = self.q_proj(xr)
+        key_states = self.k_proj(xk)
+        value_states = self.v_proj(xv)
+        decay_states = (self.time_decay + torch.tanh(xw @ self.time_decay_w1) @ self.time_decay_w2).to(query_states.dtype)
+        gate_states = torch.sigmoid(self.gate(xg))
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        decay_states = decay_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        #dropout_rate = 0.0 if not self.training else self.attention_dropout
+        decay_states_log = -decay_states.float().exp()
+        decay_states_log = decay_states_log.clamp(-5) # FIXME - is this necessary?
+        key_states = (key_states * (1 - decay_states_log.exp())).to(key_states.dtype)
+        query_states = query_states.to(value_states.dtype)
+        key_states = key_states.to(value_states.dtype)
+        # # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # # cast them back in float16 just to be sure everything works as expected.
+        # input_dtype = query_states.dtype
+        # if input_dtype == torch.float32:
+        #     if torch.is_autocast_enabled():
+        #         target_dtype = torch.get_autocast_gpu_dtype()
+        #     # Handle the case where the model is quantized
+        #     elif hasattr(self.config, "_pre_quantization_dtype"):
+        #         target_dtype = self.config._pre_quantization_dtype
+        #     else:
+        #         target_dtype = self.q_proj.weight.dtype
+        #     logger.warning_once(
+        #         f"The input hidden states seems to be silently casted in float32, this might be related to"
+        #         f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+        #         f" {target_dtype}."
+        #     )
+        #     query_states = query_states.to(target_dtype)
+        #     key_states = key_states.to(target_dtype)
+        #     value_states = value_states.to(target_dtype)
+        # decay_states_log.view is to match fla_chunk_simple_gla's requirements
+        #print("layer", self.layer_id, "pre ", bool(query_states.isnan().any()), bool(key_states.isnan().any()), bool(value_states.isnan().any()), bool(decay_states_log.isnan().any()))
+        #o = chunk_simple_gla(q.contiguous(), k.contiguous(), v.contiguous(), g.contiguous(), scale)
+        #print("layer", self.layer_id, "post", bool(query_states.isnan().any()), bool(key_states.isnan().any()), bool(value_states.isnan().any()), bool(decay_states_log.isnan().any()))
+        if not output_attentions:
+            attn_weights = torch.empty(0, device=x.device)
+            #attn_output = fla_chunk_simple_gla(query_states, key_states, value_states, decay_states_log.view(bsz, self.num_heads, q_len))[0]
+            #attn_output = chunk_gla(query_states, key_states, value_states, decay_states_log)[0]
+            attn_output = fused_recurrent_gla(query_states, key_states, value_states, decay_states_log)[0]
+            attn_output = attn_output.transpose(1, 2).contiguous()
+            attn_output = attn_output.view(bsz, q_len, self.hidden_size)
+            #attn_output = self.ln_x(attn_output)
+            attn_output = self.o_proj(attn_output * gate_states)
+        else:
+            attn_weights = (query_states * (key_states.size(-1) ** -0.5)) @ key_states.mT
+            decay_states_log = decay_states_log.mean(-1, keepdim=True)
+            attn_weights = attn_weights.float() * self.segsum(decay_states_log.float()) # NOTE - without the explicit cast to float ddp mismatched deepspeed here
+            attn_weights = attn_weights.to(query_states.dtype)
+            attn_output = torch.empty(0, device=x.device)
+        return attn_output, TimeMixState(last_state.wkv_state, last_state.shift_state), attn_weights #, past_key_value
+def get_cmix_default_state(x:Tensor, config:Qwen2Config, requires_grad:bool):
+    B, T, C = x.size()
+    return ChannelMixState(
+        torch.zeros([B, C], dtype=x.dtype, device=x.device, requires_grad=requires_grad)
+    )
+class CMix_qwen2(nn.Module):
+    def get_default_state_factory(self): return get_cmix_default_state
+    def __init__(self, config:Qwen2Config, layer_id):
+        super().__init__()
+        self.config = config
+        self.layer_id = layer_id
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = torch.nn.SiLU() #ACT2FN[config.hidden_act]
+    def forward(self, x, last_model_state:ModelState):
+        last_state = last_model_state.block_states[self.layer_id].channel_mix_state
+        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)), last_state
+class Qwen2DecoderLayer(nn.Module):
+    def __init__(self, config:Qwen2Config, layer_id:int):
+        super().__init__()
+        self.config = config
+        self.input_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        cmix = CMix_qwen2(config, layer_id)
+        #if args.attention_type == 'rwkv':
+        self.self_attn = TMix_qwen2rwkv(config, layer_id)
+        #else:
+        #    self.self_attn = TMix_qwen2(config, layer_id)
+        self.default_time_mix_state_factory = self.self_attn.get_default_state_factory() if hasattr(self.self_attn, 'get_default_state_factory') else lambda x, c, r: TimeMixState()
+        self.teacher_attn = None
+        #if config.train is not None:
+        #    if config.train.attention_distillation_stage in (1, 2):
+        #        self.teacher_attn = TMix_qwen2(config, layer_id)
+        self.default_channel_mix_state_factory = cmix.get_default_state_factory() if hasattr(cmix, 'get_default_state_factory') else lambda x, c, r: ChannelMixState()
+        self.mlp = cmix
+    def forward(self, x:Tensor, last_model_state:ModelState, shared:Shared, output_attentions:bool, output_post_attention_hidden_states:bool):
+        s = last_model_state
+        if self.teacher_attn is not None:
+            dx, last_timemix_state, attentions = self.teacher_attn(self.input_layernorm(x), s, shared, output_attentions)
+            student_dx, student_last_timemix_state, student_attentions = self.self_attn(self.input_layernorm(x), s, shared, output_attentions)
+        else:
+            dx, last_timemix_state, attentions = self.self_attn(self.input_layernorm(x), s, shared, output_attentions)
+            student_dx, student_last_timemix_state, student_attentions = None, None, None
+        if output_post_attention_hidden_states:
+            post_attention_hidden_states = dx
+            student_post_attention_hidden_states = student_dx
+        else:
+            post_attention_hidden_states = torch.empty(0, device=x.device)
+            student_post_attention_hidden_states = torch.empty(0, device=x.device)
+        x = x + dx
+        dx, last_chanmix_state = self.mlp(self.post_attention_layernorm(x), s)
+        x = x + dx
+        return x, s, attentions, post_attention_hidden_states, student_attentions, student_post_attention_hidden_states
+def ckpt(block:Qwen2DecoderLayer, *block_args):
+    # if block.training and block.config.train.grad_cp == 1 and 'fsdp' not in block.config.train.strategy: # FSDP has its own checkpointing wrapper
+        #if "deepspeed" in block.config.train.strategy:
+        #    results = deepspeed.checkpointing.checkpoint(block, *block_args)
+        #else:
+        # NOTE - both deepspeed.checkpointing.checkpoint and use_reentrant=True failed miserably (bad loss) when used in conjunction with requires_grad=False params and grad_cp with deepspeed
+    results = torch.utils.checkpoint.checkpoint(block, *block_args, use_reentrant=False)
+    # else:
+    #     results = block(*block_args)
+    return results
+class Qwen2Decoder(nn.Module):
+    def __init__(self, config:Qwen2Config):
+        super().__init__()
+        self.config = config
+        self.shared = Shared()
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size) #, config.vocab_padding_idx)
+        self.layers = nn.ModuleList(
+            [Qwen2DecoderLayer(config, layer_id) for layer_id in range(config.num_hidden_layers)]
+        )
+        self.norm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+    def forward_preamble(self, x, last_model_state:ModelState|None = None, ):
+        config = self.config
+        B, T, C = x.size()
+        shared = self.shared
+        #if config.rope is not None and shared.angles.size(0) == 0:
+        #    shared.angles = generate_rotary_embedding(config.max_position_embeddings, config.head_size, config.rope_theta).to(self.norm.weight)
+        assert (shared.angles.size(0) == 0 or T <= shared.angles.size(0)) or (shared.bias_mask.size(0) == 0 or T <= shared.bias_mask.size(0))
+        # might need to be true in the future for BPTT support
+        requires_grad = self.training
+        if last_model_state is None:
+            last_model_state = ModelState()
+            for layer_id in range(config.num_hidden_layers):
+                layer = self.layers[layer_id]
+                last_model_state.block_states.append(BlockState(
+                    layer.default_time_mix_state_factory(x, config, requires_grad),
+                    layer.default_channel_mix_state_factory(x, config, requires_grad),
+                ))
+        return last_model_state
+    def forward(self, token_ids:Tensor|list, last_model_state:ModelState|None = None, output_hidden_states:bool=False, output_attentions:bool=False, output_post_attention_hidden_states:bool=False):
+        config = self.config
+        if isinstance(token_ids, Tensor):
+            B, T = token_ids.size()
+        else:
+            B = 1
+            T = len(token_ids)
+            token_ids = torch.tensor(token_ids, device=self.embed_tokens.weight.device, dtype=torch.long, requires_grad=False)[None, :]
+        x = self.embed_tokens(token_ids)
+        last_model_state = self.forward_preamble(x, last_model_state)
+        hidden_states_outputs, attentions_outputs, post_attention_hidden_states_outputs = (), (), ()
+        student_hidden_states_outputs, student_attentions_outputs, student_post_attention_hidden_states_outputs = (), (), ()
+        if output_hidden_states:
+            hidden_states_outputs += (x,)
+            student_hidden_states_outputs += (x,)
+        for decoder_layer in self.layers:
+            x, s, attentions, post_attention_hidden_states, student_attentions, student_post_attention_hidden_states = ckpt(decoder_layer, x, last_model_state, self.shared, output_attentions, output_post_attention_hidden_states)
+            hidden_states_outputs += (x,)
+            student_hidden_states_outputs += (x,)
+            if output_attentions:
+                attentions_outputs += (attentions,)
+                student_attentions_outputs += (student_attentions,)
+            if output_post_attention_hidden_states:
+                post_attention_hidden_states_outputs += (post_attention_hidden_states,)
+                student_post_attention_hidden_states_outputs += (student_post_attention_hidden_states,)
+        x = self.norm(x)
+        return LLMOutput(x, last_model_state, hidden_states_outputs, attentions_outputs, post_attention_hidden_states_outputs, student_attentions_outputs, student_post_attention_hidden_states_outputs)
+        #return x, last_model_state, hidden_states_outputs, attentions_outputs, post_attention_hidden_states_outputs # FIXME - not updating state at all
+class Model_qwen2(nn.Module): # Qwen2CausalLM
+    def __init__(self, config:Qwen2Config):
+        super().__init__()
+        self.config = config
+        self.model = None
+        self.configure_model()
+    def configure_model(self):
+        if self.model is not None:
+            return
+        self.model = Qwen2Decoder(self.config)
+        self.lm_head = nn.Linear(self.config.hidden_size, self.config.vocab_size, bias=False)
+    def forward(self, input_ids:Tensor|list, last_model_state:ModelState|None = None, output_hidden_states:bool=False, output_attentions:bool=False, output_post_attention_hidden_states:bool=False):
+        #print("teacher q min, max", float(self.model.layers[0].self_attn.q_proj.weight.min()), float(self.model.layers[0].self_attn.q_proj.weight.max()))
+        results = self.model(input_ids, last_model_state=last_model_state, output_hidden_states=output_hidden_states, output_attentions=output_attentions, output_post_attention_hidden_states=output_post_attention_hidden_states)
+        results.logits = self.lm_head(results.logits)
+        return results
+    def get_optim_groups(self):
+        # separates groups for weight decay and non-weight decay
+        config = self.config
+        weight_decay = 0.0 # train_config.weight_decay
+        # if train_config.attention_distillation_stage in (1, 2):
+        #     self.requires_grad_(False)
+        #     for decoder_layer in self.model.layers:
+        #         if train_config.attention_distillation_stage == 1:
+        #             decoder_layer.self_attn.time_maa_x.requires_grad_(True)
+        #             decoder_layer.self_attn.time_maa_r.requires_grad_(True)
+        #             decoder_layer.self_attn.time_maa_k.requires_grad_(True)
+        #             decoder_layer.self_attn.time_maa_w.requires_grad_(True)
+        #             decoder_layer.self_attn.time_maa_w1.requires_grad_(True)
+        #             decoder_layer.self_attn.time_maa_w2.requires_grad_(True)
+        #             decoder_layer.self_attn.time_decay.requires_grad_(True)
+        #             decoder_layer.self_attn.time_decay_w1.requires_grad_(True)
+        #             decoder_layer.self_attn.time_decay_w2.requires_grad_(True)
+        #             # FIXME - wow we removed q, k here by accident and it.. helped??!?!
+        #             # decoder_layer.self_attn.q_proj.requires_grad_(True)
+        #             # decoder_layer.self_attn.k_proj.requires_grad_(True)
+        #         elif train_config.attention_distillation_stage == 2:
+        #             decoder_layer.self_attn.requires_grad_(True)
+        # FIXME - remove these for full training
+        # for decoder_layer in self.model.layers:
+        #     decoder_layer.post_attention_layernorm.requires_grad_(False)
+        #     decoder_layer.mlp.requires_grad_(False)
+        # self.model.embed_tokens.requires_grad_(False)
+        # self.model.norm.requires_grad_(False)
+        # self.lm_head.requires_grad_(False)
+        # JIT at last minute
+        for decoder_layer in self.model.layers:
+            decoder_layer.self_attn = TJIT(decoder_layer.self_attn)
+            decoder_layer.mlp = TJIT(decoder_layer.mlp)
+        lr_decay = set()
+        lr_1x = set()
+        lr_fp32 = set()
+        for n, p in self.named_parameters():
+            if not p.requires_grad:
+                continue
+            if 'lm_head.weight' in n or 'embed_tokens.weight' in n:
+                lr_fp32.add(n)
+            elif (len(p.squeeze().shape) >= 2) and (weight_decay > 0):
+                lr_decay.add(n)
+            else:
+                lr_1x.add(n)
+        param_dict = {n: p for n, p in self.named_parameters()}
+        param_check = list(lr_decay) + list(lr_1x) + list(lr_fp32)
+        #if not train_config.load_partial and (train_config.teacher is None or train_config.teacher.attention_distillation_stage ==3):
+        #    assert sorted(param_dict) == sorted(param_check)
+        lr_decay = sorted(list(lr_decay))
+        lr_1x = sorted(list(lr_1x))
+        lr_fp32 = sorted(list(lr_fp32))
+        print('decay', lr_decay, '\n')
+        print('1x', lr_1x, '\n')
+        print('fp32', lr_fp32, '\n')
+        optim_groups = [
+            {"params": [param_dict[n] for n in lr_1x], "weight_decay": 0.0, "use_fp16": True, "my_lr_scale": 1.0, 'name':'lr_1x'},
+        ]
+        if len(lr_fp32) > 0:
+            optim_groups += [{"params": [param_dict[n] for n in lr_fp32], "weight_decay": weight_decay, "my_lr_scale": 1.0, 'name':'lr_tiny'}]
+        if len(lr_decay) > 0:
+            optim_groups += [{"params": [param_dict[n] for n in lr_decay], "weight_decay": weight_decay, "use_fp16": True, "my_lr_scale": 1.0, 'name':'lr_decay'}]
+        return optim_groups
+    # def _init_weights(self, module):
+    #     std = 0.02 #self.config.initializer_range
+    #     if isinstance(module, nn.Linear):
+    #         module.weight.data.normal_(mean=0.0, std=std)
+    #         if module.bias is not None:
+    #             module.bias.data.zero_()
+    #     elif isinstance(module, nn.Embedding):
+    #         module.weight.data.normal_(mean=0.0, std=std)
+    #         if module.padding_idx is not None:
+    #             module.weight.data[module.padding_idx].zero_()
+    def init_all_weights(self):
+        # FIXME - we really need to init the gate here to identity (zero weights, ones bias) but instead we just won't init weights since they're all grabbed or set anyway
+        pass
+        # self.apply(self._init_weights)
+        # for n, p in self.named_parameters():
+        #     requires_grad_temp = p.requires_grad
+        #     p.requires_grad_(False)
+        #     if n.endswith('.ln_x.weight'):
+        #         layer_scale = (1+int(n.split('.')[2])) / self.config.model.n_layer
+        #         print('.ln_x.weight layer', int(n.split('.')[2]), "scale", (layer_scale ** 0.7))
+        #         p *= 0.0
+        #         p += (layer_scale ** 0.7)
+        #     p.requires_grad = requires_grad_temp

run_lm_eval.py ADDED Viewed

	@@ -0,0 +1,96 @@

+import sys, os
+import transformers # just for a bugfix for 0.4.2 of lm_eval
+import torch
+torch.backends.cudnn.benchmark = True
+torch.backends.cudnn.allow_tf32 = True
+torch.backends.cuda.matmul.allow_tf32 = True
+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
+from configs import parse_cmdline_configs
+from pydoc import locate
+from lm_eval import evaluator
+from lm_eval.models.huggingface import HFLM
+from dataclasses import dataclass
+from typing import Any, Callable
+@dataclass
+class CLI_Config:
+    tokenizer_path: str
+    model_path: str
+    attn_path: str = 'rwkv6attn.RWKV6Attention'
+    tasks: str = 'lambada_openai' # arc_challenge, arc_easy, headqa, openbookqa, hellaswag, winogrande, piqa, record, copa, storycloze_2016
+    bsz: int|str = 'auto'
+    precision: int | str = 'bf16'
+    num_fewshot: int = 0
+    attn_classes_path: str = 'transformers.models.qwen2.modeling_qwen2.QWEN2_ATTENTION_CLASSES' # 'transformers.models.llama.modeling_llama.LLAMA_ATTENTION_CLASSES'
+    seed: int | None = None
+    train:Any = None
+config, errors = parse_cmdline_configs(sys.argv[1:], CLI_Config)
+if errors != '':
+    print(errors)
+    exit()
+match config.precision:
+    case 32:
+        dtype = torch.float32
+    case '32':
+        dtype = torch.float32
+    case 16:
+        dtype = torch.float16
+    case '16':
+        dtype = torch.float16
+    case 'bf16':
+        dtype = torch.bfloat16
+    case _:
+        print("Bad precision type specified")
+        exit()
+# avoid 1000 huggingface warnings "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...""
+os.environ['TOKENIZERS_PARALLELISM'] = 'false'
+print(f'Loading model - {config.model_path}')
+model_config = AutoConfig.from_pretrained(config.model_path)
+if config.model_path.startswith('.') or config.model_path.startswith('/'):
+    # replace attention classes
+    ReplacementSelfAttentionType = locate(config.attn_path)
+    assert isinstance(ReplacementSelfAttentionType, Callable)
+    attn_classes_dict = locate(config.attn_classes_path)
+    assert isinstance(attn_classes_dict, dict), 'could not find attention classes dict at path provided'
+    for key in list(attn_classes_dict.keys()):
+        attn_classes_dict[key] = ReplacementSelfAttentionType
+model = AutoModelForCausalLM.from_pretrained(config.model_path, config=model_config, torch_dtype=dtype, device_map='cuda')
+tokenizer = AutoTokenizer.from_pretrained(config.tokenizer_path)
+#device = 'cuda'
+#model = model.to(device=device, dtype=dtype)
+model.eval()
+eval_tasks = config.tasks.split(',')
+if config.seed is None:
+    config.seed = 1234
+adapter = HFLM(pretrained=model, tokenizer=tokenizer, batch_size=config.bsz)
+with torch.no_grad():
+    with torch.amp.autocast(device_type='cuda', dtype=dtype):
+	    results = evaluator.simple_evaluate(
+	        model=adapter,
+	        tasks=eval_tasks,
+	        #provide_description=False,
+	        num_fewshot=config.num_fewshot,
+	        limit=None,
+	        bootstrap_iters=10000,
+	        numpy_random_seed = config.seed,
+	        torch_random_seed = config.seed,
+	        fewshot_random_seed = config.seed,
+	    )
+print(results['results'])

tokenization_rwkv6qwen2.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from transformers.models.qwen2.tokenization_qwen2 import Qwen2Tokenizer
+class RWKV6Qwen2Tokenizer(Qwen2Tokenizer):
+    pass

tokenization_rwkv6qwen2_fast.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from transformers.models.qwen2.tokenization_qwen2_fast import Qwen2TokenizerFast
+class RWKV6Qwen2TokenizerFast(Qwen2TokenizerFast):
+    pass

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,207 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "chat_template": "{%- if tools %}\n    {{- '<|im_start|>system\\n' }}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- messages[0]['content'] }}\n    {%- else %}\n        {{- 'You are a helpful assistant.' }}\n    {%- endif %}\n    {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n    {%- for tool in tools %}\n        {{- \"\\n\" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n    {%- else %}\n        {{- '<|im_start|>system\\nYou are a helpful assistant.<|im_end|>\\n' }}\n    {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n    {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n        {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {{- '<|im_start|>' + message.role }}\n        {%- if message.content %}\n            {{- '\\n' + message.content }}\n        {%- endif %}\n        {%- for tool_call in message.tool_calls %}\n            {%- if tool_call.function is defined %}\n                {%- set tool_call = tool_call.function %}\n            {%- endif %}\n            {{- '\\n<tool_call>\\n{\"name\": \"' }}\n            {{- tool_call.name }}\n            {{- '\", \"arguments\": ' }}\n            {{- tool_call.arguments | tojson }}\n            {{- '}\\n</tool_call>' }}\n        {%- endfor %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n            {{- '<|im_start|>user' }}\n        {%- endif %}\n        {{- '\\n<tool_response>\\n' }}\n        {{- message.content }}\n        {{- '\\n</tool_response>' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n            {{- '<|im_end|>\\n' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff