Upload folder using huggingface_hub

Browse files

Files changed (16) hide show

.gitattributes +1 -0
added_tokens.json +28 -0
chat_template.jinja +86 -0
config.json +66 -0
configuration_aimv2_navit_rope.py +59 -0
configuration_andesvl.py +34 -0
generation_config.json +12 -0
merges.txt +0 -0
modeling_aimv2_navit_rope.py +388 -0
modeling_andesvl.py +288 -0
preprocessor_config.json +32 -0
pytorch_model.bin +3 -0
special_tokens_map.json +31 -0
tokenizer.json +3 -0
tokenizer_config.json +239 -0
vocab.json +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

added_tokens.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "</img>": 151653,
+  "</think>": 151668,
+  "</tool_call>": 151658,
+  "</tool_response>": 151666,
+  "<img>": 151652,
+  "<think>": 151667,
+  "<tool_call>": 151657,
+  "<tool_response>": 151665,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_pad|>": 151654
+}

chat_template.jinja ADDED Viewed

	@@ -0,0 +1,86 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0].role == 'system' %}
+        {{- messages[0].content + '\n\n' }}
+    {%- endif %}
+    {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0].role == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+    {%- set index = (messages|length - 1) - loop.index0 %}
+    {%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
+        {%- set ns.multi_step_tool = false %}
+        {%- set ns.last_query_index = index %}
+    {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+    {%- if message.content is string %}
+        {%- set content = message.content %}
+    {%- else %}
+        {%- set content = '' %}
+    {%- endif %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+        {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {%- set reasoning_content = '' %}
+        {%- if message.reasoning_content is string %}
+            {%- set reasoning_content = message.reasoning_content %}
+        {%- else %}
+            {%- if '</think>' in content %}
+                {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
+                {%- set content = content.split('</think>')[-1].lstrip('\n') %}
+            {%- endif %}
+        {%- endif %}
+        {%- if loop.index0 > ns.last_query_index %}
+            {%- if loop.last or (not loop.last and reasoning_content) %}
+                {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
+            {%- else %}
+                {{- '<|im_start|>' + message.role + '\n' + content }}
+            {%- endif %}
+        {%- else %}
+            {{- '<|im_start|>' + message.role + '\n' + content }}
+        {%- endif %}
+        {%- if message.tool_calls %}
+            {%- for tool_call in message.tool_calls %}
+                {%- if (loop.first and content) or (not loop.first) %}
+                    {{- '\n' }}
+                {%- endif %}
+                {%- if tool_call.function %}
+                    {%- set tool_call = tool_call.function %}
+                {%- endif %}
+                {{- '<tool_call>\n{"name": "' }}
+                {{- tool_call.name }}
+                {{- '", "arguments": ' }}
+                {%- if tool_call.arguments is string %}
+                    {{- tool_call.arguments }}
+                {%- else %}
+                    {{- tool_call.arguments | tojson }}
+                {%- endif %}
+                {{- '}\n</tool_call>' }}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n<think>\n' }}
+{%- endif %}

config.json ADDED Viewed

	@@ -0,0 +1,66 @@

+{
+  "architectures": [
+    "AndesVLForConditionalGeneration"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_andesvl.AndesVLConfig",
+    "AutoModel": "modeling_andesvl.AndesVLForConditionalGeneration",
+    "AutoModelForCausalLM": "modeling_andesvl.AndesVLForConditionalGeneration"
+  },
+  "model_type": "andesvl-aimv2-qwen3",
+  "text_config": {
+    "vocab_size": 151936,
+    "max_position_embeddings": 262144,
+    "hidden_size": 2560,
+    "intermediate_size": 9728,
+    "num_hidden_layers": 36,
+    "num_attention_heads": 32,
+    "use_sliding_window": false,
+    "sliding_window": null,
+    "max_window_layers": 36,
+    "num_key_value_heads": 8,
+    "head_dim": 128,
+    "hidden_act": "silu",
+    "initializer_range": 0.02,
+    "rms_norm_eps": 1e-06,
+    "use_cache": true,
+    "rope_theta": 5000000,
+    "rope_scaling": null,
+    "attention_bias": false,
+    "attention_dropout": 0.0,
+    "tie_word_embeddings": true,
+    "architectures": [
+      "Qwen3ForCausalLM"
+    ],
+    "bos_token_id": 151643,
+    "eos_token_id": 151645,
+    "model_type": "qwen3"
+  },
+  "vision_config": {
+    "attention_dropout": 0.0,
+    "disable_rope": false,
+    "fullatt_block_indexes": null,
+    "hidden_size": 1024,
+    "hidden_stride": 2,
+    "image_size": 448,
+    "intermediate_size": 2816,
+    "interpolate_pe_method": "two_dim",
+    "model_type": "aimv2",
+    "num_attention_heads": 8,
+    "num_channels": 3,
+    "num_hidden_layers": 24,
+    "patch_size": 14,
+    "preserve_original_pe": true,
+    "projection_dropout": 0.0,
+    "qkv_bias": false,
+    "rms_norm_eps": 1e-05,
+    "temporal_patch_size": 1,
+    "torch_dtype": "bfloat16",
+    "transformers_version": "4.52.4",
+    "use_bias": false,
+    "window_size": 112
+  },
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.51.0"
+}

configuration_aimv2_navit_rope.py ADDED Viewed

	@@ -0,0 +1,59 @@

+from typing import Any
+from transformers.configuration_utils import PretrainedConfig
+__all__ = ["Aimv2VisionConfig"]
+class Aimv2VisionConfig(PretrainedConfig):
+    model_type: str = "aimv2"
+    def __init__(
+        self,
+        hidden_size: int = 1024,
+        intermediate_size: int = 2816,
+        num_hidden_layers: int = 24,
+        num_attention_heads: int = 8,
+        num_channels: int = 3,
+        image_size: int = 224,
+        patch_size: int = 14,
+        rms_norm_eps: float = 1e-5,
+        attention_dropout: float = 0.0,
+        projection_dropout: float = 0.0,
+        qkv_bias: bool = False,
+        use_bias: bool = False,
+        hidden_stride: int = 2,
+        window_size: int = 112,
+        fullatt_block_indexes: list = None,
+        temporal_patch_size: int = 1,
+        preserve_original_pe: bool = False,
+        interpolate_pe_method: str = 'one_dim',
+        disable_rope: bool = False,
+        min_pixels: int = 3136,
+        max_pixels: int = 1960000,
+        **kwargs: Any,
+    ):
+        super().__init__(**kwargs)
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.attention_dropout = attention_dropout
+        self.rms_norm_eps = rms_norm_eps
+        self.projection_dropout = projection_dropout
+        self.qkv_bias = qkv_bias
+        self.use_bias = use_bias
+        self.hidden_stride = hidden_stride
+        self.window_size = window_size
+        self.fullatt_block_indexes = fullatt_block_indexes
+        self.temporal_patch_size = temporal_patch_size
+        self.preserve_original_pe = preserve_original_pe
+        self.interpolate_pe_method = interpolate_pe_method
+        self.disable_rope = disable_rope
+        self.min_pixels = min_pixels
+        self.max_pixels = max_pixels

configuration_andesvl.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import copy
+from transformers import  Qwen3Config
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+from .configuration_aimv2_navit_rope import Aimv2VisionConfig
+logger = logging.get_logger(__name__)
+class AndesVLConfig(PretrainedConfig):
+    model_type = 'andesvl-aimv2-qwen3'
+    def __init__(
+        self,
+        vision_config=None,
+        text_config=None,
+        **kwargs):
+        super().__init__(**kwargs)
+        self.vision_config = Aimv2VisionConfig(**vision_config) if vision_config is not None else Aimv2VisionConfig()
+        self.text_config = Qwen3Config(**text_config) if text_config is not None else Qwen3Config()
+    def to_dict(self):
+        """
+        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
+        Returns:
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = copy.deepcopy(self.__dict__)
+        output['vision_config'] = self.vision_config.to_dict()
+        output['text_config'] = self.text_config.to_dict()
+        output['model_type'] = self.__class__.model_type
+        return output

generation_config.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "do_sample": true,
+  "temperature": 0.6,
+  "top_k": 20,
+  "top_p": 0.95,
+  "pad_token_id": 151643,
+  "bos_token_id": 151643,
+  "eos_token_id": [
+    151645,
+    151643
+  ]
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

modeling_aimv2_navit_rope.py ADDED Viewed

	@@ -0,0 +1,388 @@

+# adapted from https://huggingface.co/apple/aimv2-huge-patch14-448 (modification: add gradient checkpoint support)
+from typing import Optional, Tuple, Union
+import torch
+from torch import nn
+from torch.nn import functional as F
+from transformers.modeling_outputs import BaseModelOutputWithNoAttention
+from transformers.modeling_utils import PreTrainedModel
+from flash_attn.layers.rotary import apply_rotary_emb
+from flash_attn import flash_attn_varlen_func
+from .configuration_aimv2_navit_rope import Aimv2VisionConfig
+class RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(dim))
+        self.eps = eps
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        output = self._norm(x.float()).type_as(x)
+        return output * self.weight
+    def extra_repr(self) -> str:
+        return f"{tuple(self.weight.shape)}, eps={self.eps}"
+    def _norm(self, x: torch.Tensor) -> torch.Tensor:
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+try:
+    from flash_attn.ops.rms_norm import RMSNorm
+except Exception as e:
+    pass
+class AIMv2SwiGLUFFN(nn.Module):
+    def __init__(self, config: Aimv2VisionConfig):
+        super().__init__()
+        hidden_features = config.intermediate_size
+        in_features = config.hidden_size
+        bias = config.use_bias
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=bias)
+        self.fc2 = nn.Linear(hidden_features, in_features, bias=bias)
+        self.fc3 = nn.Linear(in_features, hidden_features, bias=bias)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = F.silu(self.fc1(x)) * self.fc3(x)
+        x = self.fc2(x)
+        return x
+# copied from qwen2.5-vl
+class VisionRotaryEmbedding(nn.Module):
+    def __init__(self, dim: int, theta: float = 10000.0) -> None:
+        super().__init__()
+        inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+    def forward(self, seqlen: int) -> torch.Tensor:
+        seq = torch.arange(seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
+        freqs = torch.outer(seq, self.inv_freq)
+        return freqs
+# Note: in qwen2-vl and qwen2.5-vl, 3d convolution is used.
+class AIMv2PatchEmbed(nn.Module):
+    def __init__(self, config: Aimv2VisionConfig):
+        super().__init__()
+        self.config = config
+        self.proj = nn.Conv2d(
+            config.num_channels,
+            config.hidden_size,
+            kernel_size=(config.patch_size, config.patch_size),
+            stride=(config.patch_size, config.patch_size),
+        )
+        assert self.config.temporal_patch_size == 1 #恒等于1.
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+    #NOTE: 这里主要是将conv2d转换为linear的运算，效率更高。
+    def _get_2d_weight(self):
+        # Get 2d conv weight and bias, convert to format that linear function can use directly
+        weight = self.proj.weight.view(self.config.hidden_size, -1)  # [hidden_size, c*patch_size*patch_size]
+        bias = self.proj.bias if self.proj.bias is not None else torch.zeros(self.config.hidden_size, device=weight.device)
+        return weight, bias
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # Expected input shape: (num_patches, c*temporal_patch_size*patch_size*patch_size)
+        # When temporal_patch_size=1: (num_patches, c*patch_size*patch_size)
+        x = torch.nn.functional.linear(x, *self._get_2d_weight())
+        x = self.norm(x)
+        return x
+class AIMv2ViTPreprocessor(nn.Module):
+    def __init__(self, config: Aimv2VisionConfig):
+        super().__init__()
+        num_patches = (config.image_size // config.patch_size) ** 2
+        self.patchifier = AIMv2PatchEmbed(config)
+        self.preserve_original_pe = config.preserve_original_pe
+        self.hidden_stride = config.hidden_stride
+        if self.preserve_original_pe:
+            self.interpolate_pe_method = config.interpolate_pe_method
+            self.pos_embed = nn.Parameter(torch.zeros((1, num_patches, config.hidden_size)))
+    def forward(self, x: torch.Tensor, grid_thws: Optional[torch.Tensor] = None) -> torch.Tensor:
+        tokens = self.patchifier(x)
+        if self.preserve_original_pe:
+            assert grid_thws is not None
+            pos_embed_new = torch.zeros_like(tokens)
+            if self.interpolate_pe_method == 'one_dim':
+                pos_embed = self.pos_embed.transpose(1,2).to(tokens.device)
+            elif self.interpolate_pe_method == 'two_dim':
+                ori_h = ori_w = int(self.pos_embed.shape[1] ** 0.5)
+                pos_embed = self.pos_embed.reshape(1, ori_h, ori_w, -1).permute(0,3,1,2)
+            else:
+                raise TypeError("The interpolation method for pe should be one_dim, two_dim.")
+            cnt = 0
+            for t, h, w in grid_thws:
+                num_patches = h * w
+                thw = t * h * w
+                if self.interpolate_pe_method == 'one_dim':
+                    pe = F.interpolate(pos_embed, size=num_patches, mode='linear', align_corners=False).transpose(1,2)
+                elif self.interpolate_pe_method == 'two_dim':
+                    # 1, 1024, 32, 32
+                    pe = F.interpolate(pos_embed, size=(h,w), mode='bicubic', align_corners=False)
+                    # 1, 1024, 1024
+                    pe = pe.permute(0,2,3,1).reshape(1, h*w, -1)
+                # 1024, 1024
+                pe = pe[0].repeat(t,1)
+                # 1, 16, 2, 16, 2, 1024
+                pe = pe.reshape(t, h//self.hidden_stride, self.hidden_stride, w//self.hidden_stride, self.hidden_stride, -1)
+                # 1024, 1024
+                pe = pe.permute(0,1,3,2,4,5).reshape(thw,-1)
+                pos_embed_new[cnt:cnt+thw] = pe
+                cnt += thw
+            tokens = tokens + pos_embed_new
+        return tokens
+# copied from qwen2.5-vl
+def apply_rotary_pos_emb_flashatt(
+    q: torch.Tensor, k: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    cos = cos.chunk(2, dim=-1)[0].contiguous()
+    sin = sin.chunk(2, dim=-1)[0].contiguous()
+    q_embed = apply_rotary_emb(q.float(), cos.float(), sin.float()).type_as(q)
+    k_embed = apply_rotary_emb(k.float(), cos.float(), sin.float()).type_as(k)
+    return q_embed, k_embed
+class AIMv2FlashAttention2(nn.Module):
+    def __init__(self, config: Aimv2VisionConfig) -> None:
+        super().__init__()
+        dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.qkv = nn.Linear(dim, dim * 3, bias=config.qkv_bias)
+        self.proj = nn.Linear(dim, dim, bias=config.use_bias)
+        self.use_rope = not config.disable_rope
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+    ) -> torch.Tensor:
+        seq_length = hidden_states.shape[0]
+        q, k, v = self.qkv(hidden_states).reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0)
+        if self.use_rope:
+            cos, sin = position_embeddings
+            q, k = apply_rotary_pos_emb_flashatt(q.unsqueeze(0), k.unsqueeze(0), cos, sin)
+            q = q.squeeze(0)
+            k = k.squeeze(0)
+        max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
+        attn_output = flash_attn_varlen_func(q, k, v, cu_seqlens, cu_seqlens, max_seqlen, max_seqlen).reshape(
+            seq_length, -1
+        )
+        attn_output = self.proj(attn_output)
+        return attn_output
+class AIMv2Block(nn.Module):
+    def __init__(self, config: Aimv2VisionConfig):
+        super().__init__()
+        self.attn = AIMv2FlashAttention2(config)
+        self.norm_1 = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.mlp = AIMv2SwiGLUFFN(config)
+        self.norm_2 = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+    def forward(
+        self, x: torch.Tensor, cu_seqlens: torch.Tensor, position_embeddings: torch.Tensor
+    ) -> torch.Tensor:
+        x = x + self.attn(self.norm_1(x), cu_seqlens=cu_seqlens, position_embeddings=position_embeddings)
+        x = x + self.mlp(self.norm_2(x))
+        return x
+class AIMv2Transformer(nn.Module):
+    def __init__(self, config: Aimv2VisionConfig):
+        super().__init__()
+        self.blocks = nn.ModuleList(
+            [AIMv2Block(config) for _ in range(config.num_hidden_layers)]
+        )
+        self.post_trunk_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.gradient_checkpointing = False
+        self.rotary_pos_emb = VisionRotaryEmbedding(config.hidden_size // config.num_attention_heads // 2)
+        self.hidden_stride = config.hidden_stride
+        self.patch_size = config.patch_size
+        self.window_size = config.window_size
+        self.spatial_merge_unit = config.hidden_stride * config.hidden_stride
+        self.fullatt_block_indexes = config.fullatt_block_indexes
+    # copied from qwen2.5_vl
+    def rot_pos_emb(self, grid_thw):
+        pos_ids = []
+        for t, h, w in grid_thw:
+            hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
+            hpos_ids = hpos_ids.reshape(
+                h // self.hidden_stride,
+                self.hidden_stride,
+                w // self.hidden_stride,
+                self.hidden_stride,
+            )
+            hpos_ids = hpos_ids.permute(0, 2, 1, 3)
+            hpos_ids = hpos_ids.flatten()
+            wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
+            wpos_ids = wpos_ids.reshape(
+                h // self.hidden_stride,
+                self.hidden_stride,
+                w // self.hidden_stride,
+                self.hidden_stride,
+            )
+            wpos_ids = wpos_ids.permute(0, 2, 1, 3)
+            wpos_ids = wpos_ids.flatten()
+            pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
+        pos_ids = torch.cat(pos_ids, dim=0)
+        max_grid_size = grid_thw[:, 1:].max()
+        rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
+        rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
+        return rotary_pos_emb
+    def get_window_index(self, grid_thw):
+        window_index: list = []
+        cu_window_seqlens: list = [0]
+        window_index_id = 0
+        vit_merger_window_size = self.window_size // self.hidden_stride // self.patch_size # patch (after merge) number in each window
+        for grid_t, grid_h, grid_w in grid_thw:
+            llm_grid_h, llm_grid_w = (
+                grid_h // self.hidden_stride, # number of patch after merge
+                grid_w // self.hidden_stride,
+            )
+            index = torch.arange(grid_t * llm_grid_h * llm_grid_w).reshape(grid_t, llm_grid_h, llm_grid_w)
+            pad_h = vit_merger_window_size - llm_grid_h % vit_merger_window_size
+            pad_w = vit_merger_window_size - llm_grid_w % vit_merger_window_size
+            num_windows_h = (llm_grid_h + pad_h) // vit_merger_window_size
+            num_windows_w = (llm_grid_w + pad_w) // vit_merger_window_size
+            index_padded = F.pad(index, (0, pad_w, 0, pad_h), "constant", -100)
+            index_padded = index_padded.reshape(
+                grid_t,
+                num_windows_h,
+                vit_merger_window_size,
+                num_windows_w,
+                vit_merger_window_size,
+            )
+            index_padded = index_padded.permute(0, 1, 3, 2, 4).reshape(
+                grid_t,
+                num_windows_h * num_windows_w,
+                vit_merger_window_size,
+                vit_merger_window_size,
+            )
+            seqlens = (index_padded != -100).sum([2, 3]).reshape(-1)
+            index_padded = index_padded.reshape(-1)
+            index_new = index_padded[index_padded != -100]
+            window_index.append(index_new + window_index_id)
+            cu_seqlens_tmp = seqlens.cumsum(0) * self.spatial_merge_unit + cu_window_seqlens[-1]
+            cu_window_seqlens.extend(cu_seqlens_tmp.tolist())
+            window_index_id += (grid_t * llm_grid_h * llm_grid_w).item()
+        window_index = torch.cat(window_index, dim=0)
+        return window_index, cu_window_seqlens
+    def forward(
+        self,
+        tokens: torch.Tensor,
+        grid_thws: torch.Tensor,
+        output_hidden_states: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, ...]]]:
+        # RoPE, modified from qwen2.5_vl
+        rotary_pos_emb = self.rot_pos_emb(grid_thws)
+        window_index, cu_window_seqlens = self.get_window_index(grid_thws)
+        cu_window_seqlens = torch.tensor(
+            cu_window_seqlens,
+            device=tokens.device,
+            dtype=grid_thws.dtype if torch.jit.is_tracing() else torch.int32,
+        )
+        cu_window_seqlens = torch.unique_consecutive(cu_window_seqlens)
+        seq_len, _ = tokens.size()
+        tokens = tokens.reshape(seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1)
+        tokens = tokens[window_index, :, :]
+        tokens = tokens.reshape(seq_len, -1)
+        rotary_pos_emb = rotary_pos_emb.reshape(seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1)
+        rotary_pos_emb = rotary_pos_emb[window_index, :, :]
+        rotary_pos_emb = rotary_pos_emb.reshape(seq_len, -1)
+        emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
+        position_embeddings = (emb.cos(), emb.sin())
+        cu_seqlens = torch.repeat_interleave(grid_thws[:, 1] * grid_thws[:, 2], grid_thws[:, 0]).cumsum(
+            dim=0,
+            # Select dtype based on the following factors:
+            #  - FA2 requires that cu_seqlens_q must have dtype int32
+            #  - torch.onnx.export requires that cu_seqlens_q must have same dtype as grid_thw
+            # See https://github.com/huggingface/transformers/pull/34852 for more information
+            dtype=grid_thws.dtype if torch.jit.is_tracing() else torch.int32,
+        )
+        cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
+        reverse_indices = torch.argsort(window_index)
+        hidden_states = () if output_hidden_states else None
+        for index, block in enumerate(self.blocks):
+            if self.fullatt_block_indexes is None or index in self.fullatt_block_indexes:
+                cu_seqlens_tmp = cu_seqlens
+            else:
+                cu_seqlens_tmp = cu_window_seqlens
+            if self.gradient_checkpointing and self.training:
+                tokens = self._gradient_checkpointing_func(block.__call__, tokens, cu_seqlens_tmp, position_embeddings)
+            else:
+                tokens = block(tokens, cu_seqlens_tmp, position_embeddings)
+            if output_hidden_states:
+                tokens_ = tokens.reshape(seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1)
+                hidden_states += (tokens_[reverse_indices,:].reshape(seq_len, -1),)
+        tokens = self.post_trunk_norm(tokens)
+        tokens = tokens.reshape(seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1)
+        tokens = tokens[reverse_indices,:].reshape(seq_len, -1)
+        return tokens, hidden_states
+class AIMv2PretrainedModel(PreTrainedModel):
+    config_class = Aimv2VisionConfig
+    base_model_prefix = "aimv2"
+    supports_gradient_checkpointing = True
+    main_input_name = "pixel_values"
+    _no_split_modules = ["AIMv2ViTPreprocessor", "AIMv2Block"]
+    _supports_sdpa = True
+    _supports_flash_attn_2 = True
+class Aimv2VisionModel(AIMv2PretrainedModel):
+    def __init__(self, config: Aimv2VisionConfig):
+        super().__init__(config)
+        self.preprocessor = AIMv2ViTPreprocessor(config)
+        self.trunk = AIMv2Transformer(config)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        grid_hws: torch.Tensor,
+    ):
+        # NOTE: 这个是我们自研的ViT输入接口
+        # Transform flattened pixel values to include temporal dimension
+        pixel_values = torch.cat([hidden_states for _ in range(self.config.temporal_patch_size)], dim=1)
+        # Add temporal dimension (t=1) to the grid info
+        grid_t = torch.ones(grid_hws.shape[0], 1, device=grid_hws.device, dtype=grid_hws.dtype)
+        grid_thws = torch.cat([grid_t, grid_hws], dim=1)
+        # Process through the model
+        x = self.preprocessor(pixel_values, grid_thws=grid_thws)
+        x, _ = self.trunk(x, grid_thws=grid_thws, output_hidden_states=False)
+        return x
+__all__ = ["Aimv2VisionModel"]

modeling_andesvl.py ADDED Viewed

	@@ -0,0 +1,288 @@

+from torch import nn
+import torch.utils.checkpoint
+from transformers import Qwen3ForCausalLM
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import  logging
+from .configuration_andesvl import AndesVLConfig
+from .modeling_aimv2_navit_rope import Aimv2VisionModel
+logger = logging.get_logger(__name__)
+class AndesVLForConditionalGeneration(PreTrainedModel):
+    config_class = AndesVLConfig
+    main_input_name = 'pixel_values'
+    _supports_flash_attn_2 = True
+    _no_split_modules = ['Aimv2VisionModel','Qwen3DecoderLayer']
+    def __init__(self, config: AndesVLConfig):
+        super().__init__(config)
+        self.config = config
+        self.vision_encoder = Aimv2VisionModel(config.vision_config)
+        self.language_model = Qwen3ForCausalLM(config.text_config)
+        vit_hidden_size = self.vision_encoder.config.hidden_size
+        llm_hidden_size = self.language_model.config.hidden_size
+        self.patch_size = self.vision_encoder.config.patch_size
+        self.mlp = nn.Sequential(
+            nn.Linear(vit_hidden_size * 4, vit_hidden_size * 4),
+            nn.GELU(),
+            nn.Linear(vit_hidden_size * 4, llm_hidden_size),
+        )
+    def get_input_embeddings(self):
+        return self.language_model.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.language_model.model.embed_tokens = value
+    def get_output_embeddings(self):
+        return self.language_model.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.language_model.lm_head = new_embeddings
+    def get_flated_pixel_values(self, pixel_values):
+        flated_pixel_values = []
+        image_grid_hw = []
+        for pv in pixel_values:
+            c, h, w = pv.shape
+            assert c==3 and h%self.patch_size==0 and w%self.patch_size==0, f"{c}, {w}, {h}, {self.patch_size}"
+            image_grid_hw.append((h//self.patch_size, w//self.patch_size))
+            fpv = pv.reshape(c, h//(2*self.patch_size), 2, self.patch_size, w//(2*self.patch_size), 2, self.patch_size)
+            flated_pixel_values.append(fpv.permute(1, 4, 2, 5, 0, 3, 6).reshape(-1, c*self.patch_size*self.patch_size))
+        flated_pixel_values = torch.cat(flated_pixel_values, dim=0) # (Len_img, C, H, W)
+        image_grid_hw = torch.tensor(image_grid_hw, device=flated_pixel_values.device) # (N_img, 2)
+        return flated_pixel_values, image_grid_hw
+    def get_vit_embeds_and_merge(self, pixel_values, image_grid_hw, input_embeds, image_flags):
+        """
+        Args:
+            pixel_values: (Len_img, H_vit0)， 拉平后的初始patch特征，按照序列维度拼接在一起
+            image_grid_hw: (N_img, 2)， 每个图片的宽高
+            input_embeds: (Bt, Lt, Ht)， 每个token的embedding
+            image_flags: (Bt, Lt)， 每个token是否是图片
+        """
+        vit_embeds = self.vision_encoder(pixel_values, image_grid_hw)  # (Len_img, H_vit)
+        vit_embeds = vit_embeds.view(-1, vit_embeds.shape[-1]*4) # (Len_img//4, H_vit*4)
+        vit_embeds = self.mlp(vit_embeds) # (Len_img//4, H_llm)
+        vit_embeds = vit_embeds[:image_flags.sum()]
+        Bt, Lt, Ht = input_embeds.shape
+        input_embeds = input_embeds.reshape(-1, Ht)
+        image_flags = image_flags.view(-1)
+        input_embeds[image_flags == 1] = vit_embeds
+        input_embeds = input_embeds.view(Bt, Lt, Ht)
+        return input_embeds
+    @torch.inference_mode()
+    @torch.autocast(device_type="cuda", dtype=torch.bfloat16)
+    def generate(
+        self,
+        pixel_values=None,
+        input_ids=None,
+        attention_mask=None,
+        image_flags=None,  # (Bt, Lt)
+        generation_config=None,
+        **generate_kwargs,
+    ) -> torch.LongTensor:
+        input_embeds = self.language_model.get_input_embeddings()(input_ids)  # (Bt, Lt, Ht)
+        if image_flags != None and (image_flags == 1).sum() > 0:
+            flated_pixel_values, image_grid_hw = self.get_flated_pixel_values(pixel_values)
+            input_embeds = self.get_vit_embeds_and_merge(flated_pixel_values, image_grid_hw, input_embeds, image_flags)
+        outputs = self.language_model.generate(
+            input_ids=input_ids,
+            inputs_embeds=input_embeds,
+            attention_mask=attention_mask,
+            generation_config=generation_config,
+            use_cache=True,
+            **generate_kwargs,
+        )
+        return outputs
+    #NOTE: completion和chat接口暂不支持batch推理，需要手动构建self.generate函数的输入来实现。
+    def completion(self, prompt, images, tokenizer, image_processor, **kwargs):
+        """输入一段文字和一组图片（其中文字中的图片用占位符标记为<image>），输出补全的文本"""
+        assert prompt.count("<image>") == len(images), "图片数量和占位符数量不匹配"
+        def replacement(m):
+            token_count = image_tokens.pop(0)
+            return f"<img>{'<|vision_pad|>' * token_count}</img>"
+        #首先对所有的图像进行处理，获取对应的size
+        max_size = kwargs.get("max_size", 733) # max_size**2为支持的最大的面积
+        base = self.patch_size*2
+        image_token_id = tokenizer.vocab['<|vision_pad|>'] # 图像token的占位符
+        background_color = tuple(int(x*255) for x in image_processor.image_mean)
+        transform = T.Compose([T.ToTensor(),T.Normalize(mean=image_processor.image_mean, std=image_processor.image_std)])
+        pixel_values = []
+        image_tokens = []
+        for image in images:
+            if isinstance(image, (tuple, list)):
+                image, detail = image
+            else:
+                detail = "low"
+            image = load_image(image)
+            if detail=="low":
+                image = native_preprocess(image, max_size, base, background_color, min_tokens=4)
+                pixel_values.append(transform(image))
+                image_tokens.append(image.size[0]*image.size[1]//(base*base))
+            else:
+                raise NotImplementedError("暂未实现")
+        new_prompt = re.sub(r"<image>", replacement, prompt)
+        input_ids = tokenizer(new_prompt, return_tensors="pt", add_special_tokens=False).input_ids.to(self.device)
+        image_flags = (input_ids == image_token_id).int()
+        input_ids = input_ids.to(self.vision_encoder.device)
+        pixel_values = [pv.to(self.vision_encoder.device) for pv in pixel_values]
+        image_flags = image_flags.to(self.vision_encoder.device)
+        output_ids = self.generate(pixel_values=pixel_values, input_ids=input_ids, image_flags=image_flags, **kwargs)[0][input_ids.shape[1]:]
+        return tokenizer.decode(output_ids, skip_special_tokens=True)
+    def chat(self, messages, tokenizer, image_processor, **kwargs):
+        """输入是一组对话信息（openai格式），输出是回复"""
+        prompt = ""
+        images = []
+        for message in messages:
+            role = message["role"]
+            assert role in ["user", "assistant", "system"], f"非法的角色{role}"
+            content = message['content']
+            if isinstance(content, str):
+                prompt += f"<|im_start|>{role}\n{content}{tokenizer.eos_token}\n"
+            elif isinstance(content, list):
+                temp = ""
+                for sub_content in content:
+                    if sub_content['type']=='text':
+                        temp += f"{sub_content['text']}"
+                    elif sub_content['type']=='image_url':
+                        temp += "<image>"
+                        images.append([load_image(sub_content['image_url']['url']), sub_content['image_url'].get("detail",'low')])
+                prompt += f"<|im_start|>{role}\n{temp}{tokenizer.eos_token}\n"
+            else:
+                raise ValueError(f"非法的内容{content}")
+        thinking = 'thinking' in kwargs and kwargs['thinking']
+        if 'thinking' in kwargs:
+            kwargs.pop('thinking')
+        prompt += f"<|im_start|>assistant\n" + ('<think>' if thinking else '')
+        return ('<think>' if thinking else '') + self.completion(prompt, images, tokenizer, image_processor, **kwargs)
+########################
+###下面是图像处理的代码###
+########################
+import os
+import math
+import re
+from typing import Union
+import requests
+import base64
+from io import BytesIO
+from PIL import Image
+import torchvision.transforms as T
+def load_image(source: Union[str, Image.Image]) -> Image.Image:
+    """加载图像"""
+    if isinstance(source, Image.Image):
+        img = source
+    elif isinstance(source, str):
+        if source.startswith('http'):
+            response = requests.get(source)
+            response.raise_for_status()
+            img = Image.open(BytesIO(response.content))
+        elif os.path.exists(source):
+            img = Image.open(source)
+        elif source.startswith('data:image'):
+            img = Image.open(BytesIO(base64.b64decode(source.split(',')[1])))
+        else:
+            raise ValueError("Unsupported image source")
+    else:
+        raise ValueError("Unsupported image source")
+    return img.convert('RGB')
+def get_scaled_img_size(image_size, max_area, base, max_resolution=4172, upper=True):
+    """计算缩放后的图片大小和包裹矩形的大小"""
+    # 计算原始图片的宽高比
+    aspect_ratio = image_size[0] / image_size[1]
+    # 计算包裹矩形的最大可能宽度和高度
+    max_width = math.floor(math.sqrt(max_area * aspect_ratio))
+    max_height = math.floor(math.sqrt(max_area / aspect_ratio))
+    max_width, max_height = min(max_width, max_resolution), min(
+        max_height, max_resolution
+    )
+    max_width, max_height = max(max_width, base), max(max_height, base)
+    # 确保包裹矩形的宽度和高度都是base的整数倍
+    if not upper:
+        # 向下取整, 保证面积不会超过max_area
+        max_width = max_width - max_width % base
+        max_height = max_height - max_height % base
+    else:
+        # 向上取整，同时不超过max_resolution（单边最大长度）
+        max_width = min(max_width + (base - max_width % base), max_resolution)
+        max_height = min(max_height + (base - max_height % base), max_resolution)
+    # 计算缩放因子
+    scale_factor = min(max_width / image_size[0], max_height / image_size[1])
+    # 计算缩放后的图片大小
+    new_image_size = (
+        round(image_size[0] * scale_factor),
+        round(image_size[1] * scale_factor),
+    )
+    # 计算包裹矩形的大小
+    bounding_box_size = (max_width, max_height)
+    return new_image_size, bounding_box_size
+def max_preprocess(
+    img, max_size, base, background_color, max_resolution=4172, upper=True, force_resize=False
+):
+    """对图片进行预处理，使其面积接近max_size**2"""
+    # 首先把图片resize到长度和宽度都低于max_resolution
+    w, h = img.size
+    if max(w, h) > max_resolution:
+        scale = max_resolution / max(w, h)
+        w, h = int(w * scale), int(h * scale)
+    # 获取缩放后的图片大小和包裹矩形的大小
+    new_image_size, bounding_box_size = get_scaled_img_size(
+        (w, h), max_size**2, base, max_resolution, upper
+    )
+    if force_resize:
+        return img.resize(bounding_box_size)
+    # 创建一个新的画布
+    canvas = Image.new("RGB", bounding_box_size, background_color)
+    # 计算将图像粘贴到画布上的位置
+    paste_width = (bounding_box_size[0] - new_image_size[0]) // 2
+    paste_height = (bounding_box_size[1] - new_image_size[1]) // 2
+    # 将图像粘贴到画布上
+    canvas.paste(img.resize(new_image_size), (paste_width, paste_height))
+    return canvas
+def native_preprocess(
+    img, max_size, base, background_color, max_resolution=4172, min_tokens=64
+):
+    # 对图片进行处理，使其宽度和高度都是base的整数倍
+    # 如果图片的最长边超过max_resolution，就把图片resize到max_resolution以内
+    w, h = img.size
+    # 首先保证图片的最长边不超过max_resolution(ViT在极限长度)
+    if max(w, h) > max_resolution:
+        scale = max_resolution / max(w, h)
+        w, h = int(w * scale), int(h * scale)
+        img = img.resize((w, h))
+    if w * h > max_size**2:
+        return max_preprocess(img, max_size, base, background_color, max_resolution)
+    if w * h < (base * base * min_tokens):
+        return max_preprocess(
+            img,
+            int(base * (min_tokens**0.5)),
+            base,
+            background_color,
+            max_resolution,
+        )
+    w1, h1 = w + base - w % base, h + base - h % base
+    if w1 == w and h1 == h:
+        return img
+    else:
+        # 创建一个新的(w1, h1)的画布，并把图片resize保证只有一侧存在白边的情况
+        scale = min(w1 / w, h1 / h)
+        new_w, new_h = int(w * scale), int(h * scale)
+        img = img.resize((new_w, new_h))
+        canvas = Image.new("RGB", (w1, h1), background_color)
+        canvas.paste(img, ((w1 - new_w) // 2, (h1 - new_h) // 2))
+        return canvas

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "crop_size": {
+    "height": -1,
+    "width": -1
+  },
+  "do_center_crop": false,
+  "do_convert_rgb": true,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": false,
+  "hidden_stride": 2,
+  "image_mean": [
+    0.48145466,
+    0.4578275,
+    0.40821073
+  ],
+  "image_processor_type": "CLIPImageProcessor",
+  "image_std": [
+    0.26862954,
+    0.26130258,
+    0.27577711
+  ],
+  "max_pixels": 2408448,
+  "min_pixels": 200704,
+  "patch_size": 14,
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "shortest_edge": -1
+  },
+  "temporal_patch_size": 1
+}

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:48bdeafd07da18afd3031097f745b0e91937d48c17d3282e78efbe88be86550d
+size 9497554527

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<img>",
+    "</img>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0e0d3ee707b399f44f189e1abfb2b3cd844b96407e9b2a5a21cb3e0b5f57bb05
+size 11422629

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,239 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<img>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "</img>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151665": {
+      "content": "<tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151666": {
+      "content": "</tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151667": {
+      "content": "<think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151668": {
+      "content": "</think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<img>",
+    "</img>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 262144,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff