feat: add modeling code

Browse files

Files changed (7) hide show

config.json +5 -1
modeling_projector.py +162 -0
modeling_valley.py +520 -0
modeling_vision_tower.py +161 -0
preprocessor_config.json +26 -0
processing_valley.py +312 -0
utils.py +251 -0

config.json CHANGED Viewed

@@ -3,6 +3,11 @@
   "architectures": [
     "ValleyQwen2ForCausalLM"
   ],
   "attention_dropout": 0.0,
   "bos_token_id": 151643,
   "eagle_vision_tower": "Qwen/Qwen2-VL-7B-Instruct",
@@ -32,7 +37,6 @@
   "mm_vision_select_layer": -2,
   "mm_vision_siglip_select_layer": -1,
   "mm_vision_tower": "google/siglip-so400m-patch14-384",
-  "model_class": "valley-product",
   "model_type": "valley",
   "num_attention_heads": 28,
   "num_hidden_layers": 28,

   "architectures": [
     "ValleyQwen2ForCausalLM"
   ],
+  "auto_map": {
+    "AutoConfig": "modeling_valley.ValleyConfig",
+    "AutoModel": "modeling_valley.ValleyQwen2ForCausalLM",
+    "AutoModelForCausalLM": "modeling_valley.ValleyQwen2ForCausalLM"
+  },
   "attention_dropout": 0.0,
   "bos_token_id": 151643,
   "eagle_vision_tower": "Qwen/Qwen2-VL-7B-Instruct",
   "mm_vision_select_layer": -2,
   "mm_vision_siglip_select_layer": -1,
   "mm_vision_tower": "google/siglip-so400m-patch14-384",
   "model_type": "valley",
   "num_attention_heads": 28,
   "num_hidden_layers": 28,

modeling_projector.py ADDED Viewed

	@@ -0,0 +1,162 @@

+import math
+import torch
+import torch.nn as nn
+def build_vision_projector(config, delay_load=False, **kwargs):
+    projector_type = getattr(config, 'mm_projector_type', 'linear')
+    if projector_type == 'conv_adapter':
+        return ConvAdapter(config.mm_hidden_size, config.hidden_size, getattr(config, "mlp_hidden_dim", None))
+    elif projector_type == 'mlp_pixel_shuffle':
+        return MlpPixelShuffle(config.mm_hidden_size, config.hidden_size,
+                               config.pixelshuffle_downsample_ratio, getattr(config, "mlp_hidden_dim", None))
+    elif projector_type == 'ovis_conv_adapter':
+        return OvisConvAdapter(config.mm_hidden_size, config.hidden_size, getattr(config, "mlp_hidden_dim", 32000),
+                               getattr(config, "tokenize_function", "softmax"))
+    raise ValueError(f'Unknown projector type: {projector_type}')
+class ConvAdapter(nn.Module):
+    def __init__(self, dim_in, dim_out, mlp_hidden_dim=None):
+        super().__init__()
+        self.mm_projector_type = 'conv_adapter'
+        if mlp_hidden_dim is None:
+            self.mlp = nn.Sequential(
+                nn.Linear(dim_in, dim_out),
+                nn.GELU(),
+                nn.Linear(dim_out, dim_out)
+            )
+        else:
+            self.mlp = nn.Sequential(
+                nn.Linear(dim_in, mlp_hidden_dim),
+                nn.GELU(),
+                nn.Linear(mlp_hidden_dim, dim_out)
+            )
+        self.conv = nn.Conv2d(dim_out, dim_out, kernel_size=(3, 3), stride=(2, 2), padding=1)
+    def forward(self, x):
+        """
+        Args:
+            x (torch.Tensor): image features
+                shape (F, v, D)
+        Returns:
+            shape (F, n, D) where n is token_num that has been reduced
+        """
+        x = self.mlp(x)
+        f, v, d = x.shape
+        s = int(math.sqrt(v - 1))
+        x = x[:, 1:, :]  # remove cls_token
+        x = x.reshape(f, s, s, d).permute([0, 3, 1, 2])
+        x = self.conv(x)
+        x = x.permute([0, 2, 3, 1]).reshape(f, -1, d)
+        return x
+class MlpPixelShuffle(nn.Module):
+    def __init__(self, dim_in, dim_out, pixelshuffle_downsample_ratio, mlp_hidden_dim=None):
+        super().__init__()
+        self.mm_projector_type = 'mlp_pixel_shuffle'
+        if mlp_hidden_dim is None:
+            self.mlp = nn.Sequential(
+                nn.Linear(int(dim_in * (pixelshuffle_downsample_ratio ** 2)), dim_out),
+                nn.GELU(),
+                nn.Linear(dim_out, dim_out)
+            )
+        else:
+            self.mlp = nn.Sequential(
+                nn.Linear(int(dim_in * (pixelshuffle_downsample_ratio ** 2)), mlp_hidden_dim),
+                nn.GELU(),
+                nn.Linear(mlp_hidden_dim, dim_out)
+            )
+        self.scale_factor = pixelshuffle_downsample_ratio
+    def pixel_shuffle(self, x, scale_factor=2):
+        # change scale_factor from float to int
+        n, w, h, c = x.size()
+        # N, W, H, C --> N, W, H / scale, C * scale
+        x = x.view(n, w, int(h / scale_factor), int(c * scale_factor))
+        # N, W, H / scale, C * scale --> N, H / scale, W, C * scale
+        x = x.permute(0, 2, 1, 3).contiguous()
+        # N, H / scale, W, C * scale --> N, H / scale, W / scale, C * (scale ** 2)
+        x = x.view(n, int(h / scale_factor), int(w / scale_factor),
+                   int(c * (scale_factor * scale_factor)))
+        x = x.permute(0, 2, 1, 3).contiguous()
+        return x
+    def forward(self, x):
+        """
+        Args:
+            x (torch.Tensor): image features
+                shape (F, v, D)
+        Returns:
+            shape (F, n, D) where n is token_num that has been reduced
+        """
+        x = x[:, 1:, :]  # remove cls_token
+        h = w = int(x.shape[1] ** 0.5)
+        x = x.view(x.shape[0], h, w, -1)
+        x = self.pixel_shuffle(x, self.scale_factor)
+        x = self.mlp(x)
+        x = x.view(x.shape[0],-1,x.shape[-1])
+        return x
+class OvisConvAdapter(nn.Module):
+    def __init__(self, dim_in, dim_out, vocab_size, tokenize_function="softmax"):
+        super().__init__()
+        self.mm_projector_type = 'ovis_conv_adapter'
+        self.conv = nn.Conv2d(dim_in, dim_in, kernel_size=(3, 3), stride=(2, 2), padding=1)
+        self.mlp = torch.nn.Sequential(
+            torch.nn.Linear(dim_in, vocab_size, bias=False),
+            torch.nn.LayerNorm(vocab_size)
+        )
+        self.embedding = torch.nn.Embedding(vocab_size, dim_out)
+        self.tokenize_function = tokenize_function
+    def tokenize(self, logits):
+        def st_argmax(y_soft, dim):  # straight-through softmax
+            index = y_soft.max(dim, keepdim=True)[1]
+            y_hard = torch.zeros_like(y_soft, memory_format=torch.legacy_contiguous_format).scatter_(dim, index, 1.0)
+            ret = y_hard - y_soft.detach() + y_soft
+            return ret
+        if self.tokenize_function == 'softmax':
+            tokens = torch.nn.functional.softmax(logits, dim=-1)
+        elif self.tokenize_function == 'gumbel_argmax':
+            tokens = torch.nn.functional.gumbel_softmax(logits, tau=self.config.tau, hard=True)
+        elif self.tokenize_function == 'st_argmax':
+            tokens = st_argmax(logits, dim=-1)
+        else:
+            raise ValueError(
+                'Invalid `max_type`, expected softmax or gumbel_argmax or st_argmax,'
+                f' but got {self.config.tokenize_function}'
+            )
+        return tokens
+    def forward(self, x):
+        """
+        Args:
+            x (torch.Tensor): image features
+                shape (F, v, D)
+        Returns:
+            shape (F, n, D) where n is token_num that has been reduced
+        """
+        # conv
+        f, v, d = x.shape
+        s = int(math.sqrt(v - 1))
+        x = x[:, 1:, :]  # remove cls_token
+        x = x.reshape(f, s, s, d).permute([0, 3, 1, 2])
+        x = self.conv(x)
+        x = x.permute([0, 2, 3, 1]).reshape(f, -1, d)
+        # tokenize
+        logits = self.mlp(x)
+        visual_tokens = self.tokenize(logits)
+        # get embeddings
+        out = torch.matmul(visual_tokens, self.embedding.weight)
+        return out

modeling_valley.py ADDED Viewed

	@@ -0,0 +1,520 @@

+#  Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+import torch
+import numpy as np
+from torch import nn
+from torch.nn import CrossEntropyLoss
+from abc import ABC, abstractmethod
+from typing import List, Optional, Tuple, Union
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from transformers import AutoConfig, AutoModelForCausalLM, Qwen2Config, Qwen2ForCausalLM, Qwen2Model
+from .modeling_vision_tower import build_vision_tower
+from .modeling_projector import build_vision_projector
+from .utils import get_anyres_image_grid_shape, unpad_image, IGNORE_INDEX, IMAGE_TOKEN_INDEX
+class ValleyConfig(Qwen2Config):
+    model_type = "valley"
+class ValleyMetaModel:
+    def __init__(self, config):
+        super(ValleyMetaModel, self).__init__(config)
+        # Build vision tower
+        if hasattr(config, "mm_vision_tower"):
+            if getattr(config, "eagle_vision_tower", None) is not None:
+                self.vision_tower, self.qwen2vl_vision_tower = build_vision_tower(config, delay_load=False)
+            else:
+                self.vision_tower = build_vision_tower(config, delay_load=False)
+        # Build Projector
+        if hasattr(config, "mm_projector_type"):
+            self.mm_projector = build_vision_projector(config)
+    def get_vision_tower(self):
+        vision_tower = getattr(self, "vision_tower", None)
+        if getattr(self.config, "eagle_vision_tower", None) is not None:
+            qwen2vl_vision_tower = getattr(self, "qwen2vl_vision_tower", None)
+            return vision_tower, qwen2vl_vision_tower
+        else:
+            return vision_tower
+class ValleyMetaForCausalLM(ABC):
+    @abstractmethod
+    def get_model(self):
+        pass
+    def get_vision_tower(self):
+        return self.get_model().get_vision_tower()
+    def split_by_instance(self, original_list, split_sizes):
+        start = 0
+        sub_lists = []
+        for size in split_sizes:
+            end = start + size
+            sub_list = original_list[start:end]
+            sub_lists.append([x.to(self.device) for x in sub_list])
+            start = end
+        return sub_lists
+    def encode_images_qwen2vl(self, pixel_values = None, grid_thw = None, split_sizes=None):
+        _, qwen2vl_vision_tower = self.get_model().get_vision_tower()
+        qwen2vl_image_features = qwen2vl_vision_tower(pixel_values, grid_thw)
+        qwen2vl_image_split_sizes = torch.prod(grid_thw[:, 1:3]//2, dim=1)
+        qwen2vl_image_features = torch.split(qwen2vl_image_features, qwen2vl_image_split_sizes.tolist(), dim=0)
+        qwen2vl_image_features = self.split_by_instance(qwen2vl_image_features, split_sizes)
+        return qwen2vl_image_features
+    def encode_images(self, images = None, split_sizes = None):
+        """
+        images: (if not anyres) images.shape = [n,3,336,336] , n = number of images + (number of video) * 8
+        images: (if anyres) images.shape = [n,3,336,336] , n = number of tiles * number of images
+        """
+        if getattr(self.config, "eagle_vision_tower", None) is not None:
+            siglip_vision_tower, _ = self.get_model().get_vision_tower()
+            image_features = siglip_vision_tower(images)
+            image_features = self.get_model().mm_projector(image_features)
+        else:
+            image_features = self.get_model().get_vision_tower()(images)
+            image_features = self.get_model().mm_projector(image_features)
+        if getattr(self.config,'anyres', False) and getattr(self.config, 'max_vision_token', None) is not None:
+            assert split_sizes is not None
+            image_features = list(torch.split(image_features, split_sizes, dim=0))
+            for i, image_feature in enumerate(image_features):
+                hidden_dim = image_feature.shape[-1]
+                image_tokens = image_feature.shape[0]*image_feature.shape[1]
+                if getattr(self.config, "eagle_vision_tower", None) is not None:
+                    pass # the max_vision_token will be processed in the unpad image token part
+                else:
+                    if image_tokens > self.config.max_vision_token:
+                        intput_shape = int((image_feature.shape[1])**0.5)
+                        output_shape = int((self.config.max_vision_token/image_feature.shape[0])**0.5)
+                        image_feature = image_feature.view(image_feature.shape[0],intput_shape, intput_shape, -1).permute(0,3,1,2)
+                        m = nn.AdaptiveAvgPool2d(output_shape) # different from roi pooling, but in square image, it seems the same
+                        pooling_feature = m(image_feature).permute(0,2,3,1)
+                        image_features[i] = pooling_feature.view(image_feature.shape[0], -1, hidden_dim)
+                split_sizes = None # have already split, set the flag
+        if getattr(self.config, 'mm_use_im_start_end', False):
+            raise ValueError('mm_use_im_start is not support')
+        if split_sizes is not None:
+            image_features = torch.split(image_features, split_sizes, dim=0)
+        return image_features
+    def prepare_inputs_labels_for_multimodal(
+        self, input_ids, position_ids, attention_mask, past_key_values, labels, images,
+        image_sizes, pixel_values, pixel_values_videos, image_grid_thw, video_grid_thw):
+        vision_tower = self.get_vision_tower()
+        if vision_tower is None or images is None or input_ids.shape[1] == 1:
+            if past_key_values is not None and vision_tower is not None and images is not None and input_ids.shape[1] == 1:
+                target_shape = past_key_values[-1][-1].shape[-2] + 1
+                attention_mask = torch.cat((attention_mask, torch.ones(
+                    (attention_mask.shape[0], target_shape - attention_mask.shape[1]),
+                    dtype=attention_mask.dtype,
+                    device=attention_mask.device
+                )), dim=1)
+                position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
+            return input_ids, position_ids, attention_mask, past_key_values, None, labels
+        # Step1: Get image embedings
+        if type(images) is list or images.ndim == 5:
+            # Without slicing the image
+            if not getattr(self.config,'anyres', False):
+                concat_images = torch.cat([image for image in images], dim=0) # to do batch compute
+                split_sizes = [image.shape[0] for image in images]
+                # Get vision tower feature, check whether only use navit firstly
+                if getattr(self.config, 'eagle_vision_tower', None) is not None and getattr(self.config, 'only_navit', False):
+                    image_features = None
+                else:
+                    image_features = self.encode_images(concat_images, split_sizes)
+                    image_features = [x.to(self.device) for x in image_features]
+                # Get Eagle features
+                if getattr(self.config, 'eagle_vision_tower', None) is not None:
+                    if pixel_values is not None:
+                        qwen2vl_image_features = self.encode_images_qwen2vl(pixel_values, image_grid_thw, split_sizes)
+                    elif pixel_values_videos is not None:
+                        qwen2vl_image_features = self.encode_images_qwen2vl(pixel_values_videos, video_grid_thw, split_sizes)
+                    else:
+                        qwen2vl_image_features = None
+            # Slicing the image, each image contains some sub_images:
+            # images = [
+            #   [image1_tiles(n1,3,336,336), image2_tiles(n2,3,336,336), ...],
+            #   [image1_tiles(n1,3,336,336), image2_tiles(n2,3,336,336), ...], ...
+            # ]
+            else:
+                split_sizes = [len(image) for image in images]
+                # Get Eagle features
+                if getattr(self.config, "eagle_vision_tower", None) is not None:
+                    if pixel_values is not None:
+                        qwen2vl_image_features = self.encode_images_qwen2vl(pixel_values, image_grid_thw, split_sizes)
+                    elif pixel_values_videos is not None:
+                        qwen2vl_image_features = self.encode_images_qwen2vl(pixel_values_videos, video_grid_thw, split_sizes)
+                    else:
+                        qwen2vl_image_features = None
+                # Get vision tower feature, check whether only use navit firstly
+                if getattr(self.config, 'eagle_vision_tower', None) is not None and getattr(self.config, 'only_navit', False):
+                    image_features = None
+                else:
+                    image_features = []
+                    all_concat_images = []
+                    all_split_sizes = []
+                    for batch_images in images:
+                        concat_images = torch.cat([image for image in batch_images], dim=0) # to do batch compute
+                        split_sizes = [image.shape[0] for image in batch_images]
+                        all_concat_images.append(concat_images)
+                        all_split_sizes.append(split_sizes)
+                    all_image_features = self.encode_images(images=torch.cat(all_concat_images, dim=0), split_sizes=sum(all_split_sizes, []))
+                    idx = 0
+                    for split_sizes in all_split_sizes:
+                        batch_image_features = all_image_features[idx:idx+len(split_sizes)]
+                        idx += len(split_sizes)
+                        if type(batch_image_features[0]) is list:
+                            batch_image_features = [torch.cat(x).to(self.device) for x in batch_image_features]
+                        else:
+                            batch_image_features = [x.view(-1,x.shape[-1]).to(self.device) for x in batch_image_features] # tiles feature need to flatten in token dimention, [n_tiles, T, d] -> [n_tiles * T, d]
+                        image_features.append(batch_image_features)
+                if getattr(self.config, "eagle_vision_tower", None) is not None and getattr(self.config, 'only_navit', False) == False:
+                    # unpad image tokens
+                    height = width = self.config.num_patches_per_side
+                    new_image_features = []
+                    for batch_image_features, batch_image_sizes in zip(image_features, image_sizes):
+                        batch_image_features_list = []
+                        for cur_image_feature, cur_image_size in zip(batch_image_features, batch_image_sizes):
+                            base_image_feature = cur_image_feature[:width*height, :]
+                            image_feature = cur_image_feature[width*height:, :]
+                            if image_feature.shape[0] != 0:
+                                num_patch_width, num_patch_height = get_anyres_image_grid_shape(
+                                    cur_image_size,
+                                    self.config.grid_pinpoints,
+                                    self.config.vit_crop_size
+                                )
+                                image_feature = image_feature.view(num_patch_height, num_patch_width, height, width, -1) # (num_patch_H, num_patch_W, H, W, C)
+                                image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous() # (C, num_patch_H, H, num_patch_W, W)
+                                image_feature = image_feature.flatten(1, 2).flatten(2, 3) # (C, num_token_H, num_token_W)
+                                image_feature = unpad_image(image_feature, cur_image_size) # (C, num_token_H_unpad, num_token_W_unpad)
+                                input_shape = (image_feature.shape[-2], image_feature.shape[-1])
+                                subimage_tokens = np.prod(input_shape)
+                                # adaptive avg 2d pool for reducing token num
+                                max_subimage_tokens = self.config.max_vision_token-width*height
+                                if subimage_tokens > max_subimage_tokens:
+                                    aspect_ratio = input_shape[0] / input_shape[1]
+                                    output_shape = (
+                                        int((max_subimage_tokens/aspect_ratio)**0.5*aspect_ratio),
+                                        int((max_subimage_tokens/aspect_ratio)**0.5)
+                                    )
+                                    m = nn.AdaptiveAvgPool2d(output_shape)
+                                    image_feature = m(image_feature)
+                                image_feature = image_feature.flatten(1, 2).transpose(0, 1)
+                                image_feature = torch.cat((base_image_feature, image_feature), dim=0)
+                            else:
+                                image_feature = cur_image_feature
+                            batch_image_features_list.append(image_feature)
+                        new_image_features.append(batch_image_features_list)
+                    image_features = new_image_features
+        else:
+            image_features = self.encode_images(images).to(self.device)
+        # Step2: Iterate through each sample in the batch, insert image embedings into input_embeds
+        #        and filling labels, attention mask at the same time. Finally, get `new_input_embed`,
+        #        `new_labels`, new_attention_mask`.
+        _labels = labels
+        _position_ids = position_ids
+        _attention_mask = attention_mask
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids, dtype=torch.bool)
+        if position_ids is None:
+            position_ids = torch.arange(0, input_ids.shape[1], dtype=torch.long, device=input_ids.device)
+        if labels is None:
+            labels = torch.full_like(input_ids, IGNORE_INDEX)
+        input_ids = [cur_input_ids[cur_attention_mask] for cur_input_ids, cur_attention_mask in zip(input_ids, attention_mask.bool())]
+        labels = [cur_labels[cur_attention_mask] for cur_labels, cur_attention_mask in zip(labels, attention_mask.bool())]
+        attention_mask = [cur_attention_mask[cur_attention_mask.bool()] for cur_attention_mask in attention_mask]
+        new_input_embeds = []
+        new_labels = []
+        new_attention_mask = []
+        for batch_idx, cur_input_ids in enumerate(input_ids):
+            cur_batch_image_idx = 0
+            num_images = (cur_input_ids == IMAGE_TOKEN_INDEX).sum()
+            # Step2-1: If this piece of data is pure text, then concat a dummy image to ensure the whole compute graph is same on all device
+            if num_images == 0:
+                if getattr(self.config, "eagle_vision_tower", None) is not None:
+                    if getattr(self.config, 'only_navit', False):
+                        cur_image_features = qwen2vl_image_features[batch_idx][cur_batch_image_idx]
+                    else:
+                        siglip_feat = image_features[batch_idx][cur_batch_image_idx]
+                        try:
+                            qwen2vl_feat = qwen2vl_image_features[batch_idx][cur_batch_image_idx]
+                            cur_image_features = torch.cat((siglip_feat, qwen2vl_feat), dim=0)
+                        except Exception as e:
+                            print(e)
+                            print("only siglip feature:", siglip_feat.shape)
+                            cur_image_features = siglip_feat
+                else:
+                    cur_image_features = image_features[batch_idx][cur_batch_image_idx]
+                cur_input_embeds_1 = self.get_model().embed_tokens(cur_input_ids)
+                cur_input_embeds = torch.cat([cur_input_embeds_1, cur_image_features.squeeze(0)[0:0]], dim=0)
+                new_input_embeds.append(cur_input_embeds)
+                new_labels.append(labels[batch_idx])
+                new_attention_mask.append(attention_mask[batch_idx])
+                cur_batch_image_idx += 1
+                continue
+            # Step2-2: Split input_ids, labels, attention_mask by IMAGE_TOKEN_INDEX
+            cur_input_ids_noim, cur_labels_noim, cur_attention_mask_noim = [], [], []
+            cur_labels = labels[batch_idx]
+            cur_attention_mask = attention_mask[batch_idx]
+            cur_img_attention_mask = [
+                attention_mask[batch_idx][i].item()
+                for i in torch.where(cur_input_ids == IMAGE_TOKEN_INDEX)[0].tolist()
+            ]
+            image_token_indices = [-1] + torch.where(cur_input_ids == IMAGE_TOKEN_INDEX)[0].tolist() + [cur_input_ids.shape[0]]
+            for i in range(len(image_token_indices) - 1):
+                cur_input_ids_noim.append(cur_input_ids[image_token_indices[i]+1:image_token_indices[i+1]])
+                cur_labels_noim.append(cur_labels[image_token_indices[i]+1:image_token_indices[i+1]])
+                cur_attention_mask_noim.append(cur_attention_mask[image_token_indices[i]+1:image_token_indices[i+1]])
+            split_sizes = [x.shape[0] for x in cur_labels_noim]
+            cur_input_embeds = self.get_model().embed_tokens(torch.cat(cur_input_ids_noim))
+            cur_input_embeds_no_im = list(torch.split(cur_input_embeds, split_sizes, dim=0))# get text features
+            # Step2-3: Insert image embedings
+            cur_new_input_embeds, cur_new_labels, cur_new_attention_mask = [], [], []
+            for i in range(num_images + 1): # to add multimodal feature internal the text feature
+                cur_new_input_embeds.append(cur_input_embeds_no_im[i])
+                cur_new_labels.append(cur_labels_noim[i])
+                cur_new_attention_mask.append(cur_attention_mask_noim[i])
+                if i < num_images:
+                    if getattr(self.config, "eagle_vision_tower", None) is not None:
+                        if getattr(self.config, 'only_navit', False):
+                            cur_image_features = qwen2vl_image_features[batch_idx][cur_batch_image_idx]
+                        else:
+                            siglip_feat = image_features[batch_idx][cur_batch_image_idx]
+                            try:
+                                qwen2vl_feat = qwen2vl_image_features[batch_idx][cur_batch_image_idx]
+                                cur_image_features = torch.cat((siglip_feat, qwen2vl_feat), dim=0)
+                            except Exception as e:
+                                print(e)
+                                print("only siglip feature:", siglip_feat.shape)
+                                cur_image_features = siglip_feat
+                    else:
+                        cur_image_features = image_features[batch_idx][cur_batch_image_idx]
+                    cur_batch_image_idx += 1
+                    cur_new_input_embeds.append(cur_image_features)
+                    cur_new_labels.append(torch.full((cur_image_features.shape[0],), IGNORE_INDEX, device=cur_labels.device, dtype=cur_labels.dtype))
+                    cur_new_attention_mask.append(torch.full((cur_image_features.shape[0],), True, device=cur_attention_mask.device, dtype=cur_attention_mask.dtype))
+            # Step2-4: Concat image embedings and text embedings
+            cur_new_input_embeds = torch.cat(cur_new_input_embeds)
+            cur_new_labels = torch.cat(cur_new_labels)
+            cur_new_attention_mask = torch.cat(cur_new_attention_mask)
+            new_input_embeds.append(cur_new_input_embeds)
+            new_labels.append(cur_new_labels)
+            new_attention_mask.append(cur_new_attention_mask)
+        # Step3: Truncate sequences to max length as image embeddings can make the sequence longer
+        tokenizer_model_max_length = getattr(self.config, 'tokenizer_model_max_length', None)
+        if tokenizer_model_max_length is not None:
+            new_input_embeds = [x[:tokenizer_model_max_length] for x in new_input_embeds]
+            new_labels = [x[:tokenizer_model_max_length] for x in new_labels]
+            new_attention_mask = [x[:tokenizer_model_max_length] for x in new_attention_mask]
+        # Step4: Pad and stack input_embeds, labels, attention_mask
+        max_len = max(x.shape[0] for x in new_input_embeds)
+        batch_size = len(new_input_embeds)
+        new_input_embeds_padded = []
+        new_labels_padded = torch.full((batch_size, max_len), IGNORE_INDEX, dtype=new_labels[0].dtype, device=new_labels[0].device)
+        new_attention_mask_padded = torch.zeros((batch_size, max_len), dtype=new_attention_mask[0].dtype, device=new_attention_mask[0].device)
+        position_ids = torch.zeros((batch_size, max_len), dtype=position_ids.dtype, device=position_ids.device)
+        for i, (cur_new_embed, cur_new_labels, cur_attention_mask) in enumerate(zip(new_input_embeds, new_labels, new_attention_mask)):
+            cur_len = cur_new_embed.shape[0]
+            # Right padding when inferencing
+            if not self.training and not getattr(self, "right_padding", None):
+                new_input_embeds_padded.append(torch.cat((
+                    torch.zeros((max_len - cur_len, cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, device=cur_new_embed.device),
+                    cur_new_embed
+                ), dim=0))
+                if cur_len > 0:
+                    new_labels_padded[i, -cur_len:] = cur_new_labels
+                    new_attention_mask_padded[i, -cur_len:] = cur_attention_mask
+                    position_ids[i, -cur_len:] = torch.arange(0, cur_len, dtype=position_ids.dtype, device=position_ids.device)
+            # Left padding while training
+            else:
+                new_input_embeds_padded.append(torch.cat((
+                    cur_new_embed,
+                    torch.zeros((max_len - cur_len, cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, device=cur_new_embed.device)
+                ), dim=0))
+                if cur_len > 0:
+                    new_labels_padded[i, :cur_len] = cur_new_labels
+                    new_attention_mask_padded[i, :cur_len] = cur_attention_mask
+                    position_ids[i, :cur_len] = torch.arange(0, cur_len, dtype=position_ids.dtype, device=position_ids.device)
+        new_input_embeds = torch.stack(new_input_embeds_padded, dim=0)
+        new_labels = new_labels_padded if _labels is not None else None
+        new_attention_mask = new_attention_mask_padded if _attention_mask is not None else None
+        if _position_ids is None:
+            position_ids = None
+        return None, position_ids, new_attention_mask, past_key_values, new_input_embeds, new_labels
+class ValleyQwen2Model(ValleyMetaModel, Qwen2Model):
+    config_class = ValleyConfig
+    def __init__(self, config: Qwen2Config):
+        super(ValleyQwen2Model, self).__init__(config)
+class ValleyQwen2ForCausalLM(Qwen2ForCausalLM, ValleyMetaForCausalLM):
+    config_class = ValleyConfig
+    def __init__(self, config):
+        super(Qwen2ForCausalLM, self).__init__(config)
+        self.model = ValleyQwen2Model(config)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.post_init()
+    def get_model(self):
+        return self.model
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        images: Optional[torch.FloatTensor] = None,
+        return_dict: Optional[bool] = None,
+        image_sizes: Optional[List[List[int]]] = None,
+        pixel_values: Optional[torch.Tensor] = None,
+        pixel_values_videos: Optional[torch.FloatTensor] = None,
+        image_grid_thw: Optional[torch.LongTensor] = None,
+        video_grid_thw: Optional[torch.LongTensor] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if inputs_embeds is None:
+            (
+                input_ids,
+                position_ids,
+                attention_mask,
+                past_key_values,
+                inputs_embeds,
+                labels
+            ) = self.prepare_inputs_labels_for_multimodal(
+                input_ids,
+                position_ids,
+                attention_mask,
+                past_key_values,
+                labels,
+                images,
+                image_sizes,
+                pixel_values,
+                pixel_values_videos,
+                image_grid_thw,
+                video_grid_thw,
+            )
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states)
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            loss_fct = CrossEntropyLoss(reduction='mean')
+            bs = shift_labels.shape[0]
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = torch.stack([loss_fct(shift_logits[i], shift_labels[i]) for i in range(bs)])
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+    ):
+        if past_key_values:
+            input_ids = input_ids[:, -1:]
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+        model_inputs.update(
+            {
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+                "images": kwargs.get("images", None),
+                "image_sizes": kwargs.get("image_sizes", None),
+                "pixel_values": kwargs.get("pixel_values", None),
+                "pixel_values_videos": kwargs.get("pixel_values_videos", None),
+                "image_grid_thw": kwargs.get("image_grid_thw", None),
+                "video_grid_thw": kwargs.get("video_grid_thw", None),
+            }
+        )
+        return model_inputs
+AutoConfig.register("valley", ValleyConfig)
+AutoModelForCausalLM.register(ValleyConfig, ValleyQwen2ForCausalLM)

modeling_vision_tower.py ADDED Viewed

	@@ -0,0 +1,161 @@

+import torch
+import torch.nn as nn
+from transformers.models.qwen2_vl.modeling_qwen2_vl import Qwen2VisionTransformerPretrainedModel
+from transformers import PretrainedConfig
+siglip_config = PretrainedConfig.from_dict(
+    {
+        "attention_dropout": 0.0,
+        "hidden_act": "gelu_pytorch_tanh",
+        "hidden_size": 1152,
+        "image_size": 384,
+        "intermediate_size": 4304,
+        "layer_norm_eps": 1e-06,
+        "model_type": "siglip_vision_model",
+        "num_attention_heads": 16,
+        "num_channels": 3,
+        "num_hidden_layers": 27,
+        "patch_size": 14,
+    }
+)
+qwen2vl_vit_config = PretrainedConfig.from_dict(
+    {
+        "depth": 32,
+        "embed_dim": 1280,
+        "hidden_act": "quick_gelu",
+        "hidden_size": 3584,
+        "in_channels": 3,
+        "in_chans": 3,
+        "mlp_ratio": 4,
+        "model_type": "qwen2_vl",
+        "num_heads": 16,
+        "patch_size": 14,
+        "spatial_merge_size": 2,
+        "spatial_patch_size": 14,
+        "temporal_patch_size": 2,
+        "_attn_implementation": "flash_attention_2",
+        "_attn_implementation_internal": "flash_attention_2"
+    }
+)
+def build_vision_tower(vision_tower_cfg, **kwargs):
+    vision_tower = getattr(vision_tower_cfg, "mm_vision_tower", getattr(vision_tower_cfg, "vision_tower", None))
+    if "siglip-so400m-patch14-384" in vision_tower:
+        # Eagle
+        if getattr(vision_tower_cfg, "eagle_vision_tower", None) is not None:
+            qwen2vl_vision_tower = Qwen2VisionTransformerPretrainedModel._from_config(qwen2vl_vit_config)
+            if getattr(vision_tower_cfg, "navit_merger_hidden_dim", None) is not None:
+                del qwen2vl_vision_tower.merger
+                qwen2vl_vision_tower.merger = CustomPatchMerger(
+                    vision_tower_cfg.hidden_size,
+                    context_dim=1280,
+                    hidden_dim=getattr(vision_tower_cfg, "navit_merger_hidden_dim", None)
+                ) # random initialize
+            qwen2vl_vision_tower.requires_grad_(False)
+            # If only use navit, delete siglip_vision_tower
+            if getattr(vision_tower_cfg, "only_navit", False):
+                siglip_vision_tower = None
+            else:
+                siglip_vision_tower = SigLipVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
+            return siglip_vision_tower, qwen2vl_vision_tower
+        # Non-Eagle
+        else:
+            siglip_vision_tower = SigLipVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
+            return siglip_vision_tower
+    else:
+        raise ValueError(f"Unknown vision tower: {vision_tower}")
+class SigLipVisionTower(nn.Module):
+    def __init__(self, vision_tower, args, delay_load=False, cache_dir="./cache_dir"):
+        super().__init__()
+        self.is_loaded = False
+        self.image_tower_name = vision_tower
+        self.select_layer = args.mm_vision_select_layer
+        self.select_feature = getattr(args, "mm_vision_select_feature", "patch")
+        self.cache_dir = cache_dir
+        if not delay_load:
+            self.load_model()
+        else:
+            from transformers import SiglipVisionModel
+            self.cfg_only = siglip_config
+            self.vision_tower = SiglipVisionModel._from_config(siglip_config)  # dummy-load
+    def load_model(self):
+        from transformers import SiglipVisionModel
+        self.vision_tower = SiglipVisionModel._from_config(siglip_config)
+        self.vision_tower.requires_grad_(False)
+        self.is_loaded = True
+    def feature_select(self, image_forward_outs):
+        assert self.select_feature == "cls_patch"
+        image_features = torch.cat([image_forward_outs[:, :1, :], image_forward_outs], dim=1)
+        return image_features
+    def forward(self, images):
+        if type(images) is list:
+            image_features = []
+            for image in images:
+                image_forward_out = self.vision_tower(
+                    image.to(device=self.device, dtype=self.dtype).unsqueeze(0),
+                    output_hidden_states=True,
+                    return_dict=True,
+                )
+                image_feature = self.feature_select(image_forward_out.last_hidden_state).to(image.dtype)
+                image_features.append(image_feature)
+        else:
+            image_forward_outs = self.vision_tower(
+                images.to(device=self.device, dtype=self.dtype),
+                output_hidden_states=True,
+                return_dict=True,
+            )
+            image_features = self.feature_select(image_forward_outs.last_hidden_state).to(images.dtype)
+        return image_features
+    @property
+    def dummy_feature(self):
+        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
+    @property
+    def dtype(self):
+        return self.vision_tower.dtype
+    @property
+    def device(self):
+        return self.vision_tower.device
+    @property
+    def config(self):
+        if self.is_loaded:
+            return self.vision_tower.config
+        else:
+            return self.cfg_only
+    @property
+    def hidden_size(self):
+        return self.config.hidden_size
+    @property
+    def num_patches(self):
+        return (self.config.image_size // self.config.patch_size) ** 2
+class CustomPatchMerger(nn.Module):
+    def __init__(self, dim: int, context_dim: int, hidden_dim: int, spatial_merge_size: int = 2) -> None:
+        super().__init__()
+        self.input_dim = context_dim * (spatial_merge_size**2)
+        self.ln_q = nn.LayerNorm(context_dim, eps=1e-6)
+        self.mlp = nn.Sequential(
+            nn.Linear(self.input_dim, hidden_dim),
+            nn.GELU(),
+            nn.Linear(hidden_dim, dim),
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.mlp(self.ln_q(x).view(-1, self.input_dim))
+        return x

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,26 @@

+{
+    "processor_class": "ValleyProcessor",
+    "auto_map": {
+        "AutoProcessor": "processing_valley.ValleyProcessor"
+    },
+    "min_pixels": 1,
+    "qwen2vl_processor_config": {
+        "min_pixels": 3136,
+        "max_pixels": 12845056,
+        "patch_size": 14,
+        "temporal_patch_size": 2,
+        "merge_size": 2,
+        "image_mean": [
+            0.48145466,
+            0.4578275,
+            0.40821073
+        ],
+        "image_std": [
+            0.26862954,
+            0.26130258,
+            0.27577711
+        ],
+        "image_processor_type": "Qwen2VLImageProcessor",
+        "processor_class": "Qwen2VLProcessor"
+    }
+}

processing_valley.py ADDED Viewed

	@@ -0,0 +1,312 @@

+import re
+import types
+import io
+import torch
+from PIL import Image
+from qwen_vl_utils import fetch_image
+from transformers import (
+    ProcessorMixin,
+    SiglipImageProcessor,
+    BatchFeature,
+    Qwen2VLImageProcessor,
+    PreTrainedTokenizer
+)
+from .utils import (
+    process_anyres_image,
+    BLACK_IMG_ENV,
+    DEFAULT_IM_END_TOKEN,
+    DEFAULT_IM_START_TOKEN,
+    DEFAULT_IMAGE_TOKEN,
+    DEFAULT_VI_END_TOKEN,
+    DEFAULT_VI_START_TOKEN,
+    DEFAULT_VIDEO_TOKEN,
+    IMAGE_TOKEN_INDEX,
+    SEQ_MAX_LEN,
+)
+siglip_processor_config = {
+    "do_normalize": True,
+    "do_rescale": True,
+    "do_resize": True,
+    "image_mean": [
+        0.5,
+        0.5,
+        0.5
+    ],
+    "image_processor_type": "SiglipImageProcessor",
+    "image_std": [
+        0.5,
+        0.5,
+        0.5
+    ],
+    "processor_class": "SiglipProcessor",
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098,
+    "size": {
+        "height": 384,
+        "width": 384
+    }
+}
+qwen2vl_processor_config = {
+    "min_pixels": 3136,
+    "max_pixels": 12845056,
+    "patch_size": 14,
+    "temporal_patch_size": 2,
+    "merge_size": 2,
+    "image_mean": [
+        0.48145466,
+        0.4578275,
+        0.40821073
+    ],
+    "image_std": [
+        0.26862954,
+        0.26130258,
+        0.27577711
+    ],
+    "image_processor_type": "Qwen2VLImageProcessor",
+    "processor_class": "Qwen2VLProcessor"
+}
+class ValleyProcessor(ProcessorMixin):
+    attributes = ["tokenizer"]
+    optional_attributes = [
+        "max_pixels",
+        "min_pixels",
+        "anyres",
+        "only_crop_single_image",
+        "grid_pinpoints",
+        "use_special_start_end_token",
+    ]
+    tokenizer_class = "AutoTokenizer"
+    def __init__(self, tokenizer=None, **kwargs):
+        super().__init__(tokenizer, **kwargs)
+        self.black_img = BLACK_IMG_ENV
+        self.siglip_image_processor = SiglipImageProcessor.from_dict(siglip_processor_config)
+        self.qwen2vl_image_processor = Qwen2VLImageProcessor.from_dict(
+            qwen2vl_processor_config,
+            max_pixels=kwargs.get("max_pixels", 1280*28*28),
+            min_pixels=kwargs.get("min_pixels", 4*28*28)
+        )
+        self.anyres = kwargs.get("anyres", True)
+        self.grid_pinpoints = kwargs.get("grid_pinpoints", "(1x1),...,(3x3)")
+        self.only_crop_single_image = kwargs.get("only_crop_single_image", True)
+        self.use_special_start_end_token = kwargs.get("use_special_start_end_token", True)
+    def preprocess_images_siglip(self, images) -> torch.FloatTensor:
+        if isinstance(images[0], str):
+            images_pil = [Image.open(img).convert("RGB") for img in images]
+        elif isinstance(images[0], Image.Image):
+            images_pil = [img.convert("RGB") for img in images]
+        elif isinstance(images[0], bytes):
+            images_pil = [Image.open(io.BytesIO(img)).convert("RGB") for img in images]
+        else:
+            raise ValueError("unsupported type")
+        processed_images = []
+        have_multi_images = len(images_pil) > 1
+        for img in images_pil:
+            if self.anyres:
+                if not self.only_crop_single_image or not have_multi_images:
+                    image = process_anyres_image(img, self.siglip_image_processor, self.grid_pinpoints)
+                else:
+                    image = [self.siglip_image_processor(img, return_tensors="pt")["pixel_values"][0]]
+            else:
+                image = self.siglip_image_processor(img, return_tensors="pt")["pixel_values"][0]
+            processed_images.append(image)
+        if not self.anyres:
+            return torch.stack(processed_images, dim=0)
+        else:
+            return [torch.stack(img, dim=0) for img in processed_images]
+    def preprocess_images_qwen2vl(self, images) -> dict:
+        if isinstance(images[0], str):
+            images_pil = [Image.open(img).convert("RGB") for img in images]
+        elif isinstance(images[0], Image.Image):
+            images_pil = [img.convert("RGB") for img in images]
+        elif isinstance(images[0], bytes):
+            images_pil = [Image.open(io.BytesIO(img)).convert("RGB") for img in images]
+        else:
+            raise ValueError("unsupported type")
+        image_sizes = [[x.size for x in images_pil]]
+        data_dict_qwen2vl = self.qwen2vl_image_processor(
+            [fetch_image({"image": img}) for img in images_pil],
+            return_tensors="pt"
+        )
+        data_dict_qwen2vl["image_sizes"] = image_sizes
+        return data_dict_qwen2vl
+    def preprocess_multimodal(self, conversations, img_num):
+        for sentence in conversations:
+            if sentence["role"] == "system":
+                continue
+            if DEFAULT_VIDEO_TOKEN in sentence["content"]:
+                if self.use_special_start_end_token:
+                    video_replace_token = (DEFAULT_VI_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_VI_END_TOKEN) * img_num
+                else:
+                    video_replace_token = DEFAULT_IMAGE_TOKEN * img_num
+                sentence["content"] = sentence["content"].replace(DEFAULT_VIDEO_TOKEN, "").strip()
+                sentence["content"] = video_replace_token + "\n" + sentence["content"]
+            else:
+                segs = re.split(DEFAULT_IMAGE_TOKEN, sentence["content"])
+                if self.use_special_start_end_token:
+                    sentence["content"] = (DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN).join(
+                        segs[: img_num + 1]
+                    ) + "".join(segs[img_num + 1 :])
+                else:
+                    sentence["content"] = DEFAULT_IMAGE_TOKEN.join(segs[: img_num + 1]) + "".join(segs[img_num + 1 :])
+        return conversations
+    def preprocess_qwen2(
+        self,
+        conversations,
+        tokenizer: PreTrainedTokenizer,
+        has_image: bool = False,
+        inference: bool = False,
+        only_mask_system: bool = False,
+    ) -> dict:
+        conv = types.SimpleNamespace(
+            system="You are a helpful assistant.",
+            roles=("user", "assistant"),
+            version="qwen2",
+            offset=0,
+            sep="<|im_start|>",
+            sep2="<|im_end|>\n",
+        )
+        # Check system prompt
+        assert conversations[0]["role"] == "system"
+        if conversations[0]["content"] == None:
+            conversations[0]["content"] = conv.system # use default system prompt
+        # Check conversation sequence
+        for j, sentence in enumerate(conversations[1:]):
+            role = sentence["role"]
+            assert role == conv.roles[j % 2], "The conversation sequence is incorrect."
+        conversation_str = tokenizer.apply_chat_template(conversations, tokenize=False, add_generation_prompt=inference)
+        # Mask targets
+        rounds = conversation_str.split(conv.sep2)
+        input_ids_ = torch.tensor([], dtype=torch.int64)
+        targets_ = torch.tensor([], dtype=torch.int64)
+        for i, rou in enumerate(rounds):
+            if rou == "":
+                continue
+            if (not inference) or (i < (len(rounds) - 1)):
+                rou += conv.sep2
+            if has_image:
+                cur_input_ids_ = self.tokenizer_image_token(rou, tokenizer, return_tensors='pt')
+                input_ids_ = torch.cat([input_ids_, cur_input_ids_], dim=0)
+                if only_mask_system:
+                    mask_len = len(self.tokenizer_image_token(re.sub(rf'{conv.roles[0]}\n[\s\S]*', f'{conv.roles[0]}:', rou),
+                                                        tokenizer))
+                else:
+                    mask_len = len(self.tokenizer_image_token(re.sub(rf'{conv.roles[1]}\n[\s\S]*', f'{conv.roles[1]}:', rou),
+                                                        tokenizer))
+                targets_ = torch.cat([targets_, torch.tensor([-100] * mask_len), cur_input_ids_[mask_len:]], dim=0)
+            else:
+                cur_input_ids_ = tokenizer(rou, return_tensors='pt')["input_ids"][0, :]
+                input_ids_ = torch.cat([input_ids_, cur_input_ids_], dim=0)
+                mask_len = len(tokenizer(re.sub(rf'{conv.roles[1]}\n[\s\S]*', rf'{conv.roles[1]}:', rou))["input_ids"][:])
+                targets_ = torch.cat([targets_, torch.tensor([-100] * mask_len), cur_input_ids_[mask_len:]], dim=0)
+        return {"input_ids": input_ids_, "labels": targets_}
+    def tokenizer_image_token(
+        self,
+        prompt,
+        tokenizer,
+        image_token_index=IMAGE_TOKEN_INDEX,
+        return_tensors=None,
+    ):
+        def split_with_token(string, token):
+            result = string.split(token)
+            for i in range(len(result) - 1):
+                result.insert(i * 2 + 1, token)
+            return result
+        if len(prompt) > SEQ_MAX_LEN:
+            raise ValueError("sequence is too long !!!")
+        prompt_chunks = split_with_token(prompt, DEFAULT_IMAGE_TOKEN)
+        input_ids, offset = ([tokenizer.bos_token_id], 1) if getattr(tokenizer,'bos_token',None) else ([], 0)
+        token2index = {DEFAULT_IMAGE_TOKEN: image_token_index}
+        for chunk in prompt_chunks:
+            if chunk in token2index:
+                input_ids.append(token2index[chunk])
+            else:
+                chunk_ids = tokenizer(chunk).input_ids
+                if chunk_ids[0] != getattr(tokenizer,'bos_token_id', None):
+                    offset = 0
+                input_ids.extend(chunk_ids[offset:])
+        if return_tensors is not None:
+            if return_tensors == "pt":
+                return torch.tensor(input_ids, dtype=torch.long)
+            raise ValueError(f"Unsupported tensor type: {return_tensors}")
+        return input_ids
+    def __call__(self, messages, inference=True) -> BatchFeature:
+        # Deal with images
+        if "images" not in messages or not messages["images"] or not messages["images"][0]:
+            images = [self.black_img]
+        elif type(messages["images"]) == str:
+            images = [messages["images"]]
+        else:
+            images = messages["images"][:16] # support 16 images
+        # Deal with conversations
+        conversations = messages["conversations"]
+        if conversations[0]["role"] != "system":
+            conversations = [{"role":"system", "content": None}] + conversations  # dummy system prompt
+        # Insert special token `<image>`
+        assert conversations[1]["role"] == "user"
+        if images and "<image>" not in conversations[1]["content"]:
+            image_token = " ".join(["<image>"] * len(images))
+            conversations[1]["content"] = f"{image_token}\n{conversations[1]['content']}"
+        # The last message should be assistant if inference=True
+        if inference:
+            assert conversations[-1]["role"] == "user", "the last message should be assistant if inference=True"
+        # Image preprocess
+        precessed_images_siglip = self.preprocess_images_siglip(images)
+        processed_data_dict_qwen2vl = self.preprocess_images_qwen2vl(images)
+        source = self.preprocess_multimodal(conversations, len(precessed_images_siglip))
+        data_dict = self.preprocess_qwen2(source, self.tokenizer, has_image=True, only_mask_system=False, inference=inference)
+        # Construct batch data
+        data_dict["input_ids"] = data_dict["input_ids"].unsqueeze(0) # batch_size = 1
+        data_dict["labels"] = data_dict["labels"].unsqueeze(0)
+        data_dict["images"] = [precessed_images_siglip]
+        return BatchFeature(data={**data_dict, **processed_data_dict_qwen2vl})
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)

utils.py ADDED Viewed

	@@ -0,0 +1,251 @@

+from PIL import Image
+from io import BytesIO
+import base64
+import math
+import ast
+import re
+import torch
+from transformers import StoppingCriteria
+IGNORE_INDEX = -100
+IMAGE_TOKEN_INDEX = -200
+GANDALF_TOKEN_INDEX = -300
+DEFAULT_PAD_TOKEN = "[PAD]"
+DEFAULT_EOS_TOKEN = "</s>"
+DEFAULT_BOS_TOKEN = "</s>"
+DEFAULT_UNK_TOKEN = "<unk>"
+DEFAULT_IMAGE_TOKEN = "<image>"
+DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
+DEFAULT_IM_START_TOKEN = "<im_start>"
+DEFAULT_IM_END_TOKEN = "<im_end>"
+DEFAULT_VIDEO_TOKEN = "<video>"
+DEFAULT_VIDEO_FRAME_TOKEN = "<vi_frame>"
+DEFAULT_VI_START_TOKEN = "<vi_start>"
+DEFAULT_VI_END_TOKEN = "<vi_end>"
+DEFAULT_EOC_TOKEN = "<eoc>"
+COR_START_TOKEN = "<cor>"
+COR_END_TOKEN = "<\cor>"
+SEQ_MAX_LEN = 50000
+BLACK_IMG_ENV = b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x03\x00\x00\x00\x03\x08\x02\x00\x00\x00\xd9J"\xe8\x00\x00\x00\x12IDAT\x08\x1dcd\x80\x01F\x06\x18`d\x80\x01\x00\x00Z\x00\x04we\x03N\x00\x00\x00\x00IEND\xaeB`\x82'
+def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size):
+    """
+    Calculate the shape of the image patch grid after the preprocessing for images of any resolution.
+    Args:
+        image_size (tuple): The size of the input image in the format (width, height).
+        grid_pinpoints (str): A string representation of a list of possible resolutions.
+        patch_size (int): The size of each image patch.
+    Returns:
+        tuple: The shape of the image patch grid in the format (width, height).
+    """
+    if isinstance(grid_pinpoints, str) and "x" in grid_pinpoints:
+        assert patch_size in [224, 336, 384, 448, 512], "patch_size should be in [224, 336, 384, 448, 512]"
+        # Use regex to extract the range from the input string
+        matches = re.findall(r"\((\d+)x(\d+)\)", grid_pinpoints)
+        range_start = tuple(map(int, matches[0]))
+        range_end = tuple(map(int, matches[-1]))
+        # Generate a matrix of tuples from (range_start[0], range_start[1]) to (range_end[0], range_end[1])
+        grid_pinpoints = [
+            (i, j)
+            for i in range(range_start[0], range_end[0] + 1)
+            for j in range(range_start[1], range_end[1] + 1)
+        ]
+        # Multiply all elements by patch_size
+        grid_pinpoints = [[dim * patch_size for dim in pair] for pair in grid_pinpoints]
+    if type(grid_pinpoints) is list:
+        possible_resolutions = grid_pinpoints
+    else:
+        possible_resolutions = ast.literal_eval(grid_pinpoints)
+    width, height = select_best_resolution(image_size, possible_resolutions)
+    return width // patch_size, height // patch_size
+def select_best_resolution(original_size, possible_resolutions):
+    """
+    Selects the best resolution from a list of possible resolutions based on the original size.
+    Args:
+        original_size (tuple): The original size of the image in the format (width, height).
+        possible_resolutions (list): A list of possible resolutions in the format
+                                    [(width1, height1), (width2, height2), ...].
+    Returns:
+        tuple: The best fit resolution in the format (width, height).
+    """
+    original_width, original_height = original_size
+    best_fit = None
+    max_effective_resolution = 0
+    min_wasted_resolution = float("inf")
+    for width, height in possible_resolutions:
+        # Calculate the downscaled size to keep the aspect ratio
+        scale = min(width / original_width, height / original_height)
+        downscaled_width, downscaled_height = int(original_width * scale), int(original_height * scale)
+        # Calculate effective and wasted resolutions
+        effective_resolution = min(downscaled_width * downscaled_height, original_width * original_height)
+        wasted_resolution = (width * height) - effective_resolution
+        if effective_resolution > max_effective_resolution or \
+                (effective_resolution == max_effective_resolution and wasted_resolution < min_wasted_resolution):
+            max_effective_resolution = effective_resolution
+            min_wasted_resolution = wasted_resolution
+            best_fit = (width, height)
+    return best_fit
+def unpad_image(tensor, original_size):
+    """
+    Unpads a PyTorch tensor of a padded and resized image.
+    Args:
+    tensor (torch.Tensor): The image tensor, assumed to be in CxHxW format.
+    original_size (tuple): The original size of the image (height, width).
+    Returns:
+    torch.Tensor: The unpadded image tensor.
+    """
+    original_width, original_height = original_size
+    current_height, current_width = tensor.shape[1:]
+    # Compute aspect ratios
+    original_aspect_ratio = original_width / original_height
+    current_aspect_ratio = current_width / current_height
+    # Determine padding size and direction
+    if original_aspect_ratio > current_aspect_ratio:
+        # Padding was added to the height
+        scale_factor = current_width / original_width
+        new_height = int(original_height * scale_factor)
+        padding = (current_height - new_height) // 2
+        unpadded_tensor = tensor[:, padding: current_height - padding, :]
+    else:
+        # Padding was added to the width
+        scale_factor = current_height / original_height
+        new_width = int(original_width * scale_factor)
+        padding = (current_width - new_width) // 2
+        unpadded_tensor = tensor[:, :, padding: current_width - padding]
+    return unpadded_tensor
+def process_anyres_image(image, processor, grid_pinpoints):
+    """
+    Process an image with variable resolutions.
+    Args:
+        image (PIL.Image.Image): The input image to be processed.
+        processor: The image processor object.
+        grid_pinpoints (str): A string representation of a list of possible resolutions.
+    Returns:
+        torch.Tensor: A tensor containing the processed image patches.
+    """
+    # Convert grid_pinpoints from string to list
+    if isinstance(grid_pinpoints, str) and "x" in grid_pinpoints:
+        try:
+            patch_size = processor.size["height"]
+        except Exception:
+            patch_size = processor.size["shortest_edge"]
+        assert patch_size in [224, 336, 384, 448, 512], "patch_size should be in [224, 336, 384, 448, 512]"
+        # Use regex to extract the range from the input string
+        matches = re.findall(r"\((\d+)x(\d+)\)", grid_pinpoints)
+        range_start = tuple(map(int, matches[0]))
+        range_end = tuple(map(int, matches[-1]))
+        # Generate a matrix of tuples from (range_start[0], range_start[1]) to (range_end[0], range_end[1])
+        grid_pinpoints = [
+            (i, j)
+            for i in range(range_start[0], range_end[0] + 1)
+            for j in range(range_start[1], range_end[1] + 1)
+        ]
+        # Multiply all elements by patch_size
+        grid_pinpoints = [[dim * patch_size for dim in pair] for pair in grid_pinpoints]
+    if type(grid_pinpoints) is list:
+        possible_resolutions = grid_pinpoints
+    else:
+        possible_resolutions = ast.literal_eval(grid_pinpoints)
+    best_resolution = select_best_resolution(image.size, possible_resolutions)
+    image_padded = resize_and_pad_image(image, best_resolution)
+    patches = divide_to_patches(image_padded, processor.size["height"])
+    # FIXME: this seems to be a bug that it resizes instead of pad.
+    # but to keep it consistent with previous, i will keep it as it is
+    # TODO: uncomment below to ablate with the padding
+    if isinstance(processor.size, dict):
+        shortest_edge = processor.size["height"]
+    else:
+        shortest_edge = min(processor.size)
+    image_original_resize = image.resize((shortest_edge, shortest_edge))
+    # image_padded_square = expand2square(image, tuple(int(x*255) for x in processor.image_mean))
+    image_patches = [image_original_resize] + patches
+    image_patches = [
+        processor.preprocess(image_patch, return_tensors="pt")["pixel_values"][0]
+        for image_patch in image_patches
+    ]
+    # return torch.stack(image_patches, dim=0)
+    return image_patches
+def resize_and_pad_image(image, target_resolution):
+    """
+    Resize and pad an image to a target resolution while maintaining aspect ratio.
+    Args:
+        image (PIL.Image.Image): The input image.
+        target_resolution (tuple): The target resolution (width, height) of the image.
+    Returns:
+        PIL.Image.Image: The resized and padded image.
+    """
+    original_width, original_height = image.size
+    target_width, target_height = target_resolution
+    # Determine which dimension (width or height) to fill
+    scale_w = target_width / original_width
+    scale_h = target_height / original_height
+    if scale_w < scale_h:
+        # Width will be filled completely
+        new_width = target_width
+        new_height = min(math.ceil(original_height * scale_w), target_height)
+    else:
+        # Height will be filled completely
+        new_height = target_height
+        new_width = min(math.ceil(original_width * scale_h), target_width)
+    # Resize the image
+    resized_image = image.resize((new_width, new_height))
+    # Create a new image with the target size and paste the resized image onto it
+    new_image = Image.new("RGB", (target_width, target_height), (0, 0, 0))
+    paste_x = (target_width - new_width) // 2
+    paste_y = (target_height - new_height) // 2
+    new_image.paste(resized_image, (paste_x, paste_y))
+    return new_image
+def divide_to_patches(image, patch_size):
+    """
+    Divides an image into patches of a specified size.
+    Args:
+        image (PIL.Image.Image): The input image.
+        patch_size (int): The size of each patch.
+    Returns:
+        list: A list of PIL.Image.Image objects representing the patches.
+    """
+    patches = []
+    width, height = image.size
+    for i in range(0, height, patch_size):
+        for j in range(0, width, patch_size):
+            box = (j, i, j + patch_size, i + patch_size)
+            patch = image.crop(box)
+            patches.append(patch)
+    return patches