hcwei commited on
Commit
1ef5fe2
·
verified ·
1 Parent(s): 515296e

Upload 14 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
added_tokens.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</box>": 151673,
3
+ "</img>": 151666,
4
+ "</quad>": 151669,
5
+ "</ref>": 151671,
6
+ "<box>": 151672,
7
+ "<img>": 151665,
8
+ "<quad>": 151668,
9
+ "<ref>": 151670,
10
+ "</think>": 151668,
11
+ "</tool_call>": 151658,
12
+ "</tool_response>": 151666,
13
+ "<think>": 151667,
14
+ "<tool_call>": 151657,
15
+ "<tool_response>": 151665,
16
+ "<|box_end|>": 151649,
17
+ "<|box_start|>": 151648,
18
+ "<|endoftext|>": 151643,
19
+ "<|file_sep|>": 151664,
20
+ "<|fim_middle|>": 151660,
21
+ "<|fim_pad|>": 151662,
22
+ "<|fim_prefix|>": 151659,
23
+ "<|fim_suffix|>": 151661,
24
+ "<|im_end|>": 151645,
25
+ "<|im_start|>": 151644,
26
+ "<|image_pad|>": 151655,
27
+ "<|object_ref_end|>": 151647,
28
+ "<|object_ref_start|>": 151646,
29
+ "<|quad_end|>": 151651,
30
+ "<|quad_start|>": 151650,
31
+ "<|repo_name|>": 151663,
32
+ "<|video_pad|>": 151656,
33
+ "<|vision_end|>": 151653,
34
+ "<|vision_pad|>": 151654,
35
+ "<|vision_start|>": 151652
36
+ }
config.json ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_commit_hash": null,
3
+ "architectures": [
4
+ "InternVLChatModel"
5
+ ],
6
+ "auto_map": {
7
+ "AutoConfig": "configuration_internvl_chat.InternVLChatConfig",
8
+ "AutoModel": "modeling_internvl_chat.InternVLChatModel",
9
+ "AutoModelForCausalLM": "modeling_internvl_chat.InternVLChatModel"
10
+ },
11
+ "downsample_ratio": 0.5,
12
+ "dynamic_image_size": true,
13
+ "force_image_size": 448,
14
+ "llm_config": {
15
+ "_name_or_path": "Qwen/Qwen2.5-32B-Instruct",
16
+ "add_cross_attention": false,
17
+ "architectures": [
18
+ "Qwen2ForCausalLM"
19
+ ],
20
+ "_attn_implementation": "sdpa",
21
+ "attention_dropout": 0.0,
22
+ "bad_words_ids": null,
23
+ "begin_suppress_tokens": null,
24
+ "bos_token_id": 151643,
25
+ "chunk_size_feed_forward": 0,
26
+ "cross_attention_hidden_size": null,
27
+ "decoder_start_token_id": null,
28
+ "diversity_penalty": 0.0,
29
+ "do_sample": false,
30
+ "early_stopping": false,
31
+ "encoder_no_repeat_ngram_size": 0,
32
+ "eos_token_id": 151645,
33
+ "exponential_decay_length_penalty": null,
34
+ "finetuning_task": null,
35
+ "forced_bos_token_id": null,
36
+ "forced_eos_token_id": null,
37
+ "hidden_act": "silu",
38
+ "hidden_size": 5120,
39
+ "id2label": {
40
+ "0": "LABEL_0",
41
+ "1": "LABEL_1"
42
+ },
43
+ "initializer_range": 0.02,
44
+ "intermediate_size": 27648,
45
+ "is_decoder": false,
46
+ "is_encoder_decoder": false,
47
+ "label2id": {
48
+ "LABEL_0": 0,
49
+ "LABEL_1": 1
50
+ },
51
+ "length_penalty": 1.0,
52
+ "max_length": 20,
53
+ "max_position_embeddings": 32768,
54
+ "max_window_layers": 70,
55
+ "min_length": 0,
56
+ "model_type": "qwen2",
57
+ "no_repeat_ngram_size": 0,
58
+ "num_attention_heads": 40,
59
+ "num_beam_groups": 1,
60
+ "num_beams": 1,
61
+ "num_hidden_layers": 64,
62
+ "num_key_value_heads": 8,
63
+ "num_return_sequences": 1,
64
+ "output_attentions": false,
65
+ "output_hidden_states": false,
66
+ "output_scores": false,
67
+ "pad_token_id": null,
68
+ "prefix": null,
69
+ "problem_type": null,
70
+ "pruned_heads": {},
71
+ "remove_invalid_values": false,
72
+ "repetition_penalty": 1.0,
73
+ "return_dict": true,
74
+ "return_dict_in_generate": false,
75
+ "rms_norm_eps": 1e-06,
76
+ "rope_theta": 1000000.0,
77
+ "sep_token_id": null,
78
+ "sliding_window": 131072,
79
+ "suppress_tokens": null,
80
+ "task_specific_params": null,
81
+ "temperature": 1.0,
82
+ "tf_legacy_loss": false,
83
+ "tie_encoder_decoder": false,
84
+ "tie_word_embeddings": false,
85
+ "tokenizer_class": null,
86
+ "top_k": 50,
87
+ "top_p": 1.0,
88
+ "torch_dtype": "bfloat16",
89
+ "torchscript": false,
90
+ "transformers_version": "4.37.2",
91
+ "typical_p": 1.0,
92
+ "use_bfloat16": true,
93
+ "use_cache": true,
94
+ "use_sliding_window": false,
95
+ "vocab_size": 152064
96
+ },
97
+ "max_dynamic_patch": 12,
98
+ "min_dynamic_patch": 1,
99
+ "model_type": "internvl_chat",
100
+ "ps_version": "v2",
101
+ "select_layer": -1,
102
+ "template": "internvl2_5",
103
+ "torch_dtype": "bfloat16",
104
+ "use_backbone_lora": 0,
105
+ "use_llm_lora": 0,
106
+ "use_thumbnail": true,
107
+ "vision_config": {
108
+ "architectures": [
109
+ "InternVisionModel"
110
+ ],
111
+ "attention_dropout": 0.0,
112
+ "drop_path_rate": 0.0,
113
+ "dropout": 0.0,
114
+ "hidden_act": "gelu",
115
+ "hidden_size": 3200,
116
+ "image_size": 448,
117
+ "initializer_factor": 0.1,
118
+ "initializer_range": 1e-10,
119
+ "intermediate_size": 12800,
120
+ "layer_norm_eps": 1e-06,
121
+ "model_type": "intern_vit_6b",
122
+ "norm_type": "rms_norm",
123
+ "num_attention_heads": 25,
124
+ "num_channels": 3,
125
+ "num_hidden_layers": 45,
126
+ "output_attentions": false,
127
+ "output_hidden_states": false,
128
+ "patch_size": 14,
129
+ "qk_normalization": true,
130
+ "qkv_bias": false,
131
+ "return_dict": true,
132
+ "torch_dtype": "bfloat16",
133
+ "transformers_version": "4.37.2",
134
+ "use_bfloat16": true,
135
+ "use_flash_attn": false
136
+ }
137
+ }
configuration_intern_vit.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --------------------------------------------------------
2
+ # InternVL
3
+ # Copyright (c) 2024 OpenGVLab
4
+ # Licensed under The MIT License [see LICENSE for details]
5
+ # --------------------------------------------------------
6
+
7
+ import os
8
+ from typing import Union
9
+
10
+ from transformers.configuration_utils import PretrainedConfig
11
+ from transformers.utils import logging
12
+
13
+ logger = logging.get_logger(__name__)
14
+
15
+
16
+ class InternVisionConfig(PretrainedConfig):
17
+ r"""
18
+ This is the configuration class to store the configuration of a [`InternVisionModel`]. It is used to
19
+ instantiate a vision encoder according to the specified arguments, defining the model architecture.
20
+
21
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
22
+ documentation from [`PretrainedConfig`] for more information.
23
+
24
+ Args:
25
+ num_channels (`int`, *optional*, defaults to 3):
26
+ Number of color channels in the input images (e.g., 3 for RGB).
27
+ patch_size (`int`, *optional*, defaults to 14):
28
+ The size (resolution) of each patch.
29
+ image_size (`int`, *optional*, defaults to 224):
30
+ The size (resolution) of each image.
31
+ qkv_bias (`bool`, *optional*, defaults to `False`):
32
+ Whether to add a bias to the queries and values in the self-attention layers.
33
+ hidden_size (`int`, *optional*, defaults to 3200):
34
+ Dimensionality of the encoder layers and the pooler layer.
35
+ num_attention_heads (`int`, *optional*, defaults to 25):
36
+ Number of attention heads for each attention layer in the Transformer encoder.
37
+ intermediate_size (`int`, *optional*, defaults to 12800):
38
+ Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
39
+ qk_normalization (`bool`, *optional*, defaults to `True`):
40
+ Whether to normalize the queries and keys in the self-attention layers.
41
+ num_hidden_layers (`int`, *optional*, defaults to 48):
42
+ Number of hidden layers in the Transformer encoder.
43
+ use_flash_attn (`bool`, *optional*, defaults to `True`):
44
+ Whether to use flash attention mechanism.
45
+ hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
46
+ The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
47
+ `"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported.
48
+ layer_norm_eps (`float`, *optional*, defaults to 1e-6):
49
+ The epsilon used by the layer normalization layers.
50
+ dropout (`float`, *optional*, defaults to 0.0):
51
+ The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
52
+ drop_path_rate (`float`, *optional*, defaults to 0.0):
53
+ Dropout rate for stochastic depth.
54
+ attention_dropout (`float`, *optional*, defaults to 0.0):
55
+ The dropout ratio for the attention probabilities.
56
+ initializer_range (`float`, *optional*, defaults to 0.02):
57
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
58
+ initializer_factor (`float`, *optional*, defaults to 0.1):
59
+ A factor for layer scale.
60
+ """
61
+
62
+ model_type = 'intern_vit_6b'
63
+
64
+ def __init__(
65
+ self,
66
+ num_channels=3,
67
+ patch_size=14,
68
+ image_size=224,
69
+ qkv_bias=False,
70
+ hidden_size=3200,
71
+ num_attention_heads=25,
72
+ intermediate_size=12800,
73
+ qk_normalization=True,
74
+ num_hidden_layers=48,
75
+ use_flash_attn=True,
76
+ hidden_act='gelu',
77
+ norm_type='rms_norm',
78
+ layer_norm_eps=1e-6,
79
+ dropout=0.0,
80
+ drop_path_rate=0.0,
81
+ attention_dropout=0.0,
82
+ initializer_range=0.02,
83
+ initializer_factor=0.1,
84
+ **kwargs,
85
+ ):
86
+ super().__init__(**kwargs)
87
+
88
+ self.hidden_size = hidden_size
89
+ self.intermediate_size = intermediate_size
90
+ self.dropout = dropout
91
+ self.drop_path_rate = drop_path_rate
92
+ self.num_hidden_layers = num_hidden_layers
93
+ self.num_attention_heads = num_attention_heads
94
+ self.num_channels = num_channels
95
+ self.patch_size = patch_size
96
+ self.image_size = image_size
97
+ self.initializer_range = initializer_range
98
+ self.initializer_factor = initializer_factor
99
+ self.attention_dropout = attention_dropout
100
+ self.layer_norm_eps = layer_norm_eps
101
+ self.hidden_act = hidden_act
102
+ self.norm_type = norm_type
103
+ self.qkv_bias = qkv_bias
104
+ self.qk_normalization = qk_normalization
105
+ self.use_flash_attn = use_flash_attn
106
+
107
+ @classmethod
108
+ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> 'PretrainedConfig':
109
+ config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
110
+
111
+ if 'vision_config' in config_dict:
112
+ config_dict = config_dict['vision_config']
113
+
114
+ if 'model_type' in config_dict and hasattr(cls, 'model_type') and config_dict['model_type'] != cls.model_type:
115
+ logger.warning(
116
+ f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
117
+ f'{cls.model_type}. This is not supported for all configurations of models and can yield errors.'
118
+ )
119
+
120
+ return cls.from_dict(config_dict, **kwargs)
configuration_internvl_chat.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --------------------------------------------------------
2
+ # InternVL
3
+ # Copyright (c) 2024 OpenGVLab
4
+ # Licensed under The MIT License [see LICENSE for details]
5
+ # --------------------------------------------------------
6
+
7
+ import copy
8
+
9
+ from transformers import AutoConfig, LlamaConfig, Qwen2Config
10
+ from transformers.configuration_utils import PretrainedConfig
11
+ from transformers.utils import logging
12
+
13
+ from .configuration_intern_vit import InternVisionConfig
14
+
15
+ logger = logging.get_logger(__name__)
16
+
17
+
18
+ class InternVLChatConfig(PretrainedConfig):
19
+ model_type = 'internvl_chat'
20
+ is_composition = True
21
+
22
+ def __init__(
23
+ self,
24
+ vision_config=None,
25
+ llm_config=None,
26
+ use_backbone_lora=0,
27
+ use_llm_lora=0,
28
+ select_layer=-1,
29
+ force_image_size=None,
30
+ downsample_ratio=0.5,
31
+ template=None,
32
+ dynamic_image_size=False,
33
+ use_thumbnail=False,
34
+ ps_version='v1',
35
+ min_dynamic_patch=1,
36
+ max_dynamic_patch=6,
37
+ **kwargs):
38
+ super().__init__(**kwargs)
39
+
40
+ if vision_config is None:
41
+ vision_config = {'architectures': ['InternVisionModel']}
42
+ logger.info('vision_config is None. Initializing the InternVisionConfig with default values.')
43
+
44
+ if llm_config is None:
45
+ llm_config = {'architectures': ['Qwen2ForCausalLM']}
46
+ logger.info('llm_config is None. Initializing the LlamaConfig config with default values (`LlamaConfig`).')
47
+
48
+ self.vision_config = InternVisionConfig(**vision_config)
49
+ if llm_config.get('architectures')[0] == 'LlamaForCausalLM':
50
+ self.llm_config = LlamaConfig(**llm_config)
51
+ elif llm_config.get('architectures')[0] == 'Qwen2ForCausalLM':
52
+ self.llm_config = Qwen2Config(**llm_config)
53
+ else:
54
+ raise ValueError('Unsupported architecture: {}'.format(llm_config.get('architectures')[0]))
55
+ self.use_backbone_lora = use_backbone_lora
56
+ self.use_llm_lora = use_llm_lora
57
+ self.select_layer = select_layer
58
+ self.force_image_size = force_image_size
59
+ self.downsample_ratio = downsample_ratio
60
+ self.template = template
61
+ self.dynamic_image_size = dynamic_image_size
62
+ self.use_thumbnail = use_thumbnail
63
+ self.ps_version = ps_version # pixel shuffle version
64
+ self.min_dynamic_patch = min_dynamic_patch
65
+ self.max_dynamic_patch = max_dynamic_patch
66
+
67
+ logger.info(f'vision_select_layer: {self.select_layer}')
68
+ logger.info(f'ps_version: {self.ps_version}')
69
+ logger.info(f'min_dynamic_patch: {self.min_dynamic_patch}')
70
+ logger.info(f'max_dynamic_patch: {self.max_dynamic_patch}')
71
+
72
+ def to_dict(self):
73
+ """
74
+ Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
75
+
76
+ Returns:
77
+ `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
78
+ """
79
+ output = copy.deepcopy(self.__dict__)
80
+ output['vision_config'] = self.vision_config.to_dict()
81
+ output['llm_config'] = self.llm_config.to_dict()
82
+ output['model_type'] = self.__class__.model_type
83
+ output['use_backbone_lora'] = self.use_backbone_lora
84
+ output['use_llm_lora'] = self.use_llm_lora
85
+ output['select_layer'] = self.select_layer
86
+ output['force_image_size'] = self.force_image_size
87
+ output['downsample_ratio'] = self.downsample_ratio
88
+ output['template'] = self.template
89
+ output['dynamic_image_size'] = self.dynamic_image_size
90
+ output['use_thumbnail'] = self.use_thumbnail
91
+ output['ps_version'] = self.ps_version
92
+ output['min_dynamic_patch'] = self.min_dynamic_patch
93
+ output['max_dynamic_patch'] = self.max_dynamic_patch
94
+
95
+ return output
conversation.py ADDED
@@ -0,0 +1,395 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Conversation prompt templates.
3
+
4
+ We kindly request that you import fastchat instead of copying this file if you wish to use it.
5
+ If you have changes in mind, please contribute back so the community can benefit collectively and continue to maintain these valuable templates.
6
+
7
+ Modified from https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
8
+ """
9
+
10
+ import dataclasses
11
+ from enum import IntEnum, auto
12
+ from typing import Dict, List, Tuple, Union
13
+
14
+
15
+ class SeparatorStyle(IntEnum):
16
+ """Separator styles."""
17
+
18
+ ADD_COLON_SINGLE = auto()
19
+ ADD_COLON_TWO = auto()
20
+ ADD_COLON_SPACE_SINGLE = auto()
21
+ NO_COLON_SINGLE = auto()
22
+ NO_COLON_TWO = auto()
23
+ ADD_NEW_LINE_SINGLE = auto()
24
+ LLAMA2 = auto()
25
+ CHATGLM = auto()
26
+ CHATML = auto()
27
+ CHATINTERN = auto()
28
+ DOLLY = auto()
29
+ RWKV = auto()
30
+ PHOENIX = auto()
31
+ ROBIN = auto()
32
+ FALCON_CHAT = auto()
33
+ CHATGLM3 = auto()
34
+ INTERNVL_ZH = auto()
35
+ MPT = auto()
36
+
37
+
38
+ @dataclasses.dataclass
39
+ class Conversation:
40
+ """A class that manages prompt templates and keeps all conversation history."""
41
+
42
+ # The name of this template
43
+ name: str
44
+ # The template of the system prompt
45
+ system_template: str = '{system_message}'
46
+ # The system message
47
+ system_message: str = ''
48
+ # The names of two roles
49
+ roles: Tuple[str] = ('USER', 'ASSISTANT')
50
+ # All messages. Each item is (role, message).
51
+ messages: List[List[str]] = ()
52
+ # The number of few shot examples
53
+ offset: int = 0
54
+ # The separator style and configurations
55
+ sep_style: SeparatorStyle = SeparatorStyle.ADD_COLON_SINGLE
56
+ sep: str = '\n'
57
+ sep2: str = None
58
+ # Stop criteria (the default one is EOS token)
59
+ stop_str: Union[str, List[str]] = None
60
+ # Stops generation if meeting any token in this list
61
+ stop_token_ids: List[int] = None
62
+
63
+ def get_prompt(self) -> str:
64
+ """Get the prompt for generation."""
65
+ system_prompt = self.system_template.format(system_message=self.system_message)
66
+ if self.sep_style == SeparatorStyle.ADD_COLON_SINGLE:
67
+ ret = system_prompt + self.sep
68
+ for role, message in self.messages:
69
+ if message:
70
+ ret += role + ': ' + message + self.sep
71
+ else:
72
+ ret += role + ':'
73
+ return ret
74
+ elif self.sep_style == SeparatorStyle.ADD_COLON_TWO:
75
+ seps = [self.sep, self.sep2]
76
+ ret = system_prompt + seps[0]
77
+ for i, (role, message) in enumerate(self.messages):
78
+ if message:
79
+ ret += role + ': ' + message + seps[i % 2]
80
+ else:
81
+ ret += role + ':'
82
+ return ret
83
+ elif self.sep_style == SeparatorStyle.ADD_COLON_SPACE_SINGLE:
84
+ ret = system_prompt + self.sep
85
+ for role, message in self.messages:
86
+ if message:
87
+ ret += role + ': ' + message + self.sep
88
+ else:
89
+ ret += role + ': ' # must be end with a space
90
+ return ret
91
+ elif self.sep_style == SeparatorStyle.ADD_NEW_LINE_SINGLE:
92
+ ret = '' if system_prompt == '' else system_prompt + self.sep
93
+ for role, message in self.messages:
94
+ if message:
95
+ ret += role + '\n' + message + self.sep
96
+ else:
97
+ ret += role + '\n'
98
+ return ret
99
+ elif self.sep_style == SeparatorStyle.NO_COLON_SINGLE:
100
+ ret = system_prompt
101
+ for role, message in self.messages:
102
+ if message:
103
+ ret += role + message + self.sep
104
+ else:
105
+ ret += role
106
+ return ret
107
+ elif self.sep_style == SeparatorStyle.NO_COLON_TWO:
108
+ seps = [self.sep, self.sep2]
109
+ ret = system_prompt
110
+ for i, (role, message) in enumerate(self.messages):
111
+ if message:
112
+ ret += role + message + seps[i % 2]
113
+ else:
114
+ ret += role
115
+ return ret
116
+ elif self.sep_style == SeparatorStyle.RWKV:
117
+ ret = system_prompt
118
+ for i, (role, message) in enumerate(self.messages):
119
+ if message:
120
+ ret += (
121
+ role
122
+ + ': '
123
+ + message.replace('\r\n', '\n').replace('\n\n', '\n')
124
+ )
125
+ ret += '\n\n'
126
+ else:
127
+ ret += role + ':'
128
+ return ret
129
+ elif self.sep_style == SeparatorStyle.LLAMA2:
130
+ seps = [self.sep, self.sep2]
131
+ if self.system_message:
132
+ ret = system_prompt
133
+ else:
134
+ ret = '[INST] '
135
+ for i, (role, message) in enumerate(self.messages):
136
+ tag = self.roles[i % 2]
137
+ if message:
138
+ if i == 0:
139
+ ret += message + ' '
140
+ else:
141
+ ret += tag + ' ' + message + seps[i % 2]
142
+ else:
143
+ ret += tag
144
+ return ret
145
+ elif self.sep_style == SeparatorStyle.CHATGLM:
146
+ # source: https://huggingface.co/THUDM/chatglm-6b/blob/1d240ba371910e9282298d4592532d7f0f3e9f3e/modeling_chatglm.py#L1302-L1308
147
+ # source2: https://huggingface.co/THUDM/chatglm2-6b/blob/e186c891cf64310ac66ef10a87e6635fa6c2a579/modeling_chatglm.py#L926
148
+ round_add_n = 1 if self.name == 'chatglm2' else 0
149
+ if system_prompt:
150
+ ret = system_prompt + self.sep
151
+ else:
152
+ ret = ''
153
+
154
+ for i, (role, message) in enumerate(self.messages):
155
+ if i % 2 == 0:
156
+ ret += f'[Round {i//2 + round_add_n}]{self.sep}'
157
+
158
+ if message:
159
+ ret += f'{role}:{message}{self.sep}'
160
+ else:
161
+ ret += f'{role}:'
162
+ return ret
163
+ elif self.sep_style == SeparatorStyle.CHATML:
164
+ ret = '' if system_prompt == '' else system_prompt + self.sep + '\n'
165
+ for role, message in self.messages:
166
+ if message:
167
+ ret += role + '\n' + message + self.sep + '\n'
168
+ else:
169
+ ret += role + '\n'
170
+ return ret
171
+ elif self.sep_style == SeparatorStyle.CHATGLM3:
172
+ ret = ''
173
+ if self.system_message:
174
+ ret += system_prompt
175
+ for role, message in self.messages:
176
+ if message:
177
+ ret += role + '\n' + ' ' + message
178
+ else:
179
+ ret += role
180
+ return ret
181
+ elif self.sep_style == SeparatorStyle.CHATINTERN:
182
+ # source: https://huggingface.co/internlm/internlm-chat-7b-8k/blob/bd546fa984b4b0b86958f56bf37f94aa75ab8831/modeling_internlm.py#L771
183
+ seps = [self.sep, self.sep2]
184
+ ret = system_prompt
185
+ for i, (role, message) in enumerate(self.messages):
186
+ # if i % 2 == 0:
187
+ # ret += "<s>"
188
+ if message:
189
+ ret += role + ':' + message + seps[i % 2] + '\n'
190
+ else:
191
+ ret += role + ':'
192
+ return ret
193
+ elif self.sep_style == SeparatorStyle.DOLLY:
194
+ seps = [self.sep, self.sep2]
195
+ ret = system_prompt
196
+ for i, (role, message) in enumerate(self.messages):
197
+ if message:
198
+ ret += role + ':\n' + message + seps[i % 2]
199
+ if i % 2 == 1:
200
+ ret += '\n\n'
201
+ else:
202
+ ret += role + ':\n'
203
+ return ret
204
+ elif self.sep_style == SeparatorStyle.PHOENIX:
205
+ ret = system_prompt
206
+ for role, message in self.messages:
207
+ if message:
208
+ ret += role + ': ' + '<s>' + message + '</s>'
209
+ else:
210
+ ret += role + ': ' + '<s>'
211
+ return ret
212
+ elif self.sep_style == SeparatorStyle.ROBIN:
213
+ ret = system_prompt + self.sep
214
+ for role, message in self.messages:
215
+ if message:
216
+ ret += role + ':\n' + message + self.sep
217
+ else:
218
+ ret += role + ':\n'
219
+ return ret
220
+ elif self.sep_style == SeparatorStyle.FALCON_CHAT:
221
+ ret = ''
222
+ if self.system_message:
223
+ ret += system_prompt + self.sep
224
+ for role, message in self.messages:
225
+ if message:
226
+ ret += role + ': ' + message + self.sep
227
+ else:
228
+ ret += role + ':'
229
+
230
+ return ret
231
+ elif self.sep_style == SeparatorStyle.INTERNVL_ZH:
232
+ seps = [self.sep, self.sep2]
233
+ ret = self.system_message + seps[0]
234
+ for i, (role, message) in enumerate(self.messages):
235
+ if message:
236
+ ret += role + ': ' + message + seps[i % 2]
237
+ else:
238
+ ret += role + ':'
239
+ return ret
240
+ elif self.sep_style == SeparatorStyle.MPT:
241
+ ret = system_prompt + self.sep
242
+ for role, message in self.messages:
243
+ if message:
244
+ if type(message) is tuple:
245
+ message, _, _ = message
246
+ ret += role + message + self.sep
247
+ else:
248
+ ret += role
249
+ return ret
250
+ else:
251
+ raise ValueError(f'Invalid style: {self.sep_style}')
252
+
253
+ def set_system_message(self, system_message: str):
254
+ """Set the system message."""
255
+ self.system_message = system_message
256
+
257
+ def append_message(self, role: str, message: str):
258
+ """Append a new message."""
259
+ self.messages.append([role, message])
260
+
261
+ def update_last_message(self, message: str):
262
+ """Update the last output.
263
+
264
+ The last message is typically set to be None when constructing the prompt,
265
+ so we need to update it in-place after getting the response from a model.
266
+ """
267
+ self.messages[-1][1] = message
268
+
269
+ def to_gradio_chatbot(self):
270
+ """Convert the conversation to gradio chatbot format."""
271
+ ret = []
272
+ for i, (role, msg) in enumerate(self.messages[self.offset :]):
273
+ if i % 2 == 0:
274
+ ret.append([msg, None])
275
+ else:
276
+ ret[-1][-1] = msg
277
+ return ret
278
+
279
+ def to_openai_api_messages(self):
280
+ """Convert the conversation to OpenAI chat completion format."""
281
+ ret = [{'role': 'system', 'content': self.system_message}]
282
+
283
+ for i, (_, msg) in enumerate(self.messages[self.offset :]):
284
+ if i % 2 == 0:
285
+ ret.append({'role': 'user', 'content': msg})
286
+ else:
287
+ if msg is not None:
288
+ ret.append({'role': 'assistant', 'content': msg})
289
+ return ret
290
+
291
+ def copy(self):
292
+ return Conversation(
293
+ name=self.name,
294
+ system_template=self.system_template,
295
+ system_message=self.system_message,
296
+ roles=self.roles,
297
+ messages=[[x, y] for x, y in self.messages],
298
+ offset=self.offset,
299
+ sep_style=self.sep_style,
300
+ sep=self.sep,
301
+ sep2=self.sep2,
302
+ stop_str=self.stop_str,
303
+ stop_token_ids=self.stop_token_ids,
304
+ )
305
+
306
+ def dict(self):
307
+ return {
308
+ 'template_name': self.name,
309
+ 'system_message': self.system_message,
310
+ 'roles': self.roles,
311
+ 'messages': self.messages,
312
+ 'offset': self.offset,
313
+ }
314
+
315
+
316
+ # A global registry for all conversation templates
317
+ conv_templates: Dict[str, Conversation] = {}
318
+
319
+
320
+ def register_conv_template(template: Conversation, override: bool = False):
321
+ """Register a new conversation template."""
322
+ if not override:
323
+ assert (
324
+ template.name not in conv_templates
325
+ ), f'{template.name} has been registered.'
326
+
327
+ conv_templates[template.name] = template
328
+
329
+
330
+ def get_conv_template(name: str) -> Conversation:
331
+ """Get a conversation template."""
332
+ return conv_templates[name].copy()
333
+
334
+
335
+ # Both Hermes-2 and internlm2-chat are chatml-format conversation templates. The difference
336
+ # is that during training, the preprocessing function for the Hermes-2 template doesn't add
337
+ # <s> at the beginning of the tokenized sequence, while the internlm2-chat template does.
338
+ # Therefore, they are completely equivalent during inference.
339
+ register_conv_template(
340
+ Conversation(
341
+ name='Hermes-2',
342
+ system_template='<|im_start|>system\n{system_message}',
343
+ # note: The new system prompt was not used here to avoid changes in benchmark performance.
344
+ # system_message='我是书生·万象,英文名是InternVL,是由上海人工智能实验室、清华大学及多家合作单位联合开发的多模态大语言模型。',
345
+ # system_message='你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型,英文名叫InternVL, 是一个有用无害的人工智能助手。',
346
+ system_message='',
347
+ roles=('<|im_start|>user\n', '<|im_start|>assistant\n'),
348
+ sep_style=SeparatorStyle.MPT,
349
+ sep='<|im_end|>',
350
+ stop_str='<|endoftext|>',
351
+ )
352
+ )
353
+
354
+
355
+ register_conv_template(
356
+ Conversation(
357
+ name='internlm2-chat',
358
+ system_template='<|im_start|>system\n{system_message}',
359
+ # note: The new system prompt was not used here to avoid changes in benchmark performance.
360
+ # system_message='我是书生·万象,英文名是InternVL,是由上海人工智能实验室、清华大学及多家合作单位联合开发的多模态大语言模型。',
361
+ # system_message='你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型,英文名叫InternVL, 是一个有用无害的人工智能助手。',
362
+ system_message='',
363
+ roles=('<|im_start|>user\n', '<|im_start|>assistant\n'),
364
+ sep_style=SeparatorStyle.MPT,
365
+ sep='<|im_end|>',
366
+ )
367
+ )
368
+
369
+
370
+ register_conv_template(
371
+ Conversation(
372
+ name='phi3-chat',
373
+ system_template='<|system|>\n{system_message}',
374
+ # note: The new system prompt was not used here to avoid changes in benchmark performance.
375
+ # system_message='我是书生·万象,英文名是InternVL,是由上海人工智能实验室、清华大学及多家合作单位联合开发的多模态大语言模型。',
376
+ # system_message='你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型,英文名叫InternVL, 是一个有用无害的人工智能助手。',
377
+ system_message='',
378
+ roles=('<|user|>\n', '<|assistant|>\n'),
379
+ sep_style=SeparatorStyle.MPT,
380
+ sep='<|end|>',
381
+ )
382
+ )
383
+
384
+
385
+ register_conv_template(
386
+ Conversation(
387
+ name='internvl2_5',
388
+ system_template='<|im_start|>system\n{system_message}',
389
+ # system_message='你是书生·万象,英文名是InternVL,是由上海人工智能实验室、清华大学及多家合作单位联合开发的多模态大语言模型。',
390
+ system_message='',
391
+ roles=('<|im_start|>user\n', '<|im_start|>assistant\n'),
392
+ sep_style=SeparatorStyle.MPT,
393
+ sep='<|im_end|>\n',
394
+ )
395
+ )
generation_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 151645,
6
+ 151643
7
+ ],
8
+ "pad_token_id": 151643,
9
+ "repetition_penalty": 1.0,
10
+ "temperature": 0.6,
11
+ "top_k": 40,
12
+ "top_p": 0.95,
13
+ "transformers_version": "4.45.2"
14
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff
 
modeling_intern_vit.py ADDED
@@ -0,0 +1,430 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --------------------------------------------------------
2
+ # InternVL
3
+ # Copyright (c) 2024 OpenGVLab
4
+ # Licensed under The MIT License [see LICENSE for details]
5
+ # --------------------------------------------------------
6
+
7
+ from typing import Optional, Tuple, Union
8
+
9
+ import torch
10
+ import torch.nn.functional as F
11
+ import torch.utils.checkpoint
12
+ from einops import rearrange
13
+ from timm.models.layers import DropPath
14
+ from torch import nn
15
+ from transformers.activations import ACT2FN
16
+ from transformers.modeling_outputs import (BaseModelOutput,
17
+ BaseModelOutputWithPooling)
18
+ from transformers.modeling_utils import PreTrainedModel
19
+ from transformers.utils import logging
20
+
21
+ from .configuration_intern_vit import InternVisionConfig
22
+
23
+ try:
24
+ from flash_attn.bert_padding import pad_input, unpad_input
25
+ from flash_attn.flash_attn_interface import \
26
+ flash_attn_varlen_qkvpacked_func
27
+ has_flash_attn = True
28
+ except:
29
+ print('FlashAttention2 is not installed.')
30
+ has_flash_attn = False
31
+
32
+ logger = logging.get_logger(__name__)
33
+
34
+
35
+ class FlashAttention(nn.Module):
36
+ """Implement the scaled dot product attention with softmax.
37
+ Arguments
38
+ ---------
39
+ softmax_scale: The temperature to use for the softmax attention.
40
+ (default: 1/sqrt(d_keys) where d_keys is computed at
41
+ runtime)
42
+ attention_dropout: The dropout rate to apply to the attention
43
+ (default: 0.0)
44
+ """
45
+
46
+ def __init__(self, softmax_scale=None, attention_dropout=0.0, device=None, dtype=None):
47
+ super().__init__()
48
+ self.softmax_scale = softmax_scale
49
+ self.dropout_p = attention_dropout
50
+
51
+ def forward(self, qkv, key_padding_mask=None, causal=False, cu_seqlens=None,
52
+ max_s=None, need_weights=False):
53
+ """Implements the multihead softmax attention.
54
+ Arguments
55
+ ---------
56
+ qkv: The tensor containing the query, key, and value. (B, S, 3, H, D) if key_padding_mask is None
57
+ if unpadded: (nnz, 3, h, d)
58
+ key_padding_mask: a bool tensor of shape (B, S)
59
+ """
60
+ assert not need_weights
61
+ assert qkv.dtype in [torch.float16, torch.bfloat16]
62
+ assert qkv.is_cuda
63
+
64
+ if cu_seqlens is None:
65
+ batch_size = qkv.shape[0]
66
+ seqlen = qkv.shape[1]
67
+ if key_padding_mask is None:
68
+ qkv = rearrange(qkv, 'b s ... -> (b s) ...')
69
+ max_s = seqlen
70
+ cu_seqlens = torch.arange(0, (batch_size + 1) * seqlen, step=seqlen, dtype=torch.int32,
71
+ device=qkv.device)
72
+ output = flash_attn_varlen_qkvpacked_func(
73
+ qkv, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
74
+ softmax_scale=self.softmax_scale, causal=causal
75
+ )
76
+ output = rearrange(output, '(b s) ... -> b s ...', b=batch_size)
77
+ else:
78
+ nheads = qkv.shape[-2]
79
+ x = rearrange(qkv, 'b s three h d -> b s (three h d)')
80
+ x_unpad, indices, cu_seqlens, max_s = unpad_input(x, key_padding_mask)
81
+ x_unpad = rearrange(x_unpad, 'nnz (three h d) -> nnz three h d', three=3, h=nheads)
82
+ output_unpad = flash_attn_varlen_qkvpacked_func(
83
+ x_unpad, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
84
+ softmax_scale=self.softmax_scale, causal=causal
85
+ )
86
+ output = rearrange(pad_input(rearrange(output_unpad, 'nnz h d -> nnz (h d)'),
87
+ indices, batch_size, seqlen),
88
+ 'b s (h d) -> b s h d', h=nheads)
89
+ else:
90
+ assert max_s is not None
91
+ output = flash_attn_varlen_qkvpacked_func(
92
+ qkv, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
93
+ softmax_scale=self.softmax_scale, causal=causal
94
+ )
95
+
96
+ return output, None
97
+
98
+
99
+ class InternRMSNorm(nn.Module):
100
+ def __init__(self, hidden_size, eps=1e-6):
101
+ super().__init__()
102
+ self.weight = nn.Parameter(torch.ones(hidden_size))
103
+ self.variance_epsilon = eps
104
+
105
+ def forward(self, hidden_states):
106
+ input_dtype = hidden_states.dtype
107
+ hidden_states = hidden_states.to(torch.float32)
108
+ variance = hidden_states.pow(2).mean(-1, keepdim=True)
109
+ hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
110
+ return self.weight * hidden_states.to(input_dtype)
111
+
112
+
113
+ try:
114
+ from apex.normalization import FusedRMSNorm
115
+
116
+ InternRMSNorm = FusedRMSNorm # noqa
117
+
118
+ logger.info('Discovered apex.normalization.FusedRMSNorm - will use it instead of InternRMSNorm')
119
+ except ImportError:
120
+ # using the normal InternRMSNorm
121
+ pass
122
+ except Exception:
123
+ logger.warning('discovered apex but it failed to load, falling back to InternRMSNorm')
124
+ pass
125
+
126
+
127
+ NORM2FN = {
128
+ 'rms_norm': InternRMSNorm,
129
+ 'layer_norm': nn.LayerNorm,
130
+ }
131
+
132
+
133
+ class InternVisionEmbeddings(nn.Module):
134
+ def __init__(self, config: InternVisionConfig):
135
+ super().__init__()
136
+ self.config = config
137
+ self.embed_dim = config.hidden_size
138
+ self.image_size = config.image_size
139
+ self.patch_size = config.patch_size
140
+
141
+ self.class_embedding = nn.Parameter(
142
+ torch.randn(1, 1, self.embed_dim),
143
+ )
144
+
145
+ self.patch_embedding = nn.Conv2d(
146
+ in_channels=3, out_channels=self.embed_dim, kernel_size=self.patch_size, stride=self.patch_size
147
+ )
148
+
149
+ self.num_patches = (self.image_size // self.patch_size) ** 2
150
+ self.num_positions = self.num_patches + 1
151
+
152
+ self.position_embedding = nn.Parameter(torch.randn(1, self.num_positions, self.embed_dim))
153
+
154
+ def _get_pos_embed(self, pos_embed, H, W):
155
+ target_dtype = pos_embed.dtype
156
+ pos_embed = pos_embed.float().reshape(
157
+ 1, self.image_size // self.patch_size, self.image_size // self.patch_size, -1).permute(0, 3, 1, 2)
158
+ pos_embed = F.interpolate(pos_embed, size=(H, W), mode='bicubic', align_corners=False). \
159
+ reshape(1, -1, H * W).permute(0, 2, 1).to(target_dtype)
160
+ return pos_embed
161
+
162
+ def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
163
+ target_dtype = self.patch_embedding.weight.dtype
164
+ patch_embeds = self.patch_embedding(pixel_values) # shape = [*, channel, width, height]
165
+ batch_size, _, height, width = patch_embeds.shape
166
+ patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
167
+ class_embeds = self.class_embedding.expand(batch_size, 1, -1).to(target_dtype)
168
+ embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
169
+ position_embedding = torch.cat([
170
+ self.position_embedding[:, :1, :],
171
+ self._get_pos_embed(self.position_embedding[:, 1:, :], height, width)
172
+ ], dim=1)
173
+ embeddings = embeddings + position_embedding.to(target_dtype)
174
+ return embeddings
175
+
176
+
177
+ class InternAttention(nn.Module):
178
+ """Multi-headed attention from 'Attention Is All You Need' paper"""
179
+
180
+ def __init__(self, config: InternVisionConfig):
181
+ super().__init__()
182
+ self.config = config
183
+ self.embed_dim = config.hidden_size
184
+ self.num_heads = config.num_attention_heads
185
+ self.use_flash_attn = config.use_flash_attn and has_flash_attn
186
+ if config.use_flash_attn and not has_flash_attn:
187
+ print('Warning: Flash Attention is not available, use_flash_attn is set to False.')
188
+ self.head_dim = self.embed_dim // self.num_heads
189
+ if self.head_dim * self.num_heads != self.embed_dim:
190
+ raise ValueError(
191
+ f'embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:'
192
+ f' {self.num_heads}).'
193
+ )
194
+
195
+ self.scale = self.head_dim ** -0.5
196
+ self.qkv = nn.Linear(self.embed_dim, 3 * self.embed_dim, bias=config.qkv_bias)
197
+ self.attn_drop = nn.Dropout(config.attention_dropout)
198
+ self.proj_drop = nn.Dropout(config.dropout)
199
+
200
+ self.qk_normalization = config.qk_normalization
201
+
202
+ if self.qk_normalization:
203
+ self.q_norm = InternRMSNorm(self.embed_dim, eps=config.layer_norm_eps)
204
+ self.k_norm = InternRMSNorm(self.embed_dim, eps=config.layer_norm_eps)
205
+
206
+ if self.use_flash_attn:
207
+ self.inner_attn = FlashAttention(attention_dropout=config.attention_dropout)
208
+ self.proj = nn.Linear(self.embed_dim, self.embed_dim)
209
+
210
+ def _naive_attn(self, x):
211
+ B, N, C = x.shape
212
+ qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
213
+ q, k, v = qkv.unbind(0) # make torchscript happy (cannot use tensor as tuple)
214
+
215
+ if self.qk_normalization:
216
+ B_, H_, N_, D_ = q.shape
217
+ q = self.q_norm(q.transpose(1, 2).flatten(-2, -1)).view(B_, N_, H_, D_).transpose(1, 2)
218
+ k = self.k_norm(k.transpose(1, 2).flatten(-2, -1)).view(B_, N_, H_, D_).transpose(1, 2)
219
+
220
+ attn = ((q * self.scale) @ k.transpose(-2, -1))
221
+ attn = attn.softmax(dim=-1)
222
+ attn = self.attn_drop(attn)
223
+
224
+ x = (attn @ v).transpose(1, 2).reshape(B, N, C)
225
+ x = self.proj(x)
226
+ x = self.proj_drop(x)
227
+ return x
228
+
229
+ def _flash_attn(self, x, key_padding_mask=None, need_weights=False):
230
+ qkv = self.qkv(x)
231
+ qkv = rearrange(qkv, 'b s (three h d) -> b s three h d', three=3, h=self.num_heads)
232
+
233
+ if self.qk_normalization:
234
+ q, k, v = qkv.unbind(2)
235
+ q = self.q_norm(q.flatten(-2, -1)).view(q.shape)
236
+ k = self.k_norm(k.flatten(-2, -1)).view(k.shape)
237
+ qkv = torch.stack([q, k, v], dim=2)
238
+
239
+ context, _ = self.inner_attn(
240
+ qkv, key_padding_mask=key_padding_mask, need_weights=need_weights, causal=False
241
+ )
242
+ outs = self.proj(rearrange(context, 'b s h d -> b s (h d)'))
243
+ outs = self.proj_drop(outs)
244
+ return outs
245
+
246
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
247
+ x = self._naive_attn(hidden_states) if not self.use_flash_attn else self._flash_attn(hidden_states)
248
+ return x
249
+
250
+
251
+ class InternMLP(nn.Module):
252
+ def __init__(self, config: InternVisionConfig):
253
+ super().__init__()
254
+ self.config = config
255
+ self.act = ACT2FN[config.hidden_act]
256
+ self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
257
+ self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
258
+
259
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
260
+ hidden_states = self.fc1(hidden_states)
261
+ hidden_states = self.act(hidden_states)
262
+ hidden_states = self.fc2(hidden_states)
263
+ return hidden_states
264
+
265
+
266
+ class InternVisionEncoderLayer(nn.Module):
267
+ def __init__(self, config: InternVisionConfig, drop_path_rate: float):
268
+ super().__init__()
269
+ self.embed_dim = config.hidden_size
270
+ self.intermediate_size = config.intermediate_size
271
+ self.norm_type = config.norm_type
272
+
273
+ self.attn = InternAttention(config)
274
+ self.mlp = InternMLP(config)
275
+ self.norm1 = NORM2FN[self.norm_type](self.embed_dim, eps=config.layer_norm_eps)
276
+ self.norm2 = NORM2FN[self.norm_type](self.embed_dim, eps=config.layer_norm_eps)
277
+
278
+ self.ls1 = nn.Parameter(config.initializer_factor * torch.ones(self.embed_dim))
279
+ self.ls2 = nn.Parameter(config.initializer_factor * torch.ones(self.embed_dim))
280
+ self.drop_path1 = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
281
+ self.drop_path2 = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
282
+
283
+ def forward(
284
+ self,
285
+ hidden_states: torch.Tensor,
286
+ ) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor], Optional[Tuple[torch.FloatTensor]]]:
287
+ """
288
+ Args:
289
+ hidden_states (`Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]`): input to the layer of shape `(batch, seq_len, embed_dim)`
290
+ """
291
+ hidden_states = hidden_states + self.drop_path1(self.attn(self.norm1(hidden_states).to(hidden_states.dtype)) * self.ls1)
292
+
293
+ hidden_states = hidden_states + self.drop_path2(self.mlp(self.norm2(hidden_states).to(hidden_states.dtype)) * self.ls2)
294
+
295
+ return hidden_states
296
+
297
+
298
+ class InternVisionEncoder(nn.Module):
299
+ """
300
+ Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
301
+ [`InternEncoderLayer`].
302
+
303
+ Args:
304
+ config (`InternConfig`):
305
+ The corresponding vision configuration for the `InternEncoder`.
306
+ """
307
+
308
+ def __init__(self, config: InternVisionConfig):
309
+ super().__init__()
310
+ self.config = config
311
+ # stochastic depth decay rule
312
+ dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, config.num_hidden_layers)]
313
+ self.layers = nn.ModuleList([
314
+ InternVisionEncoderLayer(config, dpr[idx]) for idx in range(config.num_hidden_layers)])
315
+ self.gradient_checkpointing = True
316
+
317
+ def forward(
318
+ self,
319
+ inputs_embeds,
320
+ output_hidden_states: Optional[bool] = None,
321
+ return_dict: Optional[bool] = None,
322
+ ) -> Union[Tuple, BaseModelOutput]:
323
+ r"""
324
+ Args:
325
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
326
+ Embedded representation of the inputs. Should be float, not int tokens.
327
+ output_hidden_states (`bool`, *optional*):
328
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
329
+ for more detail.
330
+ return_dict (`bool`, *optional*):
331
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
332
+ """
333
+ output_hidden_states = (
334
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
335
+ )
336
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
337
+
338
+ encoder_states = () if output_hidden_states else None
339
+ hidden_states = inputs_embeds
340
+
341
+ for idx, encoder_layer in enumerate(self.layers):
342
+ if output_hidden_states:
343
+ encoder_states = encoder_states + (hidden_states,)
344
+ if self.gradient_checkpointing and self.training:
345
+ layer_outputs = torch.utils.checkpoint.checkpoint(
346
+ encoder_layer,
347
+ hidden_states)
348
+ else:
349
+ layer_outputs = encoder_layer(
350
+ hidden_states,
351
+ )
352
+ hidden_states = layer_outputs
353
+
354
+ if output_hidden_states:
355
+ encoder_states = encoder_states + (hidden_states,)
356
+
357
+ if not return_dict:
358
+ return tuple(v for v in [hidden_states, encoder_states] if v is not None)
359
+ return BaseModelOutput(
360
+ last_hidden_state=hidden_states, hidden_states=encoder_states
361
+ )
362
+
363
+
364
+ class InternVisionModel(PreTrainedModel):
365
+ main_input_name = 'pixel_values'
366
+ _supports_flash_attn_2 = True
367
+ config_class = InternVisionConfig
368
+ _no_split_modules = ['InternVisionEncoderLayer']
369
+
370
+ def __init__(self, config: InternVisionConfig):
371
+ super().__init__(config)
372
+ self.config = config
373
+
374
+ self.embeddings = InternVisionEmbeddings(config)
375
+ self.encoder = InternVisionEncoder(config)
376
+
377
+ def resize_pos_embeddings(self, old_size, new_size, patch_size):
378
+ pos_emb = self.embeddings.position_embedding
379
+ _, num_positions, embed_dim = pos_emb.shape
380
+ cls_emb = pos_emb[:, :1, :]
381
+ pos_emb = pos_emb[:, 1:, :].reshape(1, old_size // patch_size, old_size // patch_size, -1).permute(0, 3, 1, 2)
382
+ pos_emb = F.interpolate(pos_emb.float(), size=new_size // patch_size, mode='bicubic', align_corners=False)
383
+ pos_emb = pos_emb.to(cls_emb.dtype).reshape(1, embed_dim, -1).permute(0, 2, 1)
384
+ pos_emb = torch.cat([cls_emb, pos_emb], dim=1)
385
+ self.embeddings.position_embedding = nn.Parameter(pos_emb)
386
+ self.embeddings.image_size = new_size
387
+ logger.info('Resized position embeddings from {} to {}'.format(old_size, new_size))
388
+
389
+ def get_input_embeddings(self):
390
+ return self.embeddings
391
+
392
+ def forward(
393
+ self,
394
+ pixel_values: Optional[torch.FloatTensor] = None,
395
+ output_hidden_states: Optional[bool] = None,
396
+ return_dict: Optional[bool] = None,
397
+ pixel_embeds: Optional[torch.FloatTensor] = None,
398
+ ) -> Union[Tuple, BaseModelOutputWithPooling]:
399
+ output_hidden_states = (
400
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
401
+ )
402
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
403
+
404
+ if pixel_values is None and pixel_embeds is None:
405
+ raise ValueError('You have to specify pixel_values or pixel_embeds')
406
+
407
+ if pixel_embeds is not None:
408
+ hidden_states = pixel_embeds
409
+ else:
410
+ if len(pixel_values.shape) == 4:
411
+ hidden_states = self.embeddings(pixel_values)
412
+ else:
413
+ raise ValueError(f'wrong pixel_values size: {pixel_values.shape}')
414
+ encoder_outputs = self.encoder(
415
+ inputs_embeds=hidden_states,
416
+ output_hidden_states=output_hidden_states,
417
+ return_dict=return_dict,
418
+ )
419
+ last_hidden_state = encoder_outputs.last_hidden_state
420
+ pooled_output = last_hidden_state[:, 0, :]
421
+
422
+ if not return_dict:
423
+ return (last_hidden_state, pooled_output) + encoder_outputs[1:]
424
+
425
+ return BaseModelOutputWithPooling(
426
+ last_hidden_state=last_hidden_state,
427
+ pooler_output=pooled_output,
428
+ hidden_states=encoder_outputs.hidden_states,
429
+ attentions=encoder_outputs.attentions,
430
+ )
modeling_internvl_chat.py ADDED
@@ -0,0 +1,359 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --------------------------------------------------------
2
+ # InternVL
3
+ # Copyright (c) 2024 OpenGVLab
4
+ # Licensed under The MIT License [see LICENSE for details]
5
+ # --------------------------------------------------------
6
+
7
+ import warnings
8
+ from typing import List, Optional, Tuple, Union
9
+
10
+ import torch.utils.checkpoint
11
+ import transformers
12
+ from torch import nn
13
+ from torch.nn import CrossEntropyLoss
14
+ from transformers import (AutoModel, GenerationConfig, LlamaForCausalLM, TextStreamer)
15
+
16
+ import sys
17
+ sys.path.append('your_infer_path')
18
+ from qwen2.modeling_qwen2 import Qwen2ForCausalLM
19
+
20
+ from transformers.modeling_outputs import CausalLMOutputWithPast
21
+ from transformers.modeling_utils import PreTrainedModel
22
+ from transformers.utils import ModelOutput, logging
23
+
24
+ from .configuration_internvl_chat import InternVLChatConfig
25
+ from .conversation import get_conv_template
26
+ from .modeling_intern_vit import InternVisionModel, has_flash_attn
27
+
28
+ logger = logging.get_logger(__name__)
29
+
30
+
31
+ def version_cmp(v1, v2, op='eq'):
32
+ import operator
33
+
34
+ from packaging import version
35
+ op_func = getattr(operator, op)
36
+ return op_func(version.parse(v1), version.parse(v2))
37
+
38
+
39
+ class InternVLChatModel(PreTrainedModel):
40
+ config_class = InternVLChatConfig
41
+ main_input_name = 'pixel_values'
42
+ base_model_prefix = 'language_model'
43
+ _supports_flash_attn_2 = True
44
+ _no_split_modules = ['InternVisionModel', 'LlamaDecoderLayer', 'Qwen2DecoderLayer']
45
+
46
+ def __init__(self, config: InternVLChatConfig, vision_model=None, language_model=None, use_flash_attn=True):
47
+ super().__init__(config)
48
+
49
+ assert version_cmp(transformers.__version__, '4.37.0', 'ge')
50
+ image_size = config.force_image_size or config.vision_config.image_size
51
+ patch_size = config.vision_config.patch_size
52
+ self.patch_size = patch_size
53
+ self.select_layer = config.select_layer
54
+ self.template = config.template
55
+ self.num_image_token = int((image_size // patch_size) ** 2 * (config.downsample_ratio ** 2))
56
+ self.downsample_ratio = config.downsample_ratio
57
+ self.ps_version = config.ps_version
58
+ use_flash_attn = use_flash_attn if has_flash_attn else False
59
+ config.vision_config.use_flash_attn = True if use_flash_attn else False
60
+ config.llm_config._attn_implementation = 'flash_attention_2' if use_flash_attn else 'eager'
61
+
62
+ logger.info(f'num_image_token: {self.num_image_token}')
63
+ logger.info(f'ps_version: {self.ps_version}')
64
+ if vision_model is not None:
65
+ self.vision_model = vision_model
66
+ else:
67
+ self.vision_model = InternVisionModel(config.vision_config)
68
+ if language_model is not None:
69
+ self.language_model = language_model
70
+ else:
71
+ if config.llm_config.architectures[0] == 'LlamaForCausalLM':
72
+ self.language_model = LlamaForCausalLM(config.llm_config)
73
+ elif config.llm_config.architectures[0] == 'Qwen2ForCausalLM':
74
+ self.language_model = Qwen2ForCausalLM(config.llm_config)
75
+ else:
76
+ raise NotImplementedError(f'{config.llm_config.architectures[0]} is not implemented.')
77
+
78
+ vit_hidden_size = config.vision_config.hidden_size
79
+ llm_hidden_size = config.llm_config.hidden_size
80
+
81
+ self.mlp1 = nn.Sequential(
82
+ nn.LayerNorm(vit_hidden_size * int(1 / self.downsample_ratio) ** 2),
83
+ nn.Linear(vit_hidden_size * int(1 / self.downsample_ratio) ** 2, llm_hidden_size),
84
+ nn.GELU(),
85
+ nn.Linear(llm_hidden_size, llm_hidden_size)
86
+ )
87
+
88
+ self.img_context_token_id = None
89
+ self.conv_template = get_conv_template(self.template)
90
+ self.system_message = self.conv_template.system_message
91
+
92
+ def forward(
93
+ self,
94
+ pixel_values: torch.FloatTensor,
95
+ input_ids: torch.LongTensor = None,
96
+ attention_mask: Optional[torch.Tensor] = None,
97
+ position_ids: Optional[torch.LongTensor] = None,
98
+ image_flags: Optional[torch.LongTensor] = None,
99
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
100
+ labels: Optional[torch.LongTensor] = None,
101
+ use_cache: Optional[bool] = None,
102
+ output_attentions: Optional[bool] = None,
103
+ output_hidden_states: Optional[bool] = None,
104
+ return_dict: Optional[bool] = None,
105
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
106
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
107
+
108
+ image_flags = image_flags.squeeze(-1)
109
+ input_embeds = self.language_model.get_input_embeddings()(input_ids).clone()
110
+
111
+ vit_embeds = self.extract_feature(pixel_values)
112
+ vit_embeds = vit_embeds[image_flags == 1]
113
+ vit_batch_size = pixel_values.shape[0]
114
+
115
+ B, N, C = input_embeds.shape
116
+ input_embeds = input_embeds.reshape(B * N, C)
117
+
118
+ if torch.distributed.is_initialized() and torch.distributed.get_rank() == 0:
119
+ print(f'dynamic ViT batch size: {vit_batch_size}, images per sample: {vit_batch_size / B}, dynamic token length: {N}')
120
+
121
+ input_ids = input_ids.reshape(B * N)
122
+ selected = (input_ids == self.img_context_token_id)
123
+ try:
124
+ input_embeds[selected] = input_embeds[selected] * 0.0 + vit_embeds.reshape(-1, C)
125
+ except Exception as e:
126
+ vit_embeds = vit_embeds.reshape(-1, C)
127
+ print(f'warning: {e}, input_embeds[selected].shape={input_embeds[selected].shape}, '
128
+ f'vit_embeds.shape={vit_embeds.shape}')
129
+ n_token = selected.sum()
130
+ input_embeds[selected] = input_embeds[selected] * 0.0 + vit_embeds[:n_token]
131
+
132
+ input_embeds = input_embeds.reshape(B, N, C)
133
+
134
+ outputs = self.language_model(
135
+ inputs_embeds=input_embeds,
136
+ attention_mask=attention_mask,
137
+ position_ids=position_ids,
138
+ past_key_values=past_key_values,
139
+ use_cache=use_cache,
140
+ output_attentions=output_attentions,
141
+ output_hidden_states=output_hidden_states,
142
+ return_dict=return_dict,
143
+ )
144
+ logits = outputs.logits
145
+
146
+ loss = None
147
+ if labels is not None:
148
+ # Shift so that tokens < n predict n
149
+ shift_logits = logits[..., :-1, :].contiguous()
150
+ shift_labels = labels[..., 1:].contiguous()
151
+ # Flatten the tokens
152
+ loss_fct = CrossEntropyLoss()
153
+ shift_logits = shift_logits.view(-1, self.language_model.config.vocab_size)
154
+ shift_labels = shift_labels.view(-1)
155
+ # Enable model parallelism
156
+ shift_labels = shift_labels.to(shift_logits.device)
157
+ loss = loss_fct(shift_logits, shift_labels)
158
+
159
+ if not return_dict:
160
+ output = (logits,) + outputs[1:]
161
+ return (loss,) + output if loss is not None else output
162
+
163
+ return CausalLMOutputWithPast(
164
+ loss=loss,
165
+ logits=logits,
166
+ past_key_values=outputs.past_key_values,
167
+ hidden_states=outputs.hidden_states,
168
+ attentions=outputs.attentions,
169
+ )
170
+
171
+ def pixel_shuffle(self, x, scale_factor=0.5):
172
+ n, w, h, c = x.size()
173
+ # N, W, H, C --> N, W, H * scale, C // scale
174
+ x = x.view(n, w, int(h * scale_factor), int(c / scale_factor))
175
+ # N, W, H * scale, C // scale --> N, H * scale, W, C // scale
176
+ x = x.permute(0, 2, 1, 3).contiguous()
177
+ # N, H * scale, W, C // scale --> N, H * scale, W * scale, C // (scale ** 2)
178
+ x = x.view(n, int(h * scale_factor), int(w * scale_factor),
179
+ int(c / (scale_factor * scale_factor)))
180
+ if self.ps_version == 'v1':
181
+ warnings.warn("In ps_version 'v1', the height and width have not been swapped back, "
182
+ 'which results in a transposed image.')
183
+ else:
184
+ x = x.permute(0, 2, 1, 3).contiguous()
185
+ return x
186
+
187
+ def extract_feature(self, pixel_values):
188
+ if self.select_layer == -1:
189
+ vit_embeds = self.vision_model(
190
+ pixel_values=pixel_values,
191
+ output_hidden_states=False,
192
+ return_dict=True).last_hidden_state
193
+ else:
194
+ vit_embeds = self.vision_model(
195
+ pixel_values=pixel_values,
196
+ output_hidden_states=True,
197
+ return_dict=True).hidden_states[self.select_layer]
198
+ vit_embeds = vit_embeds[:, 1:, :]
199
+
200
+ h = w = int(vit_embeds.shape[1] ** 0.5)
201
+ vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1)
202
+ vit_embeds = self.pixel_shuffle(vit_embeds, scale_factor=self.downsample_ratio)
203
+ vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], -1, vit_embeds.shape[-1])
204
+ vit_embeds = self.mlp1(vit_embeds)
205
+ return vit_embeds
206
+
207
+ def batch_chat(self, tokenizer, pixel_values, questions, generation_config, num_patches_list=None,
208
+ history=None, return_history=False, IMG_START_TOKEN='<img>', IMG_END_TOKEN='</img>',
209
+ IMG_CONTEXT_TOKEN='</tool_call>', verbose=False, image_counts=None):
210
+ if history is not None or return_history:
211
+ print('Now multi-turn chat is not supported in batch_chat.')
212
+ raise NotImplementedError
213
+
214
+ if image_counts is not None:
215
+ num_patches_list = image_counts
216
+ print('Warning: `image_counts` is deprecated. Please use `num_patches_list` instead.')
217
+
218
+ img_context_token_id = tokenizer.convert_tokens_to_ids(IMG_CONTEXT_TOKEN)
219
+ self.img_context_token_id = img_context_token_id
220
+
221
+ if verbose and pixel_values is not None:
222
+ image_bs = pixel_values.shape[0]
223
+ print(f'dynamic ViT batch size: {image_bs}')
224
+
225
+ queries = []
226
+ for idx, num_patches in enumerate(num_patches_list):
227
+ question = questions[idx]
228
+ if pixel_values is not None and '<image>' not in question:
229
+ question = '<image>\n' + question
230
+ template = get_conv_template(self.template)
231
+ template.system_message = self.system_message
232
+ template.append_message(template.roles[0], question)
233
+ template.append_message(template.roles[1], None)
234
+ query = template.get_prompt()
235
+
236
+ image_tokens = IMG_START_TOKEN + IMG_CONTEXT_TOKEN * self.num_image_token * num_patches + IMG_END_TOKEN
237
+ query = query.replace('<image>', image_tokens, 1)
238
+ queries.append(query)
239
+
240
+ tokenizer.padding_side = 'left'
241
+ model_inputs = tokenizer(queries, return_tensors='pt', padding=True)
242
+ input_ids = model_inputs['input_ids'].to(self.device)
243
+ attention_mask = model_inputs['attention_mask'].to(self.device)
244
+ eos_token_id = tokenizer.convert_tokens_to_ids(template.sep.strip())
245
+ generation_config['eos_token_id'] = eos_token_id
246
+ generation_output = self.generate(
247
+ pixel_values=pixel_values,
248
+ input_ids=input_ids,
249
+ attention_mask=attention_mask,
250
+ **generation_config
251
+ )
252
+ responses = tokenizer.batch_decode(generation_output, skip_special_tokens=True)
253
+ responses = [response.split(template.sep.strip())[0].strip() for response in responses]
254
+ return responses
255
+
256
+ def chat(self, tokenizer, pixel_values, question, generation_config, history=None, return_history=False,
257
+ num_patches_list=None, use_streamer=False, IMG_START_TOKEN='<img>', IMG_END_TOKEN='</img>', IMG_CONTEXT_TOKEN='</tool_call>',
258
+ verbose=False):
259
+
260
+ if history is None and pixel_values is not None and '<image>' not in question:
261
+ question = '<image>\n' + question
262
+
263
+ if num_patches_list is None:
264
+ num_patches_list = [pixel_values.shape[0]] if pixel_values is not None else []
265
+ assert pixel_values is None or len(pixel_values) == sum(num_patches_list)
266
+
267
+ img_context_token_id = tokenizer.convert_tokens_to_ids(IMG_CONTEXT_TOKEN)
268
+ self.img_context_token_id = img_context_token_id
269
+
270
+ template = get_conv_template(self.template)
271
+ template.system_message = self.system_message
272
+ eos_token_id = tokenizer.convert_tokens_to_ids(template.sep.strip())
273
+
274
+ history = [] if history is None else history
275
+ for (old_question, old_answer) in history:
276
+ template.append_message(template.roles[0], old_question)
277
+ template.append_message(template.roles[1], old_answer)
278
+ template.append_message(template.roles[0], question)
279
+ template.append_message(template.roles[1], None)
280
+ query = template.get_prompt() + '<think>'
281
+ # print(query)
282
+
283
+ if verbose and pixel_values is not None:
284
+ image_bs = pixel_values.shape[0]
285
+ print(f'dynamic ViT batch size: {image_bs}')
286
+
287
+ for num_patches in num_patches_list:
288
+ image_tokens = IMG_START_TOKEN + IMG_CONTEXT_TOKEN * self.num_image_token * num_patches + IMG_END_TOKEN
289
+ query = query.replace('<image>', image_tokens, 1)
290
+
291
+ if use_streamer:
292
+ streamer = TextStreamer(tokenizer)
293
+ else:
294
+ streamer = None
295
+
296
+ model_inputs = tokenizer(query, return_tensors='pt')
297
+ input_ids = model_inputs['input_ids'].to(self.device)
298
+ attention_mask = model_inputs['attention_mask'].to(self.device)
299
+ generation_config['eos_token_id'] = eos_token_id
300
+ generation_output = self.generate(
301
+ pixel_values=pixel_values,
302
+ input_ids=input_ids,
303
+ attention_mask=attention_mask,
304
+ streamer=streamer,
305
+ **generation_config
306
+ )
307
+ response = tokenizer.batch_decode(generation_output, skip_special_tokens=True)[0]
308
+ response = response.split(template.sep.strip())[0].strip()
309
+ history.append((question, response))
310
+ if return_history:
311
+ return response, history
312
+ else:
313
+ query_to_print = query.replace(IMG_CONTEXT_TOKEN, '')
314
+ query_to_print = query_to_print.replace(f'{IMG_START_TOKEN}{IMG_END_TOKEN}', '<image>')
315
+ if verbose:
316
+ print(query_to_print, response)
317
+ return response
318
+
319
+ @torch.no_grad()
320
+ def generate(
321
+ self,
322
+ pixel_values: Optional[torch.FloatTensor] = None,
323
+ input_ids: Optional[torch.FloatTensor] = None,
324
+ attention_mask: Optional[torch.LongTensor] = None,
325
+ visual_features: Optional[torch.FloatTensor] = None,
326
+ generation_config: Optional[GenerationConfig] = None,
327
+ output_hidden_states: Optional[bool] = None,
328
+ **generate_kwargs,
329
+ ) -> torch.LongTensor:
330
+
331
+ assert self.img_context_token_id is not None
332
+ if pixel_values is not None:
333
+ if visual_features is not None:
334
+ vit_embeds = visual_features
335
+ else:
336
+ vit_embeds = self.extract_feature(pixel_values)
337
+ input_embeds = self.language_model.get_input_embeddings()(input_ids)
338
+ B, N, C = input_embeds.shape
339
+ input_embeds = input_embeds.reshape(B * N, C)
340
+
341
+ input_ids = input_ids.reshape(B * N)
342
+ selected = (input_ids == self.img_context_token_id)
343
+ assert selected.sum() != 0
344
+ input_embeds[selected] = vit_embeds.reshape(-1, C).to(input_embeds.device)
345
+
346
+ input_embeds = input_embeds.reshape(B, N, C)
347
+ else:
348
+ input_embeds = self.language_model.get_input_embeddings()(input_ids)
349
+
350
+ outputs = self.language_model.generate(
351
+ inputs_embeds=input_embeds,
352
+ attention_mask=attention_mask,
353
+ generation_config=generation_config,
354
+ output_hidden_states=output_hidden_states,
355
+ use_cache=True,
356
+ **generate_kwargs,
357
+ )
358
+
359
+ return outputs
special_tokens_map.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>",
16
+ "<img>",
17
+ "</img>",
18
+ "<IMG_CONTEXT>",
19
+ "<quad>",
20
+ "</quad>",
21
+ "<ref>",
22
+ "</ref>",
23
+ "<box>",
24
+ "</box>",
25
+ "</think>",
26
+ "<think>"
27
+ ],
28
+ "eos_token": {
29
+ "content": "<|im_end|>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false
34
+ },
35
+ "pad_token": {
36
+ "content": "<|endoftext|>",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false
41
+ }
42
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aeb13307a71acd8fe81861d94ad54ab689df773318809eed3cbe794b4492dae4
3
+ size 11422654
tokenizer_config.json ADDED
@@ -0,0 +1,239 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ },
181
+ "151665": {
182
+ "content": "<tool_response>",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": false
188
+ },
189
+ "151666": {
190
+ "content": "</tool_response>",
191
+ "lstrip": false,
192
+ "normalized": false,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": false
196
+ },
197
+ "151667": {
198
+ "content": "<think>",
199
+ "lstrip": false,
200
+ "normalized": false,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": false
204
+ },
205
+ "151668": {
206
+ "content": "</think>",
207
+ "lstrip": false,
208
+ "normalized": false,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": false
212
+ }
213
+ },
214
+ "additional_special_tokens": [
215
+ "<|im_start|>",
216
+ "<|im_end|>",
217
+ "<|object_ref_start|>",
218
+ "<|object_ref_end|>",
219
+ "<|box_start|>",
220
+ "<|box_end|>",
221
+ "<|quad_start|>",
222
+ "<|quad_end|>",
223
+ "<|vision_start|>",
224
+ "<|vision_end|>",
225
+ "<|vision_pad|>",
226
+ "<|image_pad|>",
227
+ "<|video_pad|>"
228
+ ],
229
+ "bos_token": null,
230
+ "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- '' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" and not message.tool_calls %}\n {%- set content = message.content.split('</think>')[-1].lstrip('\\n') %}\n {{- '<|im_start|>' + message.role + '\\n' + content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {%- set content = message.content.split('</think>')[-1].lstrip('\\n') %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n<think>\\n' }}\n{%- endif %}\n",
231
+ "clean_up_tokenization_spaces": false,
232
+ "eos_token": "<|im_end|>",
233
+ "errors": "replace",
234
+ "model_max_length": 131072,
235
+ "pad_token": "<|endoftext|>",
236
+ "split_special_tokens": false,
237
+ "tokenizer_class": "Qwen2Tokenizer",
238
+ "unk_token": null
239
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff