reubk commited on
Commit
f87c79f
·
verified ·
1 Parent(s): 414df2d

Upload folder using huggingface_hub

Browse files
README.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ license: apache-2.0
4
+ base_model:
5
+ - allenai/MolmoE-1B-0924
6
+ ---
7
+
8
+ MolmoE-1B-0924 NF4 Quant
9
+ Only the LLM portion was quantized, CLIP encoder remains as is
10
+
11
+
12
+ base model for more information:
13
+
14
+ https://huggingface.co/allenai/MolmoE-1B-0924
added_tokens.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "<im_col>": 100281,
3
+ "<im_end>": 100279,
4
+ "<im_patch>": 100280,
5
+ "<im_start>": 100278,
6
+ "<|image|>": 100282
7
+ }
config.json ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoConfig": "config_molmoe.MolmoConfig",
4
+ "AutoModelForCausalLM": "modeling_molmoe.MolmoForCausalLM"
5
+ },
6
+ "activation_type": "swiglu",
7
+ "additional_vocab_size": 128,
8
+ "alibi": false,
9
+ "alibi_bias_max": 8.0,
10
+ "always_start_with_space": true,
11
+ "architectures": [
12
+ "OLMoForCausalLM"
13
+ ],
14
+ "attention_dropout": 0.0,
15
+ "attention_layer_norm": true,
16
+ "attention_layer_norm_with_affine": true,
17
+ "attention_type": "sdpa",
18
+ "attn_logit_softcapping": null,
19
+ "bias_for_layer_norm": false,
20
+ "block_group_size": 1,
21
+ "block_type": "moe",
22
+ "clip_qkv": null,
23
+ "crop_mode": "overlap-and-resize-c2",
24
+ "d_model": 2048,
25
+ "default_inference_len": 65,
26
+ "do_random_scale": false,
27
+ "embedding_dropout": 0.0,
28
+ "embedding_size": 50304,
29
+ "final_logit_softcapping": null,
30
+ "fix_image_input_idx": 2,
31
+ "float32_attention": true,
32
+ "gin_bindings": null,
33
+ "head_dim": null,
34
+ "image_feature_dropout": 0.0,
35
+ "image_padding_embed": "pad_and_partial_pad",
36
+ "image_pooling_2d": "attention-meanq",
37
+ "image_pooling_h": 2,
38
+ "image_pooling_w": 2,
39
+ "image_projector": "mlp",
40
+ "include_bias": false,
41
+ "init_cutoff_factor": 3.0,
42
+ "init_device": "meta",
43
+ "init_fn": "normal",
44
+ "init_std": 0.02,
45
+ "initializer_range": 0.02,
46
+ "layer_norm_eps": 1e-05,
47
+ "layer_norm_type": "rms",
48
+ "layer_norm_with_affine": true,
49
+ "llm_load_path": null,
50
+ "loss_token_weighting": "root_subsegments",
51
+ "low_cpu_fsdp": true,
52
+ "max_crops": 12,
53
+ "max_position_embeddings": 32768,
54
+ "max_sequence_length": 4096,
55
+ "message_formatting": "role",
56
+ "mlp_hidden_size": null,
57
+ "mlp_ratio": 1,
58
+ "model_type": "molmo",
59
+ "moe_capacity_factor": 1.25,
60
+ "moe_dropless": true,
61
+ "moe_interleave": false,
62
+ "moe_lbl_in_fp32": false,
63
+ "moe_log_expert_assignment": false,
64
+ "moe_loss_weight": 0.0,
65
+ "moe_mlp_impl": "sparse",
66
+ "moe_num_experts": 64,
67
+ "moe_shared_expert": false,
68
+ "moe_top_k": 8,
69
+ "moe_zloss_weight": 0.0,
70
+ "multi_query_attention": null,
71
+ "n_heads": 16,
72
+ "n_kv_heads": null,
73
+ "n_layers": 16,
74
+ "new_embedding_init_range": 0.02,
75
+ "norm_after": false,
76
+ "normalize_input_embeds": false,
77
+ "overlap_margins": [
78
+ 4,
79
+ 4
80
+ ],
81
+ "pad_to": null,
82
+ "pad_token_id": 1,
83
+ "pad_tokenizer": false,
84
+ "precision": "amp_bf16",
85
+ "prompt_override": null,
86
+ "prompt_type": "uber_model",
87
+ "qkv_bias": false,
88
+ "quantization_config": {
89
+ "_load_in_4bit": true,
90
+ "_load_in_8bit": false,
91
+ "bnb_4bit_compute_dtype": "float16",
92
+ "bnb_4bit_quant_storage": "uint8",
93
+ "bnb_4bit_quant_type": "nf4",
94
+ "bnb_4bit_use_double_quant": false,
95
+ "llm_int8_enable_fp32_cpu_offload": false,
96
+ "llm_int8_has_fp16_weight": false,
97
+ "llm_int8_skip_modules": [
98
+ "model.vision_backbone",
99
+ "model.transformer.ff_out",
100
+ "model.transformer.ln_f"
101
+ ],
102
+ "llm_int8_threshold": 6.0,
103
+ "load_in_4bit": true,
104
+ "load_in_8bit": false,
105
+ "quant_method": "bitsandbytes"
106
+ },
107
+ "query_pre_attn_scalar": 224,
108
+ "residual_dropout": 0.1,
109
+ "response_attention_dropout": 0.0,
110
+ "response_residual_dropout": 0.0,
111
+ "rope": true,
112
+ "rope_full_precision": true,
113
+ "rope_impl": "llama",
114
+ "rope_theta": 10000.0,
115
+ "scale_logits": false,
116
+ "system_prompt_kind": "demo_or_style",
117
+ "transformers_version": "4.45.0.dev0",
118
+ "unconditioned": false,
119
+ "use_cache": true,
120
+ "use_cls_feature": false,
121
+ "use_col_tokens": true,
122
+ "use_position_ids": true,
123
+ "vision_backbone": {
124
+ "attention_dropout": 0.0,
125
+ "fsdp_wrap": false,
126
+ "image_default_input_size": [
127
+ 336,
128
+ 336
129
+ ],
130
+ "image_dropout_rate": 0.0,
131
+ "image_emb_dim": 1024,
132
+ "image_head_dim": 64,
133
+ "image_mlp_activations": "quick_gelu",
134
+ "image_mlp_dim": 4096,
135
+ "image_model_type": "openai",
136
+ "image_norm_eps": 1e-05,
137
+ "image_num_heads": 16,
138
+ "image_num_key_value_heads": 16,
139
+ "image_num_layers": 23,
140
+ "image_num_pos": 577,
141
+ "image_patch_size": 14,
142
+ "image_pos_patch_size": 14,
143
+ "initializer_range": 0.02,
144
+ "residual_dropout": 0.0,
145
+ "resize_mode": "default"
146
+ },
147
+ "vit_layers": [
148
+ -2,
149
+ -9
150
+ ],
151
+ "vit_load_path": null,
152
+ "vocab_size": 50280,
153
+ "weight_tying": false
154
+ }
config_molmoe - Copy.py ADDED
@@ -0,0 +1,670 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ from dataclasses import asdict, dataclass, field
5
+ from enum import Enum
6
+ from glob import glob
7
+ from os import PathLike
8
+ from pathlib import Path
9
+ from typing import (
10
+ Any,
11
+ Dict,
12
+ Iterable,
13
+ List,
14
+ Optional,
15
+ Tuple,
16
+ Type,
17
+ TypeVar,
18
+ Union,
19
+ cast,
20
+ )
21
+
22
+ from transformers import PretrainedConfig
23
+
24
+
25
+ C = TypeVar("C", bound="BaseConfig")
26
+ D = TypeVar("D", bound="DictConfig|ListConfig")
27
+
28
+
29
+ PathOrStr = Union[str, PathLike]
30
+
31
+
32
+ class StrEnum(str, Enum):
33
+ """
34
+ This is equivalent to Python's :class:`enum.StrEnum` since version 3.11.
35
+ We include this here for compatibility with older version of Python.
36
+ """
37
+
38
+ def __str__(self) -> str:
39
+ return self.value
40
+
41
+ def __repr__(self) -> str:
42
+ return f"'{str(self)}'"
43
+
44
+
45
+
46
+ class AttentionType(StrEnum):
47
+ sdpa = "sdpa"
48
+ direct = "direct"
49
+ flash = "flash"
50
+
51
+
52
+ class LayerNormType(StrEnum):
53
+ default = "default"
54
+ """
55
+ The default LayerNorm implementation, equivalent to PyTorch's built-in version.
56
+ """
57
+
58
+ low_precision = "low_precision"
59
+ """
60
+ A low-precision version of the default LayerNorm.
61
+ """
62
+
63
+ rms = "rms"
64
+ """
65
+ An RMSNorm implementation. When using ``torch.compile`` this is
66
+ probably the fastest implementation.
67
+ """
68
+
69
+ gemma_rms = "gemma_rms"
70
+ """
71
+ A GemmaRMSNorm implementation. When using ``torch.compile`` this is
72
+ probably the fastest implementation.
73
+ """
74
+
75
+
76
+ class ActivationType(StrEnum):
77
+ quick_gelu = "quick_gelu"
78
+ gelu = "gelu"
79
+ gelu_tanh = "gelu_tanh"
80
+ relu = "relu"
81
+ silu = "silu"
82
+ llama_geglu = "llama_geglu"
83
+ llama_geglu_tanh = "llama_geglu_tanh"
84
+ llama_swiglu = "llama_swiglu"
85
+ swiglu = "swiglu"
86
+
87
+
88
+ class BlockType(StrEnum):
89
+ sequential = "sequential"
90
+
91
+ llama = "llama"
92
+ """
93
+ A block similar to the sequential block with slightly different
94
+ implementations of operations like attention to imitate the behavior of Llama.
95
+ """
96
+
97
+ gemma = "gemma"
98
+ """
99
+ A block similar to the sequential block with slightly different
100
+ implementations of operations like attention to imitate the behavior of Gemma.
101
+ """
102
+
103
+ moe = "moe"
104
+
105
+
106
+ class InitFnType(StrEnum):
107
+ mitchell = "mitchell"
108
+ """
109
+ The strategy suggested to us by Mitchell Wortsman from UW.
110
+ This uses a truncated normal distribution with an adaptive standard deviation that depends
111
+ on the size of the weights as well as the depth of the layer.
112
+ """
113
+
114
+ normal = "normal"
115
+ """
116
+ All weights are initialized from the same normal distribution.
117
+ """
118
+
119
+ kaiming_normal = "kaiming_normal"
120
+ """
121
+ All weights are initialized with the Kaiming method from a normal distribution.
122
+ Note this currently won't work with FSDP.
123
+ """
124
+
125
+ fan_in = "fan_in"
126
+ """
127
+ "Fan-in variance scaling", i.e. normal with a standard deviation of ``1/sqrt(d_in)`` where ``d_in``
128
+ is the input dimensionality of the kernel.
129
+ """
130
+
131
+ full_megatron = "full_megatron"
132
+ """
133
+ This is what metaseq calls "full megatron init". It is the init used for Llama 2.
134
+ """
135
+
136
+
137
+ class VisionBackboneType(StrEnum):
138
+ openai = "openai"
139
+
140
+
141
+ class ImagePaddingEmbed(StrEnum):
142
+ pad_and_partial_pad = "pad_and_partial_pad"
143
+ pad_embed = "pad_embed"
144
+ regress = "regress"
145
+
146
+
147
+ class ImagePooling2DType(StrEnum):
148
+ attention = "attention"
149
+ attention_meanq = "attention-meanq"
150
+ attention_2wide = "attention_2wide"
151
+ attention_v2 = "attention-v2"
152
+ none = "none"
153
+ stack = "stack"
154
+
155
+
156
+ class ImageProjectType(StrEnum):
157
+ mlp = "mlp"
158
+ mlpx2 = "2mlp"
159
+ linear = "linear"
160
+
161
+
162
+ @dataclass
163
+ class VisionBackboneConfig:
164
+ image_model_type: VisionBackboneType = VisionBackboneType.openai
165
+ image_default_input_size: Tuple[int, int] = (336, 336)
166
+ image_patch_size: int = 14
167
+ image_pos_patch_size: int = 14
168
+ image_emb_dim: int = 1024
169
+ image_num_heads: int = 16
170
+ image_num_key_value_heads: int = 16
171
+ image_num_layers: int = 24
172
+ image_head_dim: int = 64
173
+ image_mlp_dim: int = 4096
174
+ image_mlp_activations: ActivationType = ActivationType.gelu
175
+ image_dropout_rate: float = 0.0
176
+ image_num_pos: int = 577
177
+ image_norm_eps: float = 1e-5
178
+ attention_dropout: float = 0.0
179
+ residual_dropout: float = 0.0
180
+ initializer_range: float = 0.02
181
+ fsdp_wrap: bool = False
182
+
183
+ # how to preprocess imagse for this ViT
184
+ resize_mode: str = "default"
185
+
186
+ def __post_init__(self):
187
+ self.image_default_input_size = tuple(self.image_default_input_size) # type: ignore[assignment]
188
+
189
+ @property
190
+ def image_num_patch(self):
191
+ h, w = self.image_default_input_size
192
+ return h // self.image_patch_size, w // self.image_patch_size
193
+
194
+
195
+ class TruncationDirection(StrEnum):
196
+ right = "right"
197
+ left = "left"
198
+
199
+
200
+ @dataclass
201
+ class ModelConfig:
202
+ """
203
+ OLMo (model) configuration.
204
+ """
205
+
206
+ # Note that the defaults for these attributes are equivalent to the base GPT2 model.
207
+
208
+ d_model: int = 768
209
+ """
210
+ The hidden size of the model.
211
+ """
212
+
213
+ n_heads: int = 12
214
+ """
215
+ The number of self-attention heads.
216
+ """
217
+
218
+ n_kv_heads: Optional[int] = None
219
+ """
220
+ The number of heads to use for keys and values. Defaults to `n_heads`.
221
+ Set this to ``None`` or ``n_heads`` for normal multi-head attention.
222
+ Set this to 1 for multi-query attention.
223
+ Set it to some in-between value for Llama2-style grouped query attention.
224
+ """
225
+
226
+ qkv_bias: bool = False # qwen models use bias in kvq layers
227
+
228
+ clip_qkv: Optional[float] = None
229
+ """
230
+ Clip QKV to this value when set.
231
+ """
232
+
233
+ n_layers: int = 12
234
+ """
235
+ The number of layers/blocks.
236
+ """
237
+
238
+ mlp_ratio: int = 4
239
+ """
240
+ The ratio of the inner MLP dimensionality to ``d_model``.
241
+ This is only used when ``mlp_hidden_size`` is not set.
242
+ """
243
+
244
+ mlp_hidden_size: Optional[int] = None
245
+ """
246
+ Set the exact hidden size for the MLP. Otherwise the inner MLP hidden size will be set to `mlp_ratio * d_model`.
247
+ """
248
+
249
+ activation_type: ActivationType = ActivationType.swiglu
250
+ """
251
+ The activation function to use within the MLP layers.
252
+ """
253
+
254
+ block_type: BlockType = BlockType.sequential
255
+ """
256
+ The transformer block implementation.
257
+ """
258
+
259
+ block_group_size: int = 1
260
+ """
261
+ The number of blocks to group together into a single parent block.
262
+ This has no affect on the number of parameters in the model and is only used to wrap groups
263
+ of blocks together with a single FSDP wrapper during training.
264
+ """
265
+
266
+ alibi: bool = False
267
+ """
268
+ If ``True``, use ALiBi embeddings. Mutually exclusive with ``rope``.
269
+ """
270
+
271
+ alibi_bias_max: float = 8.0
272
+ """
273
+ Maximum absolute value of ALiBi bias.
274
+ """
275
+
276
+ rope: bool = False
277
+ """
278
+ Use rotary positional embeddings (RoPE). Mutually exclusive with ``alibi``.
279
+ """
280
+
281
+ rope_full_precision: bool = True
282
+ """
283
+ If ``True``, apply RoPE embeddings at full precision regardless of the input type. Otherwise,
284
+ apply RoPE at the precision of the input.
285
+ """
286
+
287
+ rope_theta: float = 10000.
288
+
289
+ rope_impl: str = "cockatoo"
290
+
291
+ vit_load_path: Optional[str] = None
292
+ """
293
+ Use this to load the vit model.
294
+ """
295
+
296
+ llm_load_path: Optional[str] = None
297
+ """
298
+ Use this to partially load the llm transformer.
299
+ """
300
+
301
+ low_cpu_fsdp: bool = True
302
+ """
303
+ If ``True``, we save cpu memory by loading the pretrained vision model on randk0 only
304
+ when init_device is `meta`.
305
+ If TrainConfig.load_path is set, this should be set to ``False`` (default: True)
306
+ """
307
+
308
+ attention_type: AttentionType = AttentionType.sdpa
309
+ """
310
+ Attention implementation to use.
311
+ """
312
+
313
+ float32_attention: bool = True
314
+ """
315
+ Compute attention in float32
316
+ """
317
+
318
+ attention_dropout: float = 0.1
319
+ """
320
+ The dropout probability within the attention modules.
321
+ """
322
+
323
+ # Only apply dropout to response tokens
324
+ response_attention_dropout: float = 0.0
325
+
326
+ multi_query_attention: Optional[bool] = None
327
+ """
328
+ Deprecated. Use n_kv_heads instead.
329
+ """
330
+
331
+ attention_layer_norm: bool = False
332
+ """
333
+ Apply layer norm to the keys and queries within the attention mechanism.
334
+ This can help stabilize training.
335
+ """
336
+
337
+ residual_dropout: float = 0.1
338
+ """
339
+ The dropout probability for the MLP and attention output within each block.
340
+ """
341
+
342
+ # Only apply dropout to response tokens
343
+ response_residual_dropout: float = 0.0
344
+
345
+ embedding_dropout: float = 0.1
346
+ """
347
+ The dropout probability for embeddings.
348
+ """
349
+
350
+ layer_norm_type: LayerNormType = LayerNormType.default
351
+ """
352
+ The layernorm implementation to use.
353
+ """
354
+
355
+ layer_norm_with_affine: bool = True
356
+ """
357
+ Whether to include bias and weight parameters for the layer norms.
358
+ This only affects layer norms that are immediately followed by a linear layer in the forward pass,
359
+ so everything except QK-norms. To turn off affines for QK norms as well, set :attr:`attention_layer_norm_with_affine`
360
+ to ``False``.
361
+ """
362
+
363
+ layer_norm_eps: Optional[float] = None
364
+
365
+ attention_layer_norm_with_affine: bool = True
366
+ """
367
+ Toggle affine transform for the QK norms.
368
+ """
369
+
370
+ max_sequence_length: int = 1024
371
+ """
372
+ The maximum input sequence length supported by the model.
373
+ """
374
+
375
+ max_position_embeddings: Optional[int] = None
376
+
377
+ include_bias: bool = True
378
+ """
379
+ Whether or not to include bias parameters in linear layers.
380
+ In PaLM, they got rid of all bias terms because they found that large
381
+ models tend to have near 0 bias terms anyway.
382
+ """
383
+
384
+ bias_for_layer_norm: Optional[bool] = None
385
+ """
386
+ Whether or not to include bias parameters in layer norm.
387
+ This is separate from the include_bias parameter, because of a ROCm crash when biases are disabled in
388
+ layer norm.
389
+ When this is None (the default), it inherits the setting from include_bias.
390
+ """
391
+
392
+ scale_logits: bool = False
393
+ """
394
+ If ``True``, scale the output logits by ``1 / sqrt(d_model)``.
395
+ """
396
+
397
+ vocab_size: int = 50257
398
+ """
399
+ Vocabulary size of the model.
400
+ """
401
+
402
+ embedding_size: Optional[int] = 50304
403
+ """
404
+ The number of embeddings, i.e. the number of tokens. If set to ``None`` it will default
405
+ to ``vocab_size``. If ``vocab_size`` is not a multiple of 128, setting this to the
406
+ next multiple of 128 that's greater than ``vocab_size`` can improve throughput
407
+ substantially.
408
+ """
409
+
410
+ # For new special tokens
411
+ additional_vocab_size: Optional[int] = None
412
+
413
+ new_embedding_init_range: float = 0.02
414
+ """
415
+ How to initialize embedding for new
416
+ """
417
+
418
+ weight_tying: bool = True
419
+ """
420
+ Whether to tie output linear weights to the input embedding.
421
+ """
422
+
423
+ pad_token_id: int = -1
424
+ """
425
+ The ID of the token to use for padding. Defaults to the ID of the EOS token.
426
+ """
427
+
428
+ init_device: Optional[str] = None
429
+ """
430
+ The torch device to use when initializing the model parameters, e.g. "cpu", "cuda:0", "meta".
431
+ """
432
+
433
+ init_fn: InitFnType = InitFnType.normal
434
+ """
435
+ The weight initialization strategy.
436
+ """
437
+
438
+ init_std: float = 0.02
439
+ """
440
+ The standard deviation to use when initializing weights with a "fixed distribution" ``init_fn``, such
441
+ as "normal".
442
+ """
443
+
444
+ init_cutoff_factor: Optional[float] = None
445
+ """
446
+ A positive factor used to scale the cutoff values when initializing weights with a "fixed distribution" ``init_fn``, such
447
+ as "normal". Setting this to None means values are not cutoff.
448
+ """
449
+
450
+ norm_after: bool = False
451
+ """
452
+ Apply norm after the attention/feedforward layers rather than before, as introduced in the Swin transformer paper (Liu et al).
453
+ """
454
+
455
+ precision: Optional[str] = None
456
+ """
457
+ Precision used to train/evaluate with. You shouldn't set this directly.
458
+ See :data:`TrainConfig.precision` instead.
459
+ """
460
+
461
+ moe_num_experts: Optional[int] = 8
462
+ """
463
+ The number of experts to use in the MoE block.
464
+ """
465
+
466
+ moe_top_k: Optional[int] = 2
467
+ """
468
+ The number of experts to select for each token.
469
+ """
470
+
471
+ moe_mlp_impl: Optional[str] = "sparse"
472
+ """
473
+ Choose "grouped" for grouped GEMM installable via `pip install git+https://[email protected]/tgale96/grouped_gemm.git@66c7195e35e8c4f22fa6a014037ef511bfa397cb`.
474
+ """
475
+
476
+ moe_log_expert_assignment: Optional[bool] = False
477
+ """
478
+ Whether to log the expert assignment.
479
+ """
480
+
481
+ moe_shared_expert: Optional[bool] = False
482
+ """
483
+ Whether to have an always-used expert like in [DeepSeekMoE](https://arxiv.org/abs/2401.06066).
484
+ """
485
+
486
+ moe_lbl_in_fp32: Optional[bool] = False
487
+ """
488
+ Whether to perform load balancing in FP32.
489
+ """
490
+
491
+ moe_interleave: Optional[bool] = False
492
+ """
493
+ Interleave sequential with MoE blocks starting with sequential.
494
+ """
495
+
496
+ moe_loss_weight: Optional[float] = 0.1
497
+ """
498
+ The weight to use for the MoE load balancing loss.
499
+ """
500
+
501
+ moe_zloss_weight: Optional[float] = None
502
+ """
503
+ Weight for MoE router z-loss where None means no router z-loss. 0.001 is a common value.
504
+ """
505
+
506
+ moe_dropless: Optional[bool] = True
507
+ """
508
+ Whether to use [dMoE](https://arxiv.org/abs/2211.15841).
509
+ """
510
+
511
+ moe_capacity_factor: Optional[float] = 1.25
512
+ """
513
+ The capacity factor to use in the MoE block. Only applies if not using dMoE.
514
+ """
515
+
516
+ # Image pre-processing options.
517
+ max_crops: int = 12
518
+
519
+ crop_mode: str = "patchify-v2-and-resize-c2"
520
+
521
+ do_random_scale: bool = True
522
+
523
+ use_col_tokens: bool = True
524
+
525
+ # How to prompt the model
526
+ prompt_type: str = "none"
527
+
528
+ # System prompt to use
529
+ system_prompt_kind: str = "style"
530
+
531
+ # How to format messages
532
+ message_formatting: str = "none"
533
+
534
+ always_start_with_space: bool = True
535
+
536
+ prompt_override: Optional[str] = None
537
+
538
+ default_inference_len: Optional[int] = 65
539
+
540
+ overlap_margins: Tuple[int, int] = (4, 4)
541
+
542
+ image_padding_embed: Optional[ImagePaddingEmbed] = None
543
+
544
+ # What layers to get from the image encoder
545
+ vit_layers: Tuple = (-1,)
546
+
547
+ # Controls the image/language connector
548
+ image_pooling_h: int = 2
549
+
550
+ image_pooling_w: int = 2
551
+
552
+ image_pooling_2d: ImagePooling2DType = ImagePooling2DType.attention
553
+
554
+ image_projector: ImageProjectType = ImageProjectType.mlp
555
+
556
+ image_feature_dropout: float = 0.0
557
+
558
+ use_cls_feature: bool = False
559
+
560
+ fix_image_input_idx: int = 2
561
+
562
+ # Makes the model ignore the image
563
+ unconditioned: bool = False
564
+
565
+ # Use in combination with sub-sequence experts to make imags/text tokens always
566
+ # occupy particular sub-sequences of the input
567
+ pad_to: Optional[int] = None
568
+
569
+ # LLM Transformer settings
570
+ initializer_range: float = 0.02
571
+
572
+ pad_tokenizer: bool = False
573
+
574
+ normalize_input_embeds: bool = False
575
+
576
+ use_position_ids: bool = True
577
+ """
578
+ Whether to use position IDs in the model.
579
+ The model operation regarding positional embeddings changes depending on this variable.
580
+ """
581
+
582
+ query_pre_attn_scalar: int = 224
583
+ """
584
+ Scalar to apply to the queries before attention.
585
+ Used for Gemma-2.
586
+ """
587
+
588
+ attn_logit_softcapping: Optional[float] = None
589
+ """
590
+ Softcap the logits in the attention mechanism.
591
+ Used for Gemma-2.
592
+ """
593
+
594
+ final_logit_softcapping: Optional[float] = None
595
+ """
596
+ Softcap the final logits.
597
+ Used for Gemma-2.
598
+ """
599
+
600
+ head_dim: Optional[int] = None
601
+ """
602
+ The head dimensionality for the attention mechanism.
603
+ Used for Gemma-2.
604
+ """
605
+
606
+ loss_token_weighting: Optional[str] = None
607
+
608
+ gin_bindings: Optional[str] = None
609
+
610
+
611
+ class MolmoConfig(PretrainedConfig):
612
+ model_type = "molmo"
613
+ keys_to_ignore_at_inference = ["past_key_values"] # TODO: confirm
614
+
615
+ def __init__(self, use_cache: bool = False, **kwargs):
616
+ model_config = ModelConfig()
617
+ all_kwargs = asdict(model_config)
618
+ all_kwargs.update(kwargs)
619
+ all_kwargs.update({"use_cache": use_cache})
620
+ all_kwargs.update(
621
+ {"architectures": all_kwargs.get("architectures", ["OLMoForCausalLM"]) or ["OLMoForCausalLM"]}
622
+ )
623
+ super().__init__(**all_kwargs)
624
+
625
+ @property
626
+ def num_attention_heads(self):
627
+ return self.n_heads
628
+
629
+ @property
630
+ def num_hidden_layers(self):
631
+ return self.n_layers
632
+
633
+ @property
634
+ def hidden_size(self):
635
+ return self.d_model
636
+
637
+ @property
638
+ def image_num_patch(self):
639
+ h, w = (336, 336)
640
+ return h // 14, w // 14
641
+
642
+ @property
643
+ def llm_patches_per_crop(self):
644
+ h, w = self.image_num_patch
645
+ # Round up in case we need to pad the image features for pooling
646
+ h = (h + self.image_pooling_h - 1) // self.image_pooling_h
647
+ w = (w + self.image_pooling_w - 1) // self.image_pooling_w
648
+ return h, w
649
+
650
+ @property
651
+ def effective_n_kv_heads(self) -> int:
652
+ if self.n_kv_heads is None:
653
+ if self.multi_query_attention is True:
654
+ return 1
655
+ else:
656
+ return self.n_heads
657
+ else:
658
+ if self.multi_query_attention is None:
659
+ return self.n_kv_heads
660
+ if self.multi_query_attention:
661
+ n_kv_heads_should_be = 1
662
+ else:
663
+ n_kv_heads_should_be = self.n_heads
664
+ if self.n_kv_heads == n_kv_heads_should_be:
665
+ return n_kv_heads_should_be
666
+ else:
667
+ raise ValueError(
668
+ "You can't set `multi_query_attention` and `n_kv_heads` at the same time."
669
+ )
670
+
config_molmoe.py ADDED
@@ -0,0 +1,670 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ from dataclasses import asdict, dataclass, field
5
+ from enum import Enum
6
+ from glob import glob
7
+ from os import PathLike
8
+ from pathlib import Path
9
+ from typing import (
10
+ Any,
11
+ Dict,
12
+ Iterable,
13
+ List,
14
+ Optional,
15
+ Tuple,
16
+ Type,
17
+ TypeVar,
18
+ Union,
19
+ cast,
20
+ )
21
+
22
+ from transformers import PretrainedConfig
23
+
24
+
25
+ C = TypeVar("C", bound="BaseConfig")
26
+ D = TypeVar("D", bound="DictConfig|ListConfig")
27
+
28
+
29
+ PathOrStr = Union[str, PathLike]
30
+
31
+
32
+ class StrEnum(str, Enum):
33
+ """
34
+ This is equivalent to Python's :class:`enum.StrEnum` since version 3.11.
35
+ We include this here for compatibility with older version of Python.
36
+ """
37
+
38
+ def __str__(self) -> str:
39
+ return self.value
40
+
41
+ def __repr__(self) -> str:
42
+ return f"'{str(self)}'"
43
+
44
+
45
+
46
+ class AttentionType(StrEnum):
47
+ sdpa = "sdpa"
48
+ direct = "direct"
49
+ flash = "flash"
50
+
51
+
52
+ class LayerNormType(StrEnum):
53
+ default = "default"
54
+ """
55
+ The default LayerNorm implementation, equivalent to PyTorch's built-in version.
56
+ """
57
+
58
+ low_precision = "low_precision"
59
+ """
60
+ A low-precision version of the default LayerNorm.
61
+ """
62
+
63
+ rms = "rms"
64
+ """
65
+ An RMSNorm implementation. When using ``torch.compile`` this is
66
+ probably the fastest implementation.
67
+ """
68
+
69
+ gemma_rms = "gemma_rms"
70
+ """
71
+ A GemmaRMSNorm implementation. When using ``torch.compile`` this is
72
+ probably the fastest implementation.
73
+ """
74
+
75
+
76
+ class ActivationType(StrEnum):
77
+ quick_gelu = "quick_gelu"
78
+ gelu = "gelu"
79
+ gelu_tanh = "gelu_tanh"
80
+ relu = "relu"
81
+ silu = "silu"
82
+ llama_geglu = "llama_geglu"
83
+ llama_geglu_tanh = "llama_geglu_tanh"
84
+ llama_swiglu = "llama_swiglu"
85
+ swiglu = "swiglu"
86
+
87
+
88
+ class BlockType(StrEnum):
89
+ sequential = "sequential"
90
+
91
+ llama = "llama"
92
+ """
93
+ A block similar to the sequential block with slightly different
94
+ implementations of operations like attention to imitate the behavior of Llama.
95
+ """
96
+
97
+ gemma = "gemma"
98
+ """
99
+ A block similar to the sequential block with slightly different
100
+ implementations of operations like attention to imitate the behavior of Gemma.
101
+ """
102
+
103
+ moe = "moe"
104
+
105
+
106
+ class InitFnType(StrEnum):
107
+ mitchell = "mitchell"
108
+ """
109
+ The strategy suggested to us by Mitchell Wortsman from UW.
110
+ This uses a truncated normal distribution with an adaptive standard deviation that depends
111
+ on the size of the weights as well as the depth of the layer.
112
+ """
113
+
114
+ normal = "normal"
115
+ """
116
+ All weights are initialized from the same normal distribution.
117
+ """
118
+
119
+ kaiming_normal = "kaiming_normal"
120
+ """
121
+ All weights are initialized with the Kaiming method from a normal distribution.
122
+ Note this currently won't work with FSDP.
123
+ """
124
+
125
+ fan_in = "fan_in"
126
+ """
127
+ "Fan-in variance scaling", i.e. normal with a standard deviation of ``1/sqrt(d_in)`` where ``d_in``
128
+ is the input dimensionality of the kernel.
129
+ """
130
+
131
+ full_megatron = "full_megatron"
132
+ """
133
+ This is what metaseq calls "full megatron init". It is the init used for Llama 2.
134
+ """
135
+
136
+
137
+ class VisionBackboneType(StrEnum):
138
+ openai = "openai"
139
+
140
+
141
+ class ImagePaddingEmbed(StrEnum):
142
+ pad_and_partial_pad = "pad_and_partial_pad"
143
+ pad_embed = "pad_embed"
144
+ regress = "regress"
145
+
146
+
147
+ class ImagePooling2DType(StrEnum):
148
+ attention = "attention"
149
+ attention_meanq = "attention-meanq"
150
+ attention_2wide = "attention_2wide"
151
+ attention_v2 = "attention-v2"
152
+ none = "none"
153
+ stack = "stack"
154
+
155
+
156
+ class ImageProjectType(StrEnum):
157
+ mlp = "mlp"
158
+ mlpx2 = "2mlp"
159
+ linear = "linear"
160
+
161
+
162
+ @dataclass
163
+ class VisionBackboneConfig:
164
+ image_model_type: VisionBackboneType = VisionBackboneType.openai
165
+ image_default_input_size: Tuple[int, int] = (336, 336)
166
+ image_patch_size: int = 14
167
+ image_pos_patch_size: int = 14
168
+ image_emb_dim: int = 1024
169
+ image_num_heads: int = 16
170
+ image_num_key_value_heads: int = 16
171
+ image_num_layers: int = 24
172
+ image_head_dim: int = 64
173
+ image_mlp_dim: int = 4096
174
+ image_mlp_activations: ActivationType = ActivationType.gelu
175
+ image_dropout_rate: float = 0.0
176
+ image_num_pos: int = 577
177
+ image_norm_eps: float = 1e-5
178
+ attention_dropout: float = 0.0
179
+ residual_dropout: float = 0.0
180
+ initializer_range: float = 0.02
181
+ fsdp_wrap: bool = False
182
+
183
+ # how to preprocess imagse for this ViT
184
+ resize_mode: str = "default"
185
+
186
+ def __post_init__(self):
187
+ self.image_default_input_size = tuple(self.image_default_input_size) # type: ignore[assignment]
188
+
189
+ @property
190
+ def image_num_patch(self):
191
+ h, w = self.image_default_input_size
192
+ return h // self.image_patch_size, w // self.image_patch_size
193
+
194
+
195
+ class TruncationDirection(StrEnum):
196
+ right = "right"
197
+ left = "left"
198
+
199
+
200
+ @dataclass
201
+ class ModelConfig:
202
+ """
203
+ OLMo (model) configuration.
204
+ """
205
+
206
+ # Note that the defaults for these attributes are equivalent to the base GPT2 model.
207
+
208
+ d_model: int = 768
209
+ """
210
+ The hidden size of the model.
211
+ """
212
+
213
+ n_heads: int = 12
214
+ """
215
+ The number of self-attention heads.
216
+ """
217
+
218
+ n_kv_heads: Optional[int] = None
219
+ """
220
+ The number of heads to use for keys and values. Defaults to `n_heads`.
221
+ Set this to ``None`` or ``n_heads`` for normal multi-head attention.
222
+ Set this to 1 for multi-query attention.
223
+ Set it to some in-between value for Llama2-style grouped query attention.
224
+ """
225
+
226
+ qkv_bias: bool = False # qwen models use bias in kvq layers
227
+
228
+ clip_qkv: Optional[float] = None
229
+ """
230
+ Clip QKV to this value when set.
231
+ """
232
+
233
+ n_layers: int = 12
234
+ """
235
+ The number of layers/blocks.
236
+ """
237
+
238
+ mlp_ratio: int = 4
239
+ """
240
+ The ratio of the inner MLP dimensionality to ``d_model``.
241
+ This is only used when ``mlp_hidden_size`` is not set.
242
+ """
243
+
244
+ mlp_hidden_size: Optional[int] = None
245
+ """
246
+ Set the exact hidden size for the MLP. Otherwise the inner MLP hidden size will be set to `mlp_ratio * d_model`.
247
+ """
248
+
249
+ activation_type: ActivationType = ActivationType.swiglu
250
+ """
251
+ The activation function to use within the MLP layers.
252
+ """
253
+
254
+ block_type: BlockType = BlockType.sequential
255
+ """
256
+ The transformer block implementation.
257
+ """
258
+
259
+ block_group_size: int = 1
260
+ """
261
+ The number of blocks to group together into a single parent block.
262
+ This has no affect on the number of parameters in the model and is only used to wrap groups
263
+ of blocks together with a single FSDP wrapper during training.
264
+ """
265
+
266
+ alibi: bool = False
267
+ """
268
+ If ``True``, use ALiBi embeddings. Mutually exclusive with ``rope``.
269
+ """
270
+
271
+ alibi_bias_max: float = 8.0
272
+ """
273
+ Maximum absolute value of ALiBi bias.
274
+ """
275
+
276
+ rope: bool = False
277
+ """
278
+ Use rotary positional embeddings (RoPE). Mutually exclusive with ``alibi``.
279
+ """
280
+
281
+ rope_full_precision: bool = True
282
+ """
283
+ If ``True``, apply RoPE embeddings at full precision regardless of the input type. Otherwise,
284
+ apply RoPE at the precision of the input.
285
+ """
286
+
287
+ rope_theta: float = 10000.
288
+
289
+ rope_impl: str = "cockatoo"
290
+
291
+ vit_load_path: Optional[str] = None
292
+ """
293
+ Use this to load the vit model.
294
+ """
295
+
296
+ llm_load_path: Optional[str] = None
297
+ """
298
+ Use this to partially load the llm transformer.
299
+ """
300
+
301
+ low_cpu_fsdp: bool = True
302
+ """
303
+ If ``True``, we save cpu memory by loading the pretrained vision model on randk0 only
304
+ when init_device is `meta`.
305
+ If TrainConfig.load_path is set, this should be set to ``False`` (default: True)
306
+ """
307
+
308
+ attention_type: AttentionType = AttentionType.sdpa
309
+ """
310
+ Attention implementation to use.
311
+ """
312
+
313
+ float32_attention: bool = True
314
+ """
315
+ Compute attention in float32
316
+ """
317
+
318
+ attention_dropout: float = 0.1
319
+ """
320
+ The dropout probability within the attention modules.
321
+ """
322
+
323
+ # Only apply dropout to response tokens
324
+ response_attention_dropout: float = 0.0
325
+
326
+ multi_query_attention: Optional[bool] = None
327
+ """
328
+ Deprecated. Use n_kv_heads instead.
329
+ """
330
+
331
+ attention_layer_norm: bool = False
332
+ """
333
+ Apply layer norm to the keys and queries within the attention mechanism.
334
+ This can help stabilize training.
335
+ """
336
+
337
+ residual_dropout: float = 0.1
338
+ """
339
+ The dropout probability for the MLP and attention output within each block.
340
+ """
341
+
342
+ # Only apply dropout to response tokens
343
+ response_residual_dropout: float = 0.0
344
+
345
+ embedding_dropout: float = 0.1
346
+ """
347
+ The dropout probability for embeddings.
348
+ """
349
+
350
+ layer_norm_type: LayerNormType = LayerNormType.default
351
+ """
352
+ The layernorm implementation to use.
353
+ """
354
+
355
+ layer_norm_with_affine: bool = True
356
+ """
357
+ Whether to include bias and weight parameters for the layer norms.
358
+ This only affects layer norms that are immediately followed by a linear layer in the forward pass,
359
+ so everything except QK-norms. To turn off affines for QK norms as well, set :attr:`attention_layer_norm_with_affine`
360
+ to ``False``.
361
+ """
362
+
363
+ layer_norm_eps: Optional[float] = None
364
+
365
+ attention_layer_norm_with_affine: bool = True
366
+ """
367
+ Toggle affine transform for the QK norms.
368
+ """
369
+
370
+ max_sequence_length: int = 1024
371
+ """
372
+ The maximum input sequence length supported by the model.
373
+ """
374
+
375
+ max_position_embeddings: Optional[int] = None
376
+
377
+ include_bias: bool = True
378
+ """
379
+ Whether or not to include bias parameters in linear layers.
380
+ In PaLM, they got rid of all bias terms because they found that large
381
+ models tend to have near 0 bias terms anyway.
382
+ """
383
+
384
+ bias_for_layer_norm: Optional[bool] = None
385
+ """
386
+ Whether or not to include bias parameters in layer norm.
387
+ This is separate from the include_bias parameter, because of a ROCm crash when biases are disabled in
388
+ layer norm.
389
+ When this is None (the default), it inherits the setting from include_bias.
390
+ """
391
+
392
+ scale_logits: bool = False
393
+ """
394
+ If ``True``, scale the output logits by ``1 / sqrt(d_model)``.
395
+ """
396
+
397
+ vocab_size: int = 50257
398
+ """
399
+ Vocabulary size of the model.
400
+ """
401
+
402
+ embedding_size: Optional[int] = 50304
403
+ """
404
+ The number of embeddings, i.e. the number of tokens. If set to ``None`` it will default
405
+ to ``vocab_size``. If ``vocab_size`` is not a multiple of 128, setting this to the
406
+ next multiple of 128 that's greater than ``vocab_size`` can improve throughput
407
+ substantially.
408
+ """
409
+
410
+ # For new special tokens
411
+ additional_vocab_size: Optional[int] = None
412
+
413
+ new_embedding_init_range: float = 0.02
414
+ """
415
+ How to initialize embedding for new
416
+ """
417
+
418
+ weight_tying: bool = True
419
+ """
420
+ Whether to tie output linear weights to the input embedding.
421
+ """
422
+
423
+ pad_token_id: int = -1
424
+ """
425
+ The ID of the token to use for padding. Defaults to the ID of the EOS token.
426
+ """
427
+
428
+ init_device: Optional[str] = None
429
+ """
430
+ The torch device to use when initializing the model parameters, e.g. "cpu", "cuda:0", "meta".
431
+ """
432
+
433
+ init_fn: InitFnType = InitFnType.normal
434
+ """
435
+ The weight initialization strategy.
436
+ """
437
+
438
+ init_std: float = 0.02
439
+ """
440
+ The standard deviation to use when initializing weights with a "fixed distribution" ``init_fn``, such
441
+ as "normal".
442
+ """
443
+
444
+ init_cutoff_factor: Optional[float] = None
445
+ """
446
+ A positive factor used to scale the cutoff values when initializing weights with a "fixed distribution" ``init_fn``, such
447
+ as "normal". Setting this to None means values are not cutoff.
448
+ """
449
+
450
+ norm_after: bool = False
451
+ """
452
+ Apply norm after the attention/feedforward layers rather than before, as introduced in the Swin transformer paper (Liu et al).
453
+ """
454
+
455
+ precision: Optional[str] = None
456
+ """
457
+ Precision used to train/evaluate with. You shouldn't set this directly.
458
+ See :data:`TrainConfig.precision` instead.
459
+ """
460
+
461
+ moe_num_experts: Optional[int] = 8
462
+ """
463
+ The number of experts to use in the MoE block.
464
+ """
465
+
466
+ moe_top_k: Optional[int] = 2
467
+ """
468
+ The number of experts to select for each token.
469
+ """
470
+
471
+ moe_mlp_impl: Optional[str] = "sparse"
472
+ """
473
+ Choose "grouped" for grouped GEMM installable via `pip install git+https://[email protected]/tgale96/grouped_gemm.git@66c7195e35e8c4f22fa6a014037ef511bfa397cb`.
474
+ """
475
+
476
+ moe_log_expert_assignment: Optional[bool] = False
477
+ """
478
+ Whether to log the expert assignment.
479
+ """
480
+
481
+ moe_shared_expert: Optional[bool] = False
482
+ """
483
+ Whether to have an always-used expert like in [DeepSeekMoE](https://arxiv.org/abs/2401.06066).
484
+ """
485
+
486
+ moe_lbl_in_fp32: Optional[bool] = False
487
+ """
488
+ Whether to perform load balancing in FP32.
489
+ """
490
+
491
+ moe_interleave: Optional[bool] = False
492
+ """
493
+ Interleave sequential with MoE blocks starting with sequential.
494
+ """
495
+
496
+ moe_loss_weight: Optional[float] = 0.1
497
+ """
498
+ The weight to use for the MoE load balancing loss.
499
+ """
500
+
501
+ moe_zloss_weight: Optional[float] = None
502
+ """
503
+ Weight for MoE router z-loss where None means no router z-loss. 0.001 is a common value.
504
+ """
505
+
506
+ moe_dropless: Optional[bool] = True
507
+ """
508
+ Whether to use [dMoE](https://arxiv.org/abs/2211.15841).
509
+ """
510
+
511
+ moe_capacity_factor: Optional[float] = 1.25
512
+ """
513
+ The capacity factor to use in the MoE block. Only applies if not using dMoE.
514
+ """
515
+
516
+ # Image pre-processing options.
517
+ max_crops: int = 12
518
+
519
+ crop_mode: str = "patchify-v2-and-resize-c2"
520
+
521
+ do_random_scale: bool = True
522
+
523
+ use_col_tokens: bool = True
524
+
525
+ # How to prompt the model
526
+ prompt_type: str = "none"
527
+
528
+ # System prompt to use
529
+ system_prompt_kind: str = "style"
530
+
531
+ # How to format messages
532
+ message_formatting: str = "none"
533
+
534
+ always_start_with_space: bool = True
535
+
536
+ prompt_override: Optional[str] = None
537
+
538
+ default_inference_len: Optional[int] = 65
539
+
540
+ overlap_margins: Tuple[int, int] = (4, 4)
541
+
542
+ image_padding_embed: Optional[ImagePaddingEmbed] = None
543
+
544
+ # What layers to get from the image encoder
545
+ vit_layers: Tuple = (-1,)
546
+
547
+ # Controls the image/language connector
548
+ image_pooling_h: int = 2
549
+
550
+ image_pooling_w: int = 2
551
+
552
+ image_pooling_2d: ImagePooling2DType = ImagePooling2DType.attention
553
+
554
+ image_projector: ImageProjectType = ImageProjectType.mlp
555
+
556
+ image_feature_dropout: float = 0.0
557
+
558
+ use_cls_feature: bool = False
559
+
560
+ fix_image_input_idx: int = 2
561
+
562
+ # Makes the model ignore the image
563
+ unconditioned: bool = False
564
+
565
+ # Use in combination with sub-sequence experts to make imags/text tokens always
566
+ # occupy particular sub-sequences of the input
567
+ pad_to: Optional[int] = None
568
+
569
+ # LLM Transformer settings
570
+ initializer_range: float = 0.02
571
+
572
+ pad_tokenizer: bool = False
573
+
574
+ normalize_input_embeds: bool = False
575
+
576
+ use_position_ids: bool = True
577
+ """
578
+ Whether to use position IDs in the model.
579
+ The model operation regarding positional embeddings changes depending on this variable.
580
+ """
581
+
582
+ query_pre_attn_scalar: int = 224
583
+ """
584
+ Scalar to apply to the queries before attention.
585
+ Used for Gemma-2.
586
+ """
587
+
588
+ attn_logit_softcapping: Optional[float] = None
589
+ """
590
+ Softcap the logits in the attention mechanism.
591
+ Used for Gemma-2.
592
+ """
593
+
594
+ final_logit_softcapping: Optional[float] = None
595
+ """
596
+ Softcap the final logits.
597
+ Used for Gemma-2.
598
+ """
599
+
600
+ head_dim: Optional[int] = None
601
+ """
602
+ The head dimensionality for the attention mechanism.
603
+ Used for Gemma-2.
604
+ """
605
+
606
+ loss_token_weighting: Optional[str] = None
607
+
608
+ gin_bindings: Optional[str] = None
609
+
610
+
611
+ class MolmoConfig(PretrainedConfig):
612
+ model_type = "molmo"
613
+ keys_to_ignore_at_inference = ["past_key_values"] # TODO: confirm
614
+
615
+ def __init__(self, use_cache: bool = False, **kwargs):
616
+ model_config = ModelConfig()
617
+ all_kwargs = asdict(model_config)
618
+ all_kwargs.update(kwargs)
619
+ all_kwargs.update({"use_cache": use_cache})
620
+ all_kwargs.update(
621
+ {"architectures": all_kwargs.get("architectures", ["OLMoForCausalLM"]) or ["OLMoForCausalLM"]}
622
+ )
623
+ super().__init__(**all_kwargs)
624
+
625
+ @property
626
+ def num_attention_heads(self):
627
+ return self.n_heads
628
+
629
+ @property
630
+ def num_hidden_layers(self):
631
+ return self.n_layers
632
+
633
+ @property
634
+ def hidden_size(self):
635
+ return self.d_model
636
+
637
+ @property
638
+ def image_num_patch(self):
639
+ h, w = (336, 336)
640
+ return h // 14, w // 14
641
+
642
+ @property
643
+ def llm_patches_per_crop(self):
644
+ h, w = self.image_num_patch
645
+ # Round up in case we need to pad the image features for pooling
646
+ h = (h + self.image_pooling_h - 1) // self.image_pooling_h
647
+ w = (w + self.image_pooling_w - 1) // self.image_pooling_w
648
+ return h, w
649
+
650
+ @property
651
+ def effective_n_kv_heads(self) -> int:
652
+ if self.n_kv_heads is None:
653
+ if self.multi_query_attention is True:
654
+ return 1
655
+ else:
656
+ return self.n_heads
657
+ else:
658
+ if self.multi_query_attention is None:
659
+ return self.n_kv_heads
660
+ if self.multi_query_attention:
661
+ n_kv_heads_should_be = 1
662
+ else:
663
+ n_kv_heads_should_be = self.n_heads
664
+ if self.n_kv_heads == n_kv_heads_should_be:
665
+ return n_kv_heads_should_be
666
+ else:
667
+ raise ValueError(
668
+ "You can't set `multi_query_attention` and `n_kv_heads` at the same time."
669
+ )
670
+
example.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoProcessor, AutoModelForCausalLM, GenerationConfig
2
+ from PIL import Image
3
+ import requests
4
+
5
+
6
+ def main():
7
+ load_path = "."
8
+
9
+ # load the processor
10
+ print("Loading processor")
11
+ processor = AutoProcessor.from_pretrained(
12
+ load_path,
13
+ trust_remote_code=True,
14
+ torch_dtype='auto',
15
+ device_map='auto'
16
+ )
17
+
18
+ # load the model
19
+ print("Loading model")
20
+ model = AutoModelForCausalLM.from_pretrained(
21
+ load_path,
22
+ trust_remote_code=True,
23
+ torch_dtype='auto',
24
+ device_map='auto'
25
+ )
26
+
27
+ # process the image and text
28
+ print("Processing...")
29
+ inputs = processor.process(
30
+ images=[Image.open(requests.get("https://picsum.photos/id/237/536/354", stream=True).raw)],
31
+ text="Describe this image."
32
+ )
33
+
34
+ # move inputs to the correct device and make a batch of size 1
35
+ inputs = {k: v.to(model.device).unsqueeze(0) for k, v in inputs.items()}
36
+
37
+ # generate output; maximum 200 new tokens; stop generation when <|endoftext|> is generated
38
+ print("Generating....")
39
+ output = model.generate_from_batch(
40
+ inputs,
41
+ GenerationConfig(max_new_tokens=200, stop_strings="<|endoftext|>"),
42
+ tokenizer=processor.tokenizer
43
+ )
44
+
45
+ # only get generated tokens; decode them to text
46
+ generated_tokens = output[0,inputs['input_ids'].size(1):]
47
+ generated_text = processor.tokenizer.decode(generated_tokens, skip_special_tokens=True)
48
+
49
+ # print the generated text
50
+ print(generated_text)
51
+
52
+
53
+
54
+ if __name__ == '__main__':
55
+ main()
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "eos_token_id": 50279,
4
+ "pad_token_id": 1,
5
+ "transformers_version": "4.43.0.dev0"
6
+ }
image_preprocessing_molmo.py ADDED
@@ -0,0 +1,546 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Image processor class for Molmo"""
2
+ from typing import List, Optional, Union, Mapping
3
+
4
+ import numpy as np
5
+ import einops
6
+ import torch
7
+ import torchvision.transforms
8
+ from torchvision.transforms import InterpolationMode
9
+ from torchvision.transforms.functional import convert_image_dtype
10
+
11
+ from transformers.image_utils import (
12
+ OPENAI_CLIP_MEAN,
13
+ OPENAI_CLIP_STD,
14
+ ImageInput,
15
+ is_valid_image,
16
+ )
17
+ from transformers.processing_utils import ImagesKwargs
18
+ from transformers.image_processing_utils import BaseImageProcessor
19
+ from transformers.utils import logging
20
+
21
+
22
+ logger = logging.get_logger(__name__)
23
+
24
+
25
+ def pad_to_bounding_box(
26
+ image, offset_height, offset_width, target_height,
27
+ target_width, value=0
28
+ ):
29
+ height, width = image.shape[:2]
30
+ after_padding_width = target_width - offset_width - width
31
+ after_padding_height = target_height - offset_height - height
32
+ return np.pad(image, [
33
+ [offset_height, after_padding_height],
34
+ [offset_width, after_padding_width],
35
+ [0, 0]
36
+ ], constant_values=value)
37
+
38
+
39
+ def normalize_image(image, offset, scale):
40
+ image -= np.array(offset, dtype=np.float32)[None, None, :]
41
+ image /= np.array(scale, dtype=np.float32)[None, None, :]
42
+ return image
43
+
44
+
45
+ def resize_and_pad(
46
+ image,
47
+ desired_output_size,
48
+ resize_method="torch-bilinear",
49
+ pad_value=0,
50
+ normalize=True,
51
+ image_mean=OPENAI_CLIP_MEAN,
52
+ image_std=OPENAI_CLIP_STD,
53
+ ):
54
+ desired_height, desired_width = desired_output_size
55
+ height, width = image.shape[:2]
56
+
57
+ # Cast into float32 since the training code did this in float32 and it (very rarely) effects
58
+ # the results after rounding.
59
+ image_scale_y = np.array(desired_height, np.float32) / np.array(height, np.float32)
60
+ image_scale_x = np.array(desired_width, np.float32) / np.array(width, np.float32)
61
+ image_scale = min(image_scale_x, image_scale_y)
62
+ scaled_height = int(np.array(height, np.float32) * image_scale)
63
+ scaled_width = int(np.array(width, np.float32) * image_scale)
64
+
65
+ if resize_method == "tensorflow":
66
+ # This how the original training code did resizing, it can produce slightly different
67
+ # results then using torch resize so we keep it just in case
68
+ import tensorflow as tf
69
+ image = tf.image.convert_image_dtype(tf.constant(image), dtype=tf.float32)
70
+ image = tf.image.resize(
71
+ image,
72
+ [scaled_height, scaled_width],
73
+ method=tf.image.ResizeMethod.BILINEAR,
74
+ antialias=True,
75
+ )
76
+ image = tf.clip_by_value(image, 0.0, 1.0)
77
+ image = image.numpy()
78
+ elif resize_method == "torch-bilinear":
79
+ image = torch.permute(torch.from_numpy(image), [2, 0, 1])
80
+ image = convert_image_dtype(image) # resize in float32 to match the training code
81
+ image = torchvision.transforms.Resize(
82
+ [scaled_height, scaled_width], InterpolationMode.BILINEAR, antialias=True
83
+ )(image)
84
+ image = torch.clip(image, 0.0, 1.0)
85
+ image = torch.permute(image, [1, 2, 0]).numpy()
86
+ else:
87
+ raise NotImplementedError(resize_method)
88
+
89
+ top_pad = (desired_height - scaled_height) // 2
90
+ left_pad = (desired_width - scaled_width) // 2
91
+ padding = [
92
+ [top_pad, desired_height - scaled_height - top_pad],
93
+ [left_pad, desired_width - scaled_width - left_pad],
94
+ [0, 0]
95
+ ]
96
+ image_mask = np.pad(np.ones_like(image[:, :, 0], dtype=bool), padding[:2])
97
+ image = np.pad(image, padding, constant_values=pad_value)
98
+ if normalize:
99
+ image = normalize_image(image, offset=image_mean, scale=image_std)
100
+ return image, image_mask
101
+
102
+
103
+ def select_tiling(h, w, patch_size, max_num_patches):
104
+ """Decide how best to divide in image of size [w, h] in up to max_num_patches of size patch_size"""
105
+ original_size = np.stack([h, w]) # [1, 2]
106
+ original_res = h * w
107
+ tilings = []
108
+ for i in range(1, max_num_patches+1):
109
+ for j in range(1, max_num_patches+1):
110
+ if i*j <= max_num_patches:
111
+ tilings.append((i, j))
112
+ # sort so argmin and argmax favour smaller tilings in the event of a tie
113
+ tilings.sort(key=lambda x: (x[0]*x[1], x[0]))
114
+ candidate_tilings = np.array(tilings, dtype=np.int32) # [n_resolutions, 2]
115
+ candidate_resolutions = candidate_tilings * patch_size # [n_resolutions, 2]
116
+
117
+ # How much we would need to scale the image to fit exactly in each tiling
118
+ original_size = np.stack([h, w], dtype=np.float32) # [1, 2]
119
+ required_scale_d = candidate_resolutions.astype(np.float32) / original_size
120
+ required_scale = np.min(required_scale_d, axis=-1, keepdims=True) # [n_resolutions, 1]
121
+ if np.all(required_scale < 1):
122
+ # We are forced to downscale, so try to minimize the amount of downscaling
123
+ ix = np.argmax(required_scale)
124
+ else:
125
+ # Pick the resolution that required the least upscaling so that it most closely fits the image
126
+ required_scale = np.where(required_scale < 1.0, 10e9, required_scale)
127
+ ix = np.argmin(required_scale)
128
+ return candidate_tilings[ix]
129
+
130
+
131
+ class MolmoImagesKwargs(ImagesKwargs, total=False):
132
+ max_crops: Optional[int]
133
+ overlap_margins: Optional[List[int]]
134
+ base_image_input_size: Optional[List[int]]
135
+ image_token_length_w: Optional[int]
136
+ image_token_length_h: Optional[int]
137
+ image_patch_size: Optional[int]
138
+ image_padding_mask: Optional[bool]
139
+
140
+
141
+ class MolmoImageProcessor(BaseImageProcessor):
142
+ """Preprocess images and multi-model inputs"""
143
+
144
+ def __init__(
145
+ self,
146
+ max_crops: int = 12,
147
+ overlap_margins: List[int] = (4, 4),
148
+ base_image_input_size: List[int] = (336, 336),
149
+ image_token_length_w: int = 12,
150
+ image_token_length_h: int = 12,
151
+ image_patch_size: int = 14,
152
+ image_padding_mask: bool = True,
153
+ do_normalize: bool = True,
154
+ image_mean: Optional[Union[float, List[float]]] = None,
155
+ image_std: Optional[Union[float, List[float]]] = None,
156
+ **kwargs,
157
+ ):
158
+ super().__init__(**kwargs)
159
+ self.max_crops = max_crops
160
+ self.overlap_margins = overlap_margins
161
+ self.base_image_input_size = base_image_input_size
162
+ self.image_token_length_w = image_token_length_w
163
+ self.image_token_length_h = image_token_length_h
164
+ self.image_patch_size = image_patch_size
165
+ self.image_padding_mask = image_padding_mask
166
+ self.do_normalize = do_normalize
167
+ self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
168
+ self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
169
+
170
+ def image_to_patches_and_tokens(
171
+ self,
172
+ image: ImageInput,
173
+ image_patch_token_id: int,
174
+ image_col_token_id: int,
175
+ image_start_token_id: int,
176
+ image_end_token_id: int,
177
+ max_crops: Optional[int] = None,
178
+ overlap_margins: Optional[List[int]] = None,
179
+ base_image_input_size: Optional[Union[int, List[int]]] = None,
180
+ image_token_length_w: Optional[int] = None,
181
+ image_token_length_h: Optional[int] = None,
182
+ image_patch_size: Optional[int] = None,
183
+ ):
184
+ if isinstance(base_image_input_size, int):
185
+ base_image_input_size = (base_image_input_size, base_image_input_size)
186
+
187
+ base_image_input_d = image_patch_size
188
+ tokens_per_image = image_token_length_w * image_token_length_h
189
+ image_base_patch_w = base_image_input_size[1] // base_image_input_d
190
+ image_base_patch_h = base_image_input_size[0] // base_image_input_d
191
+
192
+ original_image_h, original_image_w = image.shape[:2]
193
+ crop_size = base_image_input_size[0]
194
+
195
+ # Discard this many patches from the (left/top, right/bottom) of crops
196
+ left_margin, right_margin = overlap_margins
197
+ # left_margin, right_margin = 2, 2
198
+ assert left_margin % 2 == 0 # Required for compatibility with 2x2 pooling
199
+ total_margin_pixels = base_image_input_d*(right_margin + left_margin) # pixels removed per dim
200
+ crop_patches = base_image_input_size[0] // base_image_input_d # patches per crop dim
201
+ crop_window_patches = crop_patches - (right_margin + left_margin) # usable patches
202
+ crop_window_size = crop_window_patches * base_image_input_d
203
+ tiling = select_tiling(
204
+ original_image_h - total_margin_pixels,
205
+ original_image_w - total_margin_pixels,
206
+ crop_window_size,
207
+ max_crops
208
+ )
209
+ src, img_mask = resize_and_pad(
210
+ image,
211
+ [tiling[0]*crop_window_size+total_margin_pixels, tiling[1]*crop_window_size+total_margin_pixels]
212
+ )
213
+
214
+ # Now we have to split the image into crops, while keeping track of how each patch in the
215
+ # each crop should be ordered in the global image, this require a lot of tricky booking
216
+ n_crops = tiling[0] * tiling[1]
217
+ patches_arr = []
218
+ mask_arr = []
219
+ patch_ordering_arr = []
220
+
221
+ # We assume 2x2 pooling, but can allow padding the right/bottom with extra
222
+ # patches if the number of patches per side is not even
223
+ assert (crop_patches+1)//2 == image_token_length_h
224
+ assert (crop_patches+1)//2 == image_token_length_w
225
+ on = 0
226
+ on_patch = 0
227
+ for i in range(tiling[0]):
228
+ y0 = i*crop_window_size
229
+ if i == 0:
230
+ crop_y0 = 0
231
+ else:
232
+ crop_y0 = left_margin // 2
233
+
234
+ crop_h = image_base_patch_h - (right_margin + left_margin)
235
+ if i == 0:
236
+ crop_h += left_margin
237
+ if i == (tiling[0]-1):
238
+ crop_h += right_margin
239
+ for j in range(tiling[1]):
240
+ x0 = j*crop_window_size
241
+ if j == 0:
242
+ crop_x0 = 0
243
+ else:
244
+ crop_x0 = left_margin // 2
245
+
246
+ crop_w = image_base_patch_w - (right_margin + left_margin)
247
+ if j == 0:
248
+ crop_w += left_margin
249
+ if j == (tiling[1]-1):
250
+ crop_w += right_margin
251
+
252
+ pooled_w = (crop_w + 1) // 2
253
+ pooled_h = (crop_h + 1) // 2
254
+ patch_ordering_arr.append(
255
+ pad_to_bounding_box(
256
+ np.reshape(np.arange(on, on+pooled_h*pooled_w, dtype=np.int32), (pooled_h, pooled_w, 1)),
257
+ crop_y0, crop_x0, image_token_length_h, image_token_length_w, value=-1
258
+ )[:, :, 0]
259
+ )
260
+ patches_arr.append(src[y0:y0+crop_size, x0:x0+crop_size])
261
+ mask_arr.append(img_mask[y0:y0+crop_size, x0:x0+crop_size])
262
+
263
+ on += pooled_h*pooled_w
264
+ on_patch += 1
265
+ patches = np.stack(patches_arr)
266
+ patch_ordering = np.stack(patch_ordering_arr)
267
+ img_mask = np.stack(mask_arr)
268
+
269
+ # Switch to [n_crops, n_patches, pixels_per_patch] format
270
+ image_layout_impatch_w, image_layout_impatch_h = tiling[0], tiling[1]
271
+ patches = einops.rearrange(
272
+ patches, 'p (h dh) (w dw) c -> p (h w) (dh dw c)',
273
+ dh=base_image_input_d,
274
+ dw=base_image_input_d,
275
+ h=image_base_patch_h,
276
+ w=image_base_patch_w
277
+ )
278
+ img_mask = einops.rearrange(
279
+ img_mask, 'p (h dh) (w dw) -> p (h w) (dh dw)',
280
+ dh=base_image_input_d,
281
+ dw=base_image_input_d,
282
+ h=image_base_patch_h,
283
+ w=image_base_patch_w
284
+ )
285
+
286
+ img_mask = img_mask.astype(np.float32).mean(axis=-1)
287
+ patch_ordering = np.reshape(patch_ordering, [-1])
288
+ valid = patch_ordering >= 0
289
+
290
+ # Transpose order, to get left-to-right order instead of crop-by-crop order
291
+ patch_ordering_rh = np.reshape(
292
+ patch_ordering,
293
+ [tiling[0], tiling[1], image_token_length_h, image_token_length_w]
294
+ )
295
+ patch_ordering_rh = np.transpose(patch_ordering_rh, [0, 2, 1, 3])
296
+ patch_ordering_rh = np.reshape(patch_ordering_rh, [-1])
297
+
298
+ # The transpose will screw up which patches are masked, project the
299
+ # new order into sparse structure of `patch_ordering` to fix this
300
+ patch_ordering[valid] = patch_ordering_rh[patch_ordering_rh >= 0]
301
+
302
+ # Now build the output tokens
303
+ h = tiling[0] * crop_window_patches + (right_margin+left_margin)
304
+ w = tiling[1] * crop_window_patches + (right_margin+left_margin)
305
+ per_row = np.full(
306
+ ((w+1)//2,),
307
+ image_patch_token_id,
308
+ )
309
+ per_row = np.concatenate([per_row, [image_col_token_id]], 0)
310
+
311
+ joint = np.tile(per_row, [(h+1)//2])
312
+ joint = [
313
+ [image_start_token_id],
314
+ joint,
315
+ [image_end_token_id]
316
+ ]
317
+
318
+ # Finally do the same for the global image
319
+ resized, _ = resize_and_pad(image, base_image_input_size)
320
+ resized = einops.rearrange(
321
+ resized, '(h dh) (w dw) c -> (h w) (dh dw c)',
322
+ dh=base_image_input_d,
323
+ dw=base_image_input_d,
324
+ h=image_base_patch_h,
325
+ w=image_base_patch_w
326
+ )
327
+ patches = np.concatenate([np.expand_dims(resized, 0), patches], 0)
328
+
329
+ # Global image goes first, so the order of patches in previous crops gets increased
330
+ patch_ordering = np.where(
331
+ patch_ordering >= 0,
332
+ patch_ordering + tokens_per_image,
333
+ -1
334
+ )
335
+ patch_ordering = np.concatenate([np.arange(0, tokens_per_image), patch_ordering], 0)
336
+ per_row = np.full(
337
+ (image_token_length_w,),
338
+ image_patch_token_id,
339
+ )
340
+ per_row = np.concatenate([per_row, [image_col_token_id]], 0)
341
+ extra_tokens = np.tile(per_row, [image_token_length_h])
342
+ joint = [
343
+ [image_start_token_id],
344
+ extra_tokens,
345
+ [image_end_token_id],
346
+ ] + joint
347
+
348
+ joint = np.concatenate(joint, 0)
349
+ img_mask = np.pad(img_mask, [[0, 1], [0, 0]], constant_values=-1)
350
+ return patches, joint, patch_ordering, img_mask
351
+
352
+ def build_image_input_idx(
353
+ self,
354
+ image_tokens: np.ndarray,
355
+ patch_order: np.ndarray,
356
+ image_patch_token_id: int,
357
+ no_image: Optional[bool] = None,
358
+ image_token_length_w: Optional[int] = None,
359
+ image_token_length_h: Optional[int] = None,
360
+ ):
361
+ """Converts `patch_order` into a mapping of token_id -> patch_id"""
362
+
363
+ tokens_per_image = image_token_length_w * image_token_length_h
364
+ if no_image is not None and no_image:
365
+ return np.zeros((0, tokens_per_image), np.int32)
366
+
367
+ # Indices to insert the patches
368
+ image_input_idx = image_tokens == image_patch_token_id
369
+ image_input_idx = np.nonzero(image_input_idx)[0].astype(np.int32)
370
+
371
+ if patch_order is not None:
372
+ n_tokens = image_input_idx.shape[0]
373
+ patch_order = np.reshape(patch_order, [-1])
374
+ n_patches = patch_order.shape[0]
375
+
376
+ valid = patch_order >= 0
377
+ n_valid_patches = valid.sum()
378
+ assert len(image_input_idx) == n_valid_patches
379
+
380
+ sorted_patch_ixs = np.zeros([n_tokens], np.int32)
381
+ sorted_patch_ixs[patch_order[valid]] = np.arange(n_valid_patches, dtype=np.int32)
382
+
383
+ # Project the inverted mapping into same sparse structure
384
+ sorted_patch_ixs_ex = np.full(np.shape(patch_order), -1)
385
+ sorted_patch_ixs_ex[valid] = sorted_patch_ixs
386
+
387
+ # Do the gather and then re-masked outputs that were masked in `sorted_patch_ixs`
388
+ valid = (sorted_patch_ixs_ex >= 0).astype(np.int32)
389
+ image_input_idx = image_input_idx[sorted_patch_ixs_ex*valid]
390
+ image_input_idx = image_input_idx*valid - 100*(1 - valid)
391
+ image_input_idx = np.reshape(image_input_idx, [-1, tokens_per_image])
392
+ return image_input_idx
393
+
394
+ def preprocess(
395
+ self,
396
+ image: np.ndarray,
397
+ image_patch_token_id: int,
398
+ image_col_token_id: int,
399
+ image_start_token_id: int,
400
+ image_end_token_id: int,
401
+ max_crops: Optional[int] = None,
402
+ overlap_margins: Optional[List[int]] = None,
403
+ base_image_input_size: Optional[Union[int, List[int]]] = None,
404
+ image_token_length_w: Optional[int] = None,
405
+ image_token_length_h: Optional[int] = None,
406
+ image_patch_size: Optional[int] = None,
407
+ **kwargs,
408
+ ):
409
+ """Preprocesses an image
410
+
411
+ Returns:
412
+ crops: (n_crops, n_patches, patch_dim) individual crops, `n_crops` might
413
+ change between images but the other dimension are fixed
414
+ tokens: (n_tokens,) int32 tokens, pad tokens indicate where to insert the
415
+ patch features, might include other special tokens as well
416
+ image_idx: (n_crops, n_patches) index in `tokens` to put the patch features from the
417
+ crops after pooling, negative values indicates patches features to exclude
418
+ padding_mask: (n_crops, n_patches) what percent of each crop is padding, can be None
419
+ if the image mask is not being used.
420
+ """
421
+
422
+ max_crops = max_crops or self.max_crops
423
+ overlap_margins = overlap_margins or self.overlap_margins
424
+ base_image_input_size = base_image_input_size or self.base_image_input_size
425
+ image_token_length_w = image_token_length_w or self.image_token_length_w
426
+ image_token_length_h = image_token_length_h or self.image_token_length_h
427
+ image_patch_size = image_patch_size or self.image_patch_size
428
+
429
+ crops, image_tokens, patch_ordering, img_mask = self.image_to_patches_and_tokens(
430
+ image,
431
+ image_patch_token_id,
432
+ image_col_token_id,
433
+ image_start_token_id,
434
+ image_end_token_id,
435
+ max_crops,
436
+ overlap_margins,
437
+ base_image_input_size,
438
+ image_token_length_w,
439
+ image_token_length_h,
440
+ image_patch_size,
441
+ )
442
+ patch_idx = self.build_image_input_idx(
443
+ image_tokens,
444
+ patch_ordering,
445
+ image_patch_token_id,
446
+ image_token_length_w=image_token_length_w,
447
+ image_token_length_h=image_token_length_h,
448
+ )
449
+ return crops, image_tokens, patch_idx, img_mask
450
+
451
+ def multimodal_preprocess(
452
+ self,
453
+ images: np.ndarray,
454
+ tokens: List[int],
455
+ image_idx: np.ndarray,
456
+ sequence_length: int,
457
+ image_patch_token_id: int,
458
+ image_col_token_id: int,
459
+ image_start_token_id: int,
460
+ image_end_token_id: int,
461
+ **kwargs,
462
+ ):
463
+ """Merge images and text tokens into multi-modal features for the model
464
+
465
+ :param images: images to use as input
466
+ :param tokens: input text tokens
467
+ :param image_idx: where to insert the images into `tokens`
468
+ :params image_patch_token_id: id to use of tokens that will contain image features
469
+ :params image_col_token_id: token id for image column special tokens
470
+ :params image_start_token_id: token id for image start special tokens
471
+ :params image_end_token_id: token id for image end special tokens
472
+ :params kwargs: override preprocessor default args
473
+ """
474
+ max_total_crops = kwargs.get("max_crops") or self.max_crops
475
+ image_token_length_w = kwargs.get("image_token_length_w") or self.image_token_length_w
476
+ image_token_length_h = kwargs.get("image_token_length_h") or self.image_token_length_h
477
+ image_patch_size = kwargs.get("image_patch_size") or self.image_patch_size
478
+ base_image_input_size = kwargs.get("base_image_input_size") or self.base_image_input_size
479
+ image_num_patch = (
480
+ base_image_input_size[0] // image_patch_size,
481
+ base_image_input_size[1] // image_patch_size,
482
+ )
483
+ image_padding_mask = kwargs.get("image_padding_mask") or self.image_padding_mask
484
+
485
+ tokens_per_image = image_token_length_w * image_token_length_h
486
+ n_pixels = image_patch_size * image_patch_size * 3
487
+ n_patches = image_num_patch[0] * image_num_patch[1]
488
+
489
+ if images is None:
490
+ return {
491
+ "input_ids": tokens,
492
+ }
493
+ else:
494
+ n = len(images)
495
+ all_crops = []
496
+ all_image_idx = []
497
+ out_tokens = []
498
+ all_crop_masks = []
499
+
500
+ for ix in range(n):
501
+ token_ix = image_idx[ix]
502
+ crops, image_tokens, patch_idx, img_mask = self.preprocess(
503
+ images[ix],
504
+ image_patch_token_id,
505
+ image_col_token_id,
506
+ image_start_token_id,
507
+ image_end_token_id,
508
+ **kwargs,
509
+ )
510
+
511
+ if token_ix == -1: # -1 is an image inserted at the very start
512
+ start = 0
513
+ token_ix = 0
514
+ end = 0
515
+ else:
516
+ start = 0 if ix == 0 else image_idx[ix-1] + 1
517
+ end = token_ix + 1
518
+
519
+ all_image_idx.append(patch_idx + token_ix)
520
+ all_crops.append(crops)
521
+ out_tokens.append(tokens[start:token_ix])
522
+ out_tokens.append(image_tokens)
523
+ if ix == (n - 1):
524
+ out_tokens.append(tokens[end:])
525
+ if image_padding_mask:
526
+ all_crop_masks.append(img_mask)
527
+
528
+ input_ids = np.concatenate(out_tokens, 0)
529
+ images = np.concatenate(all_crops, 0)
530
+ image_input_idx = np.concatenate(all_image_idx, 0)
531
+ if image_padding_mask:
532
+ image_masks = np.concatenate(all_crop_masks, 0)
533
+ else:
534
+ image_masks = None
535
+
536
+ out = {
537
+ "input_ids": input_ids,
538
+ "images": images,
539
+ "image_input_idx": image_input_idx
540
+ }
541
+ if image_masks is not None:
542
+ out["image_masks"] = image_masks
543
+ return out
544
+
545
+
546
+ MolmoImageProcessor.register_for_auto_class()
modeling_molmoe.py ADDED
The diff for this file is too large to render. See raw diff
 
preprocessing_molmo.py ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Processor class for Molmo.
3
+ """
4
+
5
+ from typing import Optional
6
+
7
+ import PIL
8
+ from PIL import ImageOps
9
+ from PIL.Image import Image
10
+
11
+ try:
12
+ from typing import Unpack
13
+ except ImportError:
14
+ from typing_extensions import Unpack
15
+
16
+ import numpy as np
17
+ import torch
18
+
19
+ from transformers.image_utils import ImageInput
20
+ from transformers.processing_utils import (
21
+ TextKwargs,
22
+ ProcessingKwargs,
23
+ ProcessorMixin,
24
+ )
25
+
26
+ from transformers.tokenization_utils_base import TextInput, PreTokenizedInput
27
+ from transformers.utils import logging
28
+
29
+ from transformers import AutoTokenizer
30
+ from .image_preprocessing_molmo import MolmoImagesKwargs, MolmoImageProcessor
31
+
32
+
33
+ logger = logging.get_logger(__name__)
34
+
35
+
36
+ DEFAULT_IMAGE_PATCH_TOKEN = f"<im_patch>"
37
+ DEFAULT_IM_START_TOKEN = f"<im_start>"
38
+ DEFAULT_IM_END_TOKEN = f"<im_end>"
39
+ DEFAULT_IM_COL_TOKEN = f"<im_col>"
40
+ IMAGE_PROMPT = "<|image|>"
41
+
42
+ EXTRA_TOKENS = (DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_COL_TOKEN, IMAGE_PROMPT)
43
+
44
+
45
+ def get_special_token_ids(tokenizer):
46
+ ids = tokenizer.encode("".join(EXTRA_TOKENS), add_special_tokens=False)
47
+ assert len(ids) == len(EXTRA_TOKENS)
48
+ return {k: i for k, i in zip(EXTRA_TOKENS, ids)}
49
+
50
+
51
+ class MolmoTextKwargs(TextKwargs, total=False):
52
+ style: Optional[str]
53
+ system_prompt: Optional[str]
54
+ message_format: Optional[str]
55
+ always_start_with_space: Optional[bool]
56
+ sequence_length: Optional[int]
57
+
58
+
59
+ class MolmoProcessorKwargs(ProcessingKwargs, total=False):
60
+ text_kwargs: MolmoTextKwargs
61
+ images_kwargs: MolmoImagesKwargs
62
+ _defaults = {
63
+ "images_kwargs": {
64
+ "max_crops": 12,
65
+ "overlap_margins": [4, 4],
66
+ "base_image_input_size": [336, 336],
67
+ "image_token_length_w": 12,
68
+ "image_token_length_h": 12,
69
+ "image_patch_size": 14,
70
+ "image_padding_mask": True,
71
+ },
72
+ "text_kwargs": {
73
+ "style": "long_caption",
74
+ "system_prompt": "none",
75
+ "message_format": "role",
76
+ "always_start_with_space": True,
77
+ "sequence_length": 1536,
78
+ "padding": False,
79
+ },
80
+ }
81
+
82
+
83
+ class MolmoProcessor(ProcessorMixin):
84
+ attributes = ["image_processor", "tokenizer"]
85
+ image_processor_class = "AutoImageProcessor"
86
+ tokenizer_class = ("GPT2Tokenizer", "GPT2TokenizerFast")
87
+
88
+ def __init__(self, image_processor: MolmoImageProcessor = None, tokenizer : AutoTokenizer = None, **kwargs):
89
+ # self.image_processor = image_processor
90
+ # self.tokenizer = tokenizer
91
+ super().__init__(image_processor, tokenizer)
92
+ self._special_tokens = None
93
+
94
+ @property
95
+ def special_token_ids(self):
96
+ if self._special_tokens is None:
97
+ self._special_tokens = get_special_token_ids(self.tokenizer)
98
+ return self._special_tokens
99
+
100
+ def get_tokens_input(self, prompt, message_format, always_start_with_space):
101
+ if message_format == "none" or message_format is None:
102
+ pass
103
+ elif message_format == "role":
104
+ prompt = "User: " + prompt + " Assistant:"
105
+ else:
106
+ raise NotImplementedError(f"Message format {message_format} not implemented")
107
+
108
+ if always_start_with_space:
109
+ prompt = " " + prompt
110
+
111
+ tokens = self.tokenizer.encode(prompt, add_special_tokens=False)
112
+
113
+ return tokens
114
+
115
+ def process(
116
+ self,
117
+ text: TextInput = None,
118
+ images: ImageInput = None,
119
+ *,
120
+ tokens: Optional[PreTokenizedInput] = None,
121
+ **kwargs: Unpack[MolmoProcessorKwargs],
122
+ ):
123
+ output_kwargs = self._merge_kwargs(
124
+ MolmoProcessorKwargs,
125
+ tokenizer_init_kwargs=self.tokenizer.init_kwargs,
126
+ **kwargs,
127
+ )
128
+
129
+ if tokens is None:
130
+ tokens = self.get_tokens_input(
131
+ text,
132
+ output_kwargs["text_kwargs"]["message_format"],
133
+ output_kwargs["text_kwargs"]["always_start_with_space"],
134
+ )
135
+
136
+ image_token_id = self.special_token_ids[IMAGE_PROMPT]
137
+
138
+ if images is not None:
139
+ if not isinstance(images, (list, tuple)):
140
+ images = [images]
141
+ image_arrays = []
142
+ for image in images:
143
+ if isinstance(image, Image):
144
+ image = image.convert("RGB")
145
+ # Handle images with EXIF orientation tags, which PIL will ignore by default
146
+ # https://github.com/python-pillow/Pillow/issues/4703
147
+ img = ImageOps.exif_transpose(image)
148
+ image_arrays.append(np.array(image))
149
+ else:
150
+ assert len(image.shape) == 3 and image.shape[-1] == 3
151
+ image_arrays.append(image.astype(np.uint8))
152
+ images = image_arrays
153
+ # For now only support inserting images at the start
154
+ image_idx = [-1]*len(images)
155
+ else:
156
+ image_idx = None
157
+
158
+ sequence_length = output_kwargs["text_kwargs"]["sequence_length"]
159
+
160
+ image_patch_token_id = self.special_token_ids[DEFAULT_IMAGE_PATCH_TOKEN]
161
+ image_col_token_id = self.special_token_ids[DEFAULT_IM_COL_TOKEN]
162
+ image_start_token_id = self.special_token_ids[DEFAULT_IM_START_TOKEN]
163
+ image_end_token_id = self.special_token_ids[DEFAULT_IM_END_TOKEN]
164
+ out = self.image_processor.multimodal_preprocess(
165
+ images=images,
166
+ image_idx=image_idx,
167
+ tokens=np.asarray(tokens).astype(np.int32),
168
+ sequence_length=sequence_length,
169
+ image_patch_token_id=image_patch_token_id,
170
+ image_col_token_id=image_col_token_id,
171
+ image_start_token_id=image_start_token_id,
172
+ image_end_token_id=image_end_token_id,
173
+ **output_kwargs["images_kwargs"]
174
+ )
175
+
176
+ # Prepend BOS
177
+ # qwen2 and olmo do not have a BOS, and instead use EOS as a generic seperator token.
178
+ bos = self.tokenizer.bos_token_id or self.tokenizer.eos_token_id
179
+ decoder_input_tokens = np.pad(out["input_ids"], [[1, 0]], constant_values=bos)
180
+ out["input_ids"] = decoder_input_tokens
181
+ if "image_input_idx" in out:
182
+ # Shift patch mapping up by one since we added BOS
183
+ image_input_idx = out["image_input_idx"]
184
+ out["image_input_idx"] = np.where(image_input_idx < 0, image_input_idx, image_input_idx + 1)
185
+
186
+ for k, v in out.items():
187
+ out[k] = torch.from_numpy(v)
188
+
189
+ return out
190
+
191
+
192
+ MolmoProcessor.register_for_auto_class()
preprocessor_config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoImageProcessor": "image_preprocessing_molmo.MolmoImageProcessor",
4
+ "AutoProcessor": "preprocessing_molmo.MolmoProcessor"
5
+ },
6
+ "base_image_input_size": [
7
+ 336,
8
+ 336
9
+ ],
10
+ "do_normalize": true,
11
+ "image_mean": [
12
+ 0.48145466,
13
+ 0.4578275,
14
+ 0.40821073
15
+ ],
16
+ "image_padding_mask": true,
17
+ "image_patch_size": 14,
18
+ "image_processor_type": "MolmoImageProcessor",
19
+ "image_std": [
20
+ 0.26862954,
21
+ 0.26130258,
22
+ 0.27577711
23
+ ],
24
+ "image_token_length_h": 12,
25
+ "image_token_length_w": 12,
26
+ "max_crops": 12,
27
+ "overlap_margins": [
28
+ 4,
29
+ 4
30
+ ],
31
+ "processor_class": "MolmoProcessor"
32
+ }
processor_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoProcessor": "preprocessing_molmo.MolmoProcessor"
4
+ },
5
+ "processor_class": "MolmoProcessor"
6
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<im_start>",
4
+ "<im_end>",
5
+ "<im_patch>",
6
+ "<im_col>",
7
+ "<|image|>"
8
+ ],
9
+ "bos_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "eos_token": {
17
+ "content": "<|endoftext|>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "pad_token": {
24
+ "content": "<|pad|>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "unk_token": {
31
+ "content": "<|endoftext|>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ }
37
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,278 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": false,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "|||IP_ADDRESS|||",
8
+ "lstrip": false,
9
+ "normalized": true,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": false
13
+ },
14
+ "1": {
15
+ "content": "<|padding|>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "50254": {
23
+ "content": " ",
24
+ "lstrip": false,
25
+ "normalized": true,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": false
29
+ },
30
+ "50255": {
31
+ "content": " ",
32
+ "lstrip": false,
33
+ "normalized": true,
34
+ "rstrip": false,
35
+ "single_word": false,
36
+ "special": false
37
+ },
38
+ "50256": {
39
+ "content": " ",
40
+ "lstrip": false,
41
+ "normalized": true,
42
+ "rstrip": false,
43
+ "single_word": false,
44
+ "special": false
45
+ },
46
+ "50257": {
47
+ "content": " ",
48
+ "lstrip": false,
49
+ "normalized": true,
50
+ "rstrip": false,
51
+ "single_word": false,
52
+ "special": false
53
+ },
54
+ "50258": {
55
+ "content": " ",
56
+ "lstrip": false,
57
+ "normalized": true,
58
+ "rstrip": false,
59
+ "single_word": false,
60
+ "special": false
61
+ },
62
+ "50259": {
63
+ "content": " ",
64
+ "lstrip": false,
65
+ "normalized": true,
66
+ "rstrip": false,
67
+ "single_word": false,
68
+ "special": false
69
+ },
70
+ "50260": {
71
+ "content": " ",
72
+ "lstrip": false,
73
+ "normalized": true,
74
+ "rstrip": false,
75
+ "single_word": false,
76
+ "special": false
77
+ },
78
+ "50261": {
79
+ "content": " ",
80
+ "lstrip": false,
81
+ "normalized": true,
82
+ "rstrip": false,
83
+ "single_word": false,
84
+ "special": false
85
+ },
86
+ "50262": {
87
+ "content": " ",
88
+ "lstrip": false,
89
+ "normalized": true,
90
+ "rstrip": false,
91
+ "single_word": false,
92
+ "special": false
93
+ },
94
+ "50263": {
95
+ "content": " ",
96
+ "lstrip": false,
97
+ "normalized": true,
98
+ "rstrip": false,
99
+ "single_word": false,
100
+ "special": false
101
+ },
102
+ "50264": {
103
+ "content": " ",
104
+ "lstrip": false,
105
+ "normalized": true,
106
+ "rstrip": false,
107
+ "single_word": false,
108
+ "special": false
109
+ },
110
+ "50265": {
111
+ "content": " ",
112
+ "lstrip": false,
113
+ "normalized": true,
114
+ "rstrip": false,
115
+ "single_word": false,
116
+ "special": false
117
+ },
118
+ "50266": {
119
+ "content": " ",
120
+ "lstrip": false,
121
+ "normalized": true,
122
+ "rstrip": false,
123
+ "single_word": false,
124
+ "special": false
125
+ },
126
+ "50267": {
127
+ "content": " ",
128
+ "lstrip": false,
129
+ "normalized": true,
130
+ "rstrip": false,
131
+ "single_word": false,
132
+ "special": false
133
+ },
134
+ "50268": {
135
+ "content": " ",
136
+ "lstrip": false,
137
+ "normalized": true,
138
+ "rstrip": false,
139
+ "single_word": false,
140
+ "special": false
141
+ },
142
+ "50269": {
143
+ "content": " ",
144
+ "lstrip": false,
145
+ "normalized": true,
146
+ "rstrip": false,
147
+ "single_word": false,
148
+ "special": false
149
+ },
150
+ "50270": {
151
+ "content": " ",
152
+ "lstrip": false,
153
+ "normalized": true,
154
+ "rstrip": false,
155
+ "single_word": false,
156
+ "special": false
157
+ },
158
+ "50271": {
159
+ "content": " ",
160
+ "lstrip": false,
161
+ "normalized": true,
162
+ "rstrip": false,
163
+ "single_word": false,
164
+ "special": false
165
+ },
166
+ "50272": {
167
+ "content": " ",
168
+ "lstrip": false,
169
+ "normalized": true,
170
+ "rstrip": false,
171
+ "single_word": false,
172
+ "special": false
173
+ },
174
+ "50273": {
175
+ "content": " ",
176
+ "lstrip": false,
177
+ "normalized": true,
178
+ "rstrip": false,
179
+ "single_word": false,
180
+ "special": false
181
+ },
182
+ "50274": {
183
+ "content": " ",
184
+ "lstrip": false,
185
+ "normalized": true,
186
+ "rstrip": false,
187
+ "single_word": false,
188
+ "special": false
189
+ },
190
+ "50275": {
191
+ "content": " ",
192
+ "lstrip": false,
193
+ "normalized": true,
194
+ "rstrip": false,
195
+ "single_word": false,
196
+ "special": false
197
+ },
198
+ "50276": {
199
+ "content": " ",
200
+ "lstrip": false,
201
+ "normalized": true,
202
+ "rstrip": false,
203
+ "single_word": false,
204
+ "special": false
205
+ },
206
+ "50277": {
207
+ "content": "|||EMAIL_ADDRESS|||",
208
+ "lstrip": false,
209
+ "normalized": true,
210
+ "rstrip": false,
211
+ "single_word": false,
212
+ "special": false
213
+ },
214
+ "50278": {
215
+ "content": "|||PHONE_NUMBER|||",
216
+ "lstrip": false,
217
+ "normalized": true,
218
+ "rstrip": false,
219
+ "single_word": false,
220
+ "special": false
221
+ },
222
+ "50279": {
223
+ "content": "<|endoftext|>",
224
+ "lstrip": false,
225
+ "normalized": false,
226
+ "rstrip": false,
227
+ "single_word": false,
228
+ "special": true
229
+ },
230
+ "50280": {
231
+ "content": "<im_start>",
232
+ "lstrip": false,
233
+ "normalized": false,
234
+ "rstrip": false,
235
+ "single_word": false,
236
+ "special": true
237
+ },
238
+ "50281": {
239
+ "content": "<im_end>",
240
+ "lstrip": false,
241
+ "normalized": false,
242
+ "rstrip": false,
243
+ "single_word": false,
244
+ "special": true
245
+ },
246
+ "50282": {
247
+ "content": "<im_patch>",
248
+ "lstrip": false,
249
+ "normalized": false,
250
+ "rstrip": false,
251
+ "single_word": false,
252
+ "special": true
253
+ },
254
+ "50283": {
255
+ "content": "<im_col>",
256
+ "lstrip": false,
257
+ "normalized": false,
258
+ "rstrip": false,
259
+ "single_word": false,
260
+ "special": true
261
+ },
262
+ "50284": {
263
+ "content": "<|image|>",
264
+ "lstrip": false,
265
+ "normalized": false,
266
+ "rstrip": false,
267
+ "single_word": false,
268
+ "special": true
269
+ }
270
+ },
271
+ "bos_token": null,
272
+ "clean_up_tokenization_spaces": true,
273
+ "eos_token": "<|endoftext|>",
274
+ "model_max_length": 1000000000000000019884624838656,
275
+ "pad_token": "<|padding|>",
276
+ "tokenizer_class": "GPTNeoXTokenizer",
277
+ "unk_token": null
278
+ }