mrbesher commited on
Commit
77f21f5
·
verified ·
1 Parent(s): 9d6780c

Upload 6 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
config.json CHANGED
@@ -1,205 +1,70 @@
1
  {
2
- "_commit_hash": "344d954da76eb8ad47a7aaff42d012e30c15b8fe",
3
- "add_projections": false,
4
- "architectures": [
5
- "JinaCLIPModel"
6
- ],
7
- "auto_map": {
8
- "AutoConfig": "jinaai/jina-clip-implementation--configuration_clip.JinaCLIPConfig",
9
- "AutoModel": "jinaai/jina-clip-implementation--modeling_clip.JinaCLIPModel"
10
- },
11
- "initializer_factor": 1.0,
12
- "logit_scale_init_value": 2.6592,
13
- "matryoshka_dimensions": [
14
- 32,
15
- 64,
16
- 128,
17
- 256,
18
- 512,
19
- 768,
20
- 1024
21
- ],
22
- "model_type": "jina_clip",
23
- "projection_dim": 1024,
24
- "text_config": {
25
- "_attn_implementation_autoset": false,
26
- "_name_or_path": "",
27
- "add_cross_attention": false,
28
- "architectures": null,
29
- "bad_words_ids": null,
30
- "begin_suppress_tokens": null,
31
- "bos_token_id": null,
32
- "chunk_size_feed_forward": 0,
33
- "cross_attention_hidden_size": null,
34
- "decoder_start_token_id": null,
35
- "default_instruction_task": null,
36
- "default_lora_task": "retrieval.query",
37
- "diversity_penalty": 0.0,
38
- "do_sample": false,
39
- "early_stopping": false,
40
- "embed_dim": 1024,
41
- "encoder_no_repeat_ngram_size": 0,
42
- "eos_token_id": null,
43
- "exponential_decay_length_penalty": null,
44
- "finetuning_task": null,
45
- "forced_bos_token_id": null,
46
- "forced_eos_token_id": null,
47
- "hf_model_config_kwargs": {
48
- "load_trained_adapters": false,
49
- "lora_adaptations": [
50
- "retrieval.query"
51
- ],
52
- "lora_alpha": 4,
53
- "lora_dropout_p": 0.0,
54
- "lora_main_params_trainable": false,
55
- "lora_rank": 4,
56
- "task_instructions": {
57
- "retrieval.query": "Represent the query for retrieving evidence documents: "
58
- },
59
- "use_flash_attn": false
60
  },
61
- "hf_model_name_or_path": "jinaai/jina-embeddings-v3",
62
- "id2label": {
63
- "0": "LABEL_0",
64
- "1": "LABEL_1"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  },
66
- "is_decoder": false,
67
- "is_encoder_decoder": false,
68
- "label2id": {
69
- "LABEL_0": 0,
70
- "LABEL_1": 1
71
  },
72
- "length_penalty": 1.0,
73
- "max_length": 20,
74
- "min_length": 0,
75
- "model_type": "jina_clip_text",
76
- "no_repeat_ngram_size": 0,
77
- "num_beam_groups": 1,
78
- "num_beams": 1,
79
- "num_return_sequences": 1,
80
- "output_attentions": false,
81
- "output_hidden_states": false,
82
- "output_scores": false,
83
- "pad_token_id": null,
84
- "pooler_type": "mean_pooler",
85
- "prefix": null,
86
- "problem_type": null,
87
- "proj_bias": false,
88
- "proj_type": null,
89
- "pruned_heads": {},
90
- "remove_invalid_values": false,
91
- "repetition_penalty": 1.0,
92
- "return_dict": true,
93
- "return_dict_in_generate": false,
94
- "sep_token_id": null,
95
- "suppress_tokens": null,
96
- "task_specific_params": null,
97
- "temperature": 1.0,
98
- "tf_legacy_loss": false,
99
- "tie_encoder_decoder": false,
100
- "tie_word_embeddings": true,
101
- "tokenizer_class": null,
102
- "top_k": 50,
103
- "top_p": 1.0,
104
- "torch_dtype": null,
105
- "torchscript": false,
106
- "transformers_version": "4.51.3",
107
- "typical_p": 1.0,
108
- "use_bfloat16": false
109
- },
110
- "torch_dtype": "bfloat16",
111
- "transformers.js_config": {
112
- "use_external_data_format": {
113
- "model.onnx": true
114
  }
115
- },
116
- "transformers_version": null,
117
- "truncate_dim": null,
118
- "use_text_flash_attn": false,
119
- "use_vision_xformers": false,
120
- "vision_config": {
121
- "_attn_implementation_autoset": false,
122
- "_name_or_path": "",
123
- "add_cross_attention": false,
124
- "architectures": null,
125
- "bad_words_ids": null,
126
- "begin_suppress_tokens": null,
127
- "bos_token_id": null,
128
- "chunk_size_feed_forward": 0,
129
- "cross_attention_hidden_size": null,
130
- "decoder_start_token_id": null,
131
- "diversity_penalty": 0.0,
132
- "do_sample": false,
133
- "drop_path_rate": 0.0,
134
- "early_stopping": false,
135
- "embed_dim": 1024,
136
- "encoder_no_repeat_ngram_size": 0,
137
- "eos_token_id": null,
138
- "exponential_decay_length_penalty": null,
139
- "finetuning_task": null,
140
- "forced_bos_token_id": null,
141
- "forced_eos_token_id": null,
142
- "fused_layer_norm": false,
143
- "head_width": 64,
144
- "id2label": {
145
- "0": "LABEL_0",
146
- "1": "LABEL_1"
147
- },
148
- "image_size": 512,
149
- "intp_freq": true,
150
- "is_decoder": false,
151
- "is_encoder_decoder": false,
152
- "label2id": {
153
- "LABEL_0": 0,
154
- "LABEL_1": 1
155
- },
156
- "layers": 24,
157
- "length_penalty": 1.0,
158
- "ls_init_value": null,
159
- "max_length": 20,
160
- "min_length": 0,
161
- "mlp_ratio": 2.6667,
162
- "model_type": "jina_clip_vision",
163
- "naive_swiglu": true,
164
- "no_repeat_ngram_size": 0,
165
- "num_beam_groups": 1,
166
- "num_beams": 1,
167
- "num_return_sequences": 1,
168
- "output_attentions": false,
169
- "output_hidden_states": false,
170
- "output_scores": false,
171
- "pad_token_id": null,
172
- "patch_dropout": 0.1,
173
- "patch_size": 14,
174
- "post_norm": false,
175
- "prefix": null,
176
- "problem_type": null,
177
- "proj_type": null,
178
- "pruned_heads": {},
179
- "pt_hw_seq_len": 16,
180
- "qkv_bias": true,
181
- "remove_invalid_values": false,
182
- "repetition_penalty": 1.0,
183
- "return_dict": true,
184
- "return_dict_in_generate": false,
185
- "rope_embeddings": true,
186
- "sep_token_id": null,
187
- "subln": true,
188
- "suppress_tokens": null,
189
- "task_specific_params": null,
190
- "temperature": 1.0,
191
- "tf_legacy_loss": false,
192
- "tie_encoder_decoder": false,
193
- "tie_word_embeddings": true,
194
- "tokenizer_class": null,
195
- "top_k": 50,
196
- "top_p": 1.0,
197
- "torch_dtype": null,
198
- "torchscript": false,
199
- "transformers_version": "4.51.3",
200
- "typical_p": 1.0,
201
- "use_bfloat16": false,
202
- "width": 1024,
203
- "x_attention": false
204
- }
205
- }
 
1
  {
2
+ "add_projections": false,
3
+ "architectures": [
4
+ "JinaCLIPModel"
5
+ ],
6
+ "auto_map": {
7
+ "AutoConfig": "jinaai/jina-clip-implementation--configuration_clip.JinaCLIPConfig",
8
+ "AutoModel": "jinaai/jina-clip-implementation--modeling_clip.JinaCLIPModel"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  },
10
+ "initializer_factor": 1.0,
11
+ "logit_scale_init_value": 2.6592,
12
+ "matryoshka_dimensions": [32, 64, 128, 256, 512, 768, 1024],
13
+ "model_type": "jina_clip",
14
+ "projection_dim": 1024,
15
+ "text_config": {
16
+ "default_instruction_task": null,
17
+ "default_lora_task": "retrieval.query",
18
+ "embed_dim": 1024,
19
+ "hf_model_config_kwargs": {
20
+ "load_trained_adapters": false,
21
+ "lora_adaptations": [
22
+ "retrieval.query"
23
+ ],
24
+ "lora_alpha": 4,
25
+ "lora_dropout_p": 0.0,
26
+ "lora_main_params_trainable": false,
27
+ "lora_rank": 4,
28
+ "task_instructions": {
29
+ "retrieval.query": "Represent the query for retrieving evidence documents: "
30
+ },
31
+ "use_flash_attn": true
32
+ },
33
+ "hf_model_name_or_path": "jinaai/jina-embeddings-v3",
34
+ "model_type": "jina_clip_text",
35
+ "pooler_type": "mean_pooler",
36
+ "proj_bias": false,
37
+ "proj_type": null
38
  },
39
+ "torch_dtype": "bfloat16",
40
+ "transformers.js_config": {
41
+ "use_external_data_format": {
42
+ "model.onnx": true
43
+ }
44
  },
45
+ "truncate_dim": null,
46
+ "use_text_flash_attn": null,
47
+ "use_vision_xformers": null,
48
+ "vision_config": {
49
+ "embed_dim": 1024,
50
+ "fused_layer_norm": false,
51
+ "head_width": 64,
52
+ "image_size": 512,
53
+ "intp_freq": true,
54
+ "layers": 24,
55
+ "ls_init_value": null,
56
+ "mlp_ratio": 2.6667,
57
+ "model_type": "jina_clip_vision",
58
+ "naive_swiglu": true,
59
+ "patch_dropout": 0.1,
60
+ "patch_size": 14,
61
+ "post_norm": false,
62
+ "proj_type": null,
63
+ "pt_hw_seq_len": 16,
64
+ "qkv_bias": true,
65
+ "rope_embeddings": true,
66
+ "subln": true,
67
+ "width": 1024,
68
+ "x_attention": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  }
70
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
config_sentence_transformers.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "__version__": {
3
+ "sentence_transformers": "3.3.0",
4
+ "transformers": "4.46.2",
5
+ "pytorch": "2.2.2"
6
+ },
7
+ "prompts":{
8
+ "retrieval.query":"Represent the query for retrieving evidence documents: "
9
+ },
10
+ "default_prompt_name": null,
11
+ "similarity_fn_name": "cosine"
12
+ }
custom_st.py ADDED
@@ -0,0 +1,275 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import json
3
+ import os
4
+ from io import BytesIO
5
+ from typing import Any, Dict, List, Literal, Optional, Union
6
+
7
+ import requests
8
+ import torch
9
+ from PIL import Image
10
+ from torch import nn
11
+ from transformers import AutoConfig, AutoImageProcessor, AutoModel, AutoTokenizer
12
+
13
+
14
+ class Transformer(nn.Module):
15
+
16
+ save_in_root: bool = True
17
+
18
+ def __init__(
19
+ self,
20
+ model_name_or_path: str = 'jinaai/jina-clip-v2',
21
+ tokenizer_name_or_path: Optional[str] = None,
22
+ image_processor_name_or_path: Optional[str] = None,
23
+ max_seq_length: Optional[int] = None,
24
+ config_args: Optional[Dict[str, Any]] = None,
25
+ model_args: Optional[Dict[str, Any]] = None,
26
+ tokenizer_args: Optional[Dict[str, Any]] = None,
27
+ image_processor_args: Optional[Dict[str, Any]] = None,
28
+ assume_text_inputs: bool = False,
29
+ cache_dir: Optional[str] = None,
30
+ backend: Literal['torch', 'onnx', 'openvino'] = 'torch',
31
+ **_,
32
+ ) -> None:
33
+ """
34
+ Creates a custom SentenceTransformer module that uses `jinai/jina-clip-v2` to
35
+ map sentences/images to embeddings
36
+
37
+ Args:
38
+ model_name_or_path (str, optional): If it is a filepath on disc, it loads
39
+ the model from that path. If it is not a path, tries to construct a
40
+ model from the Hugging Face Hub with that name. Defaults to
41
+ 'jinaai/jina-clip-v2'
42
+ tokenizer_name_or_path (str, optional): If it is a filepath on disc, it
43
+ loads the tokenizer from that path. If it is not a path, tries to
44
+ construct a tokenizer from the Hugging Face Hub with that name.
45
+ If `None` it is automatically set to the value of `model_name_or_path`
46
+ image_processor_name_or_path (str, optional): If it is a filepath on disc,
47
+ it loads the image processor from that path. If it is not a path, tries
48
+ to construct an image processor from the Hugging Face Hub with that
49
+ name. If `None` it is automatically set to the value of
50
+ `model_name_or_path`
51
+ max_seq_length (int, optional): The maximum sequence length of the model.
52
+ If not provided, will be inferred from model or tokenizer
53
+ config_args (Dict[str, Any], optional): Additional model configuration
54
+ parameters to be passed to the Hugging Face Transformers config
55
+ model_args (Dict[str, Any], optional): Additional model configuration
56
+ parameters to be passed to the Hugging Face Transformers model
57
+ tokenizer_args (Dict[str, Any], optional): Additional tokenizer
58
+ configuration parameters to be passed to the Hugging Face Transformers
59
+ tokenizer
60
+ image_processor_args (Dict[str, Any], optional): Additional image processor
61
+ configuration parameters to be passed to the Hugging Face Transformers
62
+ image processor
63
+ assume_text_inputs (bool, optional): If set to `True`, all inputs are
64
+ treated as texts. Defaults to `False`
65
+ cache_dir (str, optional): The Hugging Face Hub cache directory
66
+ backend (str, optional): Computational backend, only 'torch' is supported
67
+
68
+ Example:
69
+ ::
70
+
71
+ from sentence_transformers import SentenceTransformer
72
+
73
+ model = SentenceTransformer(
74
+ 'jinaai/jina-clip-v2', trust_remote_code=True
75
+ )
76
+ sentences_or_images = [
77
+ "The weather is lovely today.",
78
+ "It's so sunny outside!",
79
+ "/path/to/stadium.jpg",
80
+ ]
81
+ embeddings = model.encode(sentences_or_images)
82
+ print(embeddings.shape)
83
+ # (3, 1024)
84
+
85
+ # Get the similarity scores between all inputs
86
+ similarities = model.similarity(embeddings, embeddings)
87
+ print(similarities)
88
+ # tensor([[1.0000, 0.6817, 0.0492],
89
+ # [0.6817, 1.0000, 0.0421],
90
+ # [0.0492, 0.0421, 1.0000]])
91
+ """
92
+ super(Transformer, self).__init__()
93
+ if backend != 'torch':
94
+ raise ValueError(
95
+ f'Backend \'{backend}\' is not supported, please use \'torch\' instead'
96
+ )
97
+
98
+ config_kwargs = config_args or {}
99
+ model_kwargs = model_args or {}
100
+ tokenizer_kwargs = tokenizer_args or {}
101
+ image_processor_kwargs = {
102
+ 'token': model_kwargs.get('token', None),
103
+ 'trust_remote_code': model_kwargs.get('trust_remote_code', False),
104
+ 'revision': model_kwargs.get('revision', None),
105
+ 'local_files_only': model_kwargs.get('local_files_only', None),
106
+ }
107
+ image_processor_kwargs.update(image_processor_args or {})
108
+
109
+ config = AutoConfig.from_pretrained(
110
+ model_name_or_path, cache_dir=cache_dir, **config_kwargs
111
+ )
112
+ self.model = AutoModel.from_pretrained(
113
+ model_name_or_path, config=config, cache_dir=cache_dir, **model_kwargs
114
+ )
115
+ if max_seq_length is not None and 'model_max_length' not in tokenizer_kwargs:
116
+ tokenizer_kwargs['model_max_length'] = max_seq_length
117
+
118
+ self.tokenizer = AutoTokenizer.from_pretrained(
119
+ tokenizer_name_or_path or model_name_or_path,
120
+ cache_dir=cache_dir,
121
+ **tokenizer_kwargs,
122
+ )
123
+ self.image_processor = AutoImageProcessor.from_pretrained(
124
+ image_processor_name_or_path or model_name_or_path,
125
+ cache_dir=cache_dir,
126
+ **image_processor_kwargs,
127
+ )
128
+ self.assume_text_inputs = assume_text_inputs
129
+
130
+ # No max_seq_length set. Try to infer from model
131
+ if max_seq_length is None:
132
+ if (
133
+ hasattr(self.model, 'config')
134
+ and hasattr(self.model.config, 'max_position_embeddings')
135
+ and hasattr(self.tokenizer, 'model_max_length')
136
+ ):
137
+ max_seq_length = min(
138
+ self.model.config.max_position_embeddings,
139
+ self.tokenizer.model_max_length,
140
+ )
141
+ self.max_seq_length = max_seq_length
142
+ if tokenizer_name_or_path is not None:
143
+ self.model.config.tokenizer_class = self.tokenizer.__class__.__name__
144
+
145
+ @staticmethod
146
+ def _decode_data_image(data_image_str: str) -> Image.Image:
147
+ header, data = data_image_str.split(',', 1)
148
+ image_data = base64.b64decode(data)
149
+ return Image.open(BytesIO(image_data))
150
+
151
+ def tokenize(
152
+ self, texts: List[Union[str, Image.Image]], padding: Union[str, bool] = True
153
+ ) -> Dict[str, torch.Tensor]:
154
+ """
155
+ Encodes input samples. Text samples are tokenized. Image URLs, image data
156
+ buffers and PIL images are passed through the image processor.
157
+ """
158
+ _images = []
159
+ _texts = []
160
+ _image_or_text_descriptors = []
161
+
162
+ if self.assume_text_inputs:
163
+ for sample in texts:
164
+ if isinstance(sample, str):
165
+ _texts.append(sample)
166
+ _image_or_text_descriptors.append(1)
167
+ else:
168
+ for sample in texts:
169
+ if isinstance(sample, str):
170
+ if sample.startswith('http'):
171
+ try:
172
+ response = requests.get(sample)
173
+ _images.append(
174
+ Image.open(BytesIO(response.content)).convert('RGB')
175
+ )
176
+ _image_or_text_descriptors.append(0)
177
+ except Exception as e:
178
+ _ = str(e)
179
+ _texts.append(sample)
180
+ _image_or_text_descriptors.append(1)
181
+ elif sample.startswith('data:image/'):
182
+ _images.append(self._decode_data_image(sample).convert('RGB'))
183
+ _image_or_text_descriptors.append(0)
184
+ else:
185
+ try:
186
+ _images.append(Image.open(sample).convert('RGB'))
187
+ _image_or_text_descriptors.append(0)
188
+ except Exception as e:
189
+ _ = str(e)
190
+ _texts.append(sample)
191
+ _image_or_text_descriptors.append(1)
192
+ elif isinstance(sample, Image.Image):
193
+ _images.append(sample.convert('RGB'))
194
+ _image_or_text_descriptors.append(0)
195
+
196
+ encoding = {}
197
+ if len(_texts):
198
+ encoding['input_ids'] = self.tokenizer(
199
+ _texts,
200
+ padding=padding,
201
+ truncation='longest_first',
202
+ return_tensors='pt',
203
+ max_length=self.max_seq_length,
204
+ ).input_ids
205
+
206
+ if len(_images):
207
+ encoding['pixel_values'] = self.image_processor(
208
+ _images, return_tensors='pt'
209
+ ).pixel_values
210
+
211
+ encoding['image_text_info'] = _image_or_text_descriptors
212
+ return encoding
213
+
214
+ def forward(self, features: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
215
+ image_embeddings = []
216
+ text_embeddings = []
217
+
218
+ if 'pixel_values' in features:
219
+ image_embeddings = self.model.get_image_features(features['pixel_values'])
220
+ if 'input_ids' in features:
221
+ text_embeddings = self.model.get_text_features(features['input_ids'])
222
+
223
+ sentence_embedding = []
224
+ image_features = iter(image_embeddings)
225
+ text_features = iter(text_embeddings)
226
+ for _, _input_type in enumerate(features['image_text_info']):
227
+ if _input_type == 0:
228
+ sentence_embedding.append(next(image_features))
229
+ else:
230
+ sentence_embedding.append(next(text_features))
231
+
232
+ features['sentence_embedding'] = torch.stack(sentence_embedding).float()
233
+ return features
234
+
235
+ def save(self, output_path: str, safe_serialization: bool = True) -> None:
236
+ self.model.save_pretrained(output_path, safe_serialization=safe_serialization)
237
+ self.tokenizer.save_pretrained(output_path)
238
+ self.image_processor.save_pretrained(output_path)
239
+
240
+ @staticmethod
241
+ def load(input_path: str) -> 'Transformer':
242
+ # Old classes used other config names than 'sentence_bert_config.json'
243
+ for config_name in [
244
+ 'sentence_bert_config.json',
245
+ 'sentence_roberta_config.json',
246
+ 'sentence_distilbert_config.json',
247
+ 'sentence_camembert_config.json',
248
+ 'sentence_albert_config.json',
249
+ 'sentence_xlm-roberta_config.json',
250
+ 'sentence_xlnet_config.json',
251
+ ]:
252
+ sbert_config_path = os.path.join(input_path, config_name)
253
+ if os.path.exists(sbert_config_path):
254
+ break
255
+
256
+ with open(sbert_config_path) as fIn:
257
+ config = json.load(fIn)
258
+
259
+ # Don't allow configs to set trust_remote_code
260
+ if 'config_kwargs' in config and 'trust_remote_code' in config['config_kwargs']:
261
+ config['config_kwargs'].pop('trust_remote_code')
262
+ if 'model_kwargs' in config and 'trust_remote_code' in config['model_kwargs']:
263
+ config['model_kwargs'].pop('trust_remote_code')
264
+ if (
265
+ 'tokenizer_kwargs' in config
266
+ and 'trust_remote_code' in config['tokenizer_kwargs']
267
+ ):
268
+ config['tokenizer_kwargs'].pop('trust_remote_code')
269
+ if (
270
+ 'image_processor_kwargs' in config
271
+ and 'trust_remote_code' in config['image_processor_kwargs']
272
+ ):
273
+ config['image_processor_kwargs'].pop('trust_remote_code')
274
+
275
+ return Transformer(model_name_or_path=input_path, **config)
special_tokens_map.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "cls_token": {
10
+ "content": "<s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "eos_token": {
17
+ "content": "</s>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "mask_token": {
24
+ "content": "<mask>",
25
+ "lstrip": true,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "pad_token": {
31
+ "content": "<pad>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ },
37
+ "sep_token": {
38
+ "content": "</s>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false
43
+ },
44
+ "unk_token": {
45
+ "content": "<unk>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false
50
+ }
51
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6601c4120779a1a3863897ba332fe3481d548e363bec2c91eba10ef8640a5e93
3
+ size 17082997
tokenizer_config.json ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<s>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<pad>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "</s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "250001": {
36
+ "content": "<mask>",
37
+ "lstrip": true,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "bos_token": "<s>",
45
+ "clean_up_tokenization_spaces": true,
46
+ "cls_token": "<s>",
47
+ "eos_token": "</s>",
48
+ "mask_token": "<mask>",
49
+ "model_max_length": 8194,
50
+ "pad_token": "<pad>",
51
+ "sep_token": "</s>",
52
+ "tokenizer_class": "XLMRobertaTokenizer",
53
+ "unk_token": "<unk>"
54
+ }