yonigozlan HF Staff commited on
Commit
599524e
·
verified ·
1 Parent(s): 64d722e

Upload tokenizer

Browse files
Files changed (3) hide show
  1. README.md +4 -4
  2. special_tokens_map.json +67 -10
  3. tokenizer_config.json +8 -2
README.md CHANGED
@@ -5,14 +5,14 @@ license_link: https://huggingface.co/Qwen/Qwen2.5-72B-Instruct/blob/main/LICENSE
5
  pipeline_tag: image-text-to-text
6
  library_name: transformers
7
  base_model:
8
- - OpenGVLab/InternVL3-14B-Instruct
9
  base_model_relation: finetune
10
  datasets:
11
- - OpenGVLab/MMPR-v1.2
12
  language:
13
- - multilingual
14
  tags:
15
- - internvl
16
  ---
17
 
18
  # InternVL3-14B Transformers 🤗 Implementation
 
5
  pipeline_tag: image-text-to-text
6
  library_name: transformers
7
  base_model:
8
+ - OpenGVLab/InternVL3-14B-Instruct
9
  base_model_relation: finetune
10
  datasets:
11
+ - OpenGVLab/MMPR-v1.2
12
  language:
13
+ - multilingual
14
  tags:
15
+ - internvl
16
  ---
17
 
18
  # InternVL3-14B Transformers 🤗 Implementation
special_tokens_map.json CHANGED
@@ -13,16 +13,72 @@
13
  "<|vision_pad|>",
14
  "<|image_pad|>",
15
  "<|video_pad|>",
16
- "<img>",
17
- "</img>",
18
- "<IMG_CONTEXT>",
19
- "<quad>",
20
- "</quad>",
21
- "<ref>",
22
- "</ref>",
23
- "<box>",
24
- "</box>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  ],
 
 
26
  "eos_token": {
27
  "content": "<|im_end|>",
28
  "lstrip": false,
@@ -36,5 +92,6 @@
36
  "normalized": false,
37
  "rstrip": false,
38
  "single_word": false
39
- }
 
40
  }
 
13
  "<|vision_pad|>",
14
  "<|image_pad|>",
15
  "<|video_pad|>",
16
+ {
17
+ "content": "<img>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ {
24
+ "content": "</img>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ {
31
+ "content": "<IMG_CONTEXT>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ },
37
+ {
38
+ "content": "<quad>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false
43
+ },
44
+ {
45
+ "content": "</quad>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false
50
+ },
51
+ {
52
+ "content": "<ref>",
53
+ "lstrip": false,
54
+ "normalized": false,
55
+ "rstrip": false,
56
+ "single_word": false
57
+ },
58
+ {
59
+ "content": "</ref>",
60
+ "lstrip": false,
61
+ "normalized": false,
62
+ "rstrip": false,
63
+ "single_word": false
64
+ },
65
+ {
66
+ "content": "<box>",
67
+ "lstrip": false,
68
+ "normalized": false,
69
+ "rstrip": false,
70
+ "single_word": false
71
+ },
72
+ {
73
+ "content": "</box>",
74
+ "lstrip": false,
75
+ "normalized": false,
76
+ "rstrip": false,
77
+ "single_word": false
78
+ }
79
  ],
80
+ "context_image_token": "<IMG_CONTEXT>",
81
+ "end_image_token": "</img>",
82
  "eos_token": {
83
  "content": "<|im_end|>",
84
  "lstrip": false,
 
92
  "normalized": false,
93
  "rstrip": false,
94
  "single_word": false
95
+ },
96
+ "start_image_token": "<img>"
97
  }
tokenizer_config.json CHANGED
@@ -277,14 +277,20 @@
277
  ],
278
  "bos_token": null,
279
  "clean_up_tokenization_spaces": false,
 
 
280
  "eos_token": "<|im_end|>",
281
  "errors": "replace",
282
- "extra_special_tokens": {},
 
 
 
 
283
  "model_max_length": 8192,
284
  "pad_token": "<|endoftext|>",
285
- "processor_class": "InternVLProcessor",
286
  "return_token_type_ids": false,
287
  "split_special_tokens": false,
 
288
  "tokenizer_class": "Qwen2Tokenizer",
289
  "unk_token": null
290
  }
 
277
  ],
278
  "bos_token": null,
279
  "clean_up_tokenization_spaces": false,
280
+ "context_image_token": "<IMG_CONTEXT>",
281
+ "end_image_token": "</img>",
282
  "eos_token": "<|im_end|>",
283
  "errors": "replace",
284
+ "extra_special_tokens": {
285
+ "context_image_token": "<IMG_CONTEXT>",
286
+ "end_image_token": "</img>",
287
+ "start_image_token": "<img>"
288
+ },
289
  "model_max_length": 8192,
290
  "pad_token": "<|endoftext|>",
 
291
  "return_token_type_ids": false,
292
  "split_special_tokens": false,
293
+ "start_image_token": "<img>",
294
  "tokenizer_class": "Qwen2Tokenizer",
295
  "unk_token": null
296
  }