Allen Poston commited on
Commit
1565eb8
·
verified ·
1 Parent(s): e31bd6b

Upload tokenizer

Browse files
added_tokens.json CHANGED
@@ -1,3 +1,6 @@
1
  {
2
- "<|startoftext|>": 50257
 
 
 
3
  }
 
1
  {
2
+ "<|assistant|>": 50259,
3
+ "<|pad|>": 50257,
4
+ "<|system|>": 50260,
5
+ "<|user|>": 50258
6
  }
special_tokens_map.json CHANGED
@@ -1,14 +1,21 @@
1
  {
2
  "additional_special_tokens": [
3
  {
4
- "content": "<|startoftext|>",
5
  "lstrip": false,
6
  "normalized": false,
7
  "rstrip": false,
8
  "single_word": false
9
  },
10
  {
11
- "content": "<|endoftext|>",
 
 
 
 
 
 
 
12
  "lstrip": false,
13
  "normalized": false,
14
  "rstrip": false,
@@ -25,11 +32,17 @@
25
  "eos_token": {
26
  "content": "<|endoftext|>",
27
  "lstrip": false,
28
- "normalized": true,
 
 
 
 
 
 
 
29
  "rstrip": false,
30
  "single_word": false
31
  },
32
- "pad_token": "<|endoftext|>",
33
  "unk_token": {
34
  "content": "<|endoftext|>",
35
  "lstrip": false,
 
1
  {
2
  "additional_special_tokens": [
3
  {
4
+ "content": "<|user|>",
5
  "lstrip": false,
6
  "normalized": false,
7
  "rstrip": false,
8
  "single_word": false
9
  },
10
  {
11
+ "content": "<|assistant|>",
12
+ "lstrip": false,
13
+ "normalized": false,
14
+ "rstrip": false,
15
+ "single_word": false
16
+ },
17
+ {
18
+ "content": "<|system|>",
19
  "lstrip": false,
20
  "normalized": false,
21
  "rstrip": false,
 
32
  "eos_token": {
33
  "content": "<|endoftext|>",
34
  "lstrip": false,
35
+ "normalized": false,
36
+ "rstrip": false,
37
+ "single_word": false
38
+ },
39
+ "pad_token": {
40
+ "content": "<|pad|>",
41
+ "lstrip": false,
42
+ "normalized": false,
43
  "rstrip": false,
44
  "single_word": false
45
  },
 
46
  "unk_token": {
47
  "content": "<|endoftext|>",
48
  "lstrip": false,
tokenizer.json CHANGED
@@ -7,14 +7,12 @@
7
  "stride": 0
8
  },
9
  "padding": {
10
- "strategy": {
11
- "Fixed": 256
12
- },
13
  "direction": "Right",
14
  "pad_to_multiple_of": null,
15
- "pad_id": 50256,
16
  "pad_type_id": 0,
17
- "pad_token": "<|endoftext|>"
18
  },
19
  "added_tokens": [
20
  {
@@ -28,7 +26,34 @@
28
  },
29
  {
30
  "id": 50257,
31
- "content": "<|startoftext|>",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  "single_word": false,
33
  "lstrip": false,
34
  "rstrip": false,
 
7
  "stride": 0
8
  },
9
  "padding": {
10
+ "strategy": "BatchLongest",
 
 
11
  "direction": "Right",
12
  "pad_to_multiple_of": null,
13
+ "pad_id": 50257,
14
  "pad_type_id": 0,
15
+ "pad_token": "<|pad|>"
16
  },
17
  "added_tokens": [
18
  {
 
26
  },
27
  {
28
  "id": 50257,
29
+ "content": "<|pad|>",
30
+ "single_word": false,
31
+ "lstrip": false,
32
+ "rstrip": false,
33
+ "normalized": false,
34
+ "special": true
35
+ },
36
+ {
37
+ "id": 50258,
38
+ "content": "<|user|>",
39
+ "single_word": false,
40
+ "lstrip": false,
41
+ "rstrip": false,
42
+ "normalized": false,
43
+ "special": true
44
+ },
45
+ {
46
+ "id": 50259,
47
+ "content": "<|assistant|>",
48
+ "single_word": false,
49
+ "lstrip": false,
50
+ "rstrip": false,
51
+ "normalized": false,
52
+ "special": true
53
+ },
54
+ {
55
+ "id": 50260,
56
+ "content": "<|system|>",
57
  "single_word": false,
58
  "lstrip": false,
59
  "rstrip": false,
tokenizer_config.json CHANGED
@@ -11,7 +11,31 @@
11
  "special": true
12
  },
13
  "50257": {
14
- "content": "<|startoftext|>",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  "lstrip": false,
16
  "normalized": false,
17
  "rstrip": false,
@@ -20,8 +44,9 @@
20
  }
21
  },
22
  "additional_special_tokens": [
23
- "<|startoftext|>",
24
- "<|endoftext|>"
 
25
  ],
26
  "bos_token": "<|endoftext|>",
27
  "clean_up_tokenization_spaces": true,
@@ -29,7 +54,7 @@
29
  "errors": "replace",
30
  "extra_special_tokens": {},
31
  "model_max_length": 1024,
32
- "pad_token": "<|endoftext|>",
33
  "tokenizer_class": "GPT2Tokenizer",
34
  "unk_token": "<|endoftext|>"
35
  }
 
11
  "special": true
12
  },
13
  "50257": {
14
+ "content": "<|pad|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "50258": {
22
+ "content": "<|user|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "50259": {
30
+ "content": "<|assistant|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "50260": {
38
+ "content": "<|system|>",
39
  "lstrip": false,
40
  "normalized": false,
41
  "rstrip": false,
 
44
  }
45
  },
46
  "additional_special_tokens": [
47
+ "<|user|>",
48
+ "<|assistant|>",
49
+ "<|system|>"
50
  ],
51
  "bos_token": "<|endoftext|>",
52
  "clean_up_tokenization_spaces": true,
 
54
  "errors": "replace",
55
  "extra_special_tokens": {},
56
  "model_max_length": 1024,
57
+ "pad_token": "<|pad|>",
58
  "tokenizer_class": "GPT2Tokenizer",
59
  "unk_token": "<|endoftext|>"
60
  }