yuchenxie commited on
Commit
7f70092
·
verified ·
1 Parent(s): eb8c7fc

Initial commit: Uploading tokenizer directory

Browse files
Files changed (6) hide show
  1. README.md +29 -0
  2. merges.txt +0 -0
  3. special_tokens_map.json +53 -0
  4. tokenizer.json +0 -0
  5. tokenizer_config.json +22 -0
  6. vocab.json +0 -0
README.md ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ArlowGPT Tokenizer
2
+
3
+ This repository contains a custom-trained BPE tokenizer for ArlowGPT, created by Yuchen Xie.
4
+
5
+ ## Tokenizer Details
6
+
7
+ - Type: BPE (Byte-Pair Encoding)
8
+ - Vocabulary Size: 131,072 tokens
9
+ - Special Tokens:
10
+ - Start of Text: <|startoftext|>
11
+ - End of Text: <|endoftext|>
12
+ - Padding: <|pad|>
13
+ - Unknown: <|unk|>
14
+ - Mask: <|mask|>
15
+ - Message Start: <|im_start|>
16
+ - Message End: <|im_end|>
17
+
18
+ ## Usage
19
+
20
+ ```python
21
+ from transformers import PreTrainedTokenizerFast
22
+
23
+ tokenizer = PreTrainedTokenizerFast.from_pretrained("yuchenxie/arlowgpt-tokenizer-v2")
24
+ ```
25
+
26
+ ## Training Details
27
+
28
+ This tokenizer was trained on the FineWeb dataset using the Hugging Face tokenizers library.
29
+
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
special_tokens_map.json ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|startoftext|>",
4
+ "single_word": false,
5
+ "lstrip": false,
6
+ "rstrip": false,
7
+ "normalized": true
8
+ },
9
+ "eos_token": {
10
+ "content": "<|endoftext|>",
11
+ "single_word": false,
12
+ "lstrip": false,
13
+ "rstrip": false,
14
+ "normalized": true
15
+ },
16
+ "pad_token": {
17
+ "content": "<|pad|>",
18
+ "single_word": false,
19
+ "lstrip": false,
20
+ "rstrip": false,
21
+ "normalized": true
22
+ },
23
+ "unk_token": {
24
+ "content": "<|unk|>",
25
+ "single_word": false,
26
+ "lstrip": false,
27
+ "rstrip": false,
28
+ "normalized": true
29
+ },
30
+ "mask_token": {
31
+ "content": "<|mask|>",
32
+ "single_word": false,
33
+ "lstrip": false,
34
+ "rstrip": false,
35
+ "normalized": true
36
+ },
37
+ "additional_special_tokens": [
38
+ {
39
+ "content": "<|im_start|>",
40
+ "single_word": false,
41
+ "lstrip": false,
42
+ "rstrip": false,
43
+ "normalized": true
44
+ },
45
+ {
46
+ "content": "<|im_end|>",
47
+ "single_word": false,
48
+ "lstrip": false,
49
+ "rstrip": false,
50
+ "normalized": true
51
+ }
52
+ ]
53
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_eos_token": false,
4
+ "bos_token": "<|startoftext|>",
5
+ "clean_up_tokenization_spaces": true,
6
+ "eos_token": "<|endoftext|>",
7
+ "mask_token": "<|mask|>",
8
+ "model_max_length": 2048,
9
+ "pad_token": "<|pad|>",
10
+ "tokenizer_class": "PreTrainedTokenizerFast",
11
+ "unk_token": "<|unk|>",
12
+ "chat_template": "{% for message in messages %}{{'<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
13
+ "model_input_names": ["input_ids", "attention_mask"],
14
+ "use_fast": true,
15
+ "name_or_path": "./tokenizer.json",
16
+ "tokenizer_type": "bpe",
17
+ "auto_map": {
18
+ "AutoTokenizer": ["tokenizers.Tokenizer", null]
19
+ },
20
+ "system_prompt": "You are ArlowGPT, an AI assistant created by Yuchen Xie. You aim to be helpful, honest, and direct in your interactions while maintaining high ethical standards.",
21
+ "padding_side": "right"
22
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff