GPT2_AR_200 / tokenizer.json
xiulinyang's picture
Add checkpoint
183e8c8
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 0,
"content": "<pad>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 1,
"content": "<|endoftext|>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 2,
"content": "<s>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 3,
"content": "<unk>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": {
"type": "Lowercase"
},
"pre_tokenizer": {
"type": "ByteLevel",
"add_prefix_space": true,
"trim_offsets": true,
"use_regex": true
},
"post_processor": {
"type": "TemplateProcessing",
"single": [
{
"SpecialToken": {
"id": "<|endoftext|>",
"type_id": 0
}
},
{
"Sequence": {
"id": "A",
"type_id": 0
}
}
],
"pair": [
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"Sequence": {
"id": "B",
"type_id": 1
}
}
],
"special_tokens": {
"<|endoftext|>": {
"id": "<|endoftext|>",
"ids": [
1
],
"tokens": [
"<|endoftext|>"
]
}
}
},
"decoder": {
"type": "ByteLevel",
"add_prefix_space": true,
"trim_offsets": true,
"use_regex": true
},
"model": {
"type": "BPE",
"dropout": null,
"unk_token": "<unk>",
"continuing_subword_prefix": null,
"end_of_word_suffix": null,
"fuse_unk": false,
"byte_fallback": false,
"ignore_merges": false,
"vocab": {
"<pad>": 0,
"<|endoftext|>": 1,
"<s>": 2,
"<unk>": 3,
"!": 4,
"\"": 5,
"#": 6,
"$": 7,
"%": 8,
"&": 9,
"'": 10,
"(": 11,
")": 12,
"*": 13,
"+": 14,
",": 15,
"-": 16,
".": 17,
"/": 18,
"0": 19,
"1": 20,
"2": 21,
"3": 22,
"4": 23,
"5": 24,
"6": 25,
"7": 26,
"8": 27,
"9": 28,
":": 29,
";": 30,
"<": 31,
"=": 32,
">": 33,
"?": 34,
"@": 35,
"[": 36,
"\\": 37,
"]": 38,
"^": 39,
"_": 40,
"`": 41,
"a": 42,
"b": 43,
"c": 44,
"d": 45,
"e": 46,
"f": 47,
"g": 48,
"h": 49,
"i": 50,
"j": 51,
"k": 52,
"l": 53,
"m": 54,
"n": 55,
"o": 56,
"p": 57,
"q": 58,
"r": 59,
"s": 60,
"t": 61,
"u": 62,
"v": 63,
"w": 64,
"x": 65,
"y": 66,
"z": 67,
"{": 68,
"|": 69,
"}": 70,
"~": 71,
"¡": 72,
"¢": 73,
"£": 74,
"¤": 75,
"¥": 76,
"¦": 77,
"§": 78,
"¨": 79,
"©": 80,
"ª": 81,
"«": 82,
"¬": 83,
"®": 84,
"¯": 85,
"°": 86,
"±": 87,
"²": 88,
"³": 89,
"´": 90,
"µ": 91,
"¶": 92,
"·": 93,
"¸": 94,
"¹": 95,
"º": 96,
"»": 97,
"¼": 98,
"½": 99,
"¾": 100,
"¿": 101,
"Â": 102,
"Ã": 103,
"Ä": 104,
"Å": 105,
"Æ": 106,
"Ç": 107,
"È": 108,
"É": 109,
"Ê": 110,
"Ì": 111,
"Î": 112,
"Ï": 113,
"Ð": 114,
"Ñ": 115,
"Ò": 116,
"×": 117,
"Ø": 118,
"Ù": 119,
"Ú": 120,
"Û": 121,
"Ü": 122,
"à": 123,
"á": 124,
"â": 125,
"ã": 126,
"ä": 127,
"å": 128,
"æ": 129,
"ç": 130,
"è": 131,
"é": 132,
"ê": 133,
"ë": 134,
"ì": 135,
"í": 136,
"î": 137,
"ï": 138,
"ð": 139,
"Ċ": 140,
"Ġ": 141,
"Ģ": 142,
"ģ": 143,
"Ĥ": 144,
"ĥ": 145,
"Ħ": 146,
"ħ": 147,
"Ĩ": 148,
"ĩ": 149,
"Ī": 150,
"ī": 151,
"Ĭ": 152,
"ĭ": 153,
"Į": 154,
"į": 155,
"İ": 156,
"ı": 157,
"IJ": 158,
"ij": 159,
"Ĵ": 160,
"ĵ": 161,
"Ķ": 162,
"ķ": 163,
"ĸ": 164,
"Ĺ": 165,
"ĺ": 166,
"Ļ": 167,
"ļ": 168,
"Ľ": 169,
"ľ": 170,
"Ŀ": 171,
"ŀ": 172,
"Ł": 173,
"ł": 174,
"Ń": 175,
"ا": 176,
"ÙĦ": 177,
"ĠØ": 178,
"ĠÙ": 179,
"ÙĬ": 180,
"ÙĨ": 181,
"اÙĦ": 182,
"Ùħ": 183,
"ĠاÙĦ": 184,
"ر": 185,
"ت": 186,
"ÙĪ": 187,
"د": 188,
"Ùĥ": 189,
"ع": 190,
"Ø©": 191,
"ب": 192,
"Ùĩ": 193,
"ĠÙħ": 194,
"ĠØ£": 195,
"س": 196,
"ÙĤ": 197,
"ĠÙĦ": 198,
"Ùģ": 199
},
"merges": [
[
"Ø",
"§"
],
[
"Ù",
"Ħ"
],
[
"Ġ",
"Ø"
],
[
"Ġ",
"Ù"
],
[
"Ù",
"Ĭ"
],
[
"Ù",
"Ĩ"
],
[
"ا",
"ÙĦ"
],
[
"Ù",
"ħ"
],
[
"Ġ",
"اÙĦ"
],
[
"Ø",
"±"
],
[
"Ø",
"ª"
],
[
"Ù",
"Ī"
],
[
"Ø",
"¯"
],
[
"Ù",
"ĥ"
],
[
"Ø",
"¹"
],
[
"Ø",
"©"
],
[
"Ø",
"¨"
],
[
"Ù",
"ĩ"
],
[
"ĠÙ",
"ħ"
],
[
"ĠØ",
"£"
],
[
"Ø",
"³"
],
[
"Ù",
"Ĥ"
],
[
"Ġ",
"ÙĦ"
],
[
"Ù",
"ģ"
]
]
}
}