Shaltiel commited on
Commit
b288887
·
verified ·
1 Parent(s): 2211d62

Upload folder using huggingface_hub

Browse files
Files changed (6) hide show
  1. config.json +61 -0
  2. model.safetensors +3 -0
  3. special_tokens_map.json +37 -0
  4. tokenizer.json +1022 -0
  5. tokenizer_config.json +63 -0
  6. vocab.txt +0 -0
config.json ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "largetav/",
3
+ "architectures": [
4
+ "BertForDiacritization"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "classifier_dropout": null,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.1,
10
+ "hidden_size": 1024,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 4096,
13
+ "layer_norm_eps": 1e-12,
14
+ "mat_lect_token": "<MAT_LECT>",
15
+ "max_position_embeddings": 2048,
16
+ "model_type": "bert",
17
+ "nikud_classes": [
18
+ "",
19
+ "<MAT_LECT>",
20
+ "\u05bc",
21
+ "\u05b0",
22
+ "\u05b1",
23
+ "\u05b2",
24
+ "\u05b3",
25
+ "\u05b4",
26
+ "\u05b5",
27
+ "\u05b6",
28
+ "\u05b7",
29
+ "\u05b8",
30
+ "\u05b9",
31
+ "\u05ba",
32
+ "\u05bb",
33
+ "\u05bc\u05b0",
34
+ "\u05bc\u05b1",
35
+ "\u05bc\u05b2",
36
+ "\u05bc\u05b3",
37
+ "\u05bc\u05b4",
38
+ "\u05bc\u05b5",
39
+ "\u05bc\u05b6",
40
+ "\u05bc\u05b7",
41
+ "\u05bc\u05b8",
42
+ "\u05bc\u05b9",
43
+ "\u05bc\u05ba",
44
+ "\u05bc\u05bb",
45
+ "\u05c7",
46
+ "\u05bc\u05c7"
47
+ ],
48
+ "num_attention_heads": 16,
49
+ "num_hidden_layers": 24,
50
+ "pad_token_id": 0,
51
+ "position_embedding_type": "absolute",
52
+ "shin_classes": [
53
+ "\u05c1",
54
+ "\u05c2"
55
+ ],
56
+ "torch_dtype": "float32",
57
+ "transformers_version": "4.42.4",
58
+ "type_vocab_size": 2,
59
+ "use_cache": true,
60
+ "vocab_size": 1024
61
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c2927643c61f408c7d5ff1652b605b322ed896fa07ded344bd508a02b76bf50e
3
+ size 1222010788
special_tokens_map.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": {
3
+ "content": "[CLS]",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "mask_token": {
10
+ "content": "[MASK]",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "[PAD]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "sep_token": {
24
+ "content": "[SEP]",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "unk_token": {
31
+ "content": "[UNK]",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ }
37
+ }
tokenizer.json ADDED
@@ -0,0 +1,1022 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
5
+ "added_tokens": [
6
+ {
7
+ "id": 0,
8
+ "content": "[UNK]",
9
+ "single_word": false,
10
+ "lstrip": false,
11
+ "rstrip": false,
12
+ "normalized": false,
13
+ "special": true
14
+ },
15
+ {
16
+ "id": 1,
17
+ "content": "[CLS]",
18
+ "single_word": false,
19
+ "lstrip": false,
20
+ "rstrip": false,
21
+ "normalized": false,
22
+ "special": true
23
+ },
24
+ {
25
+ "id": 2,
26
+ "content": "[SEP]",
27
+ "single_word": false,
28
+ "lstrip": false,
29
+ "rstrip": false,
30
+ "normalized": false,
31
+ "special": true
32
+ },
33
+ {
34
+ "id": 3,
35
+ "content": "[PAD]",
36
+ "single_word": false,
37
+ "lstrip": false,
38
+ "rstrip": false,
39
+ "normalized": false,
40
+ "special": true
41
+ },
42
+ {
43
+ "id": 4,
44
+ "content": "[MASK]",
45
+ "single_word": false,
46
+ "lstrip": false,
47
+ "rstrip": false,
48
+ "normalized": false,
49
+ "special": true
50
+ },
51
+ {
52
+ "id": 5,
53
+ "content": "[BLANK]",
54
+ "single_word": false,
55
+ "lstrip": false,
56
+ "rstrip": false,
57
+ "normalized": false,
58
+ "special": true
59
+ }
60
+ ],
61
+ "normalizer": {
62
+ "type": "Sequence",
63
+ "normalizers": [
64
+ {
65
+ "type": "NFKC"
66
+ },
67
+ {
68
+ "type": "Lowercase"
69
+ },
70
+ {
71
+ "type": "StripAccents"
72
+ },
73
+ {
74
+ "type": "Replace",
75
+ "pattern": {
76
+ "String": "<foreign>"
77
+ },
78
+ "content": "[UNK]"
79
+ },
80
+ {
81
+ "type": "Replace",
82
+ "pattern": {
83
+ "Regex": "[^֐-׿\u0000-‌-‿₠-₿∀-⋿⅐-↋ff-ﭏ]+"
84
+ },
85
+ "content": "[UNK]"
86
+ }
87
+ ]
88
+ },
89
+ "pre_tokenizer": {
90
+ "type": "Split",
91
+ "pattern": {
92
+ "Regex": "(\\[UNK\\]|[\\s\\S])"
93
+ },
94
+ "behavior": "Removed",
95
+ "invert": true
96
+ },
97
+ "post_processor": {
98
+ "type": "TemplateProcessing",
99
+ "single": [
100
+ {
101
+ "SpecialToken": {
102
+ "id": "[CLS]",
103
+ "type_id": 0
104
+ }
105
+ },
106
+ {
107
+ "Sequence": {
108
+ "id": "A",
109
+ "type_id": 0
110
+ }
111
+ },
112
+ {
113
+ "SpecialToken": {
114
+ "id": "[SEP]",
115
+ "type_id": 0
116
+ }
117
+ }
118
+ ],
119
+ "pair": [
120
+ {
121
+ "SpecialToken": {
122
+ "id": "[CLS]",
123
+ "type_id": 0
124
+ }
125
+ },
126
+ {
127
+ "Sequence": {
128
+ "id": "A",
129
+ "type_id": 0
130
+ }
131
+ },
132
+ {
133
+ "SpecialToken": {
134
+ "id": "[SEP]",
135
+ "type_id": 0
136
+ }
137
+ },
138
+ {
139
+ "Sequence": {
140
+ "id": "B",
141
+ "type_id": 1
142
+ }
143
+ },
144
+ {
145
+ "SpecialToken": {
146
+ "id": "[SEP]",
147
+ "type_id": 1
148
+ }
149
+ }
150
+ ],
151
+ "special_tokens": {
152
+ "[CLS]": {
153
+ "id": "[CLS]",
154
+ "ids": [
155
+ 1
156
+ ],
157
+ "tokens": [
158
+ "[CLS]"
159
+ ]
160
+ },
161
+ "[SEP]": {
162
+ "id": "[SEP]",
163
+ "ids": [
164
+ 2
165
+ ],
166
+ "tokens": [
167
+ "[SEP]"
168
+ ]
169
+ }
170
+ }
171
+ },
172
+ "decoder": null,
173
+ "model": {
174
+ "type": "WordPiece",
175
+ "unk_token": "[UNK]",
176
+ "continuing_subword_prefix": "##",
177
+ "max_input_chars_per_word": 100,
178
+ "vocab": {
179
+ "[UNK]": 0,
180
+ "[CLS]": 1,
181
+ "[SEP]": 2,
182
+ "[PAD]": 3,
183
+ "[MASK]": 4,
184
+ "[BLANK]": 5,
185
+ "\u0000": 6,
186
+ "\u0001": 7,
187
+ "\u0002": 8,
188
+ "\u0003": 9,
189
+ "\u0004": 10,
190
+ "\u0005": 11,
191
+ "\u0006": 12,
192
+ "\u0007": 13,
193
+ "\b": 14,
194
+ "\t": 15,
195
+ "\n": 16,
196
+ "\u000b": 17,
197
+ "\u000e": 18,
198
+ "\u000f": 19,
199
+ "\u0010": 20,
200
+ "\u0011": 21,
201
+ "\u0012": 22,
202
+ "\u0013": 23,
203
+ "\u0014": 24,
204
+ "\u0015": 25,
205
+ "\u0016": 26,
206
+ "\u0017": 27,
207
+ "\u0018": 28,
208
+ "\u0019": 29,
209
+ "\u001a": 30,
210
+ "\u001b": 31,
211
+ "\u001c": 32,
212
+ "\u001d": 33,
213
+ "\u001e": 34,
214
+ "\u001f": 35,
215
+ " ": 36,
216
+ "!": 37,
217
+ "\"": 38,
218
+ "#": 39,
219
+ "$": 40,
220
+ "%": 41,
221
+ "&": 42,
222
+ "'": 43,
223
+ "(": 44,
224
+ ")": 45,
225
+ "*": 46,
226
+ "+": 47,
227
+ ",": 48,
228
+ "-": 49,
229
+ ".": 50,
230
+ "/": 51,
231
+ "0": 52,
232
+ "1": 53,
233
+ "2": 54,
234
+ "3": 55,
235
+ "4": 56,
236
+ "5": 57,
237
+ "6": 58,
238
+ "7": 59,
239
+ "8": 60,
240
+ "9": 61,
241
+ ":": 62,
242
+ ";": 63,
243
+ "<": 64,
244
+ "=": 65,
245
+ ">": 66,
246
+ "?": 67,
247
+ "@": 68,
248
+ "K": 69,
249
+ "N": 70,
250
+ "U": 71,
251
+ "[": 72,
252
+ "\\": 73,
253
+ "]": 74,
254
+ "^": 75,
255
+ "_": 76,
256
+ "`": 77,
257
+ "a": 78,
258
+ "b": 79,
259
+ "c": 80,
260
+ "d": 81,
261
+ "e": 82,
262
+ "f": 83,
263
+ "g": 84,
264
+ "h": 85,
265
+ "i": 86,
266
+ "j": 87,
267
+ "k": 88,
268
+ "l": 89,
269
+ "m": 90,
270
+ "n": 91,
271
+ "o": 92,
272
+ "p": 93,
273
+ "q": 94,
274
+ "r": 95,
275
+ "s": 96,
276
+ "t": 97,
277
+ "u": 98,
278
+ "v": 99,
279
+ "w": 100,
280
+ "x": 101,
281
+ "y": 102,
282
+ "z": 103,
283
+ "{": 104,
284
+ "|": 105,
285
+ "}": 106,
286
+ "~": 107,
287
+ "": 108,
288
+ "€": 109,
289
+ "": 110,
290
+ "‚": 111,
291
+ "ƒ": 112,
292
+ "„": 113,
293
+ "†": 114,
294
+ "ˆ": 115,
295
+ "‰": 116,
296
+ "Œ": 117,
297
+ "": 118,
298
+ "Ž": 119,
299
+ "": 120,
300
+ "": 121,
301
+ "‘": 122,
302
+ "’": 123,
303
+ "“": 124,
304
+ "”": 125,
305
+ "•": 126,
306
+ "–": 127,
307
+ "—": 128,
308
+ "˜": 129,
309
+ "™": 130,
310
+ "š": 131,
311
+ "›": 132,
312
+ "œ": 133,
313
+ "": 134,
314
+ "ž": 135,
315
+ "Ÿ": 136,
316
+ "¡": 137,
317
+ "¢": 138,
318
+ "£": 139,
319
+ "¤": 140,
320
+ "¥": 141,
321
+ "¦": 142,
322
+ "§": 143,
323
+ "©": 144,
324
+ "«": 145,
325
+ "¬": 146,
326
+ "­": 147,
327
+ "®": 148,
328
+ "°": 149,
329
+ "±": 150,
330
+ "¶": 151,
331
+ "·": 152,
332
+ "»": 153,
333
+ "¿": 154,
334
+ "×": 155,
335
+ "ß": 156,
336
+ "à": 157,
337
+ "á": 158,
338
+ "â": 159,
339
+ "ã": 160,
340
+ "ä": 161,
341
+ "å": 162,
342
+ "æ": 163,
343
+ "ç": 164,
344
+ "è": 165,
345
+ "é": 166,
346
+ "ê": 167,
347
+ "ë": 168,
348
+ "ì": 169,
349
+ "í": 170,
350
+ "î": 171,
351
+ "ï": 172,
352
+ "ð": 173,
353
+ "ñ": 174,
354
+ "ò": 175,
355
+ "ó": 176,
356
+ "ô": 177,
357
+ "õ": 178,
358
+ "ö": 179,
359
+ "÷": 180,
360
+ "ø": 181,
361
+ "ù": 182,
362
+ "ú": 183,
363
+ "û": 184,
364
+ "ü": 185,
365
+ "ý": 186,
366
+ "þ": 187,
367
+ "ÿ": 188,
368
+ "ȼ": 189,
369
+ "˖": 190,
370
+ "˗": 191,
371
+ "ͱ": 192,
372
+ "ͳ": 193,
373
+ "͵": 194,
374
+ "ӏ": 195,
375
+ "ԝ": 196,
376
+ "֎": 197,
377
+ "־": 198,
378
+ "׀": 199,
379
+ "׃": 200,
380
+ "׆": 201,
381
+ "׈": 202,
382
+ "׉": 203,
383
+ "׊": 204,
384
+ "׋": 205,
385
+ "׍": 206,
386
+ "׎": 207,
387
+ "׏": 208,
388
+ "א": 209,
389
+ "ב": 210,
390
+ "ג": 211,
391
+ "ד": 212,
392
+ "ה": 213,
393
+ "ו": 214,
394
+ "ז": 215,
395
+ "ח": 216,
396
+ "ט": 217,
397
+ "י": 218,
398
+ "ך": 219,
399
+ "כ": 220,
400
+ "ל": 221,
401
+ "ם": 222,
402
+ "מ": 223,
403
+ "ן": 224,
404
+ "נ": 225,
405
+ "ס": 226,
406
+ "ע": 227,
407
+ "ף": 228,
408
+ "פ": 229,
409
+ "ץ": 230,
410
+ "צ": 231,
411
+ "ק": 232,
412
+ "ר": 233,
413
+ "ש": 234,
414
+ "ת": 235,
415
+ "׫": 236,
416
+ "װ": 237,
417
+ "ױ": 238,
418
+ "ײ": 239,
419
+ "׳": 240,
420
+ "״": 241,
421
+ "׸": 242,
422
+ "׹": 243,
423
+ "׺": 244,
424
+ "׿": 245,
425
+ "،": 246,
426
+ "؛": 247,
427
+ "؟": 248,
428
+ "٪": 249,
429
+ "٭": 250,
430
+ "۔": 251,
431
+ "۝": 252,
432
+ "۞": 253,
433
+ "۩": 254,
434
+ "ߋ": 255,
435
+ "ߐ": 256,
436
+ "ߕ": 257,
437
+ "ߗ": 258,
438
+ "ߜ": 259,
439
+ "ߝ": 260,
440
+ "ߞ": 261,
441
+ "ߟ": 262,
442
+ "ߠ": 263,
443
+ "ߡ": 264,
444
+ "ߢ": 265,
445
+ "ߨ": 266,
446
+ "ߩ": 267,
447
+ "ߪ": 268,
448
+ "।": 269,
449
+ "฿": 270,
450
+ "๏": 271,
451
+ "፡": 272,
452
+ "ᤞ": 273,
453
+ "᧐": 274,
454
+ "ᨁ": 275,
455
+ "ᨅ": 276,
456
+ "ᨔ": 277,
457
+ "ᨕ": 278,
458
+ "‌": 279,
459
+ "‍": 280,
460
+ "‎": 281,
461
+ "‏": 282,
462
+ "‐": 283,
463
+ "‒": 284,
464
+ "–": 285,
465
+ "—": 286,
466
+ "―": 287,
467
+ "‖": 288,
468
+ "‘": 289,
469
+ "’": 290,
470
+ "‚": 291,
471
+ "‛": 292,
472
+ "“": 293,
473
+ "”": 294,
474
+ "„": 295,
475
+ "‟": 296,
476
+ "†": 297,
477
+ "‡": 298,
478
+ "•": 299,
479
+ "‣": 300,
480
+ "‧": 301,
481
+ "
": 302,
482
+ "
": 303,
483
+ "‪": 304,
484
+ "‫": 305,
485
+ "‬": 306,
486
+ "‭": 307,
487
+ "‮": 308,
488
+ "‰": 309,
489
+ "′": 310,
490
+ "‹": 311,
491
+ "›": 312,
492
+ "※": 313,
493
+ "‽": 314,
494
+ "‿": 315,
495
+ "⁃": 316,
496
+ "⁄": 317,
497
+ "⁎": 318,
498
+ "⁠": 319,
499
+ "⁣": 320,
500
+ "⁦": 321,
501
+ "⁧": 322,
502
+ "⁨": 323,
503
+ "⁩": 324,
504
+ "₡": 325,
505
+ "₣": 326,
506
+ "₤": 327,
507
+ "₦": 328,
508
+ "₩": 329,
509
+ "₪": 330,
510
+ "₫": 331,
511
+ "€": 332,
512
+ "₭": 333,
513
+ "₮": 334,
514
+ "₱": 335,
515
+ "₴": 336,
516
+ "₵": 337,
517
+ "₸": 338,
518
+ "₹": 339,
519
+ "₺": 340,
520
+ "₼": 341,
521
+ "₽": 342,
522
+ "₾": 343,
523
+ "₿": 344,
524
+ "ↄ": 345,
525
+ "←": 346,
526
+ "↑": 347,
527
+ "→": 348,
528
+ "↓": 349,
529
+ "↔": 350,
530
+ "↗": 351,
531
+ "↘": 352,
532
+ "↙": 353,
533
+ "↩": 354,
534
+ "↳": 355,
535
+ "↵": 356,
536
+ "⇌": 357,
537
+ "⇐": 358,
538
+ "⇒": 359,
539
+ "⇓": 360,
540
+ "⇔": 361,
541
+ "⇦": 362,
542
+ "⇧": 363,
543
+ "⇨": 364,
544
+ "⇱": 365,
545
+ "∀": 366,
546
+ "∂": 367,
547
+ "∃": 368,
548
+ "∅": 369,
549
+ "∆": 370,
550
+ "∇": 371,
551
+ "∈": 372,
552
+ "∉": 373,
553
+ "∍": 374,
554
+ "∎": 375,
555
+ "∏": 376,
556
+ "∐": 377,
557
+ "∑": 378,
558
+ "−": 379,
559
+ "∕": 380,
560
+ "∗": 381,
561
+ "∘": 382,
562
+ "∙": 383,
563
+ "√": 384,
564
+ "∛": 385,
565
+ "∝": 386,
566
+ "∞": 387,
567
+ "∟": 388,
568
+ "∠": 389,
569
+ "∢": 390,
570
+ "∧": 391,
571
+ "∨": 392,
572
+ "∩": 393,
573
+ "∪": 394,
574
+ "∫": 395,
575
+ "∴": 396,
576
+ "∼": 397,
577
+ "≅": 398,
578
+ "≈": 399,
579
+ "≋": 400,
580
+ "≟": 401,
581
+ "≠": 402,
582
+ "≡": 403,
583
+ "≤": 404,
584
+ "≥": 405,
585
+ "≦": 406,
586
+ "≧": 407,
587
+ "≪": 408,
588
+ "≫": 409,
589
+ "⊂": 410,
590
+ "⊃": 411,
591
+ "⊆": 412,
592
+ "⊇": 413,
593
+ "⊕": 414,
594
+ "⊗": 415,
595
+ "⊙": 416,
596
+ "⊞": 417,
597
+ "⊠": 418,
598
+ "⊢": 419,
599
+ "⊤": 420,
600
+ "⊦": 421,
601
+ "⋃": 422,
602
+ "⋄": 423,
603
+ "⋅": 424,
604
+ "⋆": 425,
605
+ "⋇": 426,
606
+ "⋧": 427,
607
+ "⋮": 428,
608
+ "⋯": 429,
609
+ "⌀": 430,
610
+ "⌂": 431,
611
+ "⌘": 432,
612
+ "⌚": 433,
613
+ "⌛": 434,
614
+ "⌥": 435,
615
+ "⎙": 436,
616
+ "⏎": 437,
617
+ "⏪": 438,
618
+ "⏮": 439,
619
+ "⏰": 440,
620
+ "⏱": 441,
621
+ "⏳": 442,
622
+ "⏺": 443,
623
+ "─": 444,
624
+ "│": 445,
625
+ "┐": 446,
626
+ "└": 447,
627
+ "┴": 448,
628
+ "╋": 449,
629
+ "║": 450,
630
+ "╬": 451,
631
+ "█": 452,
632
+ "▌": 453,
633
+ "░": 454,
634
+ "■": 455,
635
+ "□": 456,
636
+ "▪": 457,
637
+ "▫": 458,
638
+ "▲": 459,
639
+ "△": 460,
640
+ "▶": 461,
641
+ "▷": 462,
642
+ "▸": 463,
643
+ "►": 464,
644
+ "▼": 465,
645
+ "▽": 466,
646
+ "▾": 467,
647
+ "◀": 468,
648
+ "◁": 469,
649
+ "◂": 470,
650
+ "◃": 471,
651
+ "◄": 472,
652
+ "◆": 473,
653
+ "◇": 474,
654
+ "◈": 475,
655
+ "◉": 476,
656
+ "◊": 477,
657
+ "○": 478,
658
+ "◌": 479,
659
+ "◎": 480,
660
+ "●": 481,
661
+ "◕": 482,
662
+ "◘": 483,
663
+ "◙": 484,
664
+ "◡": 485,
665
+ "◥": 486,
666
+ "◦": 487,
667
+ "◴": 488,
668
+ "◻": 489,
669
+ "◼": 490,
670
+ "◽": 491,
671
+ "◾": 492,
672
+ "☀": 493,
673
+ "☁": 494,
674
+ "☂": 495,
675
+ "☃": 496,
676
+ "☄": 497,
677
+ "★": 498,
678
+ "☆": 499,
679
+ "☉": 500,
680
+ "☎": 501,
681
+ "☏": 502,
682
+ "☐": 503,
683
+ "☑": 504,
684
+ "☒": 505,
685
+ "☔": 506,
686
+ "☕": 507,
687
+ "☘": 508,
688
+ "☚": 509,
689
+ "☜": 510,
690
+ "☝": 511,
691
+ "☠": 512,
692
+ "☢": 513,
693
+ "☯": 514,
694
+ "☰": 515,
695
+ "☹": 516,
696
+ "☺": 517,
697
+ "☻": 518,
698
+ "☼": 519,
699
+ "♀": 520,
700
+ "♂": 521,
701
+ "♔": 522,
702
+ "♕": 523,
703
+ "♚": 524,
704
+ "♛": 525,
705
+ "♟": 526,
706
+ "♠": 527,
707
+ "♡": 528,
708
+ "♢": 529,
709
+ "♣": 530,
710
+ "♥": 531,
711
+ "♦": 532,
712
+ "♧": 533,
713
+ "♨": 534,
714
+ "♪": 535,
715
+ "♫": 536,
716
+ "♬": 537,
717
+ "♭": 538,
718
+ "♯": 539,
719
+ "♰": 540,
720
+ "♻": 541,
721
+ "♿": 542,
722
+ "⚇": 543,
723
+ "⚒": 544,
724
+ "⚓": 545,
725
+ "⚔": 546,
726
+ "⚖": 547,
727
+ "⚘": 548,
728
+ "⚛": 549,
729
+ "⚜": 550,
730
+ "⚠": 551,
731
+ "⚡": 552,
732
+ "⚧": 553,
733
+ "⚪": 554,
734
+ "⚫": 555,
735
+ "⚽": 556,
736
+ "⛔": 557,
737
+ "⛰": 558,
738
+ "✂": 559,
739
+ "✅": 560,
740
+ "✆": 561,
741
+ "✈": 562,
742
+ "✉": 563,
743
+ "✊": 564,
744
+ "✋": 565,
745
+ "✌": 566,
746
+ "✍": 567,
747
+ "✎": 568,
748
+ "✏": 569,
749
+ "✓": 570,
750
+ "✔": 571,
751
+ "✖": 572,
752
+ "✗": 573,
753
+ "✙": 574,
754
+ "✛": 575,
755
+ "✡": 576,
756
+ "✦": 577,
757
+ "✧": 578,
758
+ "✨": 579,
759
+ "✩": 580,
760
+ "✪": 581,
761
+ "✫": 582,
762
+ "✭": 583,
763
+ "✮": 584,
764
+ "✯": 585,
765
+ "✰": 586,
766
+ "✱": 587,
767
+ "✲": 588,
768
+ "✳": 589,
769
+ "✴": 590,
770
+ "✶": 591,
771
+ "✸": 592,
772
+ "✺": 593,
773
+ "✻": 594,
774
+ "✽": 595,
775
+ "✾": 596,
776
+ "✿": 597,
777
+ "❀": 598,
778
+ "❁": 599,
779
+ "❂": 600,
780
+ "❃": 601,
781
+ "❄": 602,
782
+ "❇": 603,
783
+ "❈": 604,
784
+ "❋": 605,
785
+ "❌": 606,
786
+ "❎": 607,
787
+ "❏": 608,
788
+ "❑": 609,
789
+ "❒": 610,
790
+ "❓": 611,
791
+ "❔": 612,
792
+ "❕": 613,
793
+ "❖": 614,
794
+ "❗": 615,
795
+ "❝": 616,
796
+ "❞": 617,
797
+ "❣": 618,
798
+ "❤": 619,
799
+ "❥": 620,
800
+ "❦": 621,
801
+ "❭": 622,
802
+ "❯": 623,
803
+ "❶": 624,
804
+ "❷": 625,
805
+ "❸": 626,
806
+ "➊": 627,
807
+ "➋": 628,
808
+ "➌": 629,
809
+ "➍": 630,
810
+ "➎": 631,
811
+ "➔": 632,
812
+ "➕": 633,
813
+ "➖": 634,
814
+ "➡": 635,
815
+ "➢": 636,
816
+ "➤": 637,
817
+ "➦": 638,
818
+ "⟨": 639,
819
+ "⟩": 640,
820
+ "⠀": 641,
821
+ "⤵": 642,
822
+ "⤶": 643,
823
+ "⦁": 644,
824
+ "⦿": 645,
825
+ "⧼": 646,
826
+ "⧽": 647,
827
+ "⬅": 648,
828
+ "⬆": 649,
829
+ "⬇": 650,
830
+ "⬛": 651,
831
+ "⬜": 652,
832
+ "⭐": 653,
833
+ "⭕": 654,
834
+ "ⰲ": 655,
835
+ "ⰽ": 656,
836
+ "ⰾ": 657,
837
+ "ⱀ": 658,
838
+ "ⱁ": 659,
839
+ "ⱄ": 660,
840
+ "ⱏ": 661,
841
+ "ⱐ": 662,
842
+ "ⱑ": 663,
843
+ "ⱥ": 664,
844
+ "ⲟ": 665,
845
+ "ⴰ": 666,
846
+ "ⴻ": 667,
847
+ "ⵍ": 668,
848
+ "ⵏ": 669,
849
+ "ⵔ": 670,
850
+ "ⵢ": 671,
851
+ "ⵣ": 672,
852
+ "、": 673,
853
+ "。": 674,
854
+ "〈": 675,
855
+ "〉": 676,
856
+ "《": 677,
857
+ "》": 678,
858
+ "「": 679,
859
+ "」": 680,
860
+ "【": 681,
861
+ "】": 682,
862
+ "ꙭ": 683,
863
+ "": 684,
864
+ "": 685,
865
+ "": 686,
866
+ "": 687,
867
+ "": 688,
868
+ "": 689,
869
+ "": 690,
870
+ "": 691,
871
+ "": 692,
872
+ "": 693,
873
+ "": 694,
874
+ "": 695,
875
+ "": 696,
876
+ "": 697,
877
+ "": 698,
878
+ "": 699,
879
+ "": 700,
880
+ "": 701,
881
+ "": 702,
882
+ "": 703,
883
+ "": 704,
884
+ "": 705,
885
+ "": 706,
886
+ "": 707,
887
+ "": 708,
888
+ "": 709,
889
+ "": 710,
890
+ "": 711,
891
+ "": 712,
892
+ "": 713,
893
+ "": 714,
894
+ "": 715,
895
+ "": 716,
896
+ "": 717,
897
+ "": 718,
898
+ "": 719,
899
+ "": 720,
900
+ "": 721,
901
+ "": 722,
902
+ "": 723,
903
+ "": 724,
904
+ "": 725,
905
+ "": 726,
906
+ "": 727,
907
+ "": 728,
908
+ "": 729,
909
+ "": 730,
910
+ "": 731,
911
+ "": 732,
912
+ "": 733,
913
+ "": 734,
914
+ "": 735,
915
+ "": 736,
916
+ "": 737,
917
+ "": 738,
918
+ "": 739,
919
+ "": 740,
920
+ "": 741,
921
+ "": 742,
922
+ "": 743,
923
+ "": 744,
924
+ "": 745,
925
+ "": 746,
926
+ "": 747,
927
+ "": 748,
928
+ "": 749,
929
+ "": 750,
930
+ "": 751,
931
+ "": 752,
932
+ "": 753,
933
+ "": 754,
934
+ "": 755,
935
+ "": 756,
936
+ "": 757,
937
+ "": 758,
938
+ "": 759,
939
+ "": 760,
940
+ "": 761,
941
+ "": 762,
942
+ "": 763,
943
+ "": 764,
944
+ "": 765,
945
+ "": 766,
946
+ "": 767,
947
+ "": 768,
948
+ "": 769,
949
+ "": 770,
950
+ "": 771,
951
+ "": 772,
952
+ "": 773,
953
+ "": 774,
954
+ "": 775,
955
+ "": 776,
956
+ "": 777,
957
+ "": 778,
958
+ "": 779,
959
+ "": 780,
960
+ "": 781,
961
+ "": 782,
962
+ "": 783,
963
+ "": 784,
964
+ "": 785,
965
+ "": 786,
966
+ "": 787,
967
+ "": 788,
968
+ "": 789,
969
+ "": 790,
970
+ "": 791,
971
+ "": 792,
972
+ "": 793,
973
+ "": 794,
974
+ "": 795,
975
+ "": 796,
976
+ "": 797,
977
+ "": 798,
978
+ "": 799,
979
+ "": 800,
980
+ "": 801,
981
+ "": 802,
982
+ "": 803,
983
+ "": 804,
984
+ "": 805,
985
+ "": 806,
986
+ "": 807,
987
+ "": 808,
988
+ "": 809,
989
+ "": 810,
990
+ "": 811,
991
+ "": 812,
992
+ "": 813,
993
+ "": 814,
994
+ "": 815,
995
+ "": 816,
996
+ "": 817,
997
+ "": 818,
998
+ "": 819,
999
+ "": 820,
1000
+ "": 821,
1001
+ "": 822,
1002
+ "": 823,
1003
+ "": 824,
1004
+ "": 825,
1005
+ "": 826,
1006
+ "": 827,
1007
+ "": 828,
1008
+ "": 829,
1009
+ "": 830,
1010
+ "": 831,
1011
+ "": 832,
1012
+ "": 833,
1013
+ "": 834,
1014
+ "": 835,
1015
+ "": 836,
1016
+ "": 837,
1017
+ "": 838,
1018
+ "": 839,
1019
+ "": 840
1020
+ }
1021
+ }
1022
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[UNK]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "[CLS]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "[SEP]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "[PAD]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "4": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ },
43
+ "5": {
44
+ "content": "[BLANK]",
45
+ "lstrip": false,
46
+ "normalized": false,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": true
50
+ }
51
+ },
52
+ "clean_up_tokenization_spaces": true,
53
+ "cls_token": "[CLS]",
54
+ "do_lower_case": true,
55
+ "mask_token": "[MASK]",
56
+ "model_max_length": 2048,
57
+ "pad_token": "[PAD]",
58
+ "sep_token": "[SEP]",
59
+ "strip_accents": null,
60
+ "tokenize_chinese_chars": true,
61
+ "tokenizer_class": "BertTokenizer",
62
+ "unk_token": "[UNK]"
63
+ }
vocab.txt ADDED
Binary file (3.01 kB). View file