CrabInHoney commited on
Commit
9232dc6
·
verified ·
1 Parent(s): 9dbcc83

Upload 6 files

Browse files
Files changed (6) hide show
  1. config.json +79 -0
  2. model.safetensors +3 -0
  3. special_tokens_map.json +37 -0
  4. tokenizer.json +239 -0
  5. tokenizer_config.json +65 -0
  6. vocab.txt +75 -0
config.json ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "./bert-morphology-token-classification-finetunedv2v3",
3
+ "architectures": [
4
+ "BertForTokenClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "classifier_dropout": null,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.1,
10
+ "hidden_size": 768,
11
+ "id2label": {
12
+ "0": "END",
13
+ "1": "END1",
14
+ "2": "HYPH",
15
+ "3": "HYPH1",
16
+ "4": "LINK",
17
+ "5": "LINK1",
18
+ "6": "LINK2",
19
+ "7": "LINK3",
20
+ "8": "POSTFIX",
21
+ "9": "PREF",
22
+ "10": "PREF1",
23
+ "11": "PREF2",
24
+ "12": "ROOT",
25
+ "13": "ROOT1",
26
+ "14": "ROOT2",
27
+ "15": "ROOT3",
28
+ "16": "ROOT4",
29
+ "17": "ROOT5",
30
+ "18": "SUFF",
31
+ "19": "SUFF1",
32
+ "20": "SUFF2",
33
+ "21": "SUFF3",
34
+ "22": "SUFF4",
35
+ "23": "SUFF5",
36
+ "24": "SUFF6"
37
+ },
38
+ "initializer_range": 0.02,
39
+ "intermediate_size": 3072,
40
+ "label2id": {
41
+ "END": 0,
42
+ "END1": 1,
43
+ "HYPH": 2,
44
+ "HYPH1": 3,
45
+ "LINK": 4,
46
+ "LINK1": 5,
47
+ "LINK2": 6,
48
+ "LINK3": 7,
49
+ "POSTFIX": 8,
50
+ "PREF": 9,
51
+ "PREF1": 10,
52
+ "PREF2": 11,
53
+ "ROOT": 12,
54
+ "ROOT1": 13,
55
+ "ROOT2": 14,
56
+ "ROOT3": 15,
57
+ "ROOT4": 16,
58
+ "ROOT5": 17,
59
+ "SUFF": 18,
60
+ "SUFF1": 19,
61
+ "SUFF2": 20,
62
+ "SUFF3": 21,
63
+ "SUFF4": 22,
64
+ "SUFF5": 23,
65
+ "SUFF6": 24
66
+ },
67
+ "layer_norm_eps": 1e-12,
68
+ "max_position_embeddings": 512,
69
+ "model_type": "bert",
70
+ "num_attention_heads": 12,
71
+ "num_hidden_layers": 12,
72
+ "pad_token_id": 0,
73
+ "position_embedding_type": "absolute",
74
+ "torch_dtype": "float32",
75
+ "transformers_version": "4.44.2",
76
+ "type_vocab_size": 2,
77
+ "use_cache": true,
78
+ "vocab_size": 75
79
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:00ce21b18af4a2c040fda2e4af4e3b8f4b909c5c960c799ee4641d01a3eb3775
3
+ size 342133524
special_tokens_map.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": {
3
+ "content": "[CLS]",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "mask_token": {
10
+ "content": "[MASK]",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "[PAD]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "sep_token": {
24
+ "content": "[SEP]",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "unk_token": {
31
+ "content": "[UNK]",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ }
37
+ }
tokenizer.json ADDED
@@ -0,0 +1,239 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": {
4
+ "direction": "Right",
5
+ "max_length": 34,
6
+ "strategy": "LongestFirst",
7
+ "stride": 0
8
+ },
9
+ "padding": {
10
+ "strategy": {
11
+ "Fixed": 34
12
+ },
13
+ "direction": "Right",
14
+ "pad_to_multiple_of": null,
15
+ "pad_id": 0,
16
+ "pad_type_id": 0,
17
+ "pad_token": "[PAD]"
18
+ },
19
+ "added_tokens": [
20
+ {
21
+ "id": 0,
22
+ "content": "[PAD]",
23
+ "single_word": false,
24
+ "lstrip": false,
25
+ "rstrip": false,
26
+ "normalized": false,
27
+ "special": true
28
+ },
29
+ {
30
+ "id": 1,
31
+ "content": "[UNK]",
32
+ "single_word": false,
33
+ "lstrip": false,
34
+ "rstrip": false,
35
+ "normalized": false,
36
+ "special": true
37
+ },
38
+ {
39
+ "id": 2,
40
+ "content": "[CLS]",
41
+ "single_word": false,
42
+ "lstrip": false,
43
+ "rstrip": false,
44
+ "normalized": false,
45
+ "special": true
46
+ },
47
+ {
48
+ "id": 3,
49
+ "content": "[SEP]",
50
+ "single_word": false,
51
+ "lstrip": false,
52
+ "rstrip": false,
53
+ "normalized": false,
54
+ "special": true
55
+ },
56
+ {
57
+ "id": 4,
58
+ "content": "[MASK]",
59
+ "single_word": false,
60
+ "lstrip": false,
61
+ "rstrip": false,
62
+ "normalized": false,
63
+ "special": true
64
+ }
65
+ ],
66
+ "normalizer": {
67
+ "type": "BertNormalizer",
68
+ "clean_text": true,
69
+ "handle_chinese_chars": true,
70
+ "strip_accents": false,
71
+ "lowercase": true
72
+ },
73
+ "pre_tokenizer": {
74
+ "type": "BertPreTokenizer"
75
+ },
76
+ "post_processor": {
77
+ "type": "TemplateProcessing",
78
+ "single": [
79
+ {
80
+ "SpecialToken": {
81
+ "id": "[CLS]",
82
+ "type_id": 0
83
+ }
84
+ },
85
+ {
86
+ "Sequence": {
87
+ "id": "A",
88
+ "type_id": 0
89
+ }
90
+ },
91
+ {
92
+ "SpecialToken": {
93
+ "id": "[SEP]",
94
+ "type_id": 0
95
+ }
96
+ }
97
+ ],
98
+ "pair": [
99
+ {
100
+ "SpecialToken": {
101
+ "id": "[CLS]",
102
+ "type_id": 0
103
+ }
104
+ },
105
+ {
106
+ "Sequence": {
107
+ "id": "A",
108
+ "type_id": 0
109
+ }
110
+ },
111
+ {
112
+ "SpecialToken": {
113
+ "id": "[SEP]",
114
+ "type_id": 0
115
+ }
116
+ },
117
+ {
118
+ "Sequence": {
119
+ "id": "B",
120
+ "type_id": 1
121
+ }
122
+ },
123
+ {
124
+ "SpecialToken": {
125
+ "id": "[SEP]",
126
+ "type_id": 1
127
+ }
128
+ }
129
+ ],
130
+ "special_tokens": {
131
+ "[CLS]": {
132
+ "id": "[CLS]",
133
+ "ids": [
134
+ 2
135
+ ],
136
+ "tokens": [
137
+ "[CLS]"
138
+ ]
139
+ },
140
+ "[SEP]": {
141
+ "id": "[SEP]",
142
+ "ids": [
143
+ 3
144
+ ],
145
+ "tokens": [
146
+ "[SEP]"
147
+ ]
148
+ }
149
+ }
150
+ },
151
+ "decoder": {
152
+ "type": "WordPiece",
153
+ "prefix": "##",
154
+ "cleanup": true
155
+ },
156
+ "model": {
157
+ "type": "WordPiece",
158
+ "unk_token": "[UNK]",
159
+ "continuing_subword_prefix": "##",
160
+ "max_input_chars_per_word": 100,
161
+ "vocab": {
162
+ "[PAD]": 0,
163
+ "[UNK]": 1,
164
+ "[CLS]": 2,
165
+ "[SEP]": 3,
166
+ "[MASK]": 4,
167
+ "а": 5,
168
+ "б": 6,
169
+ "в": 7,
170
+ "г": 8,
171
+ "д": 9,
172
+ "е": 10,
173
+ "ё": 11,
174
+ "ж": 12,
175
+ "з": 13,
176
+ "и": 14,
177
+ "й": 15,
178
+ "к": 16,
179
+ "л": 17,
180
+ "м": 18,
181
+ "н": 19,
182
+ "о": 20,
183
+ "п": 21,
184
+ "р": 22,
185
+ "с": 23,
186
+ "т": 24,
187
+ "у": 25,
188
+ "ф": 26,
189
+ "х": 27,
190
+ "ц": 28,
191
+ "ч": 29,
192
+ "ш": 30,
193
+ "щ": 31,
194
+ "ъ": 32,
195
+ "ы": 33,
196
+ "ь": 34,
197
+ "э": 35,
198
+ "ю": 36,
199
+ "я": 37,
200
+ "##а": 38,
201
+ "##б": 39,
202
+ "##в": 40,
203
+ "##г": 41,
204
+ "##д": 42,
205
+ "##е": 43,
206
+ "##ё": 44,
207
+ "##ж": 45,
208
+ "##з": 46,
209
+ "##и": 47,
210
+ "##й": 48,
211
+ "##к": 49,
212
+ "##л": 50,
213
+ "##м": 51,
214
+ "##н": 52,
215
+ "##о": 53,
216
+ "##п": 54,
217
+ "##р": 55,
218
+ "##с": 56,
219
+ "##т": 57,
220
+ "##у": 58,
221
+ "##ф": 59,
222
+ "##х": 60,
223
+ "##ц": 61,
224
+ "##ч": 62,
225
+ "##ш": 63,
226
+ "##щ": 64,
227
+ "##ъ": 65,
228
+ "##ы": 66,
229
+ "##ь": 67,
230
+ "##э": 68,
231
+ "##ю": 69,
232
+ "##я": 70,
233
+ "_": 71,
234
+ "-": 72,
235
+ "##_": 73,
236
+ "##-": 74
237
+ }
238
+ }
239
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "4": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": true,
45
+ "cls_token": "[CLS]",
46
+ "do_basic_tokenize": true,
47
+ "do_lower_case": true,
48
+ "lowercase": true,
49
+ "mask_token": "[MASK]",
50
+ "max_length": 34,
51
+ "model_max_length": 1000000000000000019884624838656,
52
+ "never_split": null,
53
+ "pad_to_multiple_of": null,
54
+ "pad_token": "[PAD]",
55
+ "pad_token_type_id": 0,
56
+ "padding_side": "right",
57
+ "sep_token": "[SEP]",
58
+ "stride": 0,
59
+ "strip_accents": false,
60
+ "tokenize_chinese_chars": true,
61
+ "tokenizer_class": "BertTokenizer",
62
+ "truncation_side": "right",
63
+ "truncation_strategy": "longest_first",
64
+ "unk_token": "[UNK]"
65
+ }
vocab.txt ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [PAD]
2
+ [UNK]
3
+ [CLS]
4
+ [SEP]
5
+ [MASK]
6
+ а
7
+ б
8
+ в
9
+ г
10
+ д
11
+ е
12
+ ё
13
+ ж
14
+ з
15
+ и
16
+ й
17
+ к
18
+ л
19
+ м
20
+ н
21
+ о
22
+ п
23
+ р
24
+ с
25
+ т
26
+ у
27
+ ф
28
+ х
29
+ ц
30
+ ч
31
+ ш
32
+ щ
33
+ ъ
34
+ ы
35
+ ь
36
+ э
37
+ ю
38
+ я
39
+ ##а
40
+ ##б
41
+ ##в
42
+ ##г
43
+ ##д
44
+ ##е
45
+ ##ё
46
+ ##ж
47
+ ##з
48
+ ##и
49
+ ##й
50
+ ##к
51
+ ##л
52
+ ##м
53
+ ##н
54
+ ##о
55
+ ##п
56
+ ##р
57
+ ##с
58
+ ##т
59
+ ##у
60
+ ##ф
61
+ ##х
62
+ ##ц
63
+ ##ч
64
+ ##ш
65
+ ##щ
66
+ ##ъ
67
+ ##ы
68
+ ##ь
69
+ ##э
70
+ ##ю
71
+ ##я
72
+ _
73
+ -
74
+ ##_
75
+ ##-