tformal commited on
Commit
1828e27
·
verified ·
1 Parent(s): e6ea2c2

Upload modeling file

Browse files
added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "[MASK]": 128000
3
+ }
config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"_name_or_path": "provence", "architectures": ["Provence"], "auto_map": {"AutoConfig": "modeling_provence.ProvenceConfig", "AutoModel": "modeling_provence.Provence"}, "attention_probs_dropout_prob": 0.1, "hidden_act": "gelu", "hidden_dropout_prob": 0.1, "hidden_size": 1024, "id2label": {"0": "LABEL_0"}, "initializer_range": 0.02, "intermediate_size": 4096, "label2id": {"LABEL_0": 0}, "layer_norm_eps": 1e-07, "max_position_embeddings": 512, "max_relative_positions": -1, "model_type": "Provence", "norm_rel_ebd": "layer_norm", "num_attention_heads": 16, "num_hidden_layers": 24, "pad_token_id": 0, "pooler_dropout": 0, "pooler_hidden_act": "gelu", "pooler_hidden_size": 1024, "pos_att_type": ["p2c", "c2p"], "position_biased_input": false, "position_buckets": 256, "relative_attention": true, "share_att_key": true, "torch_dtype": "float32", "transformers_version": "4.45.1", "type_vocab_size": 0, "vocab_size": 128100}
modeling_provence.py ADDED
@@ -0,0 +1,456 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import string
3
+ from typing import Optional, Union, Tuple, List
4
+ from dataclasses import dataclass
5
+ from tqdm import tqdm
6
+ import warnings
7
+ import nltk
8
+ import numpy as np
9
+ import torch
10
+ from torch import nn
11
+ import torch.nn.functional as F
12
+ from torch.utils.data import Dataset
13
+ from torch.nn.utils.rnn import pad_sequencea
14
+ from transformers import AutoTokenizer
15
+ from transformers import DebertaV2PreTrainedModel, DebertaV2Model, PretrainedConfig
16
+ from transformers.models.deberta_v2.modeling_deberta_v2 import (
17
+ StableDropout,
18
+ ContextPooler,
19
+ )
20
+ from transformers.modeling_outputs import ModelOutput
21
+
22
+
23
+ @dataclass
24
+ class RankingCompressionOutput(ModelOutput):
25
+
26
+ compression_logits: torch.FloatTensor = None
27
+ ranking_scores: torch.FloatTensor = None
28
+ hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
29
+ attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
30
+
31
+
32
+ """adapted from https://github.com/huggingface/transformers/blob/v4.44.2/src/transformers/models/deberta_v2/modeling_deberta_v2.py#L1357
33
+ """
34
+
35
+
36
+ class ProvenceConfig(PretrainedConfig):
37
+
38
+ model_type = "Provence"
39
+
40
+ def __init__(self, **kwargs):
41
+ super().__init__(**kwargs)
42
+
43
+
44
+ class Provence(DebertaV2PreTrainedModel):
45
+
46
+ config_class = ProvenceConfig
47
+
48
+ def __init__(self, config):
49
+ super().__init__(config)
50
+ num_labels = getattr(config, "num_labels", 2)
51
+ self.num_labels = num_labels
52
+ self.deberta = DebertaV2Model(config)
53
+ self.pooler = ContextPooler(config)
54
+ output_dim = self.pooler.output_dim
55
+
56
+ ### RANKING LAYER
57
+ self.classifier = nn.Linear(output_dim, num_labels)
58
+ drop_out = getattr(config, "cls_dropout", None)
59
+ drop_out = self.config.hidden_dropout_prob if drop_out is None else drop_out
60
+ self.dropout = StableDropout(drop_out)
61
+
62
+ ### COMPRESSION LAYER: another head
63
+ token_dropout = drop_out
64
+ self.token_dropout = nn.Dropout(token_dropout)
65
+ self.token_classifier = nn.Linear(
66
+ config.hidden_size, 2
67
+ ) # => hard coded number of labels
68
+ self.name = "Provence"
69
+ self.tokenizer = AutoTokenizer.from_pretrained(config._name_or_path)
70
+ self.max_len = config.max_position_embeddings
71
+
72
+ # Initialize weights and apply final processing
73
+ self.post_init()
74
+
75
+ def forward(
76
+ self,
77
+ input_ids: Optional[torch.LongTensor] = None,
78
+ attention_mask: Optional[torch.FloatTensor] = None,
79
+ ) -> RankingCompressionOutput:
80
+ outputs = self.deberta(
81
+ input_ids,
82
+ attention_mask=attention_mask,
83
+ )
84
+
85
+ encoder_layer = outputs[0]
86
+ pooled_output = self.pooler(encoder_layer)
87
+ pooled_output = self.dropout(pooled_output)
88
+ ranking_logits = self.classifier(pooled_output)
89
+ compression_logits = self.token_classifier(self.token_dropout(encoder_layer))
90
+ ranking_scores = ranking_logits[
91
+ :, 0
92
+ ].squeeze() # select first dim of logits for ranking scores
93
+
94
+ return RankingCompressionOutput(
95
+ compression_logits=compression_logits,
96
+ ranking_scores=ranking_scores,
97
+ hidden_states=outputs.hidden_states,
98
+ attentions=outputs.attentions,
99
+ )
100
+
101
+ def process(
102
+ self,
103
+ contexts: List[List[str]],
104
+ queries: List[List[str]],
105
+ titles: Optional[Union[List[str], str]] = "first_sentence",
106
+ batch_size=32,
107
+ threshold=0.01,
108
+ always_select_title=True,
109
+ reorder=False,
110
+ top_k=5,
111
+ enable_warnings=True,
112
+ ):
113
+
114
+ assert (
115
+ titles == "first_sentence"
116
+ or titles == None
117
+ or type(titles) == list
118
+ and len(titles) == len(queries)
119
+ ), "Variable 'titles' must be 'first_sentence' or a list of strings of the same length as 'queries'"
120
+ if type(titles) == list:
121
+ assert all(
122
+ [
123
+ len(titles_item) == len(contexts_item)
124
+ for titles_item, contexts_item in zip(contexts, titles)
125
+ ]
126
+ ), "Each list in 'titles' must have the same length as the corresponding list in 'context'"
127
+ assert len(queries) == len(
128
+ contexts
129
+ ), "Lists 'queries' and 'contexts' must have same lengths"
130
+ times = []
131
+ t0 = time.time()
132
+ dataset = TestDataset(
133
+ queries=queries,
134
+ contexts=contexts,
135
+ titles=titles,
136
+ tokenizer=self.tokenizer,
137
+ max_len=self.max_len,
138
+ enable_warnings=enable_warnings,
139
+ )
140
+ times.append(["testdataset", time.time() - t0])
141
+ t0 = time.time()
142
+ selected_contexts = [
143
+ [{0: contexts[i][j]} for j in range(len(contexts[i]))]
144
+ for i in range(len(queries))
145
+ ]
146
+ reranking_scores = [
147
+ [None for j in range(len(contexts[i]))] for i in range(len(queries))
148
+ ]
149
+ times.append(["create arrays", time.time() - t0])
150
+ t0 = time.time()
151
+ with torch.no_grad():
152
+ for batch_start in tqdm(
153
+ range(0, len(dataset), batch_size), desc="Pruning contexts..."
154
+ ):
155
+ t1 = time.time()
156
+ qis = dataset.qis[batch_start : batch_start + batch_size]
157
+ cis = dataset.cis[batch_start : batch_start + batch_size]
158
+ sis = dataset.sis[batch_start : batch_start + batch_size]
159
+ sent_coords = dataset.sent_coords[
160
+ batch_start : batch_start + batch_size
161
+ ]
162
+ ids_list = dataset.ids[batch_start : batch_start + batch_size]
163
+ ids = pad_sequence(
164
+ ids_list, batch_first=True, padding_value=dataset.pad_idx
165
+ ).to(self.device)
166
+ mask = (ids != dataset.pad_idx).to(self.device)
167
+ times.append(["torch stack", time.time() - t1])
168
+ t1 = time.time()
169
+ outputs = self.forward(ids, mask)
170
+ scores = F.softmax(outputs["compression_logits"].cpu(), dim=-1)[:, :, 1]
171
+ token_preds = scores > threshold
172
+ reranking_scrs = (
173
+ outputs["ranking_scores"].cpu().numpy()
174
+ ) # get first score
175
+ if len(reranking_scrs.shape) == 0:
176
+ reranking_scrs = reranking_scrs[None]
177
+ times.append(["forward pass", time.time() - t1])
178
+ t1 = time.time()
179
+ for (
180
+ ids_list_,
181
+ token_preds_,
182
+ rerank_score,
183
+ qi,
184
+ ci,
185
+ si,
186
+ sent_coords_,
187
+ ) in zip(
188
+ ids_list, token_preds, reranking_scrs, qis, cis, sis, sent_coords
189
+ ):
190
+
191
+ selected_mask = sentence_rounding(
192
+ token_preds_.cpu().numpy(),
193
+ np.array(sent_coords_),
194
+ threshold=threshold,
195
+ always_select_title=always_select_title
196
+ and si == 0
197
+ and titles != None,
198
+ )
199
+ assert len(selected_mask) == len(token_preds_)
200
+ selected_contexts[qi][ci][si] = ids_list_[
201
+ selected_mask[: len(ids_list_)]
202
+ ]
203
+ if si == 0:
204
+ reranking_scores[qi][ci] = rerank_score
205
+ times.append(["postprocessing", time.time() - t1])
206
+ t1 = time.time()
207
+ for i in range(len(queries)):
208
+ for j in range(len(contexts[i])):
209
+ if type(selected_contexts[i][j][0]) != str:
210
+ toks = torch.cat(
211
+ [
212
+ ids_
213
+ for _, ids_ in sorted(
214
+ selected_contexts[i][j].items(), key=lambda x: x[0]
215
+ )
216
+ ]
217
+ )
218
+ selected_contexts[i][j] = self.tokenizer.decode(
219
+ toks,
220
+ skip_special_tokens=True,
221
+ clean_up_tokenization_spaces=False,
222
+ )
223
+ else:
224
+ selected_contexts[i][j] = selected_contexts[i][j][0]
225
+ if reorder:
226
+ print(reranking_scores[qi])
227
+ print(np.sort(reranking_scores[i])[::-1][:top_k])
228
+ idxs = np.argsort(reranking_scores[i])[::-1][:top_k]
229
+ selected_contexts[i] = [selected_contexts[i][j] for j in idxs]
230
+ reranking_scores[i] = [reranking_scores[i][j] for j in idxs]
231
+ times.append(["postpostprocessing", time.time() - t1])
232
+ times.append(["total inference", time.time() - t0])
233
+ return selected_contexts # , reranking_scores, times
234
+
235
+
236
+ # Some utils functions
237
+
238
+
239
+ def sentence_rounding(predictions, chunks, threshold, always_select_title=True):
240
+ """
241
+ predictions: a binary vector containing 1 for tokens which were selected and 0s otherwise
242
+ chunks: a list of pairs [start, end] of sentence, i.e. sentence is in coordinates predictions[start:end]
243
+ the functions
244
+ """
245
+ cumulative_sum = np.cumsum(predictions)
246
+ chunk_sums = cumulative_sum[chunks[:, 1] - 1] - np.where(
247
+ chunks[:, 0] > 0, cumulative_sum[chunks[:, 0] - 1], 0
248
+ )
249
+ chunk_lengths = chunks[:, 1] - chunks[:, 0]
250
+ chunk_means = chunk_sums / chunk_lengths
251
+ if always_select_title:
252
+ chunk_means[0] = 1
253
+ means = np.hstack((np.zeros(1), chunk_means, np.zeros(1)))
254
+ repeats = np.hstack(
255
+ ([chunks[0][0]], chunk_lengths, [predictions.shape[0] - chunks[-1][1]])
256
+ )
257
+ return np.repeat(means, repeats) > threshold
258
+
259
+
260
+ def normalize(s: str) -> str:
261
+ def white_space_fix(text):
262
+ return " ".join(text.split())
263
+
264
+ def remove_punc(text):
265
+ exclude = set(string.punctuation)
266
+ return "".join(ch for ch in text if ch not in exclude)
267
+
268
+ def lower(text):
269
+ return text.lower()
270
+
271
+ return white_space_fix(remove_punc(lower(s)))
272
+
273
+
274
+ def sent_split_and_tokenize(text, tokenizer, max_len):
275
+ sents_nltk = nltk.sent_tokenize(text)
276
+ sents = []
277
+ for j, sent_nltk in enumerate(sents_nltk):
278
+ tokinput = (" " if j != 0 else "") + sent_nltk
279
+ tok = tokenizer.encode(tokinput, add_special_tokens=False)
280
+ ltok = len(tok)
281
+ if ltok == 0:
282
+ continue
283
+ if ltok <= max_len:
284
+ sents.append(tok)
285
+ else:
286
+ for begin in range(0, ltok, max_len):
287
+ sents.append(tok[begin : begin + max_len])
288
+ return sents
289
+
290
+
291
+ class TestDataset(Dataset):
292
+ def __init__(
293
+ self,
294
+ queries,
295
+ contexts,
296
+ tokenizer,
297
+ max_len=512,
298
+ titles="first_sentence",
299
+ enable_warnings=True,
300
+ ):
301
+ self.tokenizer = tokenizer
302
+ self.max_len = max_len
303
+ self.pad_idx = 0
304
+ self.cls_idx = [1]
305
+ self.sep_idx = [2]
306
+ self.eos = [2]
307
+ # hardcoded deberta-specific indexes
308
+ self.nb_spe_tok = len(self.cls_idx) + len(self.sep_idx)
309
+ self.enable_warnings = enable_warnings
310
+ self.unusual_query_length = (
311
+ self.max_len // 2
312
+ ) # TODO: change to data-driven value
313
+ self.unusual_title_len = self.max_len // 2 # TODO: change to data-driven value
314
+ self.create_dataset(contexts, queries, titles)
315
+ self.len = len(self.cis)
316
+
317
+ def create_dataset(self, contexts, queries, titles="first_sentence"):
318
+ self.qis = []
319
+ self.cis = []
320
+ self.sis = []
321
+ self.sent_coords = []
322
+ self.cntx_coords = []
323
+ self.ids = []
324
+ if self.enable_warnings:
325
+ warnings_dict = {
326
+ "zero_len_query": set(),
327
+ "too_long_query": set(),
328
+ "unusually_long_query": set(),
329
+ "unusually_long_title": set(),
330
+ "split_context": set(),
331
+ }
332
+ for i, query in enumerate(queries):
333
+ tokenized_query = self.tokenizer.encode(
334
+ normalize(query), add_special_tokens=False
335
+ )
336
+ # normalize query because all training data has normalized queries
337
+ query_len = len(tokenized_query)
338
+ if query_len == 0:
339
+ if self.enable_warnings:
340
+ warnings_dict["zero_len_query"].add(i)
341
+ continue
342
+ elif query_len >= self.max_len - self.nb_spe_tok - 1: # -1 for eos
343
+ if self.enable_warnings:
344
+ warnings_dict["too_long_query"].add(i)
345
+ continue
346
+ elif query_len >= self.unusual_query_length:
347
+ if self.enable_warnings:
348
+ warnings_dict["unusually_long_query"].add(i)
349
+ left_0 = len(tokenized_query) + self.nb_spe_tok
350
+ tokenized_seq_0 = self.cls_idx + tokenized_query + self.sep_idx
351
+ max_len = self.max_len - left_0 - 1
352
+ for j, cntx in enumerate(contexts[i]):
353
+ title = titles[i][j] if type(titles) == list else titles
354
+ tokenized_sents = sent_split_and_tokenize(cntx, self.tokenizer, max_len)
355
+ # each (sent + query + special tokens) <= max_len
356
+ if title is not None and title != "first_sentence":
357
+ tokenized_title = self.tokenizer.encode(
358
+ title, add_special_tokens=False
359
+ )
360
+ ltok = len(tokenized_title)
361
+ if ltok == 0:
362
+ pass
363
+ elif ltok <= max_len:
364
+ tokenized_sents = [tokenized_title] + tokenized_sents
365
+ else:
366
+ if self.enable_warnings and ltok >= self.unusual_title_len:
367
+ warnings_dict["unusually_long_title"].add(i)
368
+ tokenized_sents = [
369
+ tokenized_title[begin : begin + max_len]
370
+ for begin in range(0, ltok, max_len)
371
+ ] + tokenized_sents
372
+ tokenized_seq = tokenized_seq_0
373
+ left = left_0
374
+ sent_coords = []
375
+ block = 0
376
+ for idx, tokenized_sent in enumerate(tokenized_sents):
377
+ l = len(tokenized_sent)
378
+ if left + l <= self.max_len - 1:
379
+ sent_coords.append([left, left + l])
380
+ tokenized_seq = tokenized_seq + tokenized_sent
381
+ left += l
382
+ else:
383
+ if self.enable_warnings:
384
+ warnings_dict["split_context"].add(i)
385
+ if len(tokenized_seq) > left_0:
386
+ tokenized_seq = tokenized_seq + self.eos
387
+ self.qis.append(i)
388
+ self.cis.append(j)
389
+ self.sis.append(block)
390
+ self.sent_coords.append(sent_coords)
391
+ self.cntx_coords.append(
392
+ [sent_coords[0][0], sent_coords[-1][1]]
393
+ )
394
+ self.ids.append(torch.tensor(tokenized_seq))
395
+ tokenized_seq = tokenized_seq_0 + tokenized_sent
396
+ sent_coords = [[left_0, left_0 + l]]
397
+ left = left_0 + l
398
+ block += 1
399
+ if len(tokenized_seq) > left_0:
400
+ tokenized_seq = tokenized_seq + self.eos
401
+ self.qis.append(i)
402
+ self.cis.append(j)
403
+ self.sis.append(block)
404
+ self.sent_coords.append(sent_coords)
405
+ self.cntx_coords.append([sent_coords[0][0], sent_coords[-1][1]])
406
+ self.ids.append(torch.tensor(tokenized_seq))
407
+ if self.enable_warnings:
408
+ self.print_warnings(warnings_dict, len(queries))
409
+
410
+ def __len__(self):
411
+ return len(self.ids)
412
+
413
+ def print_warnings(self, warnings_dict, N):
414
+ n = len(warnings_dict["zero_len_query"])
415
+ info = " You can suppress Provence warnings by setting enable_warnings=False."
416
+ if n > 0:
417
+ ex = list(warnings_dict["zero_len_query"])[:10]
418
+ warnings.warn(
419
+ f"{n} out of {N} queries have zero length, e.g. at indexes {ex}. "
420
+ "These examples will be skipped in context pruning, "
421
+ "their contexts will be kept as is." + info
422
+ )
423
+ n = len(warnings_dict["too_long_query"])
424
+ if n > 0:
425
+ ex = list(warnings_dict["too_long_query"])[:10]
426
+ warnings.warn(
427
+ f"{n} out of {N} queries are too long for context length {self.max_len}, "
428
+ f"e.g. at indexes {ex}. These examples will be skipped in context pruning, "
429
+ "their contexts will be kept as is." + info
430
+ )
431
+ n = len(warnings_dict["unusually_long_query"])
432
+ if n > 0:
433
+ ex = list(warnings_dict["unusually_long_query"])[:10]
434
+ warnings.warn(
435
+ f"{n} out of {N} queries are longer than {self.unusual_query_length} tokens, "
436
+ f"e.g. at indexes {ex}. These examples will processed as usual in context pruning, "
437
+ "but the quality of context pruning could be reduced." + info
438
+ )
439
+ n = len(warnings_dict["unusually_long_title"])
440
+ if n > 0:
441
+ ex = list(warnings_dict["unusually_long_title"])[:10]
442
+ warnings.warn(
443
+ f"{n} out of {N} titles are longer than {self.unusual_title_length} tokens, "
444
+ f"e.g. at indexes {ex}. These examples will processed as usual in context pruning, "
445
+ "but the quality of context pruning could be reduced." + info
446
+ )
447
+ n = len(warnings_dict["split_context"])
448
+ if n > 0:
449
+ ex = list(warnings_dict["split_context"])[:10]
450
+ warnings.warn(
451
+ f"{n} out of {N} contexts were split into several pieces for context pruning, "
452
+ f"due to a limited context length of Provence which is equal to {self.max_len}. "
453
+ "This could potentially reduce the quality of context pruning. "
454
+ "You could consider checking and reducing lengths of contexts, queries, or titles."
455
+ + info
456
+ )
special_tokens_map.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "[CLS]",
3
+ "cls_token": "[CLS]",
4
+ "eos_token": "[SEP]",
5
+ "mask_token": "[MASK]",
6
+ "pad_token": "[PAD]",
7
+ "sep_token": "[SEP]",
8
+ "unk_token": {
9
+ "content": "[UNK]",
10
+ "lstrip": false,
11
+ "normalized": true,
12
+ "rstrip": false,
13
+ "single_word": false
14
+ }
15
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "[CLS]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "[SEP]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "[UNK]",
29
+ "lstrip": false,
30
+ "normalized": true,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "128000": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "bos_token": "[CLS]",
45
+ "clean_up_tokenization_spaces": false,
46
+ "cls_token": "[CLS]",
47
+ "do_lower_case": false,
48
+ "eos_token": "[SEP]",
49
+ "mask_token": "[MASK]",
50
+ "model_max_length": 1000000000000000019884624838656,
51
+ "pad_token": "[PAD]",
52
+ "sep_token": "[SEP]",
53
+ "sp_model_kwargs": {},
54
+ "split_by_punct": false,
55
+ "tokenizer_class": "DebertaV2Tokenizer",
56
+ "unk_token": "[UNK]",
57
+ "vocab_type": "spm"
58
+ }