Spaces:
Running
on
Zero
Running
on
Zero
from abc import ABC | |
from abc import abstractmethod | |
class AbstractTokenizer(ABC): | |
"""Abstract class for tokenizer.""" | |
def __init__(self, name): | |
self.name = name | |
super().__init__() | |
def vocab_size(self): | |
pass | |
def vocab(self): | |
"""Dictionary from vocab text token to id token.""" | |
pass | |
def inv_vocab(self): | |
"""Dictionary from vocab id token to text token.""" | |
pass | |
def tokenize(self, text): | |
pass | |
def detokenize(self, token_ids): | |
raise NotImplementedError('detokenizer is not implemented for {} ' | |
'tokenizer'.format(self.name)) | |
def cls(self): | |
raise NotImplementedError('CLS is not provided for {} ' | |
'tokenizer'.format(self.name)) | |
def sep(self): | |
raise NotImplementedError('SEP is not provided for {} ' | |
'tokenizer'.format(self.name)) | |
def pad(self): | |
raise NotImplementedError('PAD is not provided for {} ' | |
'tokenizer'.format(self.name)) | |
def eod(self): | |
raise NotImplementedError('EOD is not provided for {} ' | |
'tokenizer'.format(self.name)) | |
def mask(self): | |
raise NotImplementedError('MASK is not provided for {} ' | |
'tokenizer'.format(self.name)) | |
class _SentencePieceTokenizer(AbstractTokenizer): | |
"""SentencePieceTokenizer-Megatron wrapper""" | |
def __init__(self, model_file, vocab_extra_ids=0): | |
name = 'SentencePieceTokenizer' | |
super().__init__(name) | |
import sentencepiece | |
self.tokenizer = sentencepiece.SentencePieceProcessor(model_file=model_file) | |
self._initalize(vocab_extra_ids) | |
def _populate_vocab(self): | |
self._vocab = {} | |
self._inv_vocab = {} | |
for i in range(len(self.tokenizer)): | |
t = self.tokenizer.id_to_piece(i) | |
self._inv_vocab[i] = t | |
self._vocab[t] = i | |
def _initalize(self, vocab_extra_ids): | |
self._populate_vocab() | |
self._special_tokens = {} | |
self._inv_special_tokens = {} | |
self._t5_tokens = [] | |
def _add_special_token(t): | |
if t not in self._vocab: | |
next_id = len(self._vocab) | |
self._vocab[t] = next_id | |
self._inv_vocab[next_id] = t | |
self._special_tokens[t] = self._vocab[t] | |
self._inv_special_tokens[self._vocab[t]] = t | |
_add_special_token('<CLS>') | |
self._cls_id = self._vocab['<CLS>'] | |
_add_special_token('<SEP>') | |
self._sep_id = self._vocab['<SEP>'] | |
_add_special_token('<EOD>') | |
self._eod_id = self._vocab['<EOD>'] | |
_add_special_token('<MASK>') | |
self._mask_id = self._vocab['<MASK>'] | |
pad_id = self.tokenizer.pad_id() | |
try: | |
pad_token = self.tokenizer.id_to_piece(pad_id) | |
except IndexError: | |
pad_token = '<PAD>' | |
_add_special_token(pad_token) | |
self._pad_id = self._vocab[pad_token] | |
bos_id = self.tokenizer.bos_id() | |
try: | |
bos_token = self.tokenizer.id_to_piece(bos_id) | |
except IndexError: | |
bos_token = '<BOS>' | |
_add_special_token(bos_token) | |
self._bos_id = self._vocab[bos_token] | |
eos_id = self.tokenizer.eos_id() | |
try: | |
eos_token = self.tokenizer.id_to_piece(eos_id) | |
except IndexError: | |
eos_token = '<EOS>' | |
_add_special_token(eos_token) | |
self._eos_id = self._vocab[eos_token] | |
for i in range(vocab_extra_ids): | |
t = "<extra_id_{}>".format(i) | |
_add_special_token(t) | |
self._t5_tokens += [t] | |
def vocab_size(self): | |
return len(self._vocab) | |
def vocab(self): | |
return self._vocab | |
def inv_vocab(self): | |
return self._inv_vocab | |
def decoder(self): | |
return self._inv_vocab | |
def encoder(self): | |
return self._vocab | |
# From: | |
# https://github.com/NVIDIA/NeMo/blob/c8fa217e811d60d11d014827c7f3845ff6c99ae7/nemo/collections/common/tokenizers/sentencepiece_tokenizer.py#L89 | |
def tokenize(self, text): | |
ids = [] | |
idx = 0 | |
while 1: | |
indices = {} | |
for token in self._special_tokens: | |
try: | |
indices[token] = text[idx:].index(token) | |
except ValueError: | |
continue | |
if len(indices) == 0: | |
break | |
next_token = min(indices, key=indices.get) | |
next_idx = idx + indices[next_token] | |
ids.extend(self.tokenizer.encode_as_ids(text[idx:next_idx])) | |
ids.append(self._special_tokens[next_token]) | |
idx = next_idx + len(next_token) | |
ids.extend(self.tokenizer.encode_as_ids(text[idx:])) | |
return ids | |
# From: | |
# https://github.com/NVIDIA/NeMo/blob/c8fa217e811d60d11d014827c7f3845ff6c99ae7/nemo/collections/common/tokenizers/sentencepiece_tokenizer.py#L125 | |
def detokenize(self, ids): | |
text = "" | |
last_i = 0 | |
for i, id in enumerate(ids): | |
if id in self._inv_special_tokens: | |
text += self.tokenizer.decode_ids(ids[last_i:i]) + " " | |
text += self._inv_special_tokens[id] + " " | |
last_i = i + 1 | |
text += self.tokenizer.decode_ids(ids[last_i:]) | |
return text | |
def cls(self): | |
return self._cls_id | |
def sep(self): | |
return self._sep_id | |
def pad(self): | |
return self._pad_id | |
def bos_token_id(self): | |
return self._bos_id | |
def bos(self): | |
return self._bos_id | |
def eod(self): | |
return self._eod_id | |
def eos_token_id(self): | |
return self._eos_id | |
def eos(self): | |
return self._eos_id | |
def mask(self): | |
return self._mask_id | |
def additional_special_tokens_ids(self): | |
return [self.vocab[k] for k in self._t5_tokens] | |
class _MMSentencePieceTokenizer(_SentencePieceTokenizer): | |
"""SentencePieceTokenizer-Megatron wrapper""" | |
def __init__(self, model_file, vocab_extra_ids=0): | |
super().__init__(model_file, vocab_extra_ids) | |
def _initalize(self, vocab_extra_ids): | |
self._populate_vocab() | |
self._special_tokens = {} | |
self._inv_special_tokens = {} | |
self._t5_tokens = [] | |
def _add_special_token(t): | |
if t not in self._vocab: | |
next_id = len(self._vocab) | |
self._vocab[t] = next_id | |
self._inv_vocab[next_id] = t | |
self._special_tokens[t] = self._vocab[t] | |
self._inv_special_tokens[self._vocab[t]] = t | |
_add_special_token('<CLS>') | |
self._cls_id = self._vocab['<CLS>'] | |
_add_special_token('<SEP>') | |
self._sep_id = self._vocab['<SEP>'] | |
_add_special_token('<EOD>') | |
self._eod_id = self._vocab['<EOD>'] | |
_add_special_token('<MASK>') | |
self._mask_id = self._vocab['<MASK>'] | |
_add_special_token('<SOA>') | |
self._soa_id = self._vocab['<SOA>'] | |
_add_special_token('<EOA>') | |
self._eoa_id = self._vocab['<EOA>'] | |
_add_special_token('<SOV>') | |
self._sov_id = self._vocab['<SOV>'] | |
_add_special_token('<EOV>') | |
self._eov_id = self._vocab['<EOV>'] | |
_add_special_token('<SOI>') | |
self._soi_id = self._vocab['<SOI>'] | |
_add_special_token('<EOI>') | |
self._eoi_id = self._vocab['<EOI>'] | |
_add_special_token('<s_local>') | |
self._s_local_id = self._vocab['<s_local>'] | |
_add_special_token('<e_local>') | |
self._e_local_id = self._vocab['<e_local>'] | |
_add_special_token('<s_global>') | |
self._s_global_id = self._vocab['<s_global>'] | |
_add_special_token('<e_global>') | |
self._e_global_id = self._vocab['<e_global>'] | |
_add_special_token('<stage_1>') | |
self._stage_1_id = self._vocab['<stage_1>'] | |
_add_special_token('<stage_2>') | |
self._stage_2_id = self._vocab['<stage_2>'] | |
pad_id = self.tokenizer.pad_id() | |
try: | |
pad_token = self.tokenizer.id_to_piece(pad_id) | |
except IndexError: | |
pad_token = '<PAD>' | |
_add_special_token(pad_token) | |
self._pad_id = self._vocab[pad_token] | |
bos_id = self.tokenizer.bos_id() | |
try: | |
bos_token = self.tokenizer.id_to_piece(bos_id) | |
except IndexError: | |
bos_token = '<BOS>' | |
_add_special_token(bos_token) | |
self._bos_id = self._vocab[bos_token] | |
eos_id = self.tokenizer.eos_id() | |
try: | |
eos_token = self.tokenizer.id_to_piece(eos_id) | |
except IndexError: | |
eos_token = '<EOS>' | |
_add_special_token(eos_token) | |
self._eos_id = self._vocab[eos_token] | |
for i in range(vocab_extra_ids): | |
t = "<extra_id_{}>".format(i) | |
_add_special_token(t) | |
self._t5_tokens += [t] | |
def soa(self): | |
return self._soa_id | |
def eoa(self): | |
return self._eoa_id | |
def sov(self): | |
return self._sov_id | |
def eov(self): | |
return self._eov_id | |
def soi(self): | |
return self._soi_id | |
def eoi(self): | |
return self._eoi_id | |
def s_local(self): | |
return self._s_local_id | |
def e_local(self): | |
return self._e_local_id | |
def s_global(self): | |
return self._s_global_id | |
def e_global(self): | |
return self._e_global_id | |
def stage_1(self): | |
return self._stage_1_id | |
def stage_2(self): | |
return self._stage_2_id | |