Gabriel Okiri
Initial commit
4bb9d41
raw
history blame
429 Bytes
from transformers import PreTrainedTokenizerFast
from typing import List, Dict
class NigerianLanguageTokenizer:
def __init__(self, base_tokenizer: PreTrainedTokenizerFast):
self.tokenizer = base_tokenizer
def tokenize_batch(self, texts: List[str]) -> Dict:
return self.tokenizer(
texts,
padding=True,
truncation=True,
return_tensors="pt"
)