|
import os |
|
import shutil |
|
|
|
from transformers import PreTrainedTokenizerFast |
|
from tokenizers import Tokenizer, normalizers, pre_tokenizers, processors, decoders |
|
from tokenizers.models import BPE |
|
from tokenizers.trainers import BpeTrainer |
|
|
|
from utils import batch_dataset_iterator |
|
from core_base_datasets import core_base_datasets |
|
from core_instruct_datasets import core_instruct_datasets |
|
|
|
|
|
tokenizer_path = '../tokenizer' |
|
|
|
if os.path.exists(tokenizer_path): |
|
shutil.rmtree(tokenizer_path) |
|
|
|
os.makedirs(tokenizer_path, exist_ok=True) |
|
|
|
|
|
|
|
|
|
bos_token = '<|endoftext|>' |
|
eos_token = '<|im_end|>' |
|
pad_token = '<|pad|>' |
|
unk_token = '<|unk|>' |
|
|
|
special_tokens = [ |
|
bos_token, |
|
eos_token, |
|
pad_token, |
|
unk_token, |
|
'<|im_start|>', |
|
'<|im_sep|>', |
|
'system', |
|
'user', |
|
'assistant', |
|
'<tools>', |
|
'</tools>', |
|
'<tool>', |
|
'</tool>', |
|
'<tool_call>', |
|
'</tool_call>', |
|
'<tool_response>', |
|
'</tool_response>', |
|
'<think>', |
|
'</think>', |
|
] |
|
|
|
for i in range(64 - len(special_tokens)): |
|
special_tokens.append(f'<|reserved_{i}|>') |
|
|
|
|
|
|
|
|
|
bpe = BPE(unk_token=None, byte_fallback=True) |
|
tokenizer = Tokenizer(bpe) |
|
|
|
|
|
tokenizer.normalizer = None |
|
|
|
|
|
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False, trim_offsets=True, use_regex=True) |
|
|
|
|
|
tokenizer.post_processor = processors.ByteLevel(add_prefix_space=True, trim_offsets=False, use_regex=True) |
|
|
|
|
|
tokenizer.decoder = decoders.ByteLevel(add_prefix_space=True, trim_offsets=True, use_regex=True) |
|
|
|
|
|
|
|
|
|
trainer = BpeTrainer( |
|
vocab_size=131072, |
|
min_frequency=3, |
|
special_tokens=special_tokens, |
|
max_token_length=16, |
|
) |
|
|
|
tokenizer_datasets = core_base_datasets + core_instruct_datasets |
|
|
|
tokenizer.train_from_iterator( |
|
(batch_dataset_iterator(n) for n in tokenizer_datasets), |
|
trainer, |
|
) |
|
|
|
tokenizer.save(os.path.join(tokenizer_path, 'tokenizer.json')) |
|
tokenizer.model.save(tokenizer_path) |
|
|
|
|
|
|
|
|
|
CHAT_TEMPLATE = ( |
|
"{% for message in messages %}" |
|
"{{'<|im_start|>' + message['role'] + '<|im_sep|>' + message['content'] + '<|im_end|>'}}" |
|
"{% endfor %}" |
|
|
|
"{% if add_generation_prompt %}" |
|
"{{ '<|im_start|>assistant<|im_sep|>' }}" |
|
"{% endif %}" |
|
) |
|
|
|
fast_tokenizer = PreTrainedTokenizerFast( |
|
tokenizer_object=tokenizer, |
|
chat_template=CHAT_TEMPLATE, |
|
bos_token=bos_token, |
|
eos_token=eos_token, |
|
pad_token=pad_token, |
|
unk_token=unk_token, |
|
clean_up_tokenization_spaces=False, |
|
) |
|
|
|
fast_tokenizer.save_pretrained(tokenizer_path) |
|
|