Simple BPE 16K Tokenizer
This repository contains a Byte-Pair Encoding (BPE) tokenizer with a 16,384 token vocabulary.
- Type: Byte-Pair Encoding (BPE)
- Vocabulary Size: 16,384
- Training Data: A sample from the EssentialAI/essential-web-v1.0 dataset.
- Pre-tokenization: Regex-based (from the GPT-4 tokenizer).
This tokenizer was trained using a custom Python script and is saved as a Python pickle file (.pkl
). Because it is a custom implementation, you must use the provided class code to load it, rather than transformers.AutoTokenizer
.
Usage
To use this tokenizer, you need the Python code for the tokenizer class itself, and the saved state from the .pkl
file.
Step 1: Install prerequisite
pip install huggingface_hub regex
Step 2: Load and Use the Tokenizer
The SimpleBytePairEncoding
class includes a handy from_hub
static method. Copy the class definition below into your project, then use the method to load the tokenizer directly from the Hub.
# Copy the entire code block below into your Python script
# --------------------------------------------------------------------------
from __future__ import annotations
import collections
import regex
import pickle
from huggingface_hub import hf_hub_download
# Note: The bpe_train function is not needed for inference, only bpe_encode.
def bpe_encode(
mergeable_ranks: dict[bytes, int], input: bytes, demo: bool = False
) -> list[int]:
parts = [bytes([b]) for b in input]
while True:
min_idx, min_rank = None, None
for i, pair in enumerate(zip(parts[:-1], parts[1:])):
rank = mergeable_ranks.get(pair[0] + pair[1])
if rank is not None and (min_rank is None or rank < min_rank):
min_idx, min_rank = i, rank
if min_rank is None:
break
assert min_idx is not None
parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2 :]
return [mergeable_ranks[part] for part in parts]
class SimpleBytePairEncoding:
def __init__(self, *, pat_str: str, mergeable_ranks: dict[bytes, int]) -> None:
"""Creates an Encoding object."""
self.pat_str = pat_str
self.mergeable_ranks = mergeable_ranks
self._decoder = {token: token_bytes for token_bytes, token in mergeable_ranks.items()}
self._pat = regex.compile(pat_str)
def encode(self, text: str, demo: bool = False) -> list[int]:
words = self._pat.findall(text)
tokens = []
for word in words:
word_bytes = word.encode("utf-8")
word_tokens = bpe_encode(self.mergeable_ranks, word_bytes, demo=demo)
tokens.extend(word_tokens)
return tokens
def decode_bytes(self, tokens: list[int]) -> bytes:
return b"".join(self._decoder[token] for token in tokens)
def decode(self, tokens: list[int]) -> str:
return self.decode_bytes(tokens).decode("utf-8", errors="replace")
def decode_tokens_bytes(self, tokens: list[int]) -> list[bytes]:
return [self._decoder[token] for token in tokens]
@property
def vocab_size(self) -> int:
"""Return the vocabulary size."""
return len(self.mergeable_ranks)
@staticmethod
def from_hub(repo_id: str, filename: str = "tokenizer.pkl"):
"""Loads the tokenizer from the Hugging Face Hub."""
local_path = hf_hub_download(repo_id=repo_id, filename=filename)
with open(local_path, 'rb') as f:
tokenizer_data = pickle.load(f)
return SimpleBytePairEncoding(
pat_str=tokenizer_data["pat_str"],
mergeable_ranks=tokenizer_data["mergeable_ranks"]
)
# --------------------------------------------------------------------------
# --- Now, you can load and use the tokenizer ---
repo_id = "vukrosic/essential-web-16k-tokenizer"
file_name = "bpe_tokenizer_16k_n1000000.pkl" # The name of the .pkl file in the repo
# Load the tokenizer directly from the Hub
enc = SimpleBytePairEncoding.from_hub(repo_id, filename=file_name)
# --- Test the tokenizer ---
text = "Hello, world! This is a test of the 16K BPE tokenizer."
tokens = enc.encode(text)
decoded_text = enc.decode(tokens)
print(f"Vocabulary size: {enc.vocab_size:,}")
print(f"Original text: '{text}'")
print(f"Tokens: {tokens}")
print(f"Number of tokens: {len(tokens)}")
print(f"Decoded text: '{decoded_text}'")
assert text == decoded_text
print("โ
Roundtrip successful!")
Inference Providers
NEW
This model isn't deployed by any Inference Provider.
๐
Ask for provider support