Simple BPE 16K Tokenizer

This repository contains a Byte-Pair Encoding (BPE) tokenizer with a 16,384 token vocabulary.

Type: Byte-Pair Encoding (BPE)
Vocabulary Size: 16,384
Training Data: A sample from the EssentialAI/essential-web-v1.0 dataset.
Pre-tokenization: Regex-based (from the GPT-4 tokenizer).

This tokenizer was trained using a custom Python script and is saved as a Python pickle file (.pkl). Because it is a custom implementation, you must use the provided class code to load it, rather than transformers.AutoTokenizer.

Usage

To use this tokenizer, you need the Python code for the tokenizer class itself, and the saved state from the .pkl file.

Step 1: Install prerequisite

pip install huggingface_hub regex

Step 2: Load and Use the Tokenizer

The SimpleBytePairEncoding class includes a handy from_hub static method. Copy the class definition below into your project, then use the method to load the tokenizer directly from the Hub.

# Copy the entire code block below into your Python script
# --------------------------------------------------------------------------

from __future__ import annotations
import collections
import regex
import pickle
from huggingface_hub import hf_hub_download

# Note: The bpe_train function is not needed for inference, only bpe_encode.
def bpe_encode(
    mergeable_ranks: dict[bytes, int], input: bytes, demo: bool = False
) -> list[int]:
    parts = [bytes([b]) for b in input]
    while True:
        min_idx, min_rank = None, None
        for i, pair in enumerate(zip(parts[:-1], parts[1:])):
            rank = mergeable_ranks.get(pair[0] + pair[1])
            if rank is not None and (min_rank is None or rank < min_rank):
                min_idx, min_rank = i, rank
        if min_rank is None:
            break
        assert min_idx is not None
        parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2 :]
    return [mergeable_ranks[part] for part in parts]

class SimpleBytePairEncoding:
    def __init__(self, *, pat_str: str, mergeable_ranks: dict[bytes, int]) -> None:
        """Creates an Encoding object."""
        self.pat_str = pat_str
        self.mergeable_ranks = mergeable_ranks
        self._decoder = {token: token_bytes for token_bytes, token in mergeable_ranks.items()}
        self._pat = regex.compile(pat_str)

    def encode(self, text: str, demo: bool = False) -> list[int]:
        words = self._pat.findall(text)
        tokens = []
        for word in words:
            word_bytes = word.encode("utf-8")
            word_tokens = bpe_encode(self.mergeable_ranks, word_bytes, demo=demo)
            tokens.extend(word_tokens)
        return tokens

    def decode_bytes(self, tokens: list[int]) -> bytes:
        return b"".join(self._decoder[token] for token in tokens)

    def decode(self, tokens: list[int]) -> str:
        return self.decode_bytes(tokens).decode("utf-8", errors="replace")

    def decode_tokens_bytes(self, tokens: list[int]) -> list[bytes]:
        return [self._decoder[token] for token in tokens]

    @property
    def vocab_size(self) -> int:
        """Return the vocabulary size."""
        return len(self.mergeable_ranks)

    @staticmethod
    def from_hub(repo_id: str, filename: str = "tokenizer.pkl"):
        """Loads the tokenizer from the Hugging Face Hub."""
        local_path = hf_hub_download(repo_id=repo_id, filename=filename)
        with open(local_path, 'rb') as f:
            tokenizer_data = pickle.load(f)
        return SimpleBytePairEncoding(
            pat_str=tokenizer_data["pat_str"],
            mergeable_ranks=tokenizer_data["mergeable_ranks"]
        )

# --------------------------------------------------------------------------


# --- Now, you can load and use the tokenizer ---
repo_id = "vukrosic/essential-web-16k-tokenizer"
file_name = "bpe_tokenizer_16k_n1000000.pkl" # The name of the .pkl file in the repo

# Load the tokenizer directly from the Hub
enc = SimpleBytePairEncoding.from_hub(repo_id, filename=file_name)

# --- Test the tokenizer ---
text = "Hello, world! This is a test of the 16K BPE tokenizer."
tokens = enc.encode(text)
decoded_text = enc.decode(tokens)

print(f"Vocabulary size: {enc.vocab_size:,}")
print(f"Original text: '{text}'")
print(f"Tokens: {tokens}")
print(f"Number of tokens: {len(tokens)}")
print(f"Decoded text: '{decoded_text}'")

assert text == decoded_text
print("✅ Roundtrip successful!")