File size: 679 Bytes
b347aa0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
from minbpe import RegexTokenizer

# Initialize the tokenizer
tokenizer = RegexTokenizer()

# Read text from a file
file_path = "/Users/mohammad.ibrahim/Desktop/TSAI/combined_text.txt"
with open(file_path, 'r', encoding='utf-8') as file:
    text = file.read()

# Train the tokenizer
tokenizer.train(text, 256 + 5)  # 256 are the byte tokens, then do 3 merges

# Encode the text
encoded_text = tokenizer.encode(text)
print("Encoded:", encoded_text)

# Decode the text
decoded_text = tokenizer.decode(encoded_text)
print("Decoded:", decoded_text)

# Save the trained tokenizer model
tokenizer.save("first")  # Writes two files: toy.model (for loading) and toy.vocab (for viewing)