New tokenizer
Browse files- tokenizer.json +0 -0
- tokens.py +2 -2
tokenizer.json
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tokens.py
CHANGED
|
@@ -3,11 +3,11 @@ from datasets import load_dataset
|
|
| 3 |
from tokenizers import ByteLevelBPETokenizer
|
| 4 |
|
| 5 |
# Load dataset
|
| 6 |
-
dataset = load_dataset("oscar", "unshuffled_deduplicated_es", split="train")
|
| 7 |
|
| 8 |
# Instantiate tokenizer
|
| 9 |
tokenizer = ByteLevelBPETokenizer()
|
| 10 |
-
def batch_iterator(batch_size=
|
| 11 |
for i in range(0, len(dataset), batch_size):
|
| 12 |
yield dataset["text"][i: i + batch_size]
|
| 13 |
|
|
|
|
| 3 |
from tokenizers import ByteLevelBPETokenizer
|
| 4 |
|
| 5 |
# Load dataset
|
| 6 |
+
dataset = load_dataset("oscar", "unshuffled_deduplicated_es", split="train[:5000000]")
|
| 7 |
|
| 8 |
# Instantiate tokenizer
|
| 9 |
tokenizer = ByteLevelBPETokenizer()
|
| 10 |
+
def batch_iterator(batch_size=100_000):
|
| 11 |
for i in range(0, len(dataset), batch_size):
|
| 12 |
yield dataset["text"][i: i + batch_size]
|
| 13 |
|