Adding correct models 10k steps
Browse files- flax_model.msgpack +2 -2
- pytorch_model.bin +3 -0
- tokens.py +2 -2
flax_model.msgpack
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5ff31ebb2460dbc41a160cc755d0555bb8c84672563808b968a2a121c1b2414a
|
| 3 |
+
size 711587941
|
pytorch_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b4265b625a915f8a622926c9be27d6b1f3f2bc44481f81ab5d53eace54a0bc06
|
| 3 |
+
size 1421780139
|
tokens.py
CHANGED
|
@@ -3,11 +3,11 @@ from datasets import load_dataset
|
|
| 3 |
from tokenizers import ByteLevelBPETokenizer
|
| 4 |
|
| 5 |
# Load dataset
|
| 6 |
-
dataset = load_dataset("oscar", "unshuffled_deduplicated_es")
|
| 7 |
|
| 8 |
# Instantiate tokenizer
|
| 9 |
tokenizer = ByteLevelBPETokenizer()
|
| 10 |
-
def batch_iterator(batch_size=
|
| 11 |
for i in range(0, len(dataset), batch_size):
|
| 12 |
yield dataset["text"][i: i + batch_size]
|
| 13 |
|
|
|
|
| 3 |
from tokenizers import ByteLevelBPETokenizer
|
| 4 |
|
| 5 |
# Load dataset
|
| 6 |
+
dataset = load_dataset("oscar", "unshuffled_deduplicated_es", split="train")
|
| 7 |
|
| 8 |
# Instantiate tokenizer
|
| 9 |
tokenizer = ByteLevelBPETokenizer()
|
| 10 |
+
def batch_iterator(batch_size=1_000_000):
|
| 11 |
for i in range(0, len(dataset), batch_size):
|
| 12 |
yield dataset["text"][i: i + batch_size]
|
| 13 |
|