Adding correct models 10k steps
Browse files- flax_model.msgpack +2 -2
- pytorch_model.bin +3 -0
- tokens.py +2 -2
    	
        flax_model.msgpack
    CHANGED
    
    | @@ -1,3 +1,3 @@ | |
| 1 | 
             
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            -
            oid sha256: | 
| 3 | 
            -
            size  | 
|  | |
| 1 | 
             
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:5ff31ebb2460dbc41a160cc755d0555bb8c84672563808b968a2a121c1b2414a
         | 
| 3 | 
            +
            size 711587941
         | 
    	
        pytorch_model.bin
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:b4265b625a915f8a622926c9be27d6b1f3f2bc44481f81ab5d53eace54a0bc06
         | 
| 3 | 
            +
            size 1421780139
         | 
    	
        tokens.py
    CHANGED
    
    | @@ -3,11 +3,11 @@ from datasets import load_dataset | |
| 3 | 
             
            from tokenizers import ByteLevelBPETokenizer
         | 
| 4 |  | 
| 5 | 
             
            # Load dataset
         | 
| 6 | 
            -
            dataset = load_dataset("oscar", "unshuffled_deduplicated_es")
         | 
| 7 |  | 
| 8 | 
             
            # Instantiate tokenizer
         | 
| 9 | 
             
            tokenizer = ByteLevelBPETokenizer()
         | 
| 10 | 
            -
            def batch_iterator(batch_size= | 
| 11 | 
             
                for i in range(0, len(dataset), batch_size):
         | 
| 12 | 
             
                    yield dataset["text"][i: i + batch_size]
         | 
| 13 |  | 
|  | |
| 3 | 
             
            from tokenizers import ByteLevelBPETokenizer
         | 
| 4 |  | 
| 5 | 
             
            # Load dataset
         | 
| 6 | 
            +
            dataset = load_dataset("oscar", "unshuffled_deduplicated_es", split="train")
         | 
| 7 |  | 
| 8 | 
             
            # Instantiate tokenizer
         | 
| 9 | 
             
            tokenizer = ByteLevelBPETokenizer()
         | 
| 10 | 
            +
            def batch_iterator(batch_size=1_000_000):
         | 
| 11 | 
             
                for i in range(0, len(dataset), batch_size):
         | 
| 12 | 
             
                    yield dataset["text"][i: i + batch_size]
         | 
| 13 |  | 

