approach0
/

colbert-cocomae-220

Model card Files Files and versions

Metrics Training metrics Community

w32zhong commited on Apr 8, 2023

Commit

03f842c

·

1 Parent(s): 10f7cbb

fixes.

Files changed (3) hide show

.gitignore +1 -0
README.md +5 -2
test.py +1 -0

.gitignore CHANGED Viewed

@@ -1,3 +1,4 @@
 ckpt/
 *.tar.gz
 *.swp

 ckpt/
 *.tar.gz
 *.swp
+pya0

README.md CHANGED Viewed

@@ -26,8 +26,11 @@ Download your tokenizer, model checkpoints, and optionally the training logs (`e
 Optionally, test model using the MLM task:
 ```sh
-pip install pya0
-python test.py ./ckpt/to/tokenizer ./ckpt/to/model
 ```
 > **Note**
 > Modify the test examples in `test.txt` to play with it.

 Optionally, test model using the MLM task:
 ```sh
+pip install pya0 # for math token preprocessing
+# testing local checkpoints:
+python test.py ./ckpt/math-tokenizer ./ckpt/2-2-0/encoder.ckpt
+# testing Model Hub checkpoints:
+python test.py approach0/coco-mae-220 approach0/coco-mae-220
 ```
 > **Note**
 > Modify the test examples in `test.txt` to play with it.

test.py CHANGED Viewed

@@ -43,6 +43,7 @@ def test(tokenizer_name_or_path, model_name_or_path, test_file='test.txt'):
             for pos in filter(lambda x: x!=0, maskpos):
                 tokens[pos-1] = '[MASK]'
             sentence = ' '.join(tokens)
             tokens = tokenizer(sentence,
                 padding=True, truncation=True, return_tensors="pt")
             #print(tokenizer.decode(tokens['input_ids'][0]))

             for pos in filter(lambda x: x!=0, maskpos):
                 tokens[pos-1] = '[MASK]'
             sentence = ' '.join(tokens)
+            sentence = sentence.replace('[mask]', '[MASK]')
             tokens = tokenizer(sentence,
                 padding=True, truncation=True, return_tensors="pt")
             #print(tokenizer.decode(tokens['input_ids'][0]))