saicharan2804 commited on
Commit
d235aee
·
1 Parent(s): f23bcf0
Files changed (2) hide show
  1. SmilesPeTokenizer.py +8 -12
  2. app.py +1 -1
SmilesPeTokenizer.py CHANGED
@@ -1,16 +1,12 @@
1
- from tokenizers import Tokenizer
 
2
 
3
- def bpe_tokenizer(smiles_string):
4
- # Load the tokenizer from the saved file
5
- tokenizer = Tokenizer.from_file("chembl_bpe_tokenizer.json")
6
 
7
- # Tokenize the SMILES string
8
- encoded_output = tokenizer.encode(smiles_string)
9
 
10
- # To get the tokenized output as text
11
- tokens_text = encoded_output.tokens
 
12
 
13
- # To get the corresponding token IDs
14
- token_ids = encoded_output.ids
15
-
16
- return tokens_text, token_ids
 
1
+ import codecs
2
+ from SmilesPE.tokenizer import *
3
 
4
+ def smilespe_tokenizer(smiles_string):
 
 
5
 
6
+ spe_vob = codecs.open('chembl_smiles_tokenizer30000.txt')
7
+ spe = SPE_Tokenizer(spe_vob)
8
 
9
+ tokenized = spe.tokenize(smiles_string)
10
+
11
+ return tokenized
12
 
 
 
 
 
app.py CHANGED
@@ -6,7 +6,7 @@ iface = gr.Interface(
6
  inputs=[
7
  gr.Textbox(label="SMILES"),
8
  ],
9
- outputs=["text", "text"]
10
  )
11
 
12
  iface.launch()
 
6
  inputs=[
7
  gr.Textbox(label="SMILES"),
8
  ],
9
+ outputs="text"
10
  )
11
 
12
  iface.launch()