fxtentacle commited on
Commit
c2cd532
·
1 Parent(s): 0788726

Upload text_tokenizer.py

Browse files
Files changed (1) hide show
  1. text_tokenizer.py +51 -0
text_tokenizer.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any, Dict, List, Optional, Union
2
+ import json
3
+
4
+ class HajoTextTokenizer:
5
+ def __init__(self, config_file: str):
6
+ with open(config_file,'rt') as f:
7
+ self.all_tokens = json.load(f)
8
+ self.unk = 1000 + len(self.all_tokens)-1
9
+ self.all_tokens[self.unk-1000] = '?'
10
+ self.valid_tokens = self.all_tokens[:-1]
11
+
12
+ def encode(self, sentence):
13
+ sentence = sentence.replace('ß','ss').replace('-',' ').replace(' ',' ').replace(' ',' ').lower()
14
+ sentence = list(sentence)
15
+ for tokid,tok in enumerate(self.valid_tokens):
16
+ tlen = len(tok)
17
+ ltok = list(tok)
18
+ for off in range(len(sentence)-tlen+1):
19
+ # print(sentence[off:off+tlen], ltok)
20
+ if sentence[off:off+tlen] == ltok:
21
+ prefix = sentence[:off]
22
+ suffix = sentence[off+tlen:]
23
+ # print('MATCH', [prefix, tok, suffix])
24
+ #print('MATCH', tok)
25
+ sentence = prefix + [1000+tokid] + suffix
26
+ #break
27
+ out = []
28
+ last_id = 0
29
+ for t in sentence:
30
+ if isinstance(t, str):
31
+ t = self.unk
32
+ if t == last_id:
33
+ if t == self.unk:
34
+ continue
35
+ out.append(0)
36
+ last_id = t
37
+ out.append(t-1000)
38
+ return out
39
+
40
+ def decode(self, label_ids):
41
+ out = ''
42
+ last_id = 0
43
+ for i in label_ids:
44
+ if i == 0 or i == -100:
45
+ last_id = i
46
+ continue
47
+ if i == 1: break
48
+ if i != last_id:
49
+ out += self.all_tokens[i]
50
+ last_id = i
51
+ return out