|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from re import M |
|
|
from tn.chinese.normalizer import Normalizer as ZhNormalizer |
|
|
from tn.english.normalizer import Normalizer as EnNormalizer |
|
|
import LangSegment |
|
|
from text import symbols as symbols_v1 |
|
|
from text.chinese import replace_consecutive_punctuation, replace_punctuation_with_en, replace_punctuation |
|
|
import sys |
|
|
|
|
|
|
|
|
TN_MODULES = {'baidu', 'wenet'} |
|
|
|
|
|
PUNCT_NORMALIZE = {',': ',', '。': '.', '、': ',', ';': ',', '‘': ',', '【': ',', '】': ',', '·': ',', '《': ',', '》': ',', '?': '?', |
|
|
':': ',', '“': ',', '”': ',', '!': '!', '…': ',', ')': ',', '(': ',', '〃': ',', '〈': ',', '〉': ',', |
|
|
'「': ',', '」': ',', '『': ',', '』': ',', '〖': ',', '〔': ',', '〕': ',', '〗': ',', '〞': ',', '〝': ',', '﹚': ',', |
|
|
'﹙': ',', '﹛': ',', '﹜': ',', '﹝': ',', '﹞': ',', '!': '!', '"': ',', ''': ',', '︐': ',', '︑': ',', '︒': ',', |
|
|
'︔': ',', '︓': ',', '︕': '!', '︖': '?', '︗': ',', '︘': ',', '︙': ',', '︰': ',', '︱': ',', '︳': ',', '︵': ',', |
|
|
'︶': ',', '︷': ',', '︸': ',', '︹': ',', '︺': ',', '︻': ',', '︼': ',', '︽': ',', '︾': ',', '︿': ',', '﹀': ',', |
|
|
'﹁': ',', '﹂': ',', '﹃': ',', '﹄': ',', ';': ',', '[': ',', ']': ',', '`': ',', ':': ',', '"': ',', |
|
|
'{': ',', '}': ',', '~': ',', ')': ',', '(': ',', '_': '"', '’': '\'', '^': ','} |
|
|
|
|
|
ALPHABET_NORM = {'a': 'a', 'b': 'b', 'c': 'c', 'd': 'd', 'e': 'e', 'f': 'f', 'g': 'g', 'h': 'h', 'i': 'i', 'j': 'j', 'k': 'k', 'l': 'l', 'm': 'm', |
|
|
'n': 'n', 'o': 'o', 'p': 'p', 'q': 'q', 'r': 'r', 's': 's', 't': 't', 'u': 'u', 'v': 'v', 'w': 'w', 'x': 'x', 'y': 'y', 'z': 'z', |
|
|
'A': 'A', 'B': 'B', 'C': 'C', 'D': 'D', 'E': 'E', 'F': 'F', 'G': 'G', 'H': 'H', 'I': 'I', 'J': 'J', 'K': 'K', 'L': 'L', 'M': 'M', |
|
|
'N': 'N', 'O': 'O', 'P': 'P', 'Q': 'Q', 'R': 'R', 'S': 'S', 'T': 'T', 'U': 'U', 'V': 'V', 'W': 'W', 'X': 'X', 'Y': 'Y', 'Z': 'Z'} |
|
|
|
|
|
|
|
|
def punct_normalization(sent): |
|
|
output_sent = '' |
|
|
for idx in range(len(sent)): |
|
|
if sent[idx] in PUNCT_NORMALIZE: |
|
|
output_sent += PUNCT_NORMALIZE[sent[idx]] |
|
|
else: |
|
|
output_sent += sent[idx] |
|
|
return output_sent |
|
|
|
|
|
|
|
|
def alphabet_normalization(sent): |
|
|
output_sent = '' |
|
|
for idx in range(len(sent)): |
|
|
if sent[idx] in ALPHABET_NORM: |
|
|
output_sent += ALPHABET_NORM[sent[idx]] |
|
|
else: |
|
|
output_sent += sent[idx] |
|
|
return output_sent |
|
|
|
|
|
|
|
|
class MultilingualTN(): |
|
|
|
|
|
def __init__(self, module="wenet", remove_interjections=False, remove_erhua=True): |
|
|
self.tn_module = module |
|
|
self.language_module_map = {"zh": "chinese", "en": "english"} |
|
|
self.tn_implements = dict() |
|
|
if self.tn_module in TN_MODULES: |
|
|
if self.tn_module == "baidu": |
|
|
for l, m in self.language_module_map.items(): |
|
|
self.tn_implements[l] = __import__("text."+m, fromlist=[m]) |
|
|
else: |
|
|
for l, m in self.language_module_map.items(): |
|
|
if l == "en": |
|
|
self.tn_implements[l] = EnNormalizer( |
|
|
overwrite_cache=True) |
|
|
else: |
|
|
self.tn_implements[l] = ZhNormalizer( |
|
|
remove_erhua=remove_erhua, remove_interjections=remove_interjections, overwrite_cache=True) |
|
|
pass |
|
|
|
|
|
def _do_tn(self, text, language="zh"): |
|
|
norm_text = "" |
|
|
if language in self.language_module_map: |
|
|
module = self.tn_implements[language] |
|
|
else: |
|
|
module = self.tn_implements["zh"] |
|
|
if self.tn_module == "baidu": |
|
|
if hasattr(module, "text_normalize"): |
|
|
norm_text = module.text_normalize(text) |
|
|
else: |
|
|
norm_text = text |
|
|
elif self.tn_module == "wenet": |
|
|
if hasattr(module, "normalize"): |
|
|
norm_text = module.normalize(text) |
|
|
else: |
|
|
norm_text = text |
|
|
else: |
|
|
norm_text = text |
|
|
|
|
|
if language == "zh": |
|
|
norm_text = replace_punctuation_with_en(norm_text) |
|
|
norm_text = replace_consecutive_punctuation(norm_text) |
|
|
while " " in norm_text: |
|
|
norm_text = norm_text.replace(" ", " ") |
|
|
return norm_text |
|
|
|
|
|
def normalize_segment(self, text, language, normalize_punct=False): |
|
|
if normalize_punct: |
|
|
text = punct_normalization(text) |
|
|
|
|
|
text = alphabet_normalization(text) |
|
|
text = text.lower() |
|
|
|
|
|
norm_text = self._do_tn(text, language) |
|
|
return norm_text |
|
|
|
|
|
def normalize(self, text, language, normalize_punct=False): |
|
|
if normalize_punct: |
|
|
text = punct_normalization(text) |
|
|
|
|
|
text = alphabet_normalization(text) |
|
|
text = text.lower() |
|
|
|
|
|
textlist = [] |
|
|
langlist = [] |
|
|
LangSegment.setfilters(["zh", "ja", "en", "ko"]) |
|
|
if language == "auto": |
|
|
for tmp in LangSegment.getTexts(text): |
|
|
langlist.append(tmp["lang"]) |
|
|
textlist.append(tmp["text"]) |
|
|
else: |
|
|
for tmp in LangSegment.getTexts(text): |
|
|
if tmp["lang"] == "en": |
|
|
langlist.append(tmp["lang"]) |
|
|
else: |
|
|
|
|
|
langlist.append(language) |
|
|
textlist.append(tmp["text"]) |
|
|
|
|
|
|
|
|
phones_list = [] |
|
|
norm_text_list = [] |
|
|
for i in range(len(textlist)): |
|
|
lang = langlist[i] |
|
|
norm_text = self._do_tn(textlist[i], lang) |
|
|
norm_text_list.append(norm_text) |
|
|
norm_text = ''.join(norm_text_list) |
|
|
return norm_text |
|
|
|
|
|
|
|
|
if __name__ == '__main__': |
|
|
''' |
|
|
Testing functions |
|
|
''' |
|
|
|
|
|
|
|
|
language = 'zh' |
|
|
TN = MultilingualTN() |
|
|
sys.stderr.write("Input: ") |
|
|
for line in sys.stdin: |
|
|
if line.strip() == "exit()": |
|
|
exit() |
|
|
if len(line.strip()) <= 0: |
|
|
sys.stderr.write("Input: ") |
|
|
continue |
|
|
sys.stdout.write("{}\n".format(TN.normalize( |
|
|
line.strip(), language="zh", normalize_punct=True))) |
|
|
sys.stderr.write("Input: ") |
|
|
|