File size: 7,366 Bytes
cd8454d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
# -*- coding: utf-8 -*-
# This project combines the TN and G2P functions of https://github.com/RVC-Boss/GPT-SoVITS and https://github.com/wenet-e2e/WeTextProcessing
# Huawei Technologies Co., Ltd. (authors: Xiao Chen)
from re import M
from tn.chinese.normalizer import Normalizer as ZhNormalizer
from tn.english.normalizer import Normalizer as EnNormalizer
import LangSegment
from text import symbols as symbols_v1
from text.chinese import replace_consecutive_punctuation, replace_punctuation_with_en, replace_punctuation
import sys
TN_MODULES = {'baidu', 'wenet'}
PUNCT_NORMALIZE = {',': ',', '。': '.', '、': ',', ';': ',', '‘': ',', '【': ',', '】': ',', '·': ',', '《': ',', '》': ',', '?': '?',
':': ',', '“': ',', '”': ',', '!': '!', '…': ',', ')': ',', '(': ',', '〃': ',', '〈': ',', '〉': ',',
'「': ',', '」': ',', '『': ',', '』': ',', '〖': ',', '〔': ',', '〕': ',', '〗': ',', '〞': ',', '〝': ',', '﹚': ',',
'﹙': ',', '﹛': ',', '﹜': ',', '﹝': ',', '﹞': ',', '!': '!', '"': ',', ''': ',', '︐': ',', '︑': ',', '︒': ',',
'︔': ',', '︓': ',', '︕': '!', '︖': '?', '︗': ',', '︘': ',', '︙': ',', '︰': ',', '︱': ',', '︳': ',', '︵': ',',
'︶': ',', '︷': ',', '︸': ',', '︹': ',', '︺': ',', '︻': ',', '︼': ',', '︽': ',', '︾': ',', '︿': ',', '﹀': ',',
'﹁': ',', '﹂': ',', '﹃': ',', '﹄': ',', ';': ',', '[': ',', ']': ',', '`': ',', ':': ',', '"': ',',
'{': ',', '}': ',', '~': ',', ')': ',', '(': ',', '_': '"', '’': '\'', '^': ','}
ALPHABET_NORM = {'a': 'a', 'b': 'b', 'c': 'c', 'd': 'd', 'e': 'e', 'f': 'f', 'g': 'g', 'h': 'h', 'i': 'i', 'j': 'j', 'k': 'k', 'l': 'l', 'm': 'm',
'n': 'n', 'o': 'o', 'p': 'p', 'q': 'q', 'r': 'r', 's': 's', 't': 't', 'u': 'u', 'v': 'v', 'w': 'w', 'x': 'x', 'y': 'y', 'z': 'z',
'A': 'A', 'B': 'B', 'C': 'C', 'D': 'D', 'E': 'E', 'F': 'F', 'G': 'G', 'H': 'H', 'I': 'I', 'J': 'J', 'K': 'K', 'L': 'L', 'M': 'M',
'N': 'N', 'O': 'O', 'P': 'P', 'Q': 'Q', 'R': 'R', 'S': 'S', 'T': 'T', 'U': 'U', 'V': 'V', 'W': 'W', 'X': 'X', 'Y': 'Y', 'Z': 'Z'}
def punct_normalization(sent):
output_sent = ''
for idx in range(len(sent)):
if sent[idx] in PUNCT_NORMALIZE:
output_sent += PUNCT_NORMALIZE[sent[idx]]
else:
output_sent += sent[idx]
return output_sent
def alphabet_normalization(sent):
output_sent = ''
for idx in range(len(sent)):
if sent[idx] in ALPHABET_NORM:
output_sent += ALPHABET_NORM[sent[idx]]
else:
output_sent += sent[idx]
return output_sent
class MultilingualTN():
def __init__(self, module="wenet", remove_interjections=False, remove_erhua=True):
self.tn_module = module
self.language_module_map = {"zh": "chinese", "en": "english"}
self.tn_implements = dict()
if self.tn_module in TN_MODULES:
if self.tn_module == "baidu":
for l, m in self.language_module_map.items():
self.tn_implements[l] = __import__("text."+m, fromlist=[m])
else:
for l, m in self.language_module_map.items():
if l == "en":
self.tn_implements[l] = EnNormalizer(
overwrite_cache=True)
else:
self.tn_implements[l] = ZhNormalizer(
remove_erhua=remove_erhua, remove_interjections=remove_interjections, overwrite_cache=True)
pass
def _do_tn(self, text, language="zh"):
norm_text = ""
if language in self.language_module_map:
module = self.tn_implements[language]
else:
module = self.tn_implements["zh"]
if self.tn_module == "baidu":
if hasattr(module, "text_normalize"):
norm_text = module.text_normalize(text)
else:
norm_text = text
elif self.tn_module == "wenet":
if hasattr(module, "normalize"):
norm_text = module.normalize(text)
else:
norm_text = text
else:
norm_text = text
if language == "zh":
norm_text = replace_punctuation_with_en(norm_text)
norm_text = replace_consecutive_punctuation(norm_text)
while " " in norm_text:
norm_text = norm_text.replace(" ", " ")
return norm_text
def normalize_segment(self, text, language, normalize_punct=False):
if normalize_punct:
text = punct_normalization(text)
text = alphabet_normalization(text)
text = text.lower()
norm_text = self._do_tn(text, language)
return norm_text
def normalize(self, text, language, normalize_punct=False):
if normalize_punct:
text = punct_normalization(text)
text = alphabet_normalization(text)
text = text.lower()
textlist = []
langlist = []
LangSegment.setfilters(["zh", "ja", "en", "ko"])
if language == "auto":
for tmp in LangSegment.getTexts(text):
langlist.append(tmp["lang"])
textlist.append(tmp["text"])
else:
for tmp in LangSegment.getTexts(text):
if tmp["lang"] == "en":
langlist.append(tmp["lang"])
else:
# 因无法区别中日韩文汉字,以用户输入为准
langlist.append(language)
textlist.append(tmp["text"])
# print(textlist)
# print(langlist)
phones_list = []
norm_text_list = []
for i in range(len(textlist)):
lang = langlist[i]
norm_text = self._do_tn(textlist[i], lang)
norm_text_list.append(norm_text)
norm_text = ''.join(norm_text_list)
return norm_text
if __name__ == '__main__':
'''
Testing functions
'''
# text = '1983年2月,旅行了2天的儿童和长翅膀的女孩儿:“︘菜单修订后有鱼香肉丝儿、『王道椒香鸡腿〕和川蜀鸡翅?……”it\'s a test.王会计会计算机。which had been in force since 1760.调查员决定调节调查的难度。Article VI, Qing government would be charged an annual interest rate of 5% for the money'
# text = 'Just Do It系列广告是哪个品牌的?从以下生物中选择出属于“植物”类的生物:\n\nA. 人 \nB. 杨树 \nC. 猫 \nD. 月季花 \nE. 细菌\nF. 真菌\nG. 灌木\n80/20法则是什么?NHTSA将自动驾驶分为多少个级别?√2和π是不是无理数?'
language = 'zh'
TN = MultilingualTN()
sys.stderr.write("Input: ")
for line in sys.stdin:
if line.strip() == "exit()":
exit()
if len(line.strip()) <= 0:
sys.stderr.write("Input: ")
continue
sys.stdout.write("{}\n".format(TN.normalize(
line.strip(), language="zh", normalize_punct=True)))
sys.stderr.write("Input: ")
|