DSTK / thirdparty /G2P /TN_processors.py

first push of codes and models for g2p, t2u, tokenizer and detokenizer

cd8454d 2 months ago

7.37 kB

	# -- coding: utf-8 --

	# This project combines the TN and G2P functions of https://github.com/RVC-Boss/GPT-SoVITS and https://github.com/wenet-e2e/WeTextProcessing
	# Huawei Technologies Co., Ltd. (authors: Xiao Chen)

	from re import M
	from tn.chinese.normalizer import Normalizer as ZhNormalizer
	from tn.english.normalizer import Normalizer as EnNormalizer
	import LangSegment
	from text import symbols as symbols_v1
	from text.chinese import replace_consecutive_punctuation, replace_punctuation_with_en, replace_punctuation
	import sys


	TN_MODULES = {'baidu', 'wenet'}

	PUNCT_NORMALIZE = {'，': ',', '。': '.', '、': ',', '；': ',', '‘': ',', '【': ',', '】': ',', '·': ',', '《': ',', '》': ',', '？': '?',
	'：': ',', '“': ',', '”': ',', '！': '!', '…': ',', '）': ',', '（': ',', '〃': ',', '〈': ',', '〉': ',',
	'「': ',', '」': ',', '『': ',', '』': ',', '〖': ',', '〔': ',', '〕': ',', '〗': ',', '〞': ',', '〝': ',', '﹚': ',',
	'﹙': ',', '﹛': ',', '﹜': ',', '﹝': ',', '﹞': ',', '！': '!', '＂': ',', '＇': ',', '︐': ',', '︑': ',', '︒': ',',
	'︔': ',', '︓': ',', '︕': '!', '︖': '?', '︗': ',', '︘': ',', '︙': ',', '︰': ',', '︱': ',', '︳': ',', '︵': ',',
	'︶': ',', '︷': ',', '︸': ',', '︹': ',', '︺': ',', '︻': ',', '︼': ',', '︽': ',', '︾': ',', '︿': ',', '﹀': ',',
	'﹁': ',', '﹂': ',', '﹃': ',', '﹄': ',', ';': ',', '[': ',', ']': ',', '`': ',', ':': ',', '"': ',',
	'{': ',', '}': ',', '~': ',', ')': ',', '(': ',', '_': '"', '’': '\'', '^': ','}

	ALPHABET_NORM = {'ａ': 'a', 'ｂ': 'b', 'ｃ': 'c', 'ｄ': 'd', 'ｅ': 'e', 'ｆ': 'f', 'ｇ': 'g', 'ｈ': 'h', 'ｉ': 'i', 'ｊ': 'j', 'ｋ': 'k', 'ｌ': 'l', 'ｍ': 'm',
	'ｎ': 'n', 'ｏ': 'o', 'ｐ': 'p', 'ｑ': 'q', 'ｒ': 'r', 'ｓ': 's', 'ｔ': 't', 'ｕ': 'u', 'ｖ': 'v', 'ｗ': 'w', 'ｘ': 'x', 'ｙ': 'y', 'ｚ': 'z',
	'Ａ': 'A', 'Ｂ': 'B', 'Ｃ': 'C', 'Ｄ': 'D', 'Ｅ': 'E', 'Ｆ': 'F', 'Ｇ': 'G', 'Ｈ': 'H', 'Ｉ': 'I', 'Ｊ': 'J', 'Ｋ': 'K', 'Ｌ': 'L', 'Ｍ': 'M',
	'Ｎ': 'N', 'Ｏ': 'O', 'Ｐ': 'P', 'Ｑ': 'Q', 'Ｒ': 'R', 'Ｓ': 'S', 'Ｔ': 'T', 'Ｕ': 'U', 'Ｖ': 'V', 'Ｗ': 'W', 'Ｘ': 'X', 'Ｙ': 'Y', 'Ｚ': 'Z'}


	def punct_normalization(sent):
	output_sent = ''
	for idx in range(len(sent)):
	if sent[idx] in PUNCT_NORMALIZE:
	output_sent += PUNCT_NORMALIZE[sent[idx]]
	else:
	output_sent += sent[idx]
	return output_sent


	def alphabet_normalization(sent):
	output_sent = ''
	for idx in range(len(sent)):
	if sent[idx] in ALPHABET_NORM:
	output_sent += ALPHABET_NORM[sent[idx]]
	else:
	output_sent += sent[idx]
	return output_sent


	class MultilingualTN():

	def __init__(self, module="wenet", remove_interjections=False, remove_erhua=True):
	self.tn_module = module
	self.language_module_map = {"zh": "chinese", "en": "english"}
	self.tn_implements = dict()
	if self.tn_module in TN_MODULES:
	if self.tn_module == "baidu":
	for l, m in self.language_module_map.items():
	self.tn_implements[l] = __import__("text."+m, fromlist=[m])
	else:
	for l, m in self.language_module_map.items():
	if l == "en":
	self.tn_implements[l] = EnNormalizer(
	overwrite_cache=True)
	else:
	self.tn_implements[l] = ZhNormalizer(
	remove_erhua=remove_erhua, remove_interjections=remove_interjections, overwrite_cache=True)
	pass

	def _do_tn(self, text, language="zh"):
	norm_text = ""
	if language in self.language_module_map:
	module = self.tn_implements[language]
	else:
	module = self.tn_implements["zh"]
	if self.tn_module == "baidu":
	if hasattr(module, "text_normalize"):
	norm_text = module.text_normalize(text)
	else:
	norm_text = text
	elif self.tn_module == "wenet":
	if hasattr(module, "normalize"):
	norm_text = module.normalize(text)
	else:
	norm_text = text
	else:
	norm_text = text

	if language == "zh":
	norm_text = replace_punctuation_with_en(norm_text)
	norm_text = replace_consecutive_punctuation(norm_text)
	while " " in norm_text:
	norm_text = norm_text.replace(" ", " ")
	return norm_text

	def normalize_segment(self, text, language, normalize_punct=False):
	if normalize_punct:
	text = punct_normalization(text)

	text = alphabet_normalization(text)
	text = text.lower()

	norm_text = self._do_tn(text, language)
	return norm_text

	def normalize(self, text, language, normalize_punct=False):
	if normalize_punct:
	text = punct_normalization(text)

	text = alphabet_normalization(text)
	text = text.lower()

	textlist = []
	langlist = []
	LangSegment.setfilters(["zh", "ja", "en", "ko"])
	if language == "auto":
	for tmp in LangSegment.getTexts(text):
	langlist.append(tmp["lang"])
	textlist.append(tmp["text"])
	else:
	for tmp in LangSegment.getTexts(text):
	if tmp["lang"] == "en":
	langlist.append(tmp["lang"])
	else:
	# 因无法区别中日韩文汉字,以用户输入为准
	langlist.append(language)
	textlist.append(tmp["text"])
	# print(textlist)
	# print(langlist)
	phones_list = []
	norm_text_list = []
	for i in range(len(textlist)):
	lang = langlist[i]
	norm_text = self._do_tn(textlist[i], lang)
	norm_text_list.append(norm_text)
	norm_text = ''.join(norm_text_list)
	return norm_text


	if __name__ == '__main__':
	'''
	Testing functions
	'''
	# text = '1983年2月，旅行了2天的儿童和长翅膀的女孩儿：“︘菜单修订后有鱼香肉丝儿、『王道椒香鸡腿〕和川蜀鸡翅？……”it\'s a test.王会计会计算机。which had been in force since 1760.调查员决定调节调查的难度。Article VI, Qing government would be charged an annual interest rate of 5% for the money'
	# text = 'Just Do It系列广告是哪个品牌的？从以下生物中选择出属于“植物”类的生物：\n\nA. 人 \nB. 杨树 \nC. 猫 \nD. 月季花 \nE. 细菌\nF. 真菌\nG. 灌木\n80/20法则是什么?NHTSA将自动驾驶分为多少个级别？√2和π是不是无理数？'
	language = 'zh'
	TN = MultilingualTN()
	sys.stderr.write("Input: ")
	for line in sys.stdin:
	if line.strip() == "exit()":
	exit()
	if len(line.strip()) <= 0:
	sys.stderr.write("Input: ")
	continue
	sys.stdout.write("{}\n".format(TN.normalize(
	line.strip(), language="zh", normalize_punct=True)))
	sys.stderr.write("Input: ")