DSTK

File size: 7,366 Bytes

cd8454d

# -*- coding: utf-8 -*-

# This project combines the TN and G2P functions of https://github.com/RVC-Boss/GPT-SoVITS and https://github.com/wenet-e2e/WeTextProcessing
# Huawei Technologies Co., Ltd. (authors: Xiao Chen)

from re import M
from tn.chinese.normalizer import Normalizer as ZhNormalizer
from tn.english.normalizer import Normalizer as EnNormalizer
import LangSegment
from text import symbols as symbols_v1
from text.chinese import replace_consecutive_punctuation, replace_punctuation_with_en, replace_punctuation
import sys


TN_MODULES = {'baidu', 'wenet'}

PUNCT_NORMALIZE = {'，': ',', '。': '.', '、': ',', '；': ',', '‘': ',', '【': ',', '】': ',', '·': ',', '《': ',', '》': ',', '？': '?',
                   '：': ',', '“': ',', '”': ',', '！': '!', '…': ',', '）': ',', '（': ',', '〃': ',', '〈': ',', '〉': ',',
                   '「': ',', '」': ',', '『': ',', '』': ',', '〖': ',', '〔': ',', '〕': ',', '〗': ',', '〞': ',', '〝': ',', '﹚': ',',
                   '﹙': ',', '﹛': ',', '﹜': ',', '﹝': ',', '﹞': ',', '！': '!', '＂': ',', '＇': ',', '︐': ',', '︑': ',', '︒': ',',
                   '︔': ',', '︓': ',', '︕': '!', '︖': '?', '︗': ',', '︘': ',', '︙': ',', '︰': ',', '︱': ',', '︳': ',', '︵': ',',
                   '︶': ',', '︷': ',', '︸': ',', '︹': ',', '︺': ',', '︻': ',', '︼': ',', '︽': ',', '︾': ',', '︿': ',', '﹀': ',',
                   '﹁': ',', '﹂': ',', '﹃': ',', '﹄': ',', ';': ',', '[': ',', ']': ',', '`': ',', ':': ',', '"': ',',
                   '{': ',', '}': ',', '~': ',', ')': ',', '(': ',', '_': '"', '’': '\'', '^': ','}

ALPHABET_NORM = {'ａ': 'a', 'ｂ': 'b', 'ｃ': 'c', 'ｄ': 'd', 'ｅ': 'e', 'ｆ': 'f', 'ｇ': 'g', 'ｈ': 'h', 'ｉ': 'i', 'ｊ': 'j', 'ｋ': 'k', 'ｌ': 'l', 'ｍ': 'm',
                 'ｎ': 'n', 'ｏ': 'o', 'ｐ': 'p', 'ｑ': 'q', 'ｒ': 'r', 'ｓ': 's', 'ｔ': 't', 'ｕ': 'u', 'ｖ': 'v', 'ｗ': 'w', 'ｘ': 'x', 'ｙ': 'y', 'ｚ': 'z',
                 'Ａ': 'A', 'Ｂ': 'B', 'Ｃ': 'C', 'Ｄ': 'D', 'Ｅ': 'E', 'Ｆ': 'F', 'Ｇ': 'G', 'Ｈ': 'H', 'Ｉ': 'I', 'Ｊ': 'J', 'Ｋ': 'K', 'Ｌ': 'L', 'Ｍ': 'M',
                 'Ｎ': 'N', 'Ｏ': 'O', 'Ｐ': 'P', 'Ｑ': 'Q', 'Ｒ': 'R', 'Ｓ': 'S', 'Ｔ': 'T', 'Ｕ': 'U', 'Ｖ': 'V', 'Ｗ': 'W', 'Ｘ': 'X', 'Ｙ': 'Y', 'Ｚ': 'Z'}


def punct_normalization(sent):
    output_sent = ''
    for idx in range(len(sent)):
        if sent[idx] in PUNCT_NORMALIZE:
            output_sent += PUNCT_NORMALIZE[sent[idx]]
        else:
            output_sent += sent[idx]
    return output_sent


def alphabet_normalization(sent):
    output_sent = ''
    for idx in range(len(sent)):
        if sent[idx] in ALPHABET_NORM:
            output_sent += ALPHABET_NORM[sent[idx]]
        else:
            output_sent += sent[idx]
    return output_sent


class MultilingualTN():

    def __init__(self, module="wenet", remove_interjections=False, remove_erhua=True):
        self.tn_module = module
        self.language_module_map = {"zh": "chinese", "en": "english"}
        self.tn_implements = dict()
        if self.tn_module in TN_MODULES:
            if self.tn_module == "baidu":
                for l, m in self.language_module_map.items():
                    self.tn_implements[l] = __import__("text."+m, fromlist=[m])
            else:
                for l, m in self.language_module_map.items():
                    if l == "en":
                        self.tn_implements[l] = EnNormalizer(
                            overwrite_cache=True)
                    else:
                        self.tn_implements[l] = ZhNormalizer(
                            remove_erhua=remove_erhua, remove_interjections=remove_interjections, overwrite_cache=True)
        pass

    def _do_tn(self, text, language="zh"):
        norm_text = ""
        if language in self.language_module_map:
            module = self.tn_implements[language]
        else:
            module = self.tn_implements["zh"]
        if self.tn_module == "baidu":
            if hasattr(module, "text_normalize"):
                norm_text = module.text_normalize(text)
            else:
                norm_text = text
        elif self.tn_module == "wenet":
            if hasattr(module, "normalize"):
                norm_text = module.normalize(text)
            else:
                norm_text = text
        else:
            norm_text = text

        if language == "zh":
            norm_text = replace_punctuation_with_en(norm_text)
        norm_text = replace_consecutive_punctuation(norm_text)
        while "  " in norm_text:
            norm_text = norm_text.replace("  ", " ")
        return norm_text

    def normalize_segment(self, text, language, normalize_punct=False):
        if normalize_punct:
            text = punct_normalization(text)
        
        text = alphabet_normalization(text)
        text = text.lower()

        norm_text = self._do_tn(text, language)
        return norm_text

    def normalize(self, text, language, normalize_punct=False):
        if normalize_punct:
            text = punct_normalization(text)
        
        text = alphabet_normalization(text)
        text = text.lower()

        textlist = []
        langlist = []
        LangSegment.setfilters(["zh", "ja", "en", "ko"])
        if language == "auto":
            for tmp in LangSegment.getTexts(text):
                langlist.append(tmp["lang"])
                textlist.append(tmp["text"])
        else:
            for tmp in LangSegment.getTexts(text):
                if tmp["lang"] == "en":
                    langlist.append(tmp["lang"])
                else:
                    # 因无法区别中日韩文汉字,以用户输入为准
                    langlist.append(language)
                textlist.append(tmp["text"])
        # print(textlist)
        # print(langlist)
        phones_list = []
        norm_text_list = []
        for i in range(len(textlist)):
            lang = langlist[i]
            norm_text = self._do_tn(textlist[i], lang)
            norm_text_list.append(norm_text)
        norm_text = ''.join(norm_text_list)
        return norm_text


if __name__ == '__main__':
    '''
    Testing functions
    '''
    # text = '1983年2月，旅行了2天的儿童和长翅膀的女孩儿：“︘菜单修订后有鱼香肉丝儿、『王道椒香鸡腿〕和川蜀鸡翅？……”it\'s a test.王会计会计算机。which had been in force since 1760.调查员决定调节调查的难度。Article VI, Qing government would be charged an annual interest rate of 5% for the money'
    # text = 'Just Do It系列广告是哪个品牌的？从以下生物中选择出属于“植物”类的生物：\n\nA. 人 \nB. 杨树 \nC. 猫 \nD. 月季花 \nE. 细菌\nF. 真菌\nG. 灌木\n80/20法则是什么?NHTSA将自动驾驶分为多少个级别？√2和π是不是无理数？'
    language = 'zh'
    TN = MultilingualTN()
    sys.stderr.write("Input: ")
    for line in sys.stdin:
        if line.strip() == "exit()":
            exit()
        if len(line.strip()) <= 0:
            sys.stderr.write("Input: ")
            continue
        sys.stdout.write("{}\n".format(TN.normalize(
            line.strip(), language="zh", normalize_punct=True)))
        sys.stderr.write("Input: ")