|
from Utils.phonemize.cotlet_utils import *
|
|
import cutlet
|
|
|
|
katsu = cutlet.Cutlet(ensure_ascii=False)
|
|
katsu.use_foreign_spelling = False
|
|
|
|
def process_japanese_text(ml):
|
|
|
|
if any(char in ml for char in "ぁぃぅぇぉ"):
|
|
|
|
ml = ml.replace("ぁ", "あ")
|
|
ml = ml.replace("ぃ", "い")
|
|
ml = ml.replace("ぅ", "う")
|
|
ml = ml.replace("ぇ", "え")
|
|
ml = ml.replace("ぉ", "お")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
output = katsu.romaji(apply_transformations(alphabetreading(ml)), capitalize=False).lower()
|
|
|
|
|
|
|
|
if 'j' in output:
|
|
output = output.replace('j', "dʑ")
|
|
if 'tt' in output:
|
|
output = output.replace('tt', "ʔt")
|
|
if 't t' in output:
|
|
output = output.replace('t t', "ʔt")
|
|
if ' ʔt' in output:
|
|
output = output.replace(' ʔt', "ʔt")
|
|
if 'ssh' in output:
|
|
output = output.replace('ssh', "ɕɕ")
|
|
|
|
|
|
output = Roma2IPA(convert_numbers_in_string(output))
|
|
|
|
|
|
output = hira2ipa(output)
|
|
|
|
|
|
output = replace_chars_2(output)
|
|
output = replace_repeated_chars(replace_tashdid_2(output))
|
|
output = nasal_mapper(output)
|
|
|
|
|
|
if " ɴ" in output:
|
|
output = output.replace(" ɴ", "ɴ")
|
|
|
|
if ' neɽitai ' in output:
|
|
output = output.replace(' neɽitai ', "naɽitai")
|
|
|
|
if 'harɯdʑisama' in output:
|
|
output = output.replace('harɯdʑisama', "arɯdʑisama")
|
|
|
|
|
|
if "ki ni ɕinai" in output:
|
|
output = re.sub(r'(?<!\s)ki ni ɕinai', r' ki ni ɕinai', output)
|
|
|
|
if 'ʔt' in output:
|
|
output = re.sub(r'(?<!\s)ʔt', r'ʔt', output)
|
|
|
|
if 'de aɽoɯ' in output:
|
|
output = re.sub(r'(?<!\s)de aɽoɯ', r' de aɽoɯ', output)
|
|
|
|
|
|
return output.lstrip()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def replace_repeating_a(output):
|
|
|
|
patterns = [
|
|
(r'(aː)\s*\1+\s*', r'\1~'),
|
|
(r'(aːa)\s*aː', r'\1~'),
|
|
(r'aːa', r'aː~'),
|
|
(r'naː\s*aː', r'naː~'),
|
|
(r'(oː)\s*\1+\s*', r'\1~'),
|
|
(r'(oːo)\s*oː', r'\1~'),
|
|
(r'oːo', r'oː~'),
|
|
(r'(eː)\s*\1+\s*', r'\1~'),
|
|
(r'(e)\s*\1+\s*', r'\1~'),
|
|
(r'(eːe)\s*eː', r'\1~'),
|
|
(r'eːe', r'eː~'),
|
|
(r'neː\s*eː', r'neː~'),
|
|
]
|
|
|
|
|
|
|
|
for pattern, replacement in patterns:
|
|
output = re.sub(pattern, replacement, output)
|
|
|
|
return output
|
|
|
|
def phonemize(text):
|
|
|
|
|
|
|
|
|
|
output = post_fix(process_japanese_text(text))
|
|
|
|
|
|
if " ɴ" in output:
|
|
output = output.replace(" ɴ", "ɴ")
|
|
if "y" in output:
|
|
output = output.replace("y", "j")
|
|
if "ɯa" in output:
|
|
output = output.replace("ɯa", "wa")
|
|
|
|
if "a aː" in output:
|
|
output = output.replace("a aː","a~")
|
|
if "a a" in output:
|
|
output = output.replace("a a","a~")
|
|
|
|
|
|
|
|
|
|
|
|
output = replace_repeating_a((output))
|
|
output = re.sub(r'\s+~', '~', output)
|
|
|
|
if "oː~o oː~ o" in output:
|
|
output = output.replace("oː~o oː~ o","oː~~~~~~")
|
|
if "aː~aː" in output:
|
|
output = output.replace("aː~aː","aː~~~")
|
|
if "oɴ naː" in output:
|
|
output = output.replace("oɴ naː","onnaː")
|
|
if "aː~~ aː" in output:
|
|
output = output.replace("aː~~ aː","aː~~~~")
|
|
if "oː~o" in output:
|
|
output = output.replace("oː~o","oː~~")
|
|
if "oː~~o o" in output:
|
|
output = output.replace("oː~~o o","oː~~~~")
|
|
|
|
output = random_space_fix(output)
|
|
output = random_sym_fix(output)
|
|
output = random_sym_fix_no_space(output)
|
|
|
|
|
|
|
|
|
|
|
|
return output.lstrip()
|
|
|
|
|
|
|