|
formal_to_informal = {
|
|
|
|
|
|
|
|
'ワタクシ': 'わたし',
|
|
'チカコ':'しゅうこ',
|
|
"タノヒト":"ほかのひと",
|
|
|
|
|
|
}
|
|
|
|
formal_to_informal2 = {
|
|
|
|
"たのひと":"ほかのひと",
|
|
"すうは": "かずは",
|
|
|
|
|
|
|
|
}
|
|
|
|
formal_to_informal3 = {
|
|
|
|
"%":"%",
|
|
"@": "あっとさいん",
|
|
"$":"どる",
|
|
"#":"はっしゅたぐ",
|
|
"$":"どる",
|
|
"#":"はっしゅたぐ",
|
|
"何が":"なにが",
|
|
"何に":"なにに",
|
|
|
|
"何も":"なにも",
|
|
"何か":"なにか",
|
|
|
|
"何は":"なにが",
|
|
"お父様":"おとうさま",
|
|
"お兄様":"おにいさま",
|
|
"何を":"なにを",
|
|
"良い":"いい",
|
|
"李衣菜":"りいな",
|
|
"志希":"しき",
|
|
"種":"たね",
|
|
"方々":"かたがた",
|
|
"颯":"はやて",
|
|
"茄子さん":"かこさん",
|
|
"茄子ちゃん":"かこちゃん",
|
|
"涼ちゃん":"りょうちゃん",
|
|
"涼さん":"りょうさん",
|
|
"紗枝":"さえ",
|
|
"文香":"ふみか",
|
|
"私":"わたし",
|
|
"周子":"しゅうこ",
|
|
"イェ":"いえ",
|
|
"可憐":"かれん",
|
|
"加蓮":"かれん",
|
|
"・":".",
|
|
|
|
|
|
"唯さん":"ゆいさん",
|
|
"唯ちゃん":"ゆいちゃん",
|
|
"聖ちゃん":"ひじりちゃん",
|
|
"他の":"ほかの",
|
|
"他に":"ほかに",
|
|
"一生懸命":"いっしょうけんめい",
|
|
"楓さん":"かえでさん",
|
|
"楓ちゃん":"かえでちゃん",
|
|
"内から":"ないから",
|
|
"の下で":"のしたで",
|
|
|
|
}
|
|
|
|
|
|
mapper = dict([
|
|
|
|
("仕方","しかた"),
|
|
("明日","あした"),
|
|
('私',"わたし"),
|
|
("従妹","いとこ"),
|
|
|
|
("1人","ひとり"),
|
|
("2人","ふたり"),
|
|
|
|
("一期","いちご"),
|
|
("一会","いちえ"),
|
|
|
|
("♪","!"),
|
|
("?","?"),
|
|
|
|
("どんな方","どんなかた"),
|
|
("ふたり暮らし","ふたりぐらし"),
|
|
|
|
("新年","しんねん"),
|
|
("来年","らいねん"),
|
|
("去年","きょねん"),
|
|
("壮年","そうねん"),
|
|
("今年","ことし"),
|
|
|
|
("昨年","さくねん"),
|
|
("本年","ほんねん"),
|
|
("平年","へいねん"),
|
|
("閏年","うるうどし"),
|
|
("初年","しょねん"),
|
|
("少年","しょうねん"),
|
|
("多年","たねん"),
|
|
("青年","せいねん"),
|
|
("中年","ちゅうねん"),
|
|
("老年","ろうねん"),
|
|
("成年","せいねん"),
|
|
("幼年","ようねん"),
|
|
("前年","ぜんねん"),
|
|
("元年","がんねん"),
|
|
("経年","けいねん"),
|
|
("当年","とうねん"),
|
|
|
|
("明年","みょうねん"),
|
|
("歳年","さいねん"),
|
|
("数年","すうねん"),
|
|
("半年","はんとし"),
|
|
("後年","こうねん"),
|
|
("実年","じつねん"),
|
|
("年年","ねんねん"),
|
|
("連年","れんねん"),
|
|
("暦年","れきねん"),
|
|
("各年","かくねん"),
|
|
("全年","ぜんねん"),
|
|
|
|
("年を","としを"),
|
|
("年が","としが"),
|
|
("年も","としも"),
|
|
("年は","としは"),
|
|
|
|
|
|
("奏ちゃん","かなでちゃん"),
|
|
("負けず嫌い","まけずぎらい"),
|
|
("貴方","あなた"),
|
|
("貴女","あなた"),
|
|
("貴男","あなた"),
|
|
|
|
("その節","そのせつ"),
|
|
|
|
("何しに","なにしに"),
|
|
("何する","なにする"),
|
|
|
|
|
|
("心さん","しんさん"),
|
|
("心ちゃん","しんちゃん"),
|
|
|
|
("乃々","のの"),
|
|
|
|
("身体の","からだの"),
|
|
("身体が","からだが"),
|
|
("身体を","からだを"),
|
|
("身体は","からだは"),
|
|
("身体に","からだに"),
|
|
("正念場","しょうねんば"),
|
|
("言う","いう"),
|
|
|
|
|
|
("一回","いっかい"),
|
|
("一曲","いっきょく"),
|
|
("一日","いちにち"),
|
|
("一言","ひとこと"),
|
|
("一杯","いっぱい"),
|
|
|
|
|
|
("方が","ほうが"),
|
|
("縦輪城","じゅうりんしろ"),
|
|
("深息","しんそく"),
|
|
("家人","かじん"),
|
|
("お返し","おかえし"),
|
|
("化物語","ばけものがたり"),
|
|
("阿良々木暦","あららぎこよみ"),
|
|
("何より","なにより")
|
|
|
|
|
|
])
|
|
|
|
|
|
|
|
all_transformations = {**formal_to_informal, **formal_to_informal2, **formal_to_informal3, **mapper}
|
|
|
|
def apply_transformations(text, transformations = all_transformations):
|
|
for key, value in transformations.items():
|
|
text = text.replace(key, value)
|
|
return text
|
|
|
|
import re
|
|
|
|
def number_to_japanese(num):
|
|
if not isinstance(num, int) or num < 0 or num > 9999:
|
|
return "Invalid input"
|
|
|
|
digits = ["", "いち", "に", "さん", "よん", "ご", "ろく", "なな", "はち", "きゅう"]
|
|
tens = ["", "じゅう", "にじゅう", "さんじゅう", "よんじゅう", "ごじゅう", "ろくじゅう", "ななじゅう", "はちじゅう", "きゅうじゅう"]
|
|
hundreds = ["", "ひゃく", "にひゃく", "さんびゃく", "よんひゃく", "ごひゃく", "ろっぴゃく", "ななひゃく", "はっぴゃく", "きゅうひゃく"]
|
|
thousands = ["", "せん", "にせん", "さんぜん", "よんせん", "ごせん", "ろくせん", "ななせん", "はっせん", "きゅうせん"]
|
|
|
|
if num == 0:
|
|
return "ゼロ"
|
|
|
|
result = ""
|
|
if num >= 1000:
|
|
result += thousands[num // 1000]
|
|
num %= 1000
|
|
if num >= 100:
|
|
result += hundreds[num // 100]
|
|
num %= 100
|
|
if num >= 10:
|
|
result += tens[num // 10]
|
|
num %= 10
|
|
if num > 0:
|
|
result += digits[num]
|
|
|
|
return result
|
|
|
|
def convert_numbers_in_string(input_string):
|
|
|
|
number_pattern = re.compile(r'\d+')
|
|
|
|
|
|
def replace_with_japanese(match):
|
|
num = int(match.group())
|
|
return number_to_japanese(num)
|
|
|
|
|
|
converted_string = number_pattern.sub(replace_with_japanese, input_string)
|
|
return converted_string
|
|
|
|
|
|
roma_mapper = dict([
|
|
|
|
|
|
|
|
("my","mʲ"),
|
|
("by","bʲ"),
|
|
("ny","nʲ"),
|
|
("ry","rʲ"),
|
|
("si","sʲ"),
|
|
("ky","kʲ"),
|
|
("gy","gʲ"),
|
|
("dy","dʲ"),
|
|
("di","dʲ"),
|
|
("fi","fʲ"),
|
|
("fy","fʲ"),
|
|
("ch","tɕ"),
|
|
("sh","ɕ"),
|
|
|
|
|
|
|
|
("a","a"),
|
|
("i","i"),
|
|
("u","ɯ"),
|
|
("e","e"),
|
|
("o","o"),
|
|
("ka","ka"),
|
|
("ki","ki"),
|
|
("ku","kɯ"),
|
|
("ke","ke"),
|
|
("ko","ko"),
|
|
("sa","sa"),
|
|
("shi","ɕi"),
|
|
("su","sɯ"),
|
|
("se","se"),
|
|
("so","so"),
|
|
("ta","ta"),
|
|
("chi","tɕi"),
|
|
("tsu","tsɯ"),
|
|
("te","te"),
|
|
("to","to"),
|
|
("na","na"),
|
|
("ni","ni"),
|
|
("nu","nɯ"),
|
|
("ne","ne"),
|
|
("no","no"),
|
|
("ha","ha"),
|
|
("hi","çi"),
|
|
("fu","ɸɯ"),
|
|
("he","he"),
|
|
("ho","ho"),
|
|
("ma","ma"),
|
|
("mi","mi"),
|
|
("mu","mɯ"),
|
|
("me","me"),
|
|
("mo","mo"),
|
|
("ra","ɽa"),
|
|
("ri","ɽi"),
|
|
("ru","ɽɯ"),
|
|
("re","ɽe"),
|
|
("ro","ɽo"),
|
|
("ga","ga"),
|
|
("gi","gi"),
|
|
("gu","gɯ"),
|
|
("ge","ge"),
|
|
("go","go"),
|
|
("za","za"),
|
|
("ji","dʑi"),
|
|
("zu","zɯ"),
|
|
("ze","ze"),
|
|
("zo","zo"),
|
|
("da","da"),
|
|
|
|
|
|
("zu","zɯ"),
|
|
("de","de"),
|
|
("do","do"),
|
|
("ba","ba"),
|
|
("bi","bi"),
|
|
("bu","bɯ"),
|
|
("be","be"),
|
|
("bo","bo"),
|
|
("pa","pa"),
|
|
("pi","pi"),
|
|
("pu","pɯ"),
|
|
("pe","pe"),
|
|
("po","po"),
|
|
("ya","ja"),
|
|
("yu","jɯ"),
|
|
("yo","jo"),
|
|
("wa","wa"),
|
|
|
|
|
|
|
|
|
|
("a","a"),
|
|
("i","i"),
|
|
("u","ɯ"),
|
|
("e","e"),
|
|
("o","o"),
|
|
("wa","wa"),
|
|
("o","o"),
|
|
|
|
|
|
("wo","o")])
|
|
|
|
nasal_sound = dict([
|
|
|
|
("ɴm","mm"),
|
|
("ɴb", "mb"),
|
|
("ɴp", "mp"),
|
|
|
|
|
|
("ɴk","ŋk"),
|
|
("ɴg", "ŋg"),
|
|
|
|
|
|
("ɴt","nt"),
|
|
("ɴd", "nd"),
|
|
("ɴn","nn"),
|
|
("ɴs", "ns"),
|
|
("ɴz","nz"),
|
|
("ɴɽ", "nɽ"),
|
|
|
|
("ɴɲ", "ɲɲ"),
|
|
|
|
])
|
|
|
|
def Roma2IPA(text):
|
|
orig = text
|
|
|
|
for k, v in roma_mapper.items():
|
|
text = text.replace(k, v)
|
|
|
|
return text
|
|
|
|
def nasal_mapper(text):
|
|
orig = text
|
|
|
|
|
|
for k, v in nasal_sound.items():
|
|
text = text.replace(k, v)
|
|
|
|
return text
|
|
|
|
def alphabetreading(text):
|
|
alphabet_dict = {"A": "エイ",
|
|
"B": "ビー",
|
|
"C": "シー",
|
|
"D": "ディー",
|
|
"E": "イー",
|
|
"F": "エフ",
|
|
"G": "ジー",
|
|
"H": "エイチ",
|
|
"I":"アイ",
|
|
"J":"ジェイ",
|
|
"K":"ケイ",
|
|
"L":"エル",
|
|
"M":"エム",
|
|
"N":"エヌ",
|
|
"O":"オー",
|
|
"P":"ピー",
|
|
"Q":"キュー",
|
|
"R":"アール",
|
|
"S":"エス",
|
|
"T":"ティー",
|
|
"U":"ユー",
|
|
"V":"ヴィー",
|
|
"W":"ダブリュー",
|
|
"X":"エックス",
|
|
"Y":"ワイ",
|
|
"Z":"ゼッド"}
|
|
text = text.upper()
|
|
text_ret = ""
|
|
for t in text:
|
|
if t in alphabet_dict:
|
|
text_ret += alphabet_dict[t]
|
|
else:
|
|
text_ret += t
|
|
return text_ret
|
|
|
|
import re
|
|
import cutlet
|
|
|
|
roma_mapper_plus_2 = {
|
|
|
|
"bjo":'bʲo',
|
|
"rjo":"rʲo",
|
|
"kjo":"kʲo",
|
|
"kyu":"kʲu",
|
|
|
|
}
|
|
|
|
def replace_repeated_chars(input_string):
|
|
result = []
|
|
i = 0
|
|
while i < len(input_string):
|
|
if i + 1 < len(input_string) and input_string[i] == input_string[i + 1] and input_string[i] in 'aiueo':
|
|
result.append(input_string[i] + 'ː')
|
|
i += 2
|
|
else:
|
|
result.append(input_string[i])
|
|
i += 1
|
|
return ''.join(result)
|
|
|
|
|
|
def replace_chars_2(text, mapping=roma_mapper_plus_2):
|
|
|
|
|
|
sorted_keys = sorted(mapping.keys(), key=len, reverse=True)
|
|
|
|
pattern = '|'.join(re.escape(key) for key in sorted_keys)
|
|
|
|
|
|
def replace(match):
|
|
key = match.group(0)
|
|
return mapping.get(key, key)
|
|
|
|
return re.sub(pattern, replace, text)
|
|
|
|
|
|
def replace_tashdid_2(s):
|
|
vowels = 'aiueoɯ0123456789.?!_。؟?!...@@##$$%%^^&&**()()_+=[「」]></\`~~―ー∺"'
|
|
result = []
|
|
|
|
i = 0
|
|
while i < len(s):
|
|
if i < len(s) - 2 and s[i].lower() == s[i + 2].lower() and s[i].lower() not in vowels and s[i + 1] == ' ':
|
|
result.append('ʔ')
|
|
result.append(s[i + 2])
|
|
i += 3
|
|
elif i < len(s) - 1 and s[i].lower() == s[i + 1].lower() and s[i].lower() not in vowels:
|
|
result.append('ʔ')
|
|
result.append(s[i + 1])
|
|
i += 2
|
|
else:
|
|
result.append(s[i])
|
|
i += 1
|
|
|
|
return ''.join(result)
|
|
|
|
def replace_tashdid(input_string):
|
|
result = []
|
|
i = 0
|
|
while i < len(input_string):
|
|
if i + 1 < len(input_string) and input_string[i] == input_string[i + 1] and input_string[i] not in 'aiueo':
|
|
result.append('ʔ')
|
|
result.append(input_string[i])
|
|
i += 2
|
|
else:
|
|
result.append(input_string[i])
|
|
i += 1
|
|
return ''.join(result)
|
|
|
|
def hira2ipa(text, roma_mapper=roma_mapper):
|
|
keys_set = set(roma_mapper.keys())
|
|
special_rule = ("n", "ɴ")
|
|
|
|
transformed_text = []
|
|
i = 0
|
|
|
|
while i < len(text):
|
|
if text[i] == special_rule[0]:
|
|
if i + 1 == len(text) or text[i + 1] not in keys_set:
|
|
transformed_text.append(special_rule[1])
|
|
else:
|
|
transformed_text.append(text[i])
|
|
else:
|
|
transformed_text.append(text[i])
|
|
|
|
i += 1
|
|
|
|
return ''.join(transformed_text)
|
|
|
|
import re
|
|
|
|
def replace_ending(text):
|
|
|
|
|
|
|
|
|
|
pattern = r'(?<!naɴ)(?<!nan)niɴ$'
|
|
return re.sub(pattern, 'dʑiɴ', text)
|
|
|
|
|
|
|
|
k_mapper = dict([
|
|
("ゔぁ","ba"),
|
|
("ゔぃ","bi"),
|
|
("ゔぇ","be"),
|
|
("ゔぉ","bo"),
|
|
("ゔゃ","bʲa"),
|
|
("ゔゅ","bʲɯ"),
|
|
("ゔゃ","bʲa"),
|
|
("ゔょ","bʲo"),
|
|
|
|
("ゔ","bɯ"),
|
|
|
|
("あぁ"," aː"),
|
|
("いぃ"," iː"),
|
|
("いぇ"," je"),
|
|
("いゃ"," ja"),
|
|
("うぅ"," ɯː"),
|
|
("えぇ"," eː"),
|
|
("おぉ"," oː"),
|
|
("かぁ"," kaː"),
|
|
("きぃ"," kiː"),
|
|
("くぅ","kɯː"),
|
|
("くゃ","ka"),
|
|
("くゅ","kʲɯ"),
|
|
("くょ","kʲo"),
|
|
("けぇ","keː"),
|
|
("こぉ","koː"),
|
|
("がぁ","gaː"),
|
|
("ぎぃ","giː"),
|
|
("ぐぅ","gɯː"),
|
|
("ぐゃ","gʲa"),
|
|
("ぐゅ","gʲɯ"),
|
|
("ぐょ","gʲo"),
|
|
("げぇ","geː"),
|
|
("ごぉ","goː"),
|
|
("さぁ","saː"),
|
|
("しぃ","ɕiː"),
|
|
("すぅ","sɯː"),
|
|
("すゃ","sʲa"),
|
|
("すゅ","sʲɯ"),
|
|
("すょ","sʲo"),
|
|
("せぇ","seː"),
|
|
("そぉ","soː"),
|
|
("ざぁ","zaː"),
|
|
("じぃ","dʑiː"),
|
|
("ずぅ","zɯː"),
|
|
("ずゃ","zʲa"),
|
|
("ずゅ","zʲɯ"),
|
|
("ずょ","zʲo"),
|
|
("ぜぇ","zeː"),
|
|
("ぞぉ","zeː"),
|
|
("たぁ","taː"),
|
|
("ちぃ","tɕiː"),
|
|
("つぁ","tsa"),
|
|
("つぃ","tsi"),
|
|
("つぅ","tsɯː"),
|
|
("つゃ","tɕa"),
|
|
("つゅ","tɕɯ"),
|
|
("つょ","tɕo"),
|
|
("つぇ","tse"),
|
|
("つぉ","tso"),
|
|
("てぇ","teː"),
|
|
("とぉ","toː"),
|
|
("だぁ","daː"),
|
|
("ぢぃ","dʑiː"),
|
|
("づぅ","dɯː"),
|
|
("づゃ","zʲa"),
|
|
("づゅ","zʲɯ"),
|
|
("づょ","zʲo"),
|
|
("でぇ","deː"),
|
|
("どぉ","doː"),
|
|
("なぁ","naː"),
|
|
("にぃ","niː"),
|
|
("ぬぅ","nɯː"),
|
|
("ぬゃ","nʲa"),
|
|
("ぬゅ","nʲɯ"),
|
|
("ぬょ","nʲo"),
|
|
("ねぇ","neː"),
|
|
("のぉ","noː"),
|
|
("はぁ","haː"),
|
|
("ひぃ","çiː"),
|
|
("ふぅ","ɸɯː"),
|
|
("ふゃ","ɸʲa"),
|
|
("ふゅ","ɸʲɯ"),
|
|
("ふょ","ɸʲo"),
|
|
("へぇ","heː"),
|
|
("ほぉ","hoː"),
|
|
("ばぁ","baː"),
|
|
("びぃ","biː"),
|
|
("ぶぅ","bɯː"),
|
|
("ふゃ","ɸʲa"),
|
|
("ぶゅ","bʲɯ"),
|
|
("ふょ","ɸʲo"),
|
|
("べぇ","beː"),
|
|
("ぼぉ","boː"),
|
|
("ぱぁ","paː"),
|
|
("ぴぃ","piː"),
|
|
("ぷぅ","pɯː"),
|
|
("ぷゃ","pʲa"),
|
|
("ぷゅ","pʲɯ"),
|
|
("ぷょ","pʲo"),
|
|
("ぺぇ","peː"),
|
|
("ぽぉ","poː"),
|
|
("まぁ","maː"),
|
|
("みぃ","miː"),
|
|
("むぅ","mɯː"),
|
|
("むゃ","mʲa"),
|
|
("むゅ","mʲɯ"),
|
|
("むょ","mʲo"),
|
|
("めぇ","meː"),
|
|
("もぉ","moː"),
|
|
("やぁ","jaː"),
|
|
("ゆぅ","jɯː"),
|
|
("ゆゃ","jaː"),
|
|
("ゆゅ","jɯː"),
|
|
("ゆょ","joː"),
|
|
("よぉ","joː"),
|
|
("らぁ","ɽaː"),
|
|
("りぃ","ɽiː"),
|
|
("るぅ","ɽɯː"),
|
|
("るゃ","ɽʲa"),
|
|
("るゅ","ɽʲɯ"),
|
|
("るょ","ɽʲo"),
|
|
("れぇ","ɽeː"),
|
|
("ろぉ","ɽoː"),
|
|
("わぁ","ɯaː"),
|
|
("をぉ","oː"),
|
|
|
|
("う゛","bɯ"),
|
|
("でぃ","di"),
|
|
("でぇ","deː"),
|
|
("でゃ","dʲa"),
|
|
("でゅ","dʲɯ"),
|
|
("でょ","dʲo"),
|
|
("てぃ","ti"),
|
|
("てぇ","teː"),
|
|
("てゃ","tʲa"),
|
|
("てゅ","tʲɯ"),
|
|
("てょ","tʲo"),
|
|
("すぃ","si"),
|
|
("ずぁ","zɯa"),
|
|
("ずぃ","zi"),
|
|
("ずぅ","zɯ"),
|
|
("ずゃ","zʲa"),
|
|
("ずゅ","zʲɯ"),
|
|
("ずょ","zʲo"),
|
|
("ずぇ","ze"),
|
|
("ずぉ","zo"),
|
|
("きゃ","kʲa"),
|
|
("きゅ","kʲɯ"),
|
|
("きょ","kʲo"),
|
|
("しゃ","ɕʲa"),
|
|
("しゅ","ɕʲɯ"),
|
|
("しぇ","ɕʲe"),
|
|
("しょ","ɕʲo"),
|
|
("ちゃ","tɕa"),
|
|
("ちゅ","tɕɯ"),
|
|
("ちぇ","tɕe"),
|
|
("ちょ","tɕo"),
|
|
("とぅ","tɯ"),
|
|
("とゃ","tʲa"),
|
|
("とゅ","tʲɯ"),
|
|
("とょ","tʲo"),
|
|
("どぁ","doa"),
|
|
("どぅ","dɯ"),
|
|
("どゃ","dʲa"),
|
|
("どゅ","dʲɯ"),
|
|
("どょ","dʲo"),
|
|
("どぉ","doː"),
|
|
("にゃ","nʲa"),
|
|
("にゅ","nʲɯ"),
|
|
("にょ","nʲo"),
|
|
("ひゃ","çʲa"),
|
|
("ひゅ","çʲɯ"),
|
|
("ひょ","çʲo"),
|
|
("みゃ","mʲa"),
|
|
("みゅ","mʲɯ"),
|
|
("みょ","mʲo"),
|
|
("りゃ","ɽʲa"),
|
|
("りぇ","ɽʲe"),
|
|
("りゅ","ɽʲɯ"),
|
|
("りょ","ɽʲo"),
|
|
("ぎゃ","gʲa"),
|
|
("ぎゅ","gʲɯ"),
|
|
("ぎょ","gʲo"),
|
|
("ぢぇ","dʑe"),
|
|
("ぢゃ","dʑa"),
|
|
("ぢゅ","dʑɯ"),
|
|
("ぢょ","dʑo"),
|
|
("じぇ","dʑe"),
|
|
("じゃ","dʑa"),
|
|
("じゅ","dʑɯ"),
|
|
("じょ","dʑo"),
|
|
("びゃ","bʲa"),
|
|
("びゅ","bʲɯ"),
|
|
("びょ","bʲo"),
|
|
("ぴゃ","pʲa"),
|
|
("ぴゅ","pʲɯ"),
|
|
("ぴょ","pʲo"),
|
|
("うぁ","ɯa"),
|
|
("うぃ","ɯi"),
|
|
("うぇ","ɯe"),
|
|
("うぉ","ɯo"),
|
|
("うゃ","ɯʲa"),
|
|
("うゅ","ɯʲɯ"),
|
|
("うょ","ɯʲo"),
|
|
("ふぁ","ɸa"),
|
|
("ふぃ","ɸi"),
|
|
("ふぅ","ɸɯ"),
|
|
("ふゃ","ɸʲa"),
|
|
("ふゅ","ɸʲɯ"),
|
|
("ふょ","ɸʲo"),
|
|
("ふぇ","ɸe"),
|
|
("ふぉ","ɸo"),
|
|
|
|
("あ"," a"),
|
|
("い"," i"),
|
|
("う","ɯ"),
|
|
("え"," e"),
|
|
("お"," o"),
|
|
("か"," ka"),
|
|
("き"," ki"),
|
|
("く"," kɯ"),
|
|
("け"," ke"),
|
|
("こ"," ko"),
|
|
("さ"," sa"),
|
|
("し"," ɕi"),
|
|
("す"," sɯ"),
|
|
("せ"," se"),
|
|
("そ"," so"),
|
|
("た"," ta"),
|
|
("ち"," tɕi"),
|
|
("つ"," tsɯ"),
|
|
("て"," te"),
|
|
("と"," to"),
|
|
("な"," na"),
|
|
("に"," ni"),
|
|
("ぬ"," nɯ"),
|
|
("ね"," ne"),
|
|
("の"," no"),
|
|
("は"," ha"),
|
|
("ひ"," çi"),
|
|
("ふ"," ɸɯ"),
|
|
("へ"," he"),
|
|
("ほ"," ho"),
|
|
("ま"," ma"),
|
|
("み"," mi"),
|
|
("む"," mɯ"),
|
|
("め"," me"),
|
|
("も"," mo"),
|
|
("ら"," ɽa"),
|
|
("り"," ɽi"),
|
|
("る"," ɽɯ"),
|
|
("れ"," ɽe"),
|
|
("ろ"," ɽo"),
|
|
("が"," ga"),
|
|
("ぎ"," gi"),
|
|
("ぐ"," gɯ"),
|
|
("げ"," ge"),
|
|
("ご"," go"),
|
|
("ざ"," za"),
|
|
("じ"," dʑi"),
|
|
("ず"," zɯ"),
|
|
("ぜ"," ze"),
|
|
("ぞ"," zo"),
|
|
("だ"," da"),
|
|
("ぢ"," dʑi"),
|
|
("づ"," zɯ"),
|
|
("で"," de"),
|
|
("ど"," do"),
|
|
("ば"," ba"),
|
|
("び"," bi"),
|
|
("ぶ"," bɯ"),
|
|
("べ"," be"),
|
|
("ぼ"," bo"),
|
|
("ぱ"," pa"),
|
|
("ぴ"," pi"),
|
|
("ぷ"," pɯ"),
|
|
("ぺ"," pe"),
|
|
("ぽ"," po"),
|
|
("や"," ja"),
|
|
("ゆ"," jɯ"),
|
|
("よ"," jo"),
|
|
("わ"," wa"),
|
|
("ゐ"," i"),
|
|
("ゑ"," e"),
|
|
("ん"," ɴ"),
|
|
("っ"," ʔ"),
|
|
("ー"," ː"),
|
|
|
|
("ぁ"," a"),
|
|
("ぃ"," i"),
|
|
("ぅ"," ɯ"),
|
|
("ぇ"," e"),
|
|
("ぉ"," o"),
|
|
("ゎ"," ɯa"),
|
|
("ぉ"," o"),
|
|
("っ","?"),
|
|
|
|
("を","o")
|
|
|
|
])
|
|
|
|
|
|
def post_fix(text):
|
|
orig = text
|
|
|
|
for k, v in k_mapper.items():
|
|
text = text.replace(k, v)
|
|
|
|
return text
|
|
|
|
|
|
|
|
|
|
sym_ws = dict([
|
|
|
|
("$ ","dorɯ"),
|
|
("$ ","dorɯ"),
|
|
|
|
("〇 ","marɯ"),
|
|
("¥ ","eɴ"),
|
|
|
|
("# ","haʔɕɯ tagɯ"),
|
|
("# ","haʔɕɯ tagɯ"),
|
|
|
|
("& ","ando"),
|
|
("& ","ando"),
|
|
|
|
("% ","paːsento"),
|
|
("% ","paːsento"),
|
|
|
|
("@ ","aʔto saiɴ"),
|
|
("@ ","aʔto saiɴ")
|
|
|
|
|
|
|
|
])
|
|
|
|
def random_sym_fix(text):
|
|
orig = text
|
|
|
|
for k, v in sym_ws.items():
|
|
text = text.replace(k, f" {v} ")
|
|
|
|
return text
|
|
|
|
|
|
sym_ns = dict([
|
|
|
|
("$","dorɯ"),
|
|
("$","dorɯ"),
|
|
|
|
("〇","marɯ"),
|
|
("¥","eɴ"),
|
|
|
|
("#","haʔɕɯ tagɯ"),
|
|
("#","haʔɕɯ tagɯ"),
|
|
|
|
("&","ando"),
|
|
("&","ando"),
|
|
|
|
("%","paːsento"),
|
|
("%","paːsento"),
|
|
|
|
("@","aʔto saiɴ"),
|
|
("@","aʔto saiɴ"),
|
|
|
|
("~","—"),
|
|
("kʲɯɯdʑɯɯkʲɯɯ.kʲɯɯdʑɯɯ","kʲɯɯdʑɯɯ kʲɯɯ teɴ kʲɯɯdʑɯɯ")
|
|
|
|
|
|
|
|
|
|
|
|
])
|
|
|
|
def random_sym_fix_no_space(text):
|
|
orig = text
|
|
|
|
for k, v in sym_ns.items():
|
|
text = text.replace(k, f" {v} ")
|
|
|
|
return text
|
|
|
|
|
|
spaces = dict([
|
|
|
|
("ɯ ɴ","ɯɴ"),
|
|
("na ɴ ","naɴ "),
|
|
(" mina ", " miɴna "),
|
|
("ko ɴ ni tɕi ha","konnitɕiwa"),
|
|
("ha i","hai"),
|
|
("boɯtɕama","boʔtɕama"),
|
|
("i eːi","ieːi"),
|
|
("taiɕɯtsɯdʑoɯ","taiɕitsɯdʑoɯ"),
|
|
("soɴna ka ze ni","soɴna fɯɯ ni"),
|
|
(" i e ","ke "),
|
|
("�",""),
|
|
("×"," batsɯ "),
|
|
("se ka ɯndo","sekaɯndo"),
|
|
("i i","iː"),
|
|
("i tɕi","itɕi"),
|
|
("ka i","kai"),
|
|
("naɴ ga","nani ga"),
|
|
("i eː i","ieːi"),
|
|
|
|
("naɴ koɽe","nani koɽe"),
|
|
("naɴ soɽe","nani soɽe"),
|
|
(" ɕeɴ "," seɴ "),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
("en ","eɴ "),
|
|
("in ","iɴ "),
|
|
("an ","aɴ "),
|
|
("on ","oɴ "),
|
|
("ɯn ","ɯɴ "),
|
|
|
|
|
|
("koɴd o","kondo"),
|
|
("ko ɴ d o","kondo"),
|
|
("ko ɴ do","kondo"),
|
|
|
|
("oanitɕaɴ","oniːtɕaɴ"),
|
|
("oanisaɴ","oniːsaɴ"),
|
|
("oanisama","oniːsama"),
|
|
("hoːmɯrɯɴɯ","hoːmɯrɯːmɯ"),
|
|
("so ɴ na ","sonna"),
|
|
(" sonna "," sonna "),
|
|
(" konna "," konna "),
|
|
("ko ɴ na ","konna"),
|
|
(" ko to "," koto "),
|
|
("edʑdʑi","eʔtɕi"),
|
|
(" edʑdʑ "," eʔtɕi "),
|
|
(" dʑdʑ "," dʑiːdʑiː "),
|
|
("secɯnd","sekaɯndo"),
|
|
|
|
("ɴɯ","nɯ"),
|
|
("ɴe","ne"),
|
|
("ɴo","no"),
|
|
("ɴa","na"),
|
|
("ɴi","ni"),
|
|
("ɴʲ","nʲ"),
|
|
|
|
("hotond o","hotondo"),
|
|
("hakoɴd e","hakoɴde"),
|
|
("gakɯtɕi ɽi","gaʔtɕiɽi "),
|
|
|
|
(" ʔ","ʔ"),
|
|
("ʔ ","ʔ"),
|
|
|
|
("-","ː"),
|
|
("- ","ː"),
|
|
("--","~ː"),
|
|
("~","—"),
|
|
("、",","),
|
|
|
|
|
|
(" ː","ː"),
|
|
('ka nade',"kanade"),
|
|
|
|
("ohahasaɴ","okaːsaɴ"),
|
|
(" "," "),
|
|
("viː","bɯiː"),
|
|
("ːː","ː—"),
|
|
|
|
("d ʑ","dʑ"),
|
|
("d a","da"),
|
|
("d e","de"),
|
|
("d o","do"),
|
|
("d ɯ","dɯ"),
|
|
|
|
("niːɕiki","ni iɕiki"),
|
|
("anitɕaɴ","niːtɕaɴ"),
|
|
("daiːtɕi","dai itɕi"),
|
|
(" niːta"," ni ita"),
|
|
(" niːrɯ"," ni irɯ"),
|
|
("a—","aː"),
|
|
("iːki","i iki"),
|
|
(" waːis"," wa ais"),
|
|
(" waːiɕ"," wa aiɕ"),
|
|
|
|
("waːʔ", "wa aʔ"),
|
|
|
|
("naɴ sono","nani sono"),
|
|
("naɴ kono","nani kono"),
|
|
("naɴ ano","nani ano"),
|
|
(" niːtaɽa"," ni itaɽa"),
|
|
("doɽamaɕiːd","doɽama ɕiːdʲi"),
|
|
("aɴ ta","anta"),
|
|
("aɴta","anta"),
|
|
("naniːʔteɴ","nani iʔteɴ"),
|
|
("niːkite","ni ikite"),
|
|
("niːʔ","ni iʔ"),
|
|
("niːɯ","ni iɯ"),
|
|
("niːw","ni iw"),
|
|
("niːkɯ","ni ikɯ"),
|
|
("de—","de e"),
|
|
(" aːj"," aː aj"),
|
|
(" aːɽ"," a aɽ"),
|
|
(" aːr"," a ar"),
|
|
(" gaːn"," ga an"),
|
|
|
|
(" gaːɽɯ "," ga aɽɯ "),
|
|
(" waːɽɯ "," wa aɽɯ "),
|
|
(" gaːrɯ "," ga arɯ "),
|
|
("gaːrɯɕi","ga arɯɕi"),
|
|
("gaːrɯnaɽa","ga arɯ naɽa"),
|
|
("gaːrɯɴ","ga arɯɴ"),
|
|
("gaːreba","ga areba"),
|
|
("gaːrɯkedo","ga arɯ kedo"),
|
|
("gaːrinagaɽa","ga ari nagaɽa"),
|
|
("gaːrɯ wa","ga arɯ wa"),
|
|
("gaːrɯwa","ga arɯ wa"),
|
|
(" waːrɯ "," wa arɯ "),
|
|
("waːmaɽi ","wa amaɽi "),
|
|
|
|
(" gaːɽi"," ga aɽi"),
|
|
(" waːɽi"," wa aɽi"),
|
|
(" gaːri"," ga ari"),
|
|
(" waːri"," wa ari"),
|
|
|
|
|
|
(" gaːɽe"," ga aɽe"),
|
|
(" waːɽe"," wa aɽe"),
|
|
(" gaːre"," ga are"),
|
|
(" waːre"," wa are"),
|
|
|
|
|
|
(" gaːʔta"," ga aʔta"),
|
|
(" waːʔta"," wa aʔta"),
|
|
|
|
("gaːʔmaɽi","ga aʔmaɽi"),
|
|
("waːʔmaɽi","wa aʔmaɽi"),
|
|
|
|
("gaːsakaʔta","ga asakaʔta"),
|
|
("waːsakaʔta","wa asakaʔta"),
|
|
|
|
(" waːme "," wa ame "),
|
|
(" gaːme "," ga ame "),
|
|
|
|
("aɽaːɽa","aɽa aɽa"),
|
|
("waːi rʲoɯ","wa aiɽa"),
|
|
("gaːi rʲoɯ","ga aiɽa"),
|
|
("gaːi ","ga ai "),
|
|
("ai rʲoɯ", "aiɽa"),
|
|
("ai joʔte","aiɽaʔte"),
|
|
("arɯihaːi rʲoɯ","arɯi wa aiɽa"),
|
|
("rʲoɯ ɕiki","joɕiki"),
|
|
("waːkarɯi","wa akarɯi"),
|
|
("gaːkarɯi","ga akarɯi"),
|
|
|
|
("waːi o","wa ai o"),
|
|
("madaːtama","mada atama"),
|
|
("hahasaɴ","kaːsaɴ"),
|
|
("ohahasaɴ","okaːsaɴ"),
|
|
("hahatɕaɴ","kaːtɕaɴ"),
|
|
("ohahatɕaɴ","okaːtɕaɴ"),
|
|
|
|
("nani mo ka mo","nanimokamo"),
|
|
|
|
(" nihon "," niʔpon "),
|
|
("niʔponniɴ","nihondʑiɴ"),
|
|
("arɽaːfɯ","arɽaː"),
|
|
|
|
("ɕiːk ","ɕi ik"),
|
|
("ɕijoː neɴ","ɕoɯneɴ"),
|
|
("aːna","a ana"),
|
|
("naɴ ɕijoɯ","nani ɕijoɯ"),
|
|
("hana ni sɯrɯ","wa nani sɯrɯ"),
|
|
("naɴ ɕite","nani ɕite"),
|
|
|
|
("hatsɯ taimeɴ","ɕotaimeɴ"),
|
|
("soŋkei go","soŋkeigo"),
|
|
(" go ","go "),
|
|
(" keiː "," kei i "),
|
|
(" kaŋgae hoɯ ","kaŋgaekata"),
|
|
("tomete kɯdasai","jamete kɯdasai"),
|
|
(" bɯkaikedo"," fɯkaikedo"),
|
|
|
|
|
|
])
|
|
|
|
|
|
def fix_wagas(text):
|
|
|
|
pattern = r'\b(gaːɽɯ|waːɽɯ|gaːrɯ|waːrɯ|gaːɽi|waːɽi|gaːri|waːri)\b(?!\s*\w)'
|
|
|
|
def replace_match(match):
|
|
word = match.group(1)
|
|
if 'ː' in word:
|
|
return word.replace('ː', ' a')
|
|
return word
|
|
|
|
return re.sub(pattern, replace_match, text)
|
|
|
|
def random_space_fix(text):
|
|
orig = text
|
|
|
|
for k, v in spaces.items():
|
|
text = text.replace(k, v)
|
|
|
|
return fix_wagas(text)
|
|
|
|
|