tiamojames's picture
Upload folder using huggingface_hub
bcff6dc verified
import re
from typing import List
def remove_space_between_chinese(text):
text = re.sub(r'([\u4e00-\u9fff])\s+([\u4e00-\u9fff])', r'\1\2', text)
text = re.sub(r'([a-zA-Z])\s+([\u4e00-\u9fff])', r'\1\2', text)
text = re.sub(r'([\u4e00-\u9fff])\s+([a-zA-Z])', r'\1\2', text)
return text
def normalize_text(current_text):
current_text = re.sub(r'([\u4e00-\u9fff])\s+([\u4e00-\u9fff])', r'\1\2', current_text)
current_text = re.sub(r'([a-zA-Z])\s+([\u4e00-\u9fff])', r'\1\2', current_text)
current_text = re.sub(r'([\u4e00-\u9fff])\s+([a-zA-Z])', r'\1\2', current_text)
if re.search(r'[\u4e00-\u9fff]$', current_text):
if current_text[-1] not in ",.?!。,?!":
current_text += "。"
elif re.search(r'[a-zA-Z]$', current_text):
if current_text[-1] not in ".!?":
current_text += "."
return current_text
def check_monologue_text(text: str, prefix: str = None) -> bool:
text = text.strip()
if prefix is not None and (not text.startswith(prefix)):
return False
if prefix is not None:
text = text.removeprefix(prefix)
text = text.strip()
if len(text) == 0:
return False
return True
def check_dialect_prompt_text(text: str, prefix: str = None) -> bool:
text = text.strip()
if prefix is not None and (not text.startswith(prefix)):
return False
text = text.strip()
if len(text) == 0:
return False
return True
def check_dialogue_text(text_list: List[str]) -> bool:
if len(text_list) == 0:
return False
for text in text_list:
if not (
check_monologue_text(text, "[S1]")
or check_monologue_text(text, "[S2]")
or check_monologue_text(text, "[S3]")
or check_monologue_text(text, "[S4]")
):
return False
return True