kenlm-5gram-npsc / prepare.py
versae's picture
Add kenlm 5gram
9b5a8ef
#!/usr/bin/env python
# coding=utf-8
import re
from datasets import load_dataset
TEXT_COLUMN_NAME = "text"
AUDIO_COLUMN_NAME = "audio"
CHARS_TO_IGNORE_REGEX = r"[,?.!\-;:“%‘”�—’…–+\"'#/<>\\]"
# Pre-processing dataset
def filter_dataset(batch):
return (
"inaudible" not in batch[TEXT_COLUMN_NAME].lower()
and batch["sentence_language_code"].lower() == "nb-no"
)
def replace_hatted_characters(batch):
text = batch["text"]
text = re.sub(CHARS_TO_IGNORE_REGEX, '', text).lower()
text = re.sub('[áàâ]', 'a', text)
text = re.sub('[ä]', 'æ', text)
text = re.sub('[éèëê]', 'e', text)
text = re.sub('[íìïî]', 'i', text)
text = re.sub('[óòöô]', 'o', text)
text = re.sub('[ö]', 'ø', text)
text = re.sub('[ç]', 'c', text)
text = re.sub('[úùüû]', 'u', text)
text = re.sub('\xa0', ' ', text)
text = re.sub('<ee>', 'eee', text)
text = re.sub('<qq>', 'qqq', text)
text = re.sub('<mm>', 'mmm', text)
text = re.sub('<inaudible>', '?', text)
text = re.sub(r'\s+', ' ', text)
text = text.strip()
return {"text": text}
def main():
dataset = load_dataset(
"NbAiLab/NPSC",
"16K_mp3",
split="train+validation",
use_auth_token=True,
)
dataset = dataset.filter(
filter_dataset,
desc="filtering out inaudible examples and keeping only nb-NO",
).map(
replace_hatted_characters,
desc="replacing hesitations and homophones",
)
# Create file with all text together
text = " ".join(dataset["text"])
with open("npsc.txt", "w") as text_file:
text_file.write(text)
# Create KenLM model
!~/bin/lmplz -o 5 <"npsc.txt" > "5gram.arpa.orig"
# Adjusting for Huggingface decoding
with open("5gram.arpa.orig", "r") as read_file, open("5gram.arpa", "w") as write_file:
has_added_eos = False
for line in read_file:
if not has_added_eos and "ngram 1=" in line:
count=line.strip().split("=")[-1]
write_file.write(line.replace(f"{count}", f"{int(count)+1}"))
elif not has_added_eos and "<s>" in line:
write_file.write(line)
write_file.write(line.replace("<s>", "</s>"))
has_added_eos = True
else:
write_file.write(line)
if __name__ == "__main__":
main()