|
|
|
|
|
import re |
|
from datasets import load_dataset |
|
|
|
TEXT_COLUMN_NAME = "text" |
|
AUDIO_COLUMN_NAME = "audio" |
|
CHARS_TO_IGNORE_REGEX = r"[,?.!\-;:“%‘”�—’…–+\"'#/<>\\]" |
|
|
|
|
|
def filter_dataset(batch): |
|
return ( |
|
"inaudible" not in batch[TEXT_COLUMN_NAME].lower() |
|
and batch["sentence_language_code"].lower() == "nb-no" |
|
) |
|
|
|
def replace_hatted_characters(batch): |
|
text = batch["text"] |
|
text = re.sub(CHARS_TO_IGNORE_REGEX, '', text).lower() |
|
text = re.sub('[áàâ]', 'a', text) |
|
text = re.sub('[ä]', 'æ', text) |
|
text = re.sub('[éèëê]', 'e', text) |
|
text = re.sub('[íìïî]', 'i', text) |
|
text = re.sub('[óòöô]', 'o', text) |
|
text = re.sub('[ö]', 'ø', text) |
|
text = re.sub('[ç]', 'c', text) |
|
text = re.sub('[úùüû]', 'u', text) |
|
text = re.sub('\xa0', ' ', text) |
|
text = re.sub('<ee>', 'eee', text) |
|
text = re.sub('<qq>', 'qqq', text) |
|
text = re.sub('<mm>', 'mmm', text) |
|
text = re.sub('<inaudible>', '?', text) |
|
text = re.sub(r'\s+', ' ', text) |
|
text = text.strip() |
|
return {"text": text} |
|
|
|
|
|
def main(): |
|
dataset = load_dataset( |
|
"NbAiLab/NPSC", |
|
"16K_mp3", |
|
split="train+validation", |
|
use_auth_token=True, |
|
) |
|
dataset = dataset.filter( |
|
filter_dataset, |
|
desc="filtering out inaudible examples and keeping only nb-NO", |
|
).map( |
|
replace_hatted_characters, |
|
desc="replacing hesitations and homophones", |
|
) |
|
|
|
|
|
text = " ".join(dataset["text"]) |
|
with open("npsc.txt", "w") as text_file: |
|
text_file.write(text) |
|
|
|
|
|
!~/bin/lmplz -o 5 <"npsc.txt" > "5gram.arpa.orig" |
|
|
|
|
|
with open("5gram.arpa.orig", "r") as read_file, open("5gram.arpa", "w") as write_file: |
|
has_added_eos = False |
|
for line in read_file: |
|
if not has_added_eos and "ngram 1=" in line: |
|
count=line.strip().split("=")[-1] |
|
write_file.write(line.replace(f"{count}", f"{int(count)+1}")) |
|
elif not has_added_eos and "<s>" in line: |
|
write_file.write(line) |
|
write_file.write(line.replace("<s>", "</s>")) |
|
has_added_eos = True |
|
else: |
|
write_file.write(line) |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
|
|
|
|
|