File size: 4,911 Bytes
b0aa389 fd102e9 b0aa389 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 |
import random
from collections import Counter, defaultdict
from langcodes import Language, standardize_tag
from rich import print
from models import translate_google, google_supported_languages
from tqdm import tqdm
from datasets import Dataset, load_dataset
import asyncio
from tqdm.asyncio import tqdm_asyncio
import os
from datasets_.util import _get_dataset_config_names, _load_dataset
slug_uhura_arc_easy = "masakhane/uhura-arc-easy"
tags_uhura_arc_easy = {
standardize_tag(a.split("_")[0], macro=True): a for a in _get_dataset_config_names(slug_uhura_arc_easy)
if not a.endswith("unmatched")
}
random.seed(42)
id_sets_train = [set(_load_dataset(slug_uhura_arc_easy, tag, split="train")["id"]) for tag in tags_uhura_arc_easy.values()]
common_ids_train = list(sorted(set.intersection(*id_sets_train)))
random.shuffle(common_ids_train)
id_sets_test = [set(_load_dataset(slug_uhura_arc_easy, tag, split="test")["id"]) for tag in tags_uhura_arc_easy.values()]
common_ids_test = list(sorted(set.intersection(*id_sets_test)))
random.shuffle(common_ids_test)
slug_uhura_arc_easy_translated = "fair-forward/arc-easy-autotranslated"
tags_uhura_arc_easy_translated = {
standardize_tag(a.split("_")[0], macro=True): a for a in _get_dataset_config_names(slug_uhura_arc_easy_translated)
}
def add_choices(row):
row["choices"] = row["choices"]["text"]
return row
def load_uhura_arc_easy(language_bcp_47, nr):
if language_bcp_47 in tags_uhura_arc_easy.keys():
ds = _load_dataset(slug_uhura_arc_easy, tags_uhura_arc_easy[language_bcp_47])
ds = ds.map(add_choices)
ds = ds.rename_column("answerKey", "answer")
train_ids = common_ids_train[nr:nr+3]
examples = ds["train"].filter(lambda x: x["id"] in train_ids)
task = ds["test"].filter(lambda x: x["id"] == common_ids_test[nr])[0]
return "masakhane/uhura-arc-easy", examples, task
if language_bcp_47 in tags_uhura_arc_easy_translated.keys():
ds = _load_dataset(slug_uhura_arc_easy_translated, tags_uhura_arc_easy_translated[language_bcp_47])
ds = ds.rename_column("answerKey", "answer")
train_ids = common_ids_train[nr:nr+3]
examples = ds["train"].filter(lambda x: x["id"] in train_ids)
# raise Exception(language_bcp_47)
task = ds["test"].filter(lambda x: x["id"] == common_ids_test[nr])[0]
return "fair-forward/arc-easy-autotranslated", examples, task
else:
return None, None, None
def translate_arc(languages):
human_translated = tags_uhura_arc_easy.keys()
untranslated = [
lang
for lang in languages["bcp_47"].values[:100]
if lang not in human_translated and lang in google_supported_languages
]
n_samples = 10
train_ids = common_ids_train[:n_samples+3]
en_train = _load_dataset(slug_uhura_arc_easy, subset=tags_uhura_arc_easy["en"], split="train")
en_train = en_train.filter(lambda x: x["id"] in train_ids)
test_ids = common_ids_test[:n_samples]
en_test = _load_dataset(slug_uhura_arc_easy, subset=tags_uhura_arc_easy["en"], split="test")
en_test = en_test.filter(lambda x: x["id"] in test_ids)
data = {"train": en_train, "test": en_test}
slug = "fair-forward/arc-easy-autotranslated"
for lang in tqdm(untranslated):
# check if already exists on hub
try:
ds_lang = load_dataset(slug, lang)
except (ValueError, Exception):
print(f"Translating {lang}...")
for split, data_en in data.items():
questions_tr = [translate_google(q, "en", lang) for q in data_en["question"]]
questions_tr = asyncio.run(tqdm_asyncio.gather(*questions_tr))
choices_texts_concatenated = []
for choice in data_en["choices"]:
for option in choice["text"]:
choices_texts_concatenated.append(option)
choices_tr = [translate_google(c, "en", lang) for c in choices_texts_concatenated]
choices_tr = asyncio.run(tqdm_asyncio.gather(*choices_tr))
# group into chunks of 4
choices_tr = [choices_tr[i:i+4] for i in range(0, len(choices_tr), 4)]
ds_lang = Dataset.from_dict(
{
"id": data_en["id"],
"question": questions_tr,
"choices": choices_tr,
"answerKey": data_en["answerKey"],
}
)
ds_lang.push_to_hub(
slug,
split=split,
config_name=lang,
token=os.getenv("HUGGINGFACE_ACCESS_TOKEN"),
)
ds_lang.to_json(
f"data/translations/arc/{lang}_{split}.json", lines=False, force_ascii=False, indent=2
)
|