Upload from GitHub Actions: Translate MMLU and evaluate
Browse files- evals/datasets_/mmlu.py +98 -30
- evals/main.py +1 -2
- evals/translate.py +2 -0
- languages.json +4 -4
- results.json +0 -0
evals/datasets_/mmlu.py
CHANGED
@@ -1,10 +1,17 @@
|
|
|
|
|
|
1 |
import random
|
2 |
from collections import Counter, defaultdict
|
3 |
|
|
|
|
|
4 |
from langcodes import Language, standardize_tag
|
|
|
5 |
from rich import print
|
|
|
|
|
|
|
6 |
|
7 |
-
from datasets_.util import _get_dataset_config_names, _load_dataset
|
8 |
def print_counts(slug, subjects_dev, subjects_test):
|
9 |
print(
|
10 |
f"{slug:<25} {len(list(set(subjects_test))):>3} test categories, {len(subjects_test):>6} samples, {len(list(set(subjects_dev))):>3} dev categories, {len(subjects_dev):>6} dev samples"
|
@@ -123,48 +130,109 @@ def add_choices(row):
|
|
123 |
return row
|
124 |
|
125 |
|
126 |
-
|
127 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
128 |
list(set(_load_dataset("masakhane/afrimmlu", "eng")["dev"]["subject"]))
|
129 |
)
|
|
|
|
|
|
|
130 |
category = categories[nr % len(categories)]
|
131 |
-
random.seed(nr)
|
132 |
-
i = random.randint(0, 100)
|
133 |
-
tags_afrimmlu = {
|
134 |
-
standardize_tag(a, macro=True): a
|
135 |
-
for a in _get_dataset_config_names("masakhane/afrimmlu")
|
136 |
-
}
|
137 |
-
tags_global_mmlu = {
|
138 |
-
standardize_tag(a, macro=True): a
|
139 |
-
for a in _get_dataset_config_names("CohereForAI/Global-MMLU")
|
140 |
-
}
|
141 |
-
tags_okapi = _get_dataset_config_names("lighteval/okapi_mmlu")
|
142 |
-
tags_mmlux = set(
|
143 |
-
a.rsplit("_", 1)[1].split("-")[0].lower()
|
144 |
-
for a in _get_dataset_config_names("Eurolingua/mmlux", trust_remote_code=True)
|
145 |
-
)
|
146 |
if language_bcp_47 in tags_afrimmlu.keys():
|
147 |
ds = _load_dataset("masakhane/afrimmlu", tags_afrimmlu[language_bcp_47])
|
148 |
ds = ds.map(parse_choices)
|
149 |
examples = ds["dev"].filter(lambda x: x["subject"] == category)
|
150 |
-
task = ds["test"].filter(lambda x: x["subject"] == category)[
|
151 |
return "masakhane/afrimmlu", examples, task
|
152 |
elif language_bcp_47 in tags_global_mmlu.keys():
|
153 |
ds = _load_dataset("CohereForAI/Global-MMLU", tags_global_mmlu[language_bcp_47])
|
154 |
ds = ds.map(add_choices)
|
155 |
examples = ds["dev"].filter(lambda x: x["subject"] == category)
|
156 |
-
task = ds["test"].filter(lambda x: x["subject"] == category)[
|
157 |
return "CohereForAI/Global-MMLU", examples, task
|
158 |
-
elif language_bcp_47 in
|
159 |
-
|
160 |
-
ds = _load_dataset(
|
161 |
-
"lighteval/okapi_mmlu", language_bcp_47, trust_remote_code=True
|
162 |
-
)
|
163 |
examples = ds["dev"].filter(lambda x: x["subject"] == category)
|
164 |
-
task = ds["test"].filter(lambda x: x["
|
165 |
-
return "
|
166 |
-
elif language_bcp_47 in tags_mmlux:
|
167 |
-
# loading this is more complicated, todo
|
168 |
-
return None, None, None
|
169 |
else:
|
170 |
return None, None, None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import asyncio
|
2 |
+
import os
|
3 |
import random
|
4 |
from collections import Counter, defaultdict
|
5 |
|
6 |
+
from datasets import Dataset, load_dataset
|
7 |
+
from datasets_.util import _get_dataset_config_names, _load_dataset
|
8 |
from langcodes import Language, standardize_tag
|
9 |
+
from models import google_supported_languages, translate_google
|
10 |
from rich import print
|
11 |
+
from tqdm import tqdm
|
12 |
+
from tqdm.asyncio import tqdm_asyncio
|
13 |
+
|
14 |
|
|
|
15 |
def print_counts(slug, subjects_dev, subjects_test):
|
16 |
print(
|
17 |
f"{slug:<25} {len(list(set(subjects_test))):>3} test categories, {len(subjects_test):>6} samples, {len(list(set(subjects_dev))):>3} dev categories, {len(subjects_dev):>6} dev samples"
|
|
|
130 |
return row
|
131 |
|
132 |
|
133 |
+
tags_afrimmlu = {
|
134 |
+
standardize_tag(a, macro=True): a
|
135 |
+
for a in _get_dataset_config_names("masakhane/afrimmlu")
|
136 |
+
}
|
137 |
+
tags_global_mmlu = {
|
138 |
+
standardize_tag(a, macro=True): a
|
139 |
+
for a in _get_dataset_config_names("CohereForAI/Global-MMLU")
|
140 |
+
}
|
141 |
+
tags_okapi = _get_dataset_config_names("lighteval/okapi_mmlu")
|
142 |
+
tags_mmlux = set(
|
143 |
+
a.rsplit("_", 1)[1].split("-")[0].lower()
|
144 |
+
for a in _get_dataset_config_names("Eurolingua/mmlux", trust_remote_code=True)
|
145 |
+
)
|
146 |
+
tags_mmlu_autotranslated = _get_dataset_config_names("fair-forward/mmlu-autotranslated")
|
147 |
+
|
148 |
+
categories = sorted(
|
149 |
list(set(_load_dataset("masakhane/afrimmlu", "eng")["dev"]["subject"]))
|
150 |
)
|
151 |
+
|
152 |
+
|
153 |
+
def load_mmlu(language_bcp_47, nr):
|
154 |
category = categories[nr % len(categories)]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
155 |
if language_bcp_47 in tags_afrimmlu.keys():
|
156 |
ds = _load_dataset("masakhane/afrimmlu", tags_afrimmlu[language_bcp_47])
|
157 |
ds = ds.map(parse_choices)
|
158 |
examples = ds["dev"].filter(lambda x: x["subject"] == category)
|
159 |
+
task = ds["test"].filter(lambda x: x["subject"] == category)[nr]
|
160 |
return "masakhane/afrimmlu", examples, task
|
161 |
elif language_bcp_47 in tags_global_mmlu.keys():
|
162 |
ds = _load_dataset("CohereForAI/Global-MMLU", tags_global_mmlu[language_bcp_47])
|
163 |
ds = ds.map(add_choices)
|
164 |
examples = ds["dev"].filter(lambda x: x["subject"] == category)
|
165 |
+
task = ds["test"].filter(lambda x: x["subject"] == category)[nr]
|
166 |
return "CohereForAI/Global-MMLU", examples, task
|
167 |
+
elif language_bcp_47 in tags_mmlu_autotranslated:
|
168 |
+
ds = _load_dataset("fair-forward/mmlu-autotranslated", language_bcp_47)
|
|
|
|
|
|
|
169 |
examples = ds["dev"].filter(lambda x: x["subject"] == category)
|
170 |
+
task = ds["test"].filter(lambda x: x["subject"] == category)[nr]
|
171 |
+
return "fair-forward/mmlu-autotranslated", examples, task
|
|
|
|
|
|
|
172 |
else:
|
173 |
return None, None, None
|
174 |
+
|
175 |
+
|
176 |
+
def translate_mmlu(languages):
|
177 |
+
human_translated = [*tags_afrimmlu.keys(), *tags_global_mmlu.keys()]
|
178 |
+
untranslated = [
|
179 |
+
lang
|
180 |
+
for lang in languages["bcp_47"].values[:100]
|
181 |
+
if lang not in human_translated and lang in google_supported_languages
|
182 |
+
]
|
183 |
+
n_samples = 10
|
184 |
+
|
185 |
+
slug = "fair-forward/mmlu-autotranslated"
|
186 |
+
for lang in tqdm(untranslated):
|
187 |
+
# check if already exists on hub
|
188 |
+
try:
|
189 |
+
ds_lang = load_dataset(slug, lang)
|
190 |
+
except (ValueError, Exception):
|
191 |
+
print(f"Translating {lang}...")
|
192 |
+
for split in ["dev", "test"]:
|
193 |
+
ds = _load_dataset("masakhane/afrimmlu", "eng", split=split)
|
194 |
+
samples = []
|
195 |
+
for category in categories:
|
196 |
+
if split == "dev":
|
197 |
+
samples.extend(ds.filter(lambda x: x["subject"] == category))
|
198 |
+
else:
|
199 |
+
for i in range(n_samples):
|
200 |
+
task = ds.filter(lambda x: x["subject"] == category)[i]
|
201 |
+
samples.append(task)
|
202 |
+
questions_tr = [
|
203 |
+
translate_google(s["question"], "en", lang) for s in samples
|
204 |
+
]
|
205 |
+
questions_tr = asyncio.run(tqdm_asyncio.gather(*questions_tr))
|
206 |
+
choices_texts_concatenated = []
|
207 |
+
for s in samples:
|
208 |
+
for choice in eval(s["choices"]):
|
209 |
+
choices_texts_concatenated.append(choice)
|
210 |
+
choices_tr = [
|
211 |
+
translate_google(c, "en", lang) for c in choices_texts_concatenated
|
212 |
+
]
|
213 |
+
choices_tr = asyncio.run(tqdm_asyncio.gather(*choices_tr))
|
214 |
+
# group into chunks of 4
|
215 |
+
choices_tr = [
|
216 |
+
choices_tr[i : i + 4] for i in range(0, len(choices_tr), 4)
|
217 |
+
]
|
218 |
+
|
219 |
+
ds_lang = Dataset.from_dict(
|
220 |
+
{
|
221 |
+
"subject": [s["subject"] for s in samples],
|
222 |
+
"question": questions_tr,
|
223 |
+
"choices": choices_tr,
|
224 |
+
"answer": [s["answer"] for s in samples],
|
225 |
+
}
|
226 |
+
)
|
227 |
+
ds_lang.push_to_hub(
|
228 |
+
slug,
|
229 |
+
split=split,
|
230 |
+
config_name=lang,
|
231 |
+
token=os.getenv("HUGGINGFACE_ACCESS_TOKEN"),
|
232 |
+
)
|
233 |
+
ds_lang.to_json(
|
234 |
+
f"data/translations/mmlu/{lang}_{split}.json",
|
235 |
+
lines=False,
|
236 |
+
force_ascii=False,
|
237 |
+
indent=2,
|
238 |
+
)
|
evals/main.py
CHANGED
@@ -15,8 +15,7 @@ n_sentences = 10
|
|
15 |
|
16 |
async def evaluate():
|
17 |
# FIXME we should not need this for-loop, but it helps
|
18 |
-
|
19 |
-
for n_languages in range(20, 21, 1):
|
20 |
print(f"running evaluations for {n_languages} languages")
|
21 |
old_results = pd.read_json("results.json")
|
22 |
old_models = pd.read_json("models.json")
|
|
|
15 |
|
16 |
async def evaluate():
|
17 |
# FIXME we should not need this for-loop, but it helps
|
18 |
+
for n_languages in range(20, 100, 20):
|
|
|
19 |
print(f"running evaluations for {n_languages} languages")
|
20 |
old_results = pd.read_json("results.json")
|
21 |
old_models = pd.read_json("models.json")
|
evals/translate.py
CHANGED
@@ -1,7 +1,9 @@
|
|
1 |
from languages import languages
|
2 |
from datasets_.mgsm import translate_mgsm
|
3 |
from datasets_.arc import translate_arc
|
|
|
4 |
|
5 |
if __name__ == "__main__":
|
|
|
6 |
translate_mgsm(languages)
|
7 |
translate_arc(languages)
|
|
|
1 |
from languages import languages
|
2 |
from datasets_.mgsm import translate_mgsm
|
3 |
from datasets_.arc import translate_arc
|
4 |
+
from datasets_.mmlu import translate_mmlu
|
5 |
|
6 |
if __name__ == "__main__":
|
7 |
+
translate_mmlu(languages)
|
8 |
translate_mgsm(languages)
|
9 |
translate_arc(languages)
|
languages.json
CHANGED
@@ -7,7 +7,7 @@
|
|
7 |
"family":"Indo-European",
|
8 |
"flores_path":"eng_Latn",
|
9 |
"fleurs_tag":"en_us",
|
10 |
-
"commonvoice_hours":
|
11 |
"commonvoice_locale":"en",
|
12 |
"in_benchmark":true
|
13 |
},
|
@@ -1183,7 +1183,7 @@
|
|
1183 |
"family":"Indo-European",
|
1184 |
"flores_path":"bel_Cyrl",
|
1185 |
"fleurs_tag":"be_by",
|
1186 |
-
"commonvoice_hours":
|
1187 |
"commonvoice_locale":"be",
|
1188 |
"in_benchmark":true
|
1189 |
},
|
@@ -3331,7 +3331,7 @@
|
|
3331 |
"family":"Indo-European",
|
3332 |
"flores_path":"gle_Latn",
|
3333 |
"fleurs_tag":"ga_ie",
|
3334 |
-
"commonvoice_hours":8.
|
3335 |
"commonvoice_locale":"ga-IE",
|
3336 |
"in_benchmark":true
|
3337 |
},
|
@@ -3535,7 +3535,7 @@
|
|
3535 |
"family":null,
|
3536 |
"flores_path":"eus_Latn",
|
3537 |
"fleurs_tag":null,
|
3538 |
-
"commonvoice_hours":
|
3539 |
"commonvoice_locale":"eu",
|
3540 |
"in_benchmark":true
|
3541 |
},
|
|
|
7 |
"family":"Indo-European",
|
8 |
"flores_path":"eng_Latn",
|
9 |
"fleurs_tag":"en_us",
|
10 |
+
"commonvoice_hours":2674.0,
|
11 |
"commonvoice_locale":"en",
|
12 |
"in_benchmark":true
|
13 |
},
|
|
|
1183 |
"family":"Indo-European",
|
1184 |
"flores_path":"bel_Cyrl",
|
1185 |
"fleurs_tag":"be_by",
|
1186 |
+
"commonvoice_hours":1810.0,
|
1187 |
"commonvoice_locale":"be",
|
1188 |
"in_benchmark":true
|
1189 |
},
|
|
|
3331 |
"family":"Indo-European",
|
3332 |
"flores_path":"gle_Latn",
|
3333 |
"fleurs_tag":"ga_ie",
|
3334 |
+
"commonvoice_hours":8.3,
|
3335 |
"commonvoice_locale":"ga-IE",
|
3336 |
"in_benchmark":true
|
3337 |
},
|
|
|
3535 |
"family":null,
|
3536 |
"flores_path":"eus_Latn",
|
3537 |
"fleurs_tag":null,
|
3538 |
+
"commonvoice_hours":440.0,
|
3539 |
"commonvoice_locale":"eu",
|
3540 |
"in_benchmark":true
|
3541 |
},
|
results.json
CHANGED
The diff for this file is too large to render.
See raw diff
|
|