davidpomerenke commited on
Commit
4c5c136
·
verified ·
1 Parent(s): b0aa389

Upload from GitHub Actions: Translate MMLU and evaluate

Browse files
evals/datasets_/mmlu.py CHANGED
@@ -1,10 +1,17 @@
 
 
1
  import random
2
  from collections import Counter, defaultdict
3
 
 
 
4
  from langcodes import Language, standardize_tag
 
5
  from rich import print
 
 
 
6
 
7
- from datasets_.util import _get_dataset_config_names, _load_dataset
8
  def print_counts(slug, subjects_dev, subjects_test):
9
  print(
10
  f"{slug:<25} {len(list(set(subjects_test))):>3} test categories, {len(subjects_test):>6} samples, {len(list(set(subjects_dev))):>3} dev categories, {len(subjects_dev):>6} dev samples"
@@ -123,48 +130,109 @@ def add_choices(row):
123
  return row
124
 
125
 
126
- def load_mmlu(language_bcp_47, nr):
127
- categories = sorted(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
  list(set(_load_dataset("masakhane/afrimmlu", "eng")["dev"]["subject"]))
129
  )
 
 
 
130
  category = categories[nr % len(categories)]
131
- random.seed(nr)
132
- i = random.randint(0, 100)
133
- tags_afrimmlu = {
134
- standardize_tag(a, macro=True): a
135
- for a in _get_dataset_config_names("masakhane/afrimmlu")
136
- }
137
- tags_global_mmlu = {
138
- standardize_tag(a, macro=True): a
139
- for a in _get_dataset_config_names("CohereForAI/Global-MMLU")
140
- }
141
- tags_okapi = _get_dataset_config_names("lighteval/okapi_mmlu")
142
- tags_mmlux = set(
143
- a.rsplit("_", 1)[1].split("-")[0].lower()
144
- for a in _get_dataset_config_names("Eurolingua/mmlux", trust_remote_code=True)
145
- )
146
  if language_bcp_47 in tags_afrimmlu.keys():
147
  ds = _load_dataset("masakhane/afrimmlu", tags_afrimmlu[language_bcp_47])
148
  ds = ds.map(parse_choices)
149
  examples = ds["dev"].filter(lambda x: x["subject"] == category)
150
- task = ds["test"].filter(lambda x: x["subject"] == category)[i]
151
  return "masakhane/afrimmlu", examples, task
152
  elif language_bcp_47 in tags_global_mmlu.keys():
153
  ds = _load_dataset("CohereForAI/Global-MMLU", tags_global_mmlu[language_bcp_47])
154
  ds = ds.map(add_choices)
155
  examples = ds["dev"].filter(lambda x: x["subject"] == category)
156
- task = ds["test"].filter(lambda x: x["subject"] == category)[i]
157
  return "CohereForAI/Global-MMLU", examples, task
158
- elif language_bcp_47 in tags_okapi:
159
- return None, None, None # FIXME
160
- ds = _load_dataset(
161
- "lighteval/okapi_mmlu", language_bcp_47, trust_remote_code=True
162
- )
163
  examples = ds["dev"].filter(lambda x: x["subject"] == category)
164
- task = ds["test"].filter(lambda x: x["id"] == f"{category}/test/{i}")[0]
165
- return "lighteval/okapi_mmlu", examples, task
166
- elif language_bcp_47 in tags_mmlux:
167
- # loading this is more complicated, todo
168
- return None, None, None
169
  else:
170
  return None, None, None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import os
3
  import random
4
  from collections import Counter, defaultdict
5
 
6
+ from datasets import Dataset, load_dataset
7
+ from datasets_.util import _get_dataset_config_names, _load_dataset
8
  from langcodes import Language, standardize_tag
9
+ from models import google_supported_languages, translate_google
10
  from rich import print
11
+ from tqdm import tqdm
12
+ from tqdm.asyncio import tqdm_asyncio
13
+
14
 
 
15
  def print_counts(slug, subjects_dev, subjects_test):
16
  print(
17
  f"{slug:<25} {len(list(set(subjects_test))):>3} test categories, {len(subjects_test):>6} samples, {len(list(set(subjects_dev))):>3} dev categories, {len(subjects_dev):>6} dev samples"
 
130
  return row
131
 
132
 
133
+ tags_afrimmlu = {
134
+ standardize_tag(a, macro=True): a
135
+ for a in _get_dataset_config_names("masakhane/afrimmlu")
136
+ }
137
+ tags_global_mmlu = {
138
+ standardize_tag(a, macro=True): a
139
+ for a in _get_dataset_config_names("CohereForAI/Global-MMLU")
140
+ }
141
+ tags_okapi = _get_dataset_config_names("lighteval/okapi_mmlu")
142
+ tags_mmlux = set(
143
+ a.rsplit("_", 1)[1].split("-")[0].lower()
144
+ for a in _get_dataset_config_names("Eurolingua/mmlux", trust_remote_code=True)
145
+ )
146
+ tags_mmlu_autotranslated = _get_dataset_config_names("fair-forward/mmlu-autotranslated")
147
+
148
+ categories = sorted(
149
  list(set(_load_dataset("masakhane/afrimmlu", "eng")["dev"]["subject"]))
150
  )
151
+
152
+
153
+ def load_mmlu(language_bcp_47, nr):
154
  category = categories[nr % len(categories)]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
  if language_bcp_47 in tags_afrimmlu.keys():
156
  ds = _load_dataset("masakhane/afrimmlu", tags_afrimmlu[language_bcp_47])
157
  ds = ds.map(parse_choices)
158
  examples = ds["dev"].filter(lambda x: x["subject"] == category)
159
+ task = ds["test"].filter(lambda x: x["subject"] == category)[nr]
160
  return "masakhane/afrimmlu", examples, task
161
  elif language_bcp_47 in tags_global_mmlu.keys():
162
  ds = _load_dataset("CohereForAI/Global-MMLU", tags_global_mmlu[language_bcp_47])
163
  ds = ds.map(add_choices)
164
  examples = ds["dev"].filter(lambda x: x["subject"] == category)
165
+ task = ds["test"].filter(lambda x: x["subject"] == category)[nr]
166
  return "CohereForAI/Global-MMLU", examples, task
167
+ elif language_bcp_47 in tags_mmlu_autotranslated:
168
+ ds = _load_dataset("fair-forward/mmlu-autotranslated", language_bcp_47)
 
 
 
169
  examples = ds["dev"].filter(lambda x: x["subject"] == category)
170
+ task = ds["test"].filter(lambda x: x["subject"] == category)[nr]
171
+ return "fair-forward/mmlu-autotranslated", examples, task
 
 
 
172
  else:
173
  return None, None, None
174
+
175
+
176
+ def translate_mmlu(languages):
177
+ human_translated = [*tags_afrimmlu.keys(), *tags_global_mmlu.keys()]
178
+ untranslated = [
179
+ lang
180
+ for lang in languages["bcp_47"].values[:100]
181
+ if lang not in human_translated and lang in google_supported_languages
182
+ ]
183
+ n_samples = 10
184
+
185
+ slug = "fair-forward/mmlu-autotranslated"
186
+ for lang in tqdm(untranslated):
187
+ # check if already exists on hub
188
+ try:
189
+ ds_lang = load_dataset(slug, lang)
190
+ except (ValueError, Exception):
191
+ print(f"Translating {lang}...")
192
+ for split in ["dev", "test"]:
193
+ ds = _load_dataset("masakhane/afrimmlu", "eng", split=split)
194
+ samples = []
195
+ for category in categories:
196
+ if split == "dev":
197
+ samples.extend(ds.filter(lambda x: x["subject"] == category))
198
+ else:
199
+ for i in range(n_samples):
200
+ task = ds.filter(lambda x: x["subject"] == category)[i]
201
+ samples.append(task)
202
+ questions_tr = [
203
+ translate_google(s["question"], "en", lang) for s in samples
204
+ ]
205
+ questions_tr = asyncio.run(tqdm_asyncio.gather(*questions_tr))
206
+ choices_texts_concatenated = []
207
+ for s in samples:
208
+ for choice in eval(s["choices"]):
209
+ choices_texts_concatenated.append(choice)
210
+ choices_tr = [
211
+ translate_google(c, "en", lang) for c in choices_texts_concatenated
212
+ ]
213
+ choices_tr = asyncio.run(tqdm_asyncio.gather(*choices_tr))
214
+ # group into chunks of 4
215
+ choices_tr = [
216
+ choices_tr[i : i + 4] for i in range(0, len(choices_tr), 4)
217
+ ]
218
+
219
+ ds_lang = Dataset.from_dict(
220
+ {
221
+ "subject": [s["subject"] for s in samples],
222
+ "question": questions_tr,
223
+ "choices": choices_tr,
224
+ "answer": [s["answer"] for s in samples],
225
+ }
226
+ )
227
+ ds_lang.push_to_hub(
228
+ slug,
229
+ split=split,
230
+ config_name=lang,
231
+ token=os.getenv("HUGGINGFACE_ACCESS_TOKEN"),
232
+ )
233
+ ds_lang.to_json(
234
+ f"data/translations/mmlu/{lang}_{split}.json",
235
+ lines=False,
236
+ force_ascii=False,
237
+ indent=2,
238
+ )
evals/main.py CHANGED
@@ -15,8 +15,7 @@ n_sentences = 10
15
 
16
  async def evaluate():
17
  # FIXME we should not need this for-loop, but it helps
18
- # for n_languages in range(20, 101, 20):
19
- for n_languages in range(20, 21, 1):
20
  print(f"running evaluations for {n_languages} languages")
21
  old_results = pd.read_json("results.json")
22
  old_models = pd.read_json("models.json")
 
15
 
16
  async def evaluate():
17
  # FIXME we should not need this for-loop, but it helps
18
+ for n_languages in range(20, 100, 20):
 
19
  print(f"running evaluations for {n_languages} languages")
20
  old_results = pd.read_json("results.json")
21
  old_models = pd.read_json("models.json")
evals/translate.py CHANGED
@@ -1,7 +1,9 @@
1
  from languages import languages
2
  from datasets_.mgsm import translate_mgsm
3
  from datasets_.arc import translate_arc
 
4
 
5
  if __name__ == "__main__":
 
6
  translate_mgsm(languages)
7
  translate_arc(languages)
 
1
  from languages import languages
2
  from datasets_.mgsm import translate_mgsm
3
  from datasets_.arc import translate_arc
4
+ from datasets_.mmlu import translate_mmlu
5
 
6
  if __name__ == "__main__":
7
+ translate_mmlu(languages)
8
  translate_mgsm(languages)
9
  translate_arc(languages)
languages.json CHANGED
@@ -7,7 +7,7 @@
7
  "family":"Indo-European",
8
  "flores_path":"eng_Latn",
9
  "fleurs_tag":"en_us",
10
- "commonvoice_hours":2673.0,
11
  "commonvoice_locale":"en",
12
  "in_benchmark":true
13
  },
@@ -1183,7 +1183,7 @@
1183
  "family":"Indo-European",
1184
  "flores_path":"bel_Cyrl",
1185
  "fleurs_tag":"be_by",
1186
- "commonvoice_hours":1809.0,
1187
  "commonvoice_locale":"be",
1188
  "in_benchmark":true
1189
  },
@@ -3331,7 +3331,7 @@
3331
  "family":"Indo-European",
3332
  "flores_path":"gle_Latn",
3333
  "fleurs_tag":"ga_ie",
3334
- "commonvoice_hours":8.2,
3335
  "commonvoice_locale":"ga-IE",
3336
  "in_benchmark":true
3337
  },
@@ -3535,7 +3535,7 @@
3535
  "family":null,
3536
  "flores_path":"eus_Latn",
3537
  "fleurs_tag":null,
3538
- "commonvoice_hours":438.0,
3539
  "commonvoice_locale":"eu",
3540
  "in_benchmark":true
3541
  },
 
7
  "family":"Indo-European",
8
  "flores_path":"eng_Latn",
9
  "fleurs_tag":"en_us",
10
+ "commonvoice_hours":2674.0,
11
  "commonvoice_locale":"en",
12
  "in_benchmark":true
13
  },
 
1183
  "family":"Indo-European",
1184
  "flores_path":"bel_Cyrl",
1185
  "fleurs_tag":"be_by",
1186
+ "commonvoice_hours":1810.0,
1187
  "commonvoice_locale":"be",
1188
  "in_benchmark":true
1189
  },
 
3331
  "family":"Indo-European",
3332
  "flores_path":"gle_Latn",
3333
  "fleurs_tag":"ga_ie",
3334
+ "commonvoice_hours":8.3,
3335
  "commonvoice_locale":"ga-IE",
3336
  "in_benchmark":true
3337
  },
 
3535
  "family":null,
3536
  "flores_path":"eus_Latn",
3537
  "fleurs_tag":null,
3538
+ "commonvoice_hours":440.0,
3539
  "commonvoice_locale":"eu",
3540
  "in_benchmark":true
3541
  },
results.json CHANGED
The diff for this file is too large to render. See raw diff