davidpomerenke commited on
Commit
f3a09a2
·
verified ·
1 Parent(s): 338dc9b

Upload from GitHub Actions: Evaluate on autotranslated GSM dataset

Browse files
datasets.json CHANGED
@@ -300,7 +300,22 @@
300
  "parallel": true,
301
  "translation": "machine",
302
  "base": "MGSM",
303
- "implemented": true,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
304
  "group": "Grade School Math"
305
  },
306
  {
 
300
  "parallel": true,
301
  "translation": "machine",
302
  "base": "MGSM",
303
+ "implemented": false,
304
+ "group": "Grade School Math"
305
+ },
306
+ {
307
+ "name": "GSM Auto-Translated",
308
+ "author": null,
309
+ "author_url": null,
310
+ "url": null,
311
+ "n_languages": 52,
312
+ "tasks": [
313
+ "math"
314
+ ],
315
+ "parallel": true,
316
+ "translation": "machine",
317
+ "base": "MGSM",
318
+ "implemented": false,
319
  "group": "Grade School Math"
320
  },
321
  {
evals/datasets_/mgsm.py CHANGED
@@ -1,5 +1,12 @@
 
 
 
 
1
  from datasets_.util import _get_dataset_config_names, _load_dataset
2
- from langcodes import Language, standardize_tag
 
 
 
3
 
4
  slug_mgsm = "juletxara/mgsm"
5
  tags_mgsm = {
@@ -14,6 +21,12 @@ tags_gsm8kx = {
14
  standardize_tag(a, macro=True): a
15
  for a in _get_dataset_config_names(slug_gsm8kx, trust_remote_code=True)
16
  }
 
 
 
 
 
 
17
 
18
  def parse_number(i):
19
  if isinstance(i, int):
@@ -23,6 +36,7 @@ def parse_number(i):
23
  except ValueError:
24
  return None
25
 
 
26
  def load_mgsm(language_bcp_47, nr):
27
  if language_bcp_47 in tags_mgsm.keys():
28
  ds = _load_dataset(slug_mgsm, subset=tags_mgsm[language_bcp_47], split="test")
@@ -32,6 +46,11 @@ def load_mgsm(language_bcp_47, nr):
32
  slug_afrimgsm, subset=tags_afrimgsm[language_bcp_47], split="test"
33
  )
34
  return slug_afrimgsm, ds[nr]
 
 
 
 
 
35
  elif language_bcp_47 in tags_gsm8kx.keys():
36
  row = _load_dataset(
37
  slug_gsm8kx,
@@ -43,3 +62,39 @@ def load_mgsm(language_bcp_47, nr):
43
  return slug_gsm8kx, row
44
  else:
45
  return None, None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import os
3
+
4
+ from datasets import Dataset, load_dataset
5
  from datasets_.util import _get_dataset_config_names, _load_dataset
6
+ from langcodes import standardize_tag
7
+ from models import google_supported_languages, translate_google
8
+ from tqdm import tqdm
9
+ from tqdm.asyncio import tqdm_asyncio
10
 
11
  slug_mgsm = "juletxara/mgsm"
12
  tags_mgsm = {
 
21
  standardize_tag(a, macro=True): a
22
  for a in _get_dataset_config_names(slug_gsm8kx, trust_remote_code=True)
23
  }
24
+ slug_gsm_autotranslated = "fair-forward/gsm-autotranslated"
25
+ tags_gsm_autotranslated = {
26
+ standardize_tag(a, macro=True): a
27
+ for a in _get_dataset_config_names(slug_gsm_autotranslated)
28
+ }
29
+
30
 
31
  def parse_number(i):
32
  if isinstance(i, int):
 
36
  except ValueError:
37
  return None
38
 
39
+
40
  def load_mgsm(language_bcp_47, nr):
41
  if language_bcp_47 in tags_mgsm.keys():
42
  ds = _load_dataset(slug_mgsm, subset=tags_mgsm[language_bcp_47], split="test")
 
46
  slug_afrimgsm, subset=tags_afrimgsm[language_bcp_47], split="test"
47
  )
48
  return slug_afrimgsm, ds[nr]
49
+ elif language_bcp_47 in tags_gsm_autotranslated.keys():
50
+ ds = _load_dataset(
51
+ slug_gsm_autotranslated, subset=tags_gsm_autotranslated[language_bcp_47], split="test"
52
+ )
53
+ return slug_gsm_autotranslated, ds[nr]
54
  elif language_bcp_47 in tags_gsm8kx.keys():
55
  row = _load_dataset(
56
  slug_gsm8kx,
 
62
  return slug_gsm8kx, row
63
  else:
64
  return None, None
65
+
66
+
67
+ def translate_mgsm(languages):
68
+ human_translated = [*tags_mgsm.keys(), *tags_afrimgsm.keys()]
69
+ untranslated = [
70
+ lang
71
+ for lang in languages["bcp_47"].values[:100]
72
+ if lang not in human_translated and lang in google_supported_languages
73
+ ]
74
+ en = _load_dataset(slug_mgsm, subset=tags_mgsm["en"], split="test")
75
+ slug = "fair-forward/gsm-autotranslated"
76
+ for lang in tqdm(untranslated):
77
+ # check if already exists on hub
78
+ try:
79
+ ds_lang = load_dataset(slug, lang, split="test")
80
+ except ValueError:
81
+ print(f"Translating {lang}...")
82
+ questions_tr = [translate_google(q, "en", lang) for q in en["question"]]
83
+ questions_tr = asyncio.run(tqdm_asyncio.gather(*questions_tr))
84
+ ds_lang = Dataset.from_dict(
85
+ {
86
+ "question": questions_tr,
87
+ "answer": en["answer"],
88
+ "answer_number": en["answer_number"],
89
+ "equation_solution": en["equation_solution"],
90
+ }
91
+ )
92
+ ds_lang.push_to_hub(
93
+ slug,
94
+ split="test",
95
+ config_name=lang,
96
+ token=os.getenv("HUGGINGFACE_ACCESS_TOKEN"),
97
+ )
98
+ ds_lang.to_json(
99
+ f"data/mgsm/{lang}.json", lines=False, force_ascii=False, indent=2
100
+ )
evals/main.py CHANGED
@@ -1,5 +1,4 @@
1
  import asyncio
2
- from time import time
3
 
4
  import pandas as pd
5
  from languages import languages
@@ -16,7 +15,7 @@ n_sentences = 10
16
 
17
  async def evaluate():
18
  # FIXME we should not need this for-loop, but it helps
19
- for n_languages in range(100, 101):
20
  print(f"running evaluations for {n_languages} languages")
21
  old_results = pd.read_json("results.json")
22
  old_models = pd.read_json("models.json")
 
1
  import asyncio
 
2
 
3
  import pandas as pd
4
  from languages import languages
 
15
 
16
  async def evaluate():
17
  # FIXME we should not need this for-loop, but it helps
18
+ for n_languages in range(90, 101, 3):
19
  print(f"running evaluations for {n_languages} languages")
20
  old_results = pd.read_json("results.json")
21
  old_models = pd.read_json("models.json")
evals/models.py CHANGED
@@ -34,7 +34,7 @@ important_models = [
34
  "mistralai/mistral-small-3.1-24b-instruct", # 0.3$
35
  "mistralai/mistral-saba", # 0.6$
36
  "mistralai/mistral-nemo", # 0.08$
37
- "google/gemini-2.5-flash-preview", # 0.6$
38
  "google/gemini-2.0-flash-lite-001", # 0.3$
39
  "google/gemma-3-27b-it", # 0.2$
40
  # "qwen/qwen-turbo", # 0.2$; recognizes "inappropriate content"
@@ -50,8 +50,14 @@ important_models = [
50
 
51
  blocklist = [
52
  "microsoft/wizardlm-2-8x22b", # temporarily rate-limited
53
- "google/gemini-2.5-pro", # something wrong FIXME
54
- "google/gemini-2.5-pro-preview", # something wrong FIXME
 
 
 
 
 
 
55
  ]
56
 
57
  transcription_models = [
@@ -153,17 +159,11 @@ async def complete(**kwargs) -> str | None:
153
 
154
 
155
  translate_client = translate.Client()
156
- supported_languages = [l["language"] for l in translate_client.get_languages()]
157
 
158
 
159
  @cache
160
  async def translate_google(text, source_language, target_language):
161
- source_language = closest_supported_match(source_language, supported_languages)
162
- target_language = closest_supported_match(target_language, supported_languages)
163
- if source_language == target_language:
164
- return text
165
- if source_language is None or target_language is None:
166
- return None
167
  async with google_rate_limit:
168
  response = translate_client.translate(
169
  text, source_language=source_language, target_language=target_language
@@ -284,7 +284,7 @@ def load_models(date: date):
284
  ["translation_from", "translation_to", "classification", "mmlu", "mgsm"]
285
  ] * len(models)
286
  models = pd.concat([models, get_translation_models()])
287
- models = models[ # temporary fix FIXME
288
  (models["id"] != "google/gemini-2.5-pro")
289
  & (models["id"] != "google/gemini-2.5-pro-preview")
290
  ]
 
34
  "mistralai/mistral-small-3.1-24b-instruct", # 0.3$
35
  "mistralai/mistral-saba", # 0.6$
36
  "mistralai/mistral-nemo", # 0.08$
37
+ "google/gemini-2.5-flash", # 0.6$
38
  "google/gemini-2.0-flash-lite-001", # 0.3$
39
  "google/gemma-3-27b-it", # 0.2$
40
  # "qwen/qwen-turbo", # 0.2$; recognizes "inappropriate content"
 
50
 
51
  blocklist = [
52
  "microsoft/wizardlm-2-8x22b", # temporarily rate-limited
53
+ "google/gemini-2.5-pro-preview",
54
+ "google/gemini-2.5-flash-preview",
55
+ "google/gemini-2.5-flash-lite-preview",
56
+ "google/gemini-2.5-flash-preview-04-17",
57
+ "google/gemini-2.5-flash-preview-05-20",
58
+ "google/gemini-2.5-flash-lite-preview-06-17",
59
+ "google/gemini-2.5-pro-preview-06-05",
60
+ "google/gemini-2.5-pro-preview-05-06",
61
  ]
62
 
63
  transcription_models = [
 
159
 
160
 
161
  translate_client = translate.Client()
162
+ google_supported_languages = [l["language"] for l in translate_client.get_languages()]
163
 
164
 
165
  @cache
166
  async def translate_google(text, source_language, target_language):
 
 
 
 
 
 
167
  async with google_rate_limit:
168
  response = translate_client.translate(
169
  text, source_language=source_language, target_language=target_language
 
284
  ["translation_from", "translation_to", "classification", "mmlu", "mgsm"]
285
  ] * len(models)
286
  models = pd.concat([models, get_translation_models()])
287
+ models = models[ # temporary fix FIXME
288
  (models["id"] != "google/gemini-2.5-pro")
289
  & (models["id"] != "google/gemini-2.5-pro-preview")
290
  ]
evals/tasks.py CHANGED
@@ -1,12 +1,15 @@
1
  import random
2
  from functools import partial
3
  from textwrap import dedent
 
4
  import evaluate
5
  import pandas as pd
6
  import sentencepiece as spm
7
  from datasets_.flores import flores_sentences
8
  from datasets_.mgsm import load_mgsm, parse_number
9
  from datasets_.mmlu import load_mmlu
 
 
10
  from languages import languages, script_name
11
  from models import complete, transcribe, translate_google
12
 
@@ -22,6 +25,9 @@ target_languages = languages[languages["in_benchmark"]].sample(
22
  frac=1, weights="speakers", replace=True, random_state=42
23
  )
24
 
 
 
 
25
 
26
  async def translate_and_evaluate(model, bcp_47, sentence_nr, mode="from"):
27
  original_language = languages[languages["bcp_47"] == bcp_47].iloc[0]
@@ -40,9 +46,18 @@ async def translate_and_evaluate(model, bcp_47, sentence_nr, mode="from"):
40
  target_sentence = flores_sentences(target_language)["text"][sentence_nr].strip()
41
  script = script_name(target_language.flores_path.split("_")[1])
42
  if model == "google/translate-v2":
43
- prediction = await translate_google(
44
- original_sentence, original_language.bcp_47, target_language.bcp_47
45
  )
 
 
 
 
 
 
 
 
 
46
  else:
47
  prediction = await complete(
48
  model=model,
 
1
  import random
2
  from functools import partial
3
  from textwrap import dedent
4
+
5
  import evaluate
6
  import pandas as pd
7
  import sentencepiece as spm
8
  from datasets_.flores import flores_sentences
9
  from datasets_.mgsm import load_mgsm, parse_number
10
  from datasets_.mmlu import load_mmlu
11
+ from google.cloud import translate_v2 as translate
12
+ from langcodes import closest_supported_match
13
  from languages import languages, script_name
14
  from models import complete, transcribe, translate_google
15
 
 
25
  frac=1, weights="speakers", replace=True, random_state=42
26
  )
27
 
28
+ translate_client = translate.Client()
29
+ supported_languages = [l["language"] for l in translate_client.get_languages()]
30
+
31
 
32
  async def translate_and_evaluate(model, bcp_47, sentence_nr, mode="from"):
33
  original_language = languages[languages["bcp_47"] == bcp_47].iloc[0]
 
46
  target_sentence = flores_sentences(target_language)["text"][sentence_nr].strip()
47
  script = script_name(target_language.flores_path.split("_")[1])
48
  if model == "google/translate-v2":
49
+ original_language = closest_supported_match(
50
+ original_language, supported_languages
51
  )
52
+ target_language = closest_supported_match(target_language, supported_languages)
53
+ if original_language == target_language:
54
+ prediction = original_sentence
55
+ elif original_language is None or target_language is None:
56
+ prediction = None
57
+ else:
58
+ prediction = await translate_google(
59
+ original_sentence, original_language.bcp_47, target_language.bcp_47
60
+ )
61
  else:
62
  prediction = await complete(
63
  model=model,
evals/translate.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ from languages import languages
2
+ from datasets_.mgsm import translate_mgsm
3
+
4
+ if __name__ == "__main__":
5
+ translate_mgsm(languages)
results.json CHANGED
The diff for this file is too large to render. See raw diff