Upload from GitHub Actions: Evaluate on autotranslated GSM dataset
Browse files- datasets.json +16 -1
- evals/datasets_/mgsm.py +56 -1
- evals/main.py +1 -2
- evals/models.py +11 -11
- evals/tasks.py +17 -2
- evals/translate.py +5 -0
- results.json +0 -0
datasets.json
CHANGED
@@ -300,7 +300,22 @@
|
|
300 |
"parallel": true,
|
301 |
"translation": "machine",
|
302 |
"base": "MGSM",
|
303 |
-
"implemented":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
304 |
"group": "Grade School Math"
|
305 |
},
|
306 |
{
|
|
|
300 |
"parallel": true,
|
301 |
"translation": "machine",
|
302 |
"base": "MGSM",
|
303 |
+
"implemented": false,
|
304 |
+
"group": "Grade School Math"
|
305 |
+
},
|
306 |
+
{
|
307 |
+
"name": "GSM Auto-Translated",
|
308 |
+
"author": null,
|
309 |
+
"author_url": null,
|
310 |
+
"url": null,
|
311 |
+
"n_languages": 52,
|
312 |
+
"tasks": [
|
313 |
+
"math"
|
314 |
+
],
|
315 |
+
"parallel": true,
|
316 |
+
"translation": "machine",
|
317 |
+
"base": "MGSM",
|
318 |
+
"implemented": false,
|
319 |
"group": "Grade School Math"
|
320 |
},
|
321 |
{
|
evals/datasets_/mgsm.py
CHANGED
@@ -1,5 +1,12 @@
|
|
|
|
|
|
|
|
|
|
1 |
from datasets_.util import _get_dataset_config_names, _load_dataset
|
2 |
-
from langcodes import
|
|
|
|
|
|
|
3 |
|
4 |
slug_mgsm = "juletxara/mgsm"
|
5 |
tags_mgsm = {
|
@@ -14,6 +21,12 @@ tags_gsm8kx = {
|
|
14 |
standardize_tag(a, macro=True): a
|
15 |
for a in _get_dataset_config_names(slug_gsm8kx, trust_remote_code=True)
|
16 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
|
18 |
def parse_number(i):
|
19 |
if isinstance(i, int):
|
@@ -23,6 +36,7 @@ def parse_number(i):
|
|
23 |
except ValueError:
|
24 |
return None
|
25 |
|
|
|
26 |
def load_mgsm(language_bcp_47, nr):
|
27 |
if language_bcp_47 in tags_mgsm.keys():
|
28 |
ds = _load_dataset(slug_mgsm, subset=tags_mgsm[language_bcp_47], split="test")
|
@@ -32,6 +46,11 @@ def load_mgsm(language_bcp_47, nr):
|
|
32 |
slug_afrimgsm, subset=tags_afrimgsm[language_bcp_47], split="test"
|
33 |
)
|
34 |
return slug_afrimgsm, ds[nr]
|
|
|
|
|
|
|
|
|
|
|
35 |
elif language_bcp_47 in tags_gsm8kx.keys():
|
36 |
row = _load_dataset(
|
37 |
slug_gsm8kx,
|
@@ -43,3 +62,39 @@ def load_mgsm(language_bcp_47, nr):
|
|
43 |
return slug_gsm8kx, row
|
44 |
else:
|
45 |
return None, None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import asyncio
|
2 |
+
import os
|
3 |
+
|
4 |
+
from datasets import Dataset, load_dataset
|
5 |
from datasets_.util import _get_dataset_config_names, _load_dataset
|
6 |
+
from langcodes import standardize_tag
|
7 |
+
from models import google_supported_languages, translate_google
|
8 |
+
from tqdm import tqdm
|
9 |
+
from tqdm.asyncio import tqdm_asyncio
|
10 |
|
11 |
slug_mgsm = "juletxara/mgsm"
|
12 |
tags_mgsm = {
|
|
|
21 |
standardize_tag(a, macro=True): a
|
22 |
for a in _get_dataset_config_names(slug_gsm8kx, trust_remote_code=True)
|
23 |
}
|
24 |
+
slug_gsm_autotranslated = "fair-forward/gsm-autotranslated"
|
25 |
+
tags_gsm_autotranslated = {
|
26 |
+
standardize_tag(a, macro=True): a
|
27 |
+
for a in _get_dataset_config_names(slug_gsm_autotranslated)
|
28 |
+
}
|
29 |
+
|
30 |
|
31 |
def parse_number(i):
|
32 |
if isinstance(i, int):
|
|
|
36 |
except ValueError:
|
37 |
return None
|
38 |
|
39 |
+
|
40 |
def load_mgsm(language_bcp_47, nr):
|
41 |
if language_bcp_47 in tags_mgsm.keys():
|
42 |
ds = _load_dataset(slug_mgsm, subset=tags_mgsm[language_bcp_47], split="test")
|
|
|
46 |
slug_afrimgsm, subset=tags_afrimgsm[language_bcp_47], split="test"
|
47 |
)
|
48 |
return slug_afrimgsm, ds[nr]
|
49 |
+
elif language_bcp_47 in tags_gsm_autotranslated.keys():
|
50 |
+
ds = _load_dataset(
|
51 |
+
slug_gsm_autotranslated, subset=tags_gsm_autotranslated[language_bcp_47], split="test"
|
52 |
+
)
|
53 |
+
return slug_gsm_autotranslated, ds[nr]
|
54 |
elif language_bcp_47 in tags_gsm8kx.keys():
|
55 |
row = _load_dataset(
|
56 |
slug_gsm8kx,
|
|
|
62 |
return slug_gsm8kx, row
|
63 |
else:
|
64 |
return None, None
|
65 |
+
|
66 |
+
|
67 |
+
def translate_mgsm(languages):
|
68 |
+
human_translated = [*tags_mgsm.keys(), *tags_afrimgsm.keys()]
|
69 |
+
untranslated = [
|
70 |
+
lang
|
71 |
+
for lang in languages["bcp_47"].values[:100]
|
72 |
+
if lang not in human_translated and lang in google_supported_languages
|
73 |
+
]
|
74 |
+
en = _load_dataset(slug_mgsm, subset=tags_mgsm["en"], split="test")
|
75 |
+
slug = "fair-forward/gsm-autotranslated"
|
76 |
+
for lang in tqdm(untranslated):
|
77 |
+
# check if already exists on hub
|
78 |
+
try:
|
79 |
+
ds_lang = load_dataset(slug, lang, split="test")
|
80 |
+
except ValueError:
|
81 |
+
print(f"Translating {lang}...")
|
82 |
+
questions_tr = [translate_google(q, "en", lang) for q in en["question"]]
|
83 |
+
questions_tr = asyncio.run(tqdm_asyncio.gather(*questions_tr))
|
84 |
+
ds_lang = Dataset.from_dict(
|
85 |
+
{
|
86 |
+
"question": questions_tr,
|
87 |
+
"answer": en["answer"],
|
88 |
+
"answer_number": en["answer_number"],
|
89 |
+
"equation_solution": en["equation_solution"],
|
90 |
+
}
|
91 |
+
)
|
92 |
+
ds_lang.push_to_hub(
|
93 |
+
slug,
|
94 |
+
split="test",
|
95 |
+
config_name=lang,
|
96 |
+
token=os.getenv("HUGGINGFACE_ACCESS_TOKEN"),
|
97 |
+
)
|
98 |
+
ds_lang.to_json(
|
99 |
+
f"data/mgsm/{lang}.json", lines=False, force_ascii=False, indent=2
|
100 |
+
)
|
evals/main.py
CHANGED
@@ -1,5 +1,4 @@
|
|
1 |
import asyncio
|
2 |
-
from time import time
|
3 |
|
4 |
import pandas as pd
|
5 |
from languages import languages
|
@@ -16,7 +15,7 @@ n_sentences = 10
|
|
16 |
|
17 |
async def evaluate():
|
18 |
# FIXME we should not need this for-loop, but it helps
|
19 |
-
for n_languages in range(
|
20 |
print(f"running evaluations for {n_languages} languages")
|
21 |
old_results = pd.read_json("results.json")
|
22 |
old_models = pd.read_json("models.json")
|
|
|
1 |
import asyncio
|
|
|
2 |
|
3 |
import pandas as pd
|
4 |
from languages import languages
|
|
|
15 |
|
16 |
async def evaluate():
|
17 |
# FIXME we should not need this for-loop, but it helps
|
18 |
+
for n_languages in range(90, 101, 3):
|
19 |
print(f"running evaluations for {n_languages} languages")
|
20 |
old_results = pd.read_json("results.json")
|
21 |
old_models = pd.read_json("models.json")
|
evals/models.py
CHANGED
@@ -34,7 +34,7 @@ important_models = [
|
|
34 |
"mistralai/mistral-small-3.1-24b-instruct", # 0.3$
|
35 |
"mistralai/mistral-saba", # 0.6$
|
36 |
"mistralai/mistral-nemo", # 0.08$
|
37 |
-
"google/gemini-2.5-flash
|
38 |
"google/gemini-2.0-flash-lite-001", # 0.3$
|
39 |
"google/gemma-3-27b-it", # 0.2$
|
40 |
# "qwen/qwen-turbo", # 0.2$; recognizes "inappropriate content"
|
@@ -50,8 +50,14 @@ important_models = [
|
|
50 |
|
51 |
blocklist = [
|
52 |
"microsoft/wizardlm-2-8x22b", # temporarily rate-limited
|
53 |
-
"google/gemini-2.5-pro",
|
54 |
-
"google/gemini-2.5-
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
]
|
56 |
|
57 |
transcription_models = [
|
@@ -153,17 +159,11 @@ async def complete(**kwargs) -> str | None:
|
|
153 |
|
154 |
|
155 |
translate_client = translate.Client()
|
156 |
-
|
157 |
|
158 |
|
159 |
@cache
|
160 |
async def translate_google(text, source_language, target_language):
|
161 |
-
source_language = closest_supported_match(source_language, supported_languages)
|
162 |
-
target_language = closest_supported_match(target_language, supported_languages)
|
163 |
-
if source_language == target_language:
|
164 |
-
return text
|
165 |
-
if source_language is None or target_language is None:
|
166 |
-
return None
|
167 |
async with google_rate_limit:
|
168 |
response = translate_client.translate(
|
169 |
text, source_language=source_language, target_language=target_language
|
@@ -284,7 +284,7 @@ def load_models(date: date):
|
|
284 |
["translation_from", "translation_to", "classification", "mmlu", "mgsm"]
|
285 |
] * len(models)
|
286 |
models = pd.concat([models, get_translation_models()])
|
287 |
-
models = models[
|
288 |
(models["id"] != "google/gemini-2.5-pro")
|
289 |
& (models["id"] != "google/gemini-2.5-pro-preview")
|
290 |
]
|
|
|
34 |
"mistralai/mistral-small-3.1-24b-instruct", # 0.3$
|
35 |
"mistralai/mistral-saba", # 0.6$
|
36 |
"mistralai/mistral-nemo", # 0.08$
|
37 |
+
"google/gemini-2.5-flash", # 0.6$
|
38 |
"google/gemini-2.0-flash-lite-001", # 0.3$
|
39 |
"google/gemma-3-27b-it", # 0.2$
|
40 |
# "qwen/qwen-turbo", # 0.2$; recognizes "inappropriate content"
|
|
|
50 |
|
51 |
blocklist = [
|
52 |
"microsoft/wizardlm-2-8x22b", # temporarily rate-limited
|
53 |
+
"google/gemini-2.5-pro-preview",
|
54 |
+
"google/gemini-2.5-flash-preview",
|
55 |
+
"google/gemini-2.5-flash-lite-preview",
|
56 |
+
"google/gemini-2.5-flash-preview-04-17",
|
57 |
+
"google/gemini-2.5-flash-preview-05-20",
|
58 |
+
"google/gemini-2.5-flash-lite-preview-06-17",
|
59 |
+
"google/gemini-2.5-pro-preview-06-05",
|
60 |
+
"google/gemini-2.5-pro-preview-05-06",
|
61 |
]
|
62 |
|
63 |
transcription_models = [
|
|
|
159 |
|
160 |
|
161 |
translate_client = translate.Client()
|
162 |
+
google_supported_languages = [l["language"] for l in translate_client.get_languages()]
|
163 |
|
164 |
|
165 |
@cache
|
166 |
async def translate_google(text, source_language, target_language):
|
|
|
|
|
|
|
|
|
|
|
|
|
167 |
async with google_rate_limit:
|
168 |
response = translate_client.translate(
|
169 |
text, source_language=source_language, target_language=target_language
|
|
|
284 |
["translation_from", "translation_to", "classification", "mmlu", "mgsm"]
|
285 |
] * len(models)
|
286 |
models = pd.concat([models, get_translation_models()])
|
287 |
+
models = models[ # temporary fix FIXME
|
288 |
(models["id"] != "google/gemini-2.5-pro")
|
289 |
& (models["id"] != "google/gemini-2.5-pro-preview")
|
290 |
]
|
evals/tasks.py
CHANGED
@@ -1,12 +1,15 @@
|
|
1 |
import random
|
2 |
from functools import partial
|
3 |
from textwrap import dedent
|
|
|
4 |
import evaluate
|
5 |
import pandas as pd
|
6 |
import sentencepiece as spm
|
7 |
from datasets_.flores import flores_sentences
|
8 |
from datasets_.mgsm import load_mgsm, parse_number
|
9 |
from datasets_.mmlu import load_mmlu
|
|
|
|
|
10 |
from languages import languages, script_name
|
11 |
from models import complete, transcribe, translate_google
|
12 |
|
@@ -22,6 +25,9 @@ target_languages = languages[languages["in_benchmark"]].sample(
|
|
22 |
frac=1, weights="speakers", replace=True, random_state=42
|
23 |
)
|
24 |
|
|
|
|
|
|
|
25 |
|
26 |
async def translate_and_evaluate(model, bcp_47, sentence_nr, mode="from"):
|
27 |
original_language = languages[languages["bcp_47"] == bcp_47].iloc[0]
|
@@ -40,9 +46,18 @@ async def translate_and_evaluate(model, bcp_47, sentence_nr, mode="from"):
|
|
40 |
target_sentence = flores_sentences(target_language)["text"][sentence_nr].strip()
|
41 |
script = script_name(target_language.flores_path.split("_")[1])
|
42 |
if model == "google/translate-v2":
|
43 |
-
|
44 |
-
|
45 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
else:
|
47 |
prediction = await complete(
|
48 |
model=model,
|
|
|
1 |
import random
|
2 |
from functools import partial
|
3 |
from textwrap import dedent
|
4 |
+
|
5 |
import evaluate
|
6 |
import pandas as pd
|
7 |
import sentencepiece as spm
|
8 |
from datasets_.flores import flores_sentences
|
9 |
from datasets_.mgsm import load_mgsm, parse_number
|
10 |
from datasets_.mmlu import load_mmlu
|
11 |
+
from google.cloud import translate_v2 as translate
|
12 |
+
from langcodes import closest_supported_match
|
13 |
from languages import languages, script_name
|
14 |
from models import complete, transcribe, translate_google
|
15 |
|
|
|
25 |
frac=1, weights="speakers", replace=True, random_state=42
|
26 |
)
|
27 |
|
28 |
+
translate_client = translate.Client()
|
29 |
+
supported_languages = [l["language"] for l in translate_client.get_languages()]
|
30 |
+
|
31 |
|
32 |
async def translate_and_evaluate(model, bcp_47, sentence_nr, mode="from"):
|
33 |
original_language = languages[languages["bcp_47"] == bcp_47].iloc[0]
|
|
|
46 |
target_sentence = flores_sentences(target_language)["text"][sentence_nr].strip()
|
47 |
script = script_name(target_language.flores_path.split("_")[1])
|
48 |
if model == "google/translate-v2":
|
49 |
+
original_language = closest_supported_match(
|
50 |
+
original_language, supported_languages
|
51 |
)
|
52 |
+
target_language = closest_supported_match(target_language, supported_languages)
|
53 |
+
if original_language == target_language:
|
54 |
+
prediction = original_sentence
|
55 |
+
elif original_language is None or target_language is None:
|
56 |
+
prediction = None
|
57 |
+
else:
|
58 |
+
prediction = await translate_google(
|
59 |
+
original_sentence, original_language.bcp_47, target_language.bcp_47
|
60 |
+
)
|
61 |
else:
|
62 |
prediction = await complete(
|
63 |
model=model,
|
evals/translate.py
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from languages import languages
|
2 |
+
from datasets_.mgsm import translate_mgsm
|
3 |
+
|
4 |
+
if __name__ == "__main__":
|
5 |
+
translate_mgsm(languages)
|
results.json
CHANGED
The diff for this file is too large to render.
See raw diff
|
|