Upload from GitHub Actions: Get more results, compute average based on all tasks
Browse files- datasets.json +8 -8
- evals/backend.py +3 -4
- evals/datasets_/truthfulqa.py +30 -0
- evals/main.py +1 -1
- evals/models.py +11 -13
- evals/tasks.py +79 -9
- frontend/src/components/ScoreColumns.js +1 -1
- languages.json +6 -6
- models.json +121 -4
- results.json +0 -0
datasets.json
CHANGED
@@ -256,7 +256,7 @@
|
|
256 |
"parallel": true,
|
257 |
"translation": "machine",
|
258 |
"base": "MMLU",
|
259 |
-
"implemented":
|
260 |
"group": "Multitask Language Understanding"
|
261 |
},
|
262 |
{
|
@@ -300,7 +300,7 @@
|
|
300 |
"parallel": true,
|
301 |
"translation": "machine",
|
302 |
"base": "MGSM",
|
303 |
-
"implemented":
|
304 |
"group": "Grade School Math"
|
305 |
},
|
306 |
{
|
@@ -315,7 +315,7 @@
|
|
315 |
"parallel": true,
|
316 |
"translation": "machine",
|
317 |
"base": "MGSM",
|
318 |
-
"implemented":
|
319 |
"group": "Grade School Math"
|
320 |
},
|
321 |
{
|
@@ -345,7 +345,7 @@
|
|
345 |
"parallel": true,
|
346 |
"translation": "machine",
|
347 |
"base": "AI2 ARC",
|
348 |
-
"implemented":
|
349 |
"group": "Abstract Reasoning"
|
350 |
},
|
351 |
{
|
@@ -360,7 +360,7 @@
|
|
360 |
"parallel": true,
|
361 |
"translation": "machine",
|
362 |
"base": "AI2 ARC",
|
363 |
-
"implemented":
|
364 |
"group": "Abstract Reasoning"
|
365 |
},
|
366 |
{
|
@@ -375,7 +375,7 @@
|
|
375 |
"parallel": true,
|
376 |
"translation": "human",
|
377 |
"base": "TruthfulQA",
|
378 |
-
"implemented":
|
379 |
"group": "Truthfulness"
|
380 |
},
|
381 |
{
|
@@ -390,7 +390,7 @@
|
|
390 |
"parallel": true,
|
391 |
"translation": "machine",
|
392 |
"base": "TruthfulQA",
|
393 |
-
"implemented":
|
394 |
"group": "Truthfulness"
|
395 |
},
|
396 |
{
|
@@ -405,7 +405,7 @@
|
|
405 |
"parallel": true,
|
406 |
"translation": "machine",
|
407 |
"base": "TruthfulQA",
|
408 |
-
"implemented":
|
409 |
"group": "Truthfulness"
|
410 |
},
|
411 |
{
|
|
|
256 |
"parallel": true,
|
257 |
"translation": "machine",
|
258 |
"base": "MMLU",
|
259 |
+
"implemented": true,
|
260 |
"group": "Multitask Language Understanding"
|
261 |
},
|
262 |
{
|
|
|
300 |
"parallel": true,
|
301 |
"translation": "machine",
|
302 |
"base": "MGSM",
|
303 |
+
"implemented": true,
|
304 |
"group": "Grade School Math"
|
305 |
},
|
306 |
{
|
|
|
315 |
"parallel": true,
|
316 |
"translation": "machine",
|
317 |
"base": "MGSM",
|
318 |
+
"implemented": true,
|
319 |
"group": "Grade School Math"
|
320 |
},
|
321 |
{
|
|
|
345 |
"parallel": true,
|
346 |
"translation": "machine",
|
347 |
"base": "AI2 ARC",
|
348 |
+
"implemented": true,
|
349 |
"group": "Abstract Reasoning"
|
350 |
},
|
351 |
{
|
|
|
360 |
"parallel": true,
|
361 |
"translation": "machine",
|
362 |
"base": "AI2 ARC",
|
363 |
+
"implemented": true,
|
364 |
"group": "Abstract Reasoning"
|
365 |
},
|
366 |
{
|
|
|
375 |
"parallel": true,
|
376 |
"translation": "human",
|
377 |
"base": "TruthfulQA",
|
378 |
+
"implemented": true,
|
379 |
"group": "Truthfulness"
|
380 |
},
|
381 |
{
|
|
|
390 |
"parallel": true,
|
391 |
"translation": "machine",
|
392 |
"base": "TruthfulQA",
|
393 |
+
"implemented": true,
|
394 |
"group": "Truthfulness"
|
395 |
},
|
396 |
{
|
|
|
405 |
"parallel": true,
|
406 |
"translation": "machine",
|
407 |
"base": "TruthfulQA",
|
408 |
+
"implemented": true,
|
409 |
"group": "Truthfulness"
|
410 |
},
|
411 |
{
|
evals/backend.py
CHANGED
@@ -26,11 +26,10 @@ task_metrics = [
|
|
26 |
"classification_accuracy",
|
27 |
"mmlu_accuracy",
|
28 |
"arc_accuracy",
|
|
|
29 |
"mgsm_accuracy",
|
30 |
]
|
31 |
|
32 |
-
task_metrics_basic = ["translation_from_bleu", "translation_to_bleu", "classification_accuracy"]
|
33 |
-
|
34 |
|
35 |
def compute_normalized_average(df, metrics):
|
36 |
"""Compute average of min-max normalized metric columns."""
|
@@ -58,7 +57,7 @@ def make_model_table(df, models):
|
|
58 |
for metric in task_metrics:
|
59 |
if metric not in df.columns:
|
60 |
df[metric] = np.nan
|
61 |
-
df["average"] = compute_normalized_average(df,
|
62 |
df = df.sort_values(by="average", ascending=False).reset_index()
|
63 |
df = pd.merge(df, models, left_on="model", right_on="id", how="left")
|
64 |
df["rank"] = df.index + 1
|
@@ -93,7 +92,7 @@ def make_language_table(df, languages):
|
|
93 |
for metric in task_metrics:
|
94 |
if metric not in df.columns:
|
95 |
df[metric] = np.nan
|
96 |
-
df["average"] = compute_normalized_average(df,
|
97 |
df = pd.merge(languages, df, on="bcp_47", how="outer")
|
98 |
df = df.sort_values(by="speakers", ascending=False)
|
99 |
df = df[
|
|
|
26 |
"classification_accuracy",
|
27 |
"mmlu_accuracy",
|
28 |
"arc_accuracy",
|
29 |
+
"truthfulqa_accuracy",
|
30 |
"mgsm_accuracy",
|
31 |
]
|
32 |
|
|
|
|
|
33 |
|
34 |
def compute_normalized_average(df, metrics):
|
35 |
"""Compute average of min-max normalized metric columns."""
|
|
|
57 |
for metric in task_metrics:
|
58 |
if metric not in df.columns:
|
59 |
df[metric] = np.nan
|
60 |
+
df["average"] = compute_normalized_average(df, task_metrics)
|
61 |
df = df.sort_values(by="average", ascending=False).reset_index()
|
62 |
df = pd.merge(df, models, left_on="model", right_on="id", how="left")
|
63 |
df["rank"] = df.index + 1
|
|
|
92 |
for metric in task_metrics:
|
93 |
if metric not in df.columns:
|
94 |
df[metric] = np.nan
|
95 |
+
df["average"] = compute_normalized_average(df, task_metrics)
|
96 |
df = pd.merge(languages, df, on="bcp_47", how="outer")
|
97 |
df = df.sort_values(by="speakers", ascending=False)
|
98 |
df = df[
|
evals/datasets_/truthfulqa.py
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import random
|
2 |
+
from collections import Counter, defaultdict
|
3 |
+
|
4 |
+
from langcodes import Language, standardize_tag
|
5 |
+
from rich import print
|
6 |
+
|
7 |
+
from datasets_.util import _get_dataset_config_names, _load_dataset
|
8 |
+
|
9 |
+
slug_uhura_truthfulqa = "masakhane/uhura-truthfulqa"
|
10 |
+
tags_uhura_truthfulqa = {
|
11 |
+
standardize_tag(a.split("_")[0], macro=True): a for a in _get_dataset_config_names(slug_uhura_truthfulqa)
|
12 |
+
if a.endswith("multiple_choice")
|
13 |
+
}
|
14 |
+
|
15 |
+
|
16 |
+
def add_choices(row):
|
17 |
+
row["choices"] = row["mc1_targets"]["choices"]
|
18 |
+
row["labels"] = row["mc1_targets"]["labels"]
|
19 |
+
return row
|
20 |
+
|
21 |
+
|
22 |
+
def load_truthfulqa(language_bcp_47, nr):
|
23 |
+
if language_bcp_47 in tags_uhura_truthfulqa.keys():
|
24 |
+
ds = _load_dataset(slug_uhura_truthfulqa, tags_uhura_truthfulqa[language_bcp_47])
|
25 |
+
ds = ds.map(add_choices)
|
26 |
+
examples = ds["train"]
|
27 |
+
task = ds["test"][nr]
|
28 |
+
return "masakhane/uhura-truthfulqa", examples, task
|
29 |
+
else:
|
30 |
+
return None, None, None
|
evals/main.py
CHANGED
@@ -15,7 +15,7 @@ n_sentences = 10
|
|
15 |
|
16 |
async def evaluate():
|
17 |
# FIXME we should not need this for-loop, but it helps
|
18 |
-
for n_languages in range(
|
19 |
print(f"running evaluations for {n_languages} languages")
|
20 |
old_results = pd.read_json("results.json")
|
21 |
old_models = pd.read_json("models.json")
|
|
|
15 |
|
16 |
async def evaluate():
|
17 |
# FIXME we should not need this for-loop, but it helps
|
18 |
+
for n_languages in range(10, 101, 10):
|
19 |
print(f"running evaluations for {n_languages} languages")
|
20 |
old_results = pd.read_json("results.json")
|
21 |
old_models = pd.read_json("models.json")
|
evals/models.py
CHANGED
@@ -11,8 +11,7 @@ from elevenlabs import AsyncElevenLabs
|
|
11 |
from google.cloud import translate_v2 as translate
|
12 |
from huggingface_hub import AsyncInferenceClient, HfApi
|
13 |
from joblib.memory import Memory
|
14 |
-
from
|
15 |
-
from openai import AsyncOpenAI, PermissionDeniedError
|
16 |
from requests import HTTPError, get
|
17 |
|
18 |
# for development purposes, all languages will be evaluated on the fast models
|
@@ -23,12 +22,12 @@ important_models = [
|
|
23 |
"meta-llama/llama-3.1-70b-instruct", # 0.3$
|
24 |
"meta-llama/llama-3-70b-instruct", # 0.4$
|
25 |
# "meta-llama/llama-2-70b-chat", # 0.9$; not properly supported by OpenRouter
|
26 |
-
|
27 |
"openai/gpt-4.1-mini", # 1.6$
|
28 |
"openai/gpt-4.1-nano", # 0.4$
|
29 |
"openai/gpt-4o-mini", # 0.6$
|
30 |
# "openai/gpt-4o-2024-11-20", # 10$
|
31 |
-
|
32 |
# "openai/gpt-3.5-turbo", # 1.5$
|
33 |
# "anthropic/claude-3.5-haiku", # 4$ -> too expensive for dev
|
34 |
"mistralai/mistral-small-3.1-24b-instruct", # 0.3$
|
@@ -37,6 +36,9 @@ important_models = [
|
|
37 |
"google/gemini-2.5-flash", # 0.6$
|
38 |
"google/gemini-2.0-flash-lite-001", # 0.3$
|
39 |
"google/gemma-3-27b-it", # 0.2$
|
|
|
|
|
|
|
40 |
# "qwen/qwen-turbo", # 0.2$; recognizes "inappropriate content"
|
41 |
# "qwen/qwq-32b", # 0.2$
|
42 |
# "qwen/qwen-2.5-72b-instruct", # 0.39$
|
@@ -49,7 +51,6 @@ important_models = [
|
|
49 |
]
|
50 |
|
51 |
blocklist = [
|
52 |
-
"microsoft/wizardlm-2-8x22b", # temporarily rate-limited
|
53 |
"google/gemini-2.5-pro-preview",
|
54 |
"google/gemini-2.5-flash-preview",
|
55 |
"google/gemini-2.5-flash-lite-preview",
|
@@ -150,9 +151,10 @@ async def complete(**kwargs) -> str | None:
|
|
150 |
async with openrouter_rate_limit:
|
151 |
try:
|
152 |
response = await client.chat.completions.create(**kwargs)
|
153 |
-
except
|
154 |
-
|
155 |
-
|
|
|
156 |
if not response.choices:
|
157 |
raise Exception(response)
|
158 |
return response.choices[0].message.content.strip()
|
@@ -281,13 +283,9 @@ def load_models(date: date):
|
|
281 |
)
|
282 |
# models = models[models["cost"] <= 2.0].reset_index(drop=True)
|
283 |
models["tasks"] = [
|
284 |
-
["translation_from", "translation_to", "classification", "mmlu", "arc", "mgsm"]
|
285 |
] * len(models)
|
286 |
models = pd.concat([models, get_translation_models()])
|
287 |
-
models = models[ # temporary fix FIXME
|
288 |
-
(models["id"] != "google/gemini-2.5-pro")
|
289 |
-
& (models["id"] != "google/gemini-2.5-pro-preview")
|
290 |
-
]
|
291 |
return models
|
292 |
|
293 |
|
|
|
11 |
from google.cloud import translate_v2 as translate
|
12 |
from huggingface_hub import AsyncInferenceClient, HfApi
|
13 |
from joblib.memory import Memory
|
14 |
+
from openai import AsyncOpenAI, BadRequestError
|
|
|
15 |
from requests import HTTPError, get
|
16 |
|
17 |
# for development purposes, all languages will be evaluated on the fast models
|
|
|
22 |
"meta-llama/llama-3.1-70b-instruct", # 0.3$
|
23 |
"meta-llama/llama-3-70b-instruct", # 0.4$
|
24 |
# "meta-llama/llama-2-70b-chat", # 0.9$; not properly supported by OpenRouter
|
25 |
+
"openai/gpt-4.1", # 8$
|
26 |
"openai/gpt-4.1-mini", # 1.6$
|
27 |
"openai/gpt-4.1-nano", # 0.4$
|
28 |
"openai/gpt-4o-mini", # 0.6$
|
29 |
# "openai/gpt-4o-2024-11-20", # 10$
|
30 |
+
"openai/gpt-3.5-turbo-0613", # 2$
|
31 |
# "openai/gpt-3.5-turbo", # 1.5$
|
32 |
# "anthropic/claude-3.5-haiku", # 4$ -> too expensive for dev
|
33 |
"mistralai/mistral-small-3.1-24b-instruct", # 0.3$
|
|
|
36 |
"google/gemini-2.5-flash", # 0.6$
|
37 |
"google/gemini-2.0-flash-lite-001", # 0.3$
|
38 |
"google/gemma-3-27b-it", # 0.2$
|
39 |
+
"qwen/qwen3-32b",
|
40 |
+
"qwen/qwen3-235b-a22b",
|
41 |
+
"qwen/qwen3-30b-a3b", # 0.29$
|
42 |
# "qwen/qwen-turbo", # 0.2$; recognizes "inappropriate content"
|
43 |
# "qwen/qwq-32b", # 0.2$
|
44 |
# "qwen/qwen-2.5-72b-instruct", # 0.39$
|
|
|
51 |
]
|
52 |
|
53 |
blocklist = [
|
|
|
54 |
"google/gemini-2.5-pro-preview",
|
55 |
"google/gemini-2.5-flash-preview",
|
56 |
"google/gemini-2.5-flash-lite-preview",
|
|
|
151 |
async with openrouter_rate_limit:
|
152 |
try:
|
153 |
response = await client.chat.completions.create(**kwargs)
|
154 |
+
except BadRequestError as e:
|
155 |
+
if "filtered" in e.message:
|
156 |
+
return None
|
157 |
+
raise e
|
158 |
if not response.choices:
|
159 |
raise Exception(response)
|
160 |
return response.choices[0].message.content.strip()
|
|
|
283 |
)
|
284 |
# models = models[models["cost"] <= 2.0].reset_index(drop=True)
|
285 |
models["tasks"] = [
|
286 |
+
["translation_from", "translation_to", "classification", "mmlu", "arc", "truthfulqa", "mgsm"]
|
287 |
] * len(models)
|
288 |
models = pd.concat([models, get_translation_models()])
|
|
|
|
|
|
|
|
|
289 |
return models
|
290 |
|
291 |
|
evals/tasks.py
CHANGED
@@ -9,6 +9,7 @@ from datasets_.flores import flores_sentences
|
|
9 |
from datasets_.mgsm import load_mgsm, parse_number
|
10 |
from datasets_.mmlu import load_mmlu
|
11 |
from datasets_.arc import load_uhura_arc_easy
|
|
|
12 |
from google.cloud import translate_v2 as translate
|
13 |
from langcodes import closest_supported_match
|
14 |
from languages import languages, script_name
|
@@ -224,6 +225,7 @@ async def mlm_and_evaluate(model, language_bcp_47, nr):
|
|
224 |
}
|
225 |
]
|
226 |
|
|
|
227 |
def format_multiple_choice(item):
|
228 |
return f"""{item["question"]}
|
229 |
|
@@ -234,6 +236,7 @@ def format_multiple_choice(item):
|
|
234 |
|
235 |
A|B|C|D?"""
|
236 |
|
|
|
237 |
async def mmlu_and_evaluate(model, language_bcp_47, nr):
|
238 |
ds_name, examples, task = load_mmlu(language_bcp_47, nr)
|
239 |
if not task:
|
@@ -253,7 +256,10 @@ async def mmlu_and_evaluate(model, language_bcp_47, nr):
|
|
253 |
temperature=0,
|
254 |
max_tokens=1,
|
255 |
)
|
256 |
-
|
|
|
|
|
|
|
257 |
except Exception as e:
|
258 |
if "ResponsibleAIPolicyViolation" in str(e):
|
259 |
acc = 0
|
@@ -270,11 +276,12 @@ async def mmlu_and_evaluate(model, language_bcp_47, nr):
|
|
270 |
}
|
271 |
]
|
272 |
|
|
|
273 |
async def arc_and_evaluate(model, language_bcp_47, nr):
|
274 |
ds_name, examples, task = load_uhura_arc_easy(language_bcp_47, nr)
|
275 |
if not task:
|
276 |
return []
|
277 |
-
|
278 |
messages = []
|
279 |
for example in examples:
|
280 |
messages += [
|
@@ -289,7 +296,10 @@ async def arc_and_evaluate(model, language_bcp_47, nr):
|
|
289 |
temperature=0,
|
290 |
max_tokens=1,
|
291 |
)
|
292 |
-
|
|
|
|
|
|
|
293 |
except Exception as e:
|
294 |
if "ResponsibleAIPolicyViolation" in str(e):
|
295 |
acc = 0
|
@@ -305,7 +315,68 @@ async def arc_and_evaluate(model, language_bcp_47, nr):
|
|
305 |
"sentence_nr": nr,
|
306 |
}
|
307 |
]
|
308 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
309 |
|
310 |
async def mgsm_and_evaluate(model, language_bcp_47, nr):
|
311 |
system_prompt = """
|
@@ -325,11 +396,9 @@ async def mgsm_and_evaluate(model, language_bcp_47, nr):
|
|
325 |
temperature=0,
|
326 |
max_tokens=1024,
|
327 |
)
|
328 |
-
|
329 |
-
|
330 |
-
accuracy = int(
|
331 |
-
parse_number(number[1].strip()) == parse_number(question["answer_number"])
|
332 |
-
)
|
333 |
else:
|
334 |
accuracy = 0
|
335 |
|
@@ -383,6 +452,7 @@ tasks = {
|
|
383 |
# "mlm": mlm_and_evaluate,
|
384 |
"mmlu": mmlu_and_evaluate,
|
385 |
"arc": arc_and_evaluate,
|
|
|
386 |
"mgsm": mgsm_and_evaluate,
|
387 |
# "asr": transcribe_and_evaluate,
|
388 |
}
|
|
|
9 |
from datasets_.mgsm import load_mgsm, parse_number
|
10 |
from datasets_.mmlu import load_mmlu
|
11 |
from datasets_.arc import load_uhura_arc_easy
|
12 |
+
from datasets_.truthfulqa import load_truthfulqa
|
13 |
from google.cloud import translate_v2 as translate
|
14 |
from langcodes import closest_supported_match
|
15 |
from languages import languages, script_name
|
|
|
225 |
}
|
226 |
]
|
227 |
|
228 |
+
|
229 |
def format_multiple_choice(item):
|
230 |
return f"""{item["question"]}
|
231 |
|
|
|
236 |
|
237 |
A|B|C|D?"""
|
238 |
|
239 |
+
|
240 |
async def mmlu_and_evaluate(model, language_bcp_47, nr):
|
241 |
ds_name, examples, task = load_mmlu(language_bcp_47, nr)
|
242 |
if not task:
|
|
|
256 |
temperature=0,
|
257 |
max_tokens=1,
|
258 |
)
|
259 |
+
if response:
|
260 |
+
acc = int(response[:1].strip() == task["answer"])
|
261 |
+
else:
|
262 |
+
acc = 0
|
263 |
except Exception as e:
|
264 |
if "ResponsibleAIPolicyViolation" in str(e):
|
265 |
acc = 0
|
|
|
276 |
}
|
277 |
]
|
278 |
|
279 |
+
|
280 |
async def arc_and_evaluate(model, language_bcp_47, nr):
|
281 |
ds_name, examples, task = load_uhura_arc_easy(language_bcp_47, nr)
|
282 |
if not task:
|
283 |
return []
|
284 |
+
|
285 |
messages = []
|
286 |
for example in examples:
|
287 |
messages += [
|
|
|
296 |
temperature=0,
|
297 |
max_tokens=1,
|
298 |
)
|
299 |
+
if response:
|
300 |
+
acc = int(response[:1].strip() == task["answer"])
|
301 |
+
else:
|
302 |
+
acc = 0
|
303 |
except Exception as e:
|
304 |
if "ResponsibleAIPolicyViolation" in str(e):
|
305 |
acc = 0
|
|
|
315 |
"sentence_nr": nr,
|
316 |
}
|
317 |
]
|
318 |
+
|
319 |
+
|
320 |
+
letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
|
321 |
+
|
322 |
+
|
323 |
+
def shuffle_choices_and_labels(item):
|
324 |
+
indices = list(range(len(item["choices"])))
|
325 |
+
random.shuffle(indices)
|
326 |
+
item["choices"] = [item["choices"][i] for i in indices]
|
327 |
+
item["labels"] = [item["labels"][i] for i in indices]
|
328 |
+
return item
|
329 |
+
|
330 |
+
|
331 |
+
def format_multiple_choice_truthfulqa(item):
|
332 |
+
text = item["question"] + "\n\n"
|
333 |
+
for i, choice in enumerate(item["choices"]):
|
334 |
+
text += f"{letters[i]}: {choice}\n"
|
335 |
+
text += "|".join(letters[: len(item["choices"])]) + "?"
|
336 |
+
return text
|
337 |
+
|
338 |
+
|
339 |
+
async def truthfulqa_and_evaluate(model, language_bcp_47, nr):
|
340 |
+
ds_name, examples, task = load_truthfulqa(language_bcp_47, nr)
|
341 |
+
if not task:
|
342 |
+
return []
|
343 |
+
task = shuffle_choices_and_labels(task)
|
344 |
+
answer = letters[task["labels"].index(1)]
|
345 |
+
messages = []
|
346 |
+
for example in examples:
|
347 |
+
example = shuffle_choices_and_labels(example)
|
348 |
+
messages += [
|
349 |
+
{"role": "user", "content": format_multiple_choice_truthfulqa(example)},
|
350 |
+
{"role": "assistant", "content": letters[example["labels"].index(1)]},
|
351 |
+
]
|
352 |
+
messages += [{"role": "user", "content": format_multiple_choice_truthfulqa(task)}]
|
353 |
+
try:
|
354 |
+
response = await complete(
|
355 |
+
model=model,
|
356 |
+
messages=messages,
|
357 |
+
temperature=0,
|
358 |
+
max_tokens=1,
|
359 |
+
)
|
360 |
+
if response:
|
361 |
+
acc = int(response[:1].strip() == answer)
|
362 |
+
else:
|
363 |
+
acc = 0
|
364 |
+
except Exception as e:
|
365 |
+
if "ResponsibleAIPolicyViolation" in str(e):
|
366 |
+
acc = 0
|
367 |
+
else:
|
368 |
+
raise e
|
369 |
+
return [
|
370 |
+
{
|
371 |
+
"model": model,
|
372 |
+
"bcp_47": language_bcp_47,
|
373 |
+
"task": "truthfulqa",
|
374 |
+
"metric": "accuracy",
|
375 |
+
"score": acc,
|
376 |
+
"sentence_nr": nr,
|
377 |
+
}
|
378 |
+
]
|
379 |
+
|
380 |
|
381 |
async def mgsm_and_evaluate(model, language_bcp_47, nr):
|
382 |
system_prompt = """
|
|
|
396 |
temperature=0,
|
397 |
max_tokens=1024,
|
398 |
)
|
399 |
+
if response and len(response.split("####")) == 2:
|
400 |
+
number = response.split("####")[1].strip()
|
401 |
+
accuracy = int(parse_number(number) == parse_number(question["answer_number"]))
|
|
|
|
|
402 |
else:
|
403 |
accuracy = 0
|
404 |
|
|
|
452 |
# "mlm": mlm_and_evaluate,
|
453 |
"mmlu": mmlu_and_evaluate,
|
454 |
"arc": arc_and_evaluate,
|
455 |
+
"truthfulqa": truthfulqa_and_evaluate,
|
456 |
"mgsm": mgsm_and_evaluate,
|
457 |
# "asr": transcribe_and_evaluate,
|
458 |
}
|
frontend/src/components/ScoreColumns.js
CHANGED
@@ -14,7 +14,7 @@ const ScoreColumns = [
|
|
14 |
<Column
|
15 |
field='average'
|
16 |
header='Proficiency'
|
17 |
-
headerTooltip='Language Proficiency Score (average
|
18 |
sortable
|
19 |
body={scoreBodyTemplate('average', { minScore: 0.2, maxScore: 0.5 })}
|
20 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
|
|
14 |
<Column
|
15 |
field='average'
|
16 |
header='Proficiency'
|
17 |
+
headerTooltip='Language Proficiency Score (average of the scores for each task, after min-max normalization)'
|
18 |
sortable
|
19 |
body={scoreBodyTemplate('average', { minScore: 0.2, maxScore: 0.5 })}
|
20 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
languages.json
CHANGED
@@ -79,7 +79,7 @@
|
|
79 |
"family":"Indo-European",
|
80 |
"flores_path":"fra_Latn",
|
81 |
"fleurs_tag":"fr_fr",
|
82 |
-
"commonvoice_hours":
|
83 |
"commonvoice_locale":"fr",
|
84 |
"in_benchmark":true
|
85 |
},
|
@@ -1375,7 +1375,7 @@
|
|
1375 |
"family":"Turkic",
|
1376 |
"flores_path":"uig_Arab",
|
1377 |
"fleurs_tag":null,
|
1378 |
-
"commonvoice_hours":
|
1379 |
"commonvoice_locale":"ug",
|
1380 |
"in_benchmark":true
|
1381 |
},
|
@@ -1747,7 +1747,7 @@
|
|
1747 |
"family":"Indo-European",
|
1748 |
"flores_path":"nob_Latn",
|
1749 |
"fleurs_tag":"nb_no",
|
1750 |
-
"commonvoice_hours":0.
|
1751 |
"commonvoice_locale":"nb-NO",
|
1752 |
"in_benchmark":true
|
1753 |
},
|
@@ -2323,7 +2323,7 @@
|
|
2323 |
"family":"Dravidian",
|
2324 |
"flores_path":null,
|
2325 |
"fleurs_tag":null,
|
2326 |
-
"commonvoice_hours":
|
2327 |
"commonvoice_locale":"brh",
|
2328 |
"in_benchmark":false
|
2329 |
},
|
@@ -2623,7 +2623,7 @@
|
|
2623 |
"family":"Indo-European",
|
2624 |
"flores_path":null,
|
2625 |
"fleurs_tag":null,
|
2626 |
-
"commonvoice_hours":0.
|
2627 |
"commonvoice_locale":"haz",
|
2628 |
"in_benchmark":false
|
2629 |
},
|
@@ -4651,7 +4651,7 @@
|
|
4651 |
"family":"Abkhaz-Adyge",
|
4652 |
"flores_path":null,
|
4653 |
"fleurs_tag":null,
|
4654 |
-
"commonvoice_hours":
|
4655 |
"commonvoice_locale":"ady",
|
4656 |
"in_benchmark":false
|
4657 |
},
|
|
|
79 |
"family":"Indo-European",
|
80 |
"flores_path":"fra_Latn",
|
81 |
"fleurs_tag":"fr_fr",
|
82 |
+
"commonvoice_hours":1065.0,
|
83 |
"commonvoice_locale":"fr",
|
84 |
"in_benchmark":true
|
85 |
},
|
|
|
1375 |
"family":"Turkic",
|
1376 |
"flores_path":"uig_Arab",
|
1377 |
"fleurs_tag":null,
|
1378 |
+
"commonvoice_hours":411.0,
|
1379 |
"commonvoice_locale":"ug",
|
1380 |
"in_benchmark":true
|
1381 |
},
|
|
|
1747 |
"family":"Indo-European",
|
1748 |
"flores_path":"nob_Latn",
|
1749 |
"fleurs_tag":"nb_no",
|
1750 |
+
"commonvoice_hours":0.5,
|
1751 |
"commonvoice_locale":"nb-NO",
|
1752 |
"in_benchmark":true
|
1753 |
},
|
|
|
2323 |
"family":"Dravidian",
|
2324 |
"flores_path":null,
|
2325 |
"fleurs_tag":null,
|
2326 |
+
"commonvoice_hours":1.2,
|
2327 |
"commonvoice_locale":"brh",
|
2328 |
"in_benchmark":false
|
2329 |
},
|
|
|
2623 |
"family":"Indo-European",
|
2624 |
"flores_path":null,
|
2625 |
"fleurs_tag":null,
|
2626 |
+
"commonvoice_hours":0.9,
|
2627 |
"commonvoice_locale":"haz",
|
2628 |
"in_benchmark":false
|
2629 |
},
|
|
|
4651 |
"family":"Abkhaz-Adyge",
|
4652 |
"flores_path":null,
|
4653 |
"fleurs_tag":null,
|
4654 |
+
"commonvoice_hours":30.0,
|
4655 |
"commonvoice_locale":"ady",
|
4656 |
"in_benchmark":false
|
4657 |
},
|
models.json
CHANGED
@@ -15,6 +15,7 @@
|
|
15 |
"classification",
|
16 |
"mmlu",
|
17 |
"arc",
|
|
|
18 |
"mgsm"
|
19 |
]
|
20 |
},
|
@@ -34,6 +35,7 @@
|
|
34 |
"classification",
|
35 |
"mmlu",
|
36 |
"arc",
|
|
|
37 |
"mgsm"
|
38 |
]
|
39 |
},
|
@@ -53,6 +55,7 @@
|
|
53 |
"classification",
|
54 |
"mmlu",
|
55 |
"arc",
|
|
|
56 |
"mgsm"
|
57 |
]
|
58 |
},
|
@@ -72,6 +75,7 @@
|
|
72 |
"classification",
|
73 |
"mmlu",
|
74 |
"arc",
|
|
|
75 |
"mgsm"
|
76 |
]
|
77 |
},
|
@@ -91,6 +95,7 @@
|
|
91 |
"classification",
|
92 |
"mmlu",
|
93 |
"arc",
|
|
|
94 |
"mgsm"
|
95 |
]
|
96 |
},
|
@@ -110,6 +115,7 @@
|
|
110 |
"classification",
|
111 |
"mmlu",
|
112 |
"arc",
|
|
|
113 |
"mgsm"
|
114 |
]
|
115 |
},
|
@@ -129,6 +135,7 @@
|
|
129 |
"classification",
|
130 |
"mmlu",
|
131 |
"arc",
|
|
|
132 |
"mgsm"
|
133 |
]
|
134 |
},
|
@@ -141,13 +148,14 @@
|
|
141 |
"size":684531386000.0,
|
142 |
"type":"open-source",
|
143 |
"license":"Mit",
|
144 |
-
"creation_date":1748390400000,
|
145 |
"tasks":[
|
146 |
"translation_from",
|
147 |
"translation_to",
|
148 |
"classification",
|
149 |
"mmlu",
|
150 |
"arc",
|
|
|
151 |
"mgsm"
|
152 |
]
|
153 |
},
|
@@ -167,6 +175,7 @@
|
|
167 |
"classification",
|
168 |
"mmlu",
|
169 |
"arc",
|
|
|
170 |
"mgsm"
|
171 |
]
|
172 |
},
|
@@ -186,6 +195,7 @@
|
|
186 |
"classification",
|
187 |
"mmlu",
|
188 |
"arc",
|
|
|
189 |
"mgsm"
|
190 |
]
|
191 |
},
|
@@ -205,6 +215,7 @@
|
|
205 |
"classification",
|
206 |
"mmlu",
|
207 |
"arc",
|
|
|
208 |
"mgsm"
|
209 |
]
|
210 |
},
|
@@ -271,12 +282,14 @@
|
|
271 |
"size":null,
|
272 |
"type":"closed-source",
|
273 |
"license":null,
|
274 |
-
"creation_date":1750118400000
|
275 |
"tasks":[
|
276 |
"translation_from",
|
277 |
"translation_to",
|
278 |
"classification",
|
279 |
"mmlu",
|
|
|
|
|
280 |
"mgsm"
|
281 |
]
|
282 |
},
|
@@ -332,6 +345,7 @@
|
|
332 |
"classification",
|
333 |
"mmlu",
|
334 |
"arc",
|
|
|
335 |
"mgsm"
|
336 |
]
|
337 |
},
|
@@ -351,6 +365,7 @@
|
|
351 |
"classification",
|
352 |
"mmlu",
|
353 |
"arc",
|
|
|
354 |
"mgsm"
|
355 |
]
|
356 |
},
|
@@ -370,6 +385,7 @@
|
|
370 |
"classification",
|
371 |
"mmlu",
|
372 |
"arc",
|
|
|
373 |
"mgsm"
|
374 |
]
|
375 |
},
|
@@ -404,6 +420,7 @@
|
|
404 |
"classification",
|
405 |
"mmlu",
|
406 |
"arc",
|
|
|
407 |
"mgsm"
|
408 |
]
|
409 |
},
|
@@ -423,6 +440,7 @@
|
|
423 |
"classification",
|
424 |
"mmlu",
|
425 |
"arc",
|
|
|
426 |
"mgsm"
|
427 |
]
|
428 |
},
|
@@ -442,6 +460,7 @@
|
|
442 |
"classification",
|
443 |
"mmlu",
|
444 |
"arc",
|
|
|
445 |
"mgsm"
|
446 |
]
|
447 |
},
|
@@ -485,6 +504,7 @@
|
|
485 |
"classification",
|
486 |
"mmlu",
|
487 |
"arc",
|
|
|
488 |
"mgsm"
|
489 |
]
|
490 |
},
|
@@ -504,6 +524,7 @@
|
|
504 |
"classification",
|
505 |
"mmlu",
|
506 |
"arc",
|
|
|
507 |
"mgsm"
|
508 |
]
|
509 |
},
|
@@ -523,6 +544,7 @@
|
|
523 |
"classification",
|
524 |
"mmlu",
|
525 |
"arc",
|
|
|
526 |
"mgsm"
|
527 |
]
|
528 |
},
|
@@ -542,6 +564,7 @@
|
|
542 |
"classification",
|
543 |
"mmlu",
|
544 |
"arc",
|
|
|
545 |
"mgsm"
|
546 |
]
|
547 |
},
|
@@ -561,6 +584,7 @@
|
|
561 |
"classification",
|
562 |
"mmlu",
|
563 |
"arc",
|
|
|
564 |
"mgsm"
|
565 |
]
|
566 |
},
|
@@ -580,6 +604,7 @@
|
|
580 |
"classification",
|
581 |
"mmlu",
|
582 |
"arc",
|
|
|
583 |
"mgsm"
|
584 |
]
|
585 |
},
|
@@ -599,6 +624,27 @@
|
|
599 |
"classification",
|
600 |
"mmlu",
|
601 |
"arc",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
602 |
"mgsm"
|
603 |
]
|
604 |
},
|
@@ -611,8 +657,16 @@
|
|
611 |
"size":null,
|
612 |
"type":"closed-source",
|
613 |
"license":null,
|
614 |
-
"creation_date":1744588800000
|
615 |
-
"tasks":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
616 |
},
|
617 |
{
|
618 |
"id":"openai\/gpt-4.1-mini",
|
@@ -630,6 +684,7 @@
|
|
630 |
"classification",
|
631 |
"mmlu",
|
632 |
"arc",
|
|
|
633 |
"mgsm"
|
634 |
]
|
635 |
},
|
@@ -649,6 +704,7 @@
|
|
649 |
"classification",
|
650 |
"mmlu",
|
651 |
"arc",
|
|
|
652 |
"mgsm"
|
653 |
]
|
654 |
},
|
@@ -668,6 +724,67 @@
|
|
668 |
"classification",
|
669 |
"mmlu",
|
670 |
"arc",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
671 |
"mgsm"
|
672 |
]
|
673 |
}
|
|
|
15 |
"classification",
|
16 |
"mmlu",
|
17 |
"arc",
|
18 |
+
"truthfulqa",
|
19 |
"mgsm"
|
20 |
]
|
21 |
},
|
|
|
35 |
"classification",
|
36 |
"mmlu",
|
37 |
"arc",
|
38 |
+
"truthfulqa",
|
39 |
"mgsm"
|
40 |
]
|
41 |
},
|
|
|
55 |
"classification",
|
56 |
"mmlu",
|
57 |
"arc",
|
58 |
+
"truthfulqa",
|
59 |
"mgsm"
|
60 |
]
|
61 |
},
|
|
|
75 |
"classification",
|
76 |
"mmlu",
|
77 |
"arc",
|
78 |
+
"truthfulqa",
|
79 |
"mgsm"
|
80 |
]
|
81 |
},
|
|
|
95 |
"classification",
|
96 |
"mmlu",
|
97 |
"arc",
|
98 |
+
"truthfulqa",
|
99 |
"mgsm"
|
100 |
]
|
101 |
},
|
|
|
115 |
"classification",
|
116 |
"mmlu",
|
117 |
"arc",
|
118 |
+
"truthfulqa",
|
119 |
"mgsm"
|
120 |
]
|
121 |
},
|
|
|
135 |
"classification",
|
136 |
"mmlu",
|
137 |
"arc",
|
138 |
+
"truthfulqa",
|
139 |
"mgsm"
|
140 |
]
|
141 |
},
|
|
|
148 |
"size":684531386000.0,
|
149 |
"type":"open-source",
|
150 |
"license":"Mit",
|
151 |
+
"creation_date":1748390400000.0,
|
152 |
"tasks":[
|
153 |
"translation_from",
|
154 |
"translation_to",
|
155 |
"classification",
|
156 |
"mmlu",
|
157 |
"arc",
|
158 |
+
"truthfulqa",
|
159 |
"mgsm"
|
160 |
]
|
161 |
},
|
|
|
175 |
"classification",
|
176 |
"mmlu",
|
177 |
"arc",
|
178 |
+
"truthfulqa",
|
179 |
"mgsm"
|
180 |
]
|
181 |
},
|
|
|
195 |
"classification",
|
196 |
"mmlu",
|
197 |
"arc",
|
198 |
+
"truthfulqa",
|
199 |
"mgsm"
|
200 |
]
|
201 |
},
|
|
|
215 |
"classification",
|
216 |
"mmlu",
|
217 |
"arc",
|
218 |
+
"truthfulqa",
|
219 |
"mgsm"
|
220 |
]
|
221 |
},
|
|
|
282 |
"size":null,
|
283 |
"type":"closed-source",
|
284 |
"license":null,
|
285 |
+
"creation_date":1750118400000,
|
286 |
"tasks":[
|
287 |
"translation_from",
|
288 |
"translation_to",
|
289 |
"classification",
|
290 |
"mmlu",
|
291 |
+
"arc",
|
292 |
+
"truthfulqa",
|
293 |
"mgsm"
|
294 |
]
|
295 |
},
|
|
|
345 |
"classification",
|
346 |
"mmlu",
|
347 |
"arc",
|
348 |
+
"truthfulqa",
|
349 |
"mgsm"
|
350 |
]
|
351 |
},
|
|
|
365 |
"classification",
|
366 |
"mmlu",
|
367 |
"arc",
|
368 |
+
"truthfulqa",
|
369 |
"mgsm"
|
370 |
]
|
371 |
},
|
|
|
385 |
"classification",
|
386 |
"mmlu",
|
387 |
"arc",
|
388 |
+
"truthfulqa",
|
389 |
"mgsm"
|
390 |
]
|
391 |
},
|
|
|
420 |
"classification",
|
421 |
"mmlu",
|
422 |
"arc",
|
423 |
+
"truthfulqa",
|
424 |
"mgsm"
|
425 |
]
|
426 |
},
|
|
|
440 |
"classification",
|
441 |
"mmlu",
|
442 |
"arc",
|
443 |
+
"truthfulqa",
|
444 |
"mgsm"
|
445 |
]
|
446 |
},
|
|
|
460 |
"classification",
|
461 |
"mmlu",
|
462 |
"arc",
|
463 |
+
"truthfulqa",
|
464 |
"mgsm"
|
465 |
]
|
466 |
},
|
|
|
504 |
"classification",
|
505 |
"mmlu",
|
506 |
"arc",
|
507 |
+
"truthfulqa",
|
508 |
"mgsm"
|
509 |
]
|
510 |
},
|
|
|
524 |
"classification",
|
525 |
"mmlu",
|
526 |
"arc",
|
527 |
+
"truthfulqa",
|
528 |
"mgsm"
|
529 |
]
|
530 |
},
|
|
|
544 |
"classification",
|
545 |
"mmlu",
|
546 |
"arc",
|
547 |
+
"truthfulqa",
|
548 |
"mgsm"
|
549 |
]
|
550 |
},
|
|
|
564 |
"classification",
|
565 |
"mmlu",
|
566 |
"arc",
|
567 |
+
"truthfulqa",
|
568 |
"mgsm"
|
569 |
]
|
570 |
},
|
|
|
584 |
"classification",
|
585 |
"mmlu",
|
586 |
"arc",
|
587 |
+
"truthfulqa",
|
588 |
"mgsm"
|
589 |
]
|
590 |
},
|
|
|
604 |
"classification",
|
605 |
"mmlu",
|
606 |
"arc",
|
607 |
+
"truthfulqa",
|
608 |
"mgsm"
|
609 |
]
|
610 |
},
|
|
|
624 |
"classification",
|
625 |
"mmlu",
|
626 |
"arc",
|
627 |
+
"truthfulqa",
|
628 |
+
"mgsm"
|
629 |
+
]
|
630 |
+
},
|
631 |
+
{
|
632 |
+
"id":"openai\/gpt-3.5-turbo-0613",
|
633 |
+
"name":"GPT-3.5 Turbo (older v0613)",
|
634 |
+
"provider_name":"OpenAI",
|
635 |
+
"cost":2.0,
|
636 |
+
"hf_id":null,
|
637 |
+
"size":null,
|
638 |
+
"type":"closed-source",
|
639 |
+
"license":null,
|
640 |
+
"creation_date":1706140800000,
|
641 |
+
"tasks":[
|
642 |
+
"translation_from",
|
643 |
+
"translation_to",
|
644 |
+
"classification",
|
645 |
+
"mmlu",
|
646 |
+
"arc",
|
647 |
+
"truthfulqa",
|
648 |
"mgsm"
|
649 |
]
|
650 |
},
|
|
|
657 |
"size":null,
|
658 |
"type":"closed-source",
|
659 |
"license":null,
|
660 |
+
"creation_date":1744588800000,
|
661 |
+
"tasks":[
|
662 |
+
"translation_from",
|
663 |
+
"translation_to",
|
664 |
+
"classification",
|
665 |
+
"mmlu",
|
666 |
+
"arc",
|
667 |
+
"truthfulqa",
|
668 |
+
"mgsm"
|
669 |
+
]
|
670 |
},
|
671 |
{
|
672 |
"id":"openai\/gpt-4.1-mini",
|
|
|
684 |
"classification",
|
685 |
"mmlu",
|
686 |
"arc",
|
687 |
+
"truthfulqa",
|
688 |
"mgsm"
|
689 |
]
|
690 |
},
|
|
|
704 |
"classification",
|
705 |
"mmlu",
|
706 |
"arc",
|
707 |
+
"truthfulqa",
|
708 |
"mgsm"
|
709 |
]
|
710 |
},
|
|
|
724 |
"classification",
|
725 |
"mmlu",
|
726 |
"arc",
|
727 |
+
"truthfulqa",
|
728 |
+
"mgsm"
|
729 |
+
]
|
730 |
+
},
|
731 |
+
{
|
732 |
+
"id":"qwen\/qwen3-235b-a22b",
|
733 |
+
"name":"Qwen3 235B A22B",
|
734 |
+
"provider_name":"Qwen",
|
735 |
+
"cost":0.0,
|
736 |
+
"hf_id":"Qwen\/Qwen3-235B-A22B",
|
737 |
+
"size":235093634560.0,
|
738 |
+
"type":"open-source",
|
739 |
+
"license":"Apache 2.0",
|
740 |
+
"creation_date":1745712000000,
|
741 |
+
"tasks":[
|
742 |
+
"translation_from",
|
743 |
+
"translation_to",
|
744 |
+
"classification",
|
745 |
+
"mmlu",
|
746 |
+
"arc",
|
747 |
+
"truthfulqa",
|
748 |
+
"mgsm"
|
749 |
+
]
|
750 |
+
},
|
751 |
+
{
|
752 |
+
"id":"qwen\/qwen3-30b-a3b",
|
753 |
+
"name":"Qwen3 30B A3B",
|
754 |
+
"provider_name":"Qwen",
|
755 |
+
"cost":0.0,
|
756 |
+
"hf_id":"Qwen\/Qwen3-30B-A3B",
|
757 |
+
"size":30532122624.0,
|
758 |
+
"type":"open-source",
|
759 |
+
"license":"Apache 2.0",
|
760 |
+
"creation_date":1745712000000,
|
761 |
+
"tasks":[
|
762 |
+
"translation_from",
|
763 |
+
"translation_to",
|
764 |
+
"classification",
|
765 |
+
"mmlu",
|
766 |
+
"arc",
|
767 |
+
"truthfulqa",
|
768 |
+
"mgsm"
|
769 |
+
]
|
770 |
+
},
|
771 |
+
{
|
772 |
+
"id":"qwen\/qwen3-32b",
|
773 |
+
"name":"Qwen3 32B",
|
774 |
+
"provider_name":"Qwen",
|
775 |
+
"cost":0.0,
|
776 |
+
"hf_id":"Qwen\/Qwen3-32B",
|
777 |
+
"size":32762123264.0,
|
778 |
+
"type":"open-source",
|
779 |
+
"license":"Apache 2.0",
|
780 |
+
"creation_date":1745712000000,
|
781 |
+
"tasks":[
|
782 |
+
"translation_from",
|
783 |
+
"translation_to",
|
784 |
+
"classification",
|
785 |
+
"mmlu",
|
786 |
+
"arc",
|
787 |
+
"truthfulqa",
|
788 |
"mgsm"
|
789 |
]
|
790 |
}
|
results.json
CHANGED
The diff for this file is too large to render.
See raw diff
|
|