davidpomerenke commited on
Commit
98c6811
·
verified ·
1 Parent(s): 4c5c136

Upload from GitHub Actions: Get more results, compute average based on all tasks

Browse files
datasets.json CHANGED
@@ -256,7 +256,7 @@
256
  "parallel": true,
257
  "translation": "machine",
258
  "base": "MMLU",
259
- "implemented": false,
260
  "group": "Multitask Language Understanding"
261
  },
262
  {
@@ -300,7 +300,7 @@
300
  "parallel": true,
301
  "translation": "machine",
302
  "base": "MGSM",
303
- "implemented": false,
304
  "group": "Grade School Math"
305
  },
306
  {
@@ -315,7 +315,7 @@
315
  "parallel": true,
316
  "translation": "machine",
317
  "base": "MGSM",
318
- "implemented": false,
319
  "group": "Grade School Math"
320
  },
321
  {
@@ -345,7 +345,7 @@
345
  "parallel": true,
346
  "translation": "machine",
347
  "base": "AI2 ARC",
348
- "implemented": false,
349
  "group": "Abstract Reasoning"
350
  },
351
  {
@@ -360,7 +360,7 @@
360
  "parallel": true,
361
  "translation": "machine",
362
  "base": "AI2 ARC",
363
- "implemented": false,
364
  "group": "Abstract Reasoning"
365
  },
366
  {
@@ -375,7 +375,7 @@
375
  "parallel": true,
376
  "translation": "human",
377
  "base": "TruthfulQA",
378
- "implemented": false,
379
  "group": "Truthfulness"
380
  },
381
  {
@@ -390,7 +390,7 @@
390
  "parallel": true,
391
  "translation": "machine",
392
  "base": "TruthfulQA",
393
- "implemented": false,
394
  "group": "Truthfulness"
395
  },
396
  {
@@ -405,7 +405,7 @@
405
  "parallel": true,
406
  "translation": "machine",
407
  "base": "TruthfulQA",
408
- "implemented": false,
409
  "group": "Truthfulness"
410
  },
411
  {
 
256
  "parallel": true,
257
  "translation": "machine",
258
  "base": "MMLU",
259
+ "implemented": true,
260
  "group": "Multitask Language Understanding"
261
  },
262
  {
 
300
  "parallel": true,
301
  "translation": "machine",
302
  "base": "MGSM",
303
+ "implemented": true,
304
  "group": "Grade School Math"
305
  },
306
  {
 
315
  "parallel": true,
316
  "translation": "machine",
317
  "base": "MGSM",
318
+ "implemented": true,
319
  "group": "Grade School Math"
320
  },
321
  {
 
345
  "parallel": true,
346
  "translation": "machine",
347
  "base": "AI2 ARC",
348
+ "implemented": true,
349
  "group": "Abstract Reasoning"
350
  },
351
  {
 
360
  "parallel": true,
361
  "translation": "machine",
362
  "base": "AI2 ARC",
363
+ "implemented": true,
364
  "group": "Abstract Reasoning"
365
  },
366
  {
 
375
  "parallel": true,
376
  "translation": "human",
377
  "base": "TruthfulQA",
378
+ "implemented": true,
379
  "group": "Truthfulness"
380
  },
381
  {
 
390
  "parallel": true,
391
  "translation": "machine",
392
  "base": "TruthfulQA",
393
+ "implemented": true,
394
  "group": "Truthfulness"
395
  },
396
  {
 
405
  "parallel": true,
406
  "translation": "machine",
407
  "base": "TruthfulQA",
408
+ "implemented": true,
409
  "group": "Truthfulness"
410
  },
411
  {
evals/backend.py CHANGED
@@ -26,11 +26,10 @@ task_metrics = [
26
  "classification_accuracy",
27
  "mmlu_accuracy",
28
  "arc_accuracy",
 
29
  "mgsm_accuracy",
30
  ]
31
 
32
- task_metrics_basic = ["translation_from_bleu", "translation_to_bleu", "classification_accuracy"]
33
-
34
 
35
  def compute_normalized_average(df, metrics):
36
  """Compute average of min-max normalized metric columns."""
@@ -58,7 +57,7 @@ def make_model_table(df, models):
58
  for metric in task_metrics:
59
  if metric not in df.columns:
60
  df[metric] = np.nan
61
- df["average"] = compute_normalized_average(df, task_metrics_basic)
62
  df = df.sort_values(by="average", ascending=False).reset_index()
63
  df = pd.merge(df, models, left_on="model", right_on="id", how="left")
64
  df["rank"] = df.index + 1
@@ -93,7 +92,7 @@ def make_language_table(df, languages):
93
  for metric in task_metrics:
94
  if metric not in df.columns:
95
  df[metric] = np.nan
96
- df["average"] = compute_normalized_average(df, task_metrics_basic)
97
  df = pd.merge(languages, df, on="bcp_47", how="outer")
98
  df = df.sort_values(by="speakers", ascending=False)
99
  df = df[
 
26
  "classification_accuracy",
27
  "mmlu_accuracy",
28
  "arc_accuracy",
29
+ "truthfulqa_accuracy",
30
  "mgsm_accuracy",
31
  ]
32
 
 
 
33
 
34
  def compute_normalized_average(df, metrics):
35
  """Compute average of min-max normalized metric columns."""
 
57
  for metric in task_metrics:
58
  if metric not in df.columns:
59
  df[metric] = np.nan
60
+ df["average"] = compute_normalized_average(df, task_metrics)
61
  df = df.sort_values(by="average", ascending=False).reset_index()
62
  df = pd.merge(df, models, left_on="model", right_on="id", how="left")
63
  df["rank"] = df.index + 1
 
92
  for metric in task_metrics:
93
  if metric not in df.columns:
94
  df[metric] = np.nan
95
+ df["average"] = compute_normalized_average(df, task_metrics)
96
  df = pd.merge(languages, df, on="bcp_47", how="outer")
97
  df = df.sort_values(by="speakers", ascending=False)
98
  df = df[
evals/datasets_/truthfulqa.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+ from collections import Counter, defaultdict
3
+
4
+ from langcodes import Language, standardize_tag
5
+ from rich import print
6
+
7
+ from datasets_.util import _get_dataset_config_names, _load_dataset
8
+
9
+ slug_uhura_truthfulqa = "masakhane/uhura-truthfulqa"
10
+ tags_uhura_truthfulqa = {
11
+ standardize_tag(a.split("_")[0], macro=True): a for a in _get_dataset_config_names(slug_uhura_truthfulqa)
12
+ if a.endswith("multiple_choice")
13
+ }
14
+
15
+
16
+ def add_choices(row):
17
+ row["choices"] = row["mc1_targets"]["choices"]
18
+ row["labels"] = row["mc1_targets"]["labels"]
19
+ return row
20
+
21
+
22
+ def load_truthfulqa(language_bcp_47, nr):
23
+ if language_bcp_47 in tags_uhura_truthfulqa.keys():
24
+ ds = _load_dataset(slug_uhura_truthfulqa, tags_uhura_truthfulqa[language_bcp_47])
25
+ ds = ds.map(add_choices)
26
+ examples = ds["train"]
27
+ task = ds["test"][nr]
28
+ return "masakhane/uhura-truthfulqa", examples, task
29
+ else:
30
+ return None, None, None
evals/main.py CHANGED
@@ -15,7 +15,7 @@ n_sentences = 10
15
 
16
  async def evaluate():
17
  # FIXME we should not need this for-loop, but it helps
18
- for n_languages in range(20, 100, 20):
19
  print(f"running evaluations for {n_languages} languages")
20
  old_results = pd.read_json("results.json")
21
  old_models = pd.read_json("models.json")
 
15
 
16
  async def evaluate():
17
  # FIXME we should not need this for-loop, but it helps
18
+ for n_languages in range(10, 101, 10):
19
  print(f"running evaluations for {n_languages} languages")
20
  old_results = pd.read_json("results.json")
21
  old_models = pd.read_json("models.json")
evals/models.py CHANGED
@@ -11,8 +11,7 @@ from elevenlabs import AsyncElevenLabs
11
  from google.cloud import translate_v2 as translate
12
  from huggingface_hub import AsyncInferenceClient, HfApi
13
  from joblib.memory import Memory
14
- from langcodes import closest_supported_match
15
- from openai import AsyncOpenAI, PermissionDeniedError
16
  from requests import HTTPError, get
17
 
18
  # for development purposes, all languages will be evaluated on the fast models
@@ -23,12 +22,12 @@ important_models = [
23
  "meta-llama/llama-3.1-70b-instruct", # 0.3$
24
  "meta-llama/llama-3-70b-instruct", # 0.4$
25
  # "meta-llama/llama-2-70b-chat", # 0.9$; not properly supported by OpenRouter
26
- # "openai/gpt-4.1", # 8$
27
  "openai/gpt-4.1-mini", # 1.6$
28
  "openai/gpt-4.1-nano", # 0.4$
29
  "openai/gpt-4o-mini", # 0.6$
30
  # "openai/gpt-4o-2024-11-20", # 10$
31
- # "openai/gpt-3.5-turbo-0613", # 2$
32
  # "openai/gpt-3.5-turbo", # 1.5$
33
  # "anthropic/claude-3.5-haiku", # 4$ -> too expensive for dev
34
  "mistralai/mistral-small-3.1-24b-instruct", # 0.3$
@@ -37,6 +36,9 @@ important_models = [
37
  "google/gemini-2.5-flash", # 0.6$
38
  "google/gemini-2.0-flash-lite-001", # 0.3$
39
  "google/gemma-3-27b-it", # 0.2$
 
 
 
40
  # "qwen/qwen-turbo", # 0.2$; recognizes "inappropriate content"
41
  # "qwen/qwq-32b", # 0.2$
42
  # "qwen/qwen-2.5-72b-instruct", # 0.39$
@@ -49,7 +51,6 @@ important_models = [
49
  ]
50
 
51
  blocklist = [
52
- "microsoft/wizardlm-2-8x22b", # temporarily rate-limited
53
  "google/gemini-2.5-pro-preview",
54
  "google/gemini-2.5-flash-preview",
55
  "google/gemini-2.5-flash-lite-preview",
@@ -150,9 +151,10 @@ async def complete(**kwargs) -> str | None:
150
  async with openrouter_rate_limit:
151
  try:
152
  response = await client.chat.completions.create(**kwargs)
153
- except PermissionDeniedError as e:
154
- print(e)
155
- return None
 
156
  if not response.choices:
157
  raise Exception(response)
158
  return response.choices[0].message.content.strip()
@@ -281,13 +283,9 @@ def load_models(date: date):
281
  )
282
  # models = models[models["cost"] <= 2.0].reset_index(drop=True)
283
  models["tasks"] = [
284
- ["translation_from", "translation_to", "classification", "mmlu", "arc", "mgsm"]
285
  ] * len(models)
286
  models = pd.concat([models, get_translation_models()])
287
- models = models[ # temporary fix FIXME
288
- (models["id"] != "google/gemini-2.5-pro")
289
- & (models["id"] != "google/gemini-2.5-pro-preview")
290
- ]
291
  return models
292
 
293
 
 
11
  from google.cloud import translate_v2 as translate
12
  from huggingface_hub import AsyncInferenceClient, HfApi
13
  from joblib.memory import Memory
14
+ from openai import AsyncOpenAI, BadRequestError
 
15
  from requests import HTTPError, get
16
 
17
  # for development purposes, all languages will be evaluated on the fast models
 
22
  "meta-llama/llama-3.1-70b-instruct", # 0.3$
23
  "meta-llama/llama-3-70b-instruct", # 0.4$
24
  # "meta-llama/llama-2-70b-chat", # 0.9$; not properly supported by OpenRouter
25
+ "openai/gpt-4.1", # 8$
26
  "openai/gpt-4.1-mini", # 1.6$
27
  "openai/gpt-4.1-nano", # 0.4$
28
  "openai/gpt-4o-mini", # 0.6$
29
  # "openai/gpt-4o-2024-11-20", # 10$
30
+ "openai/gpt-3.5-turbo-0613", # 2$
31
  # "openai/gpt-3.5-turbo", # 1.5$
32
  # "anthropic/claude-3.5-haiku", # 4$ -> too expensive for dev
33
  "mistralai/mistral-small-3.1-24b-instruct", # 0.3$
 
36
  "google/gemini-2.5-flash", # 0.6$
37
  "google/gemini-2.0-flash-lite-001", # 0.3$
38
  "google/gemma-3-27b-it", # 0.2$
39
+ "qwen/qwen3-32b",
40
+ "qwen/qwen3-235b-a22b",
41
+ "qwen/qwen3-30b-a3b", # 0.29$
42
  # "qwen/qwen-turbo", # 0.2$; recognizes "inappropriate content"
43
  # "qwen/qwq-32b", # 0.2$
44
  # "qwen/qwen-2.5-72b-instruct", # 0.39$
 
51
  ]
52
 
53
  blocklist = [
 
54
  "google/gemini-2.5-pro-preview",
55
  "google/gemini-2.5-flash-preview",
56
  "google/gemini-2.5-flash-lite-preview",
 
151
  async with openrouter_rate_limit:
152
  try:
153
  response = await client.chat.completions.create(**kwargs)
154
+ except BadRequestError as e:
155
+ if "filtered" in e.message:
156
+ return None
157
+ raise e
158
  if not response.choices:
159
  raise Exception(response)
160
  return response.choices[0].message.content.strip()
 
283
  )
284
  # models = models[models["cost"] <= 2.0].reset_index(drop=True)
285
  models["tasks"] = [
286
+ ["translation_from", "translation_to", "classification", "mmlu", "arc", "truthfulqa", "mgsm"]
287
  ] * len(models)
288
  models = pd.concat([models, get_translation_models()])
 
 
 
 
289
  return models
290
 
291
 
evals/tasks.py CHANGED
@@ -9,6 +9,7 @@ from datasets_.flores import flores_sentences
9
  from datasets_.mgsm import load_mgsm, parse_number
10
  from datasets_.mmlu import load_mmlu
11
  from datasets_.arc import load_uhura_arc_easy
 
12
  from google.cloud import translate_v2 as translate
13
  from langcodes import closest_supported_match
14
  from languages import languages, script_name
@@ -224,6 +225,7 @@ async def mlm_and_evaluate(model, language_bcp_47, nr):
224
  }
225
  ]
226
 
 
227
  def format_multiple_choice(item):
228
  return f"""{item["question"]}
229
 
@@ -234,6 +236,7 @@ def format_multiple_choice(item):
234
 
235
  A|B|C|D?"""
236
 
 
237
  async def mmlu_and_evaluate(model, language_bcp_47, nr):
238
  ds_name, examples, task = load_mmlu(language_bcp_47, nr)
239
  if not task:
@@ -253,7 +256,10 @@ async def mmlu_and_evaluate(model, language_bcp_47, nr):
253
  temperature=0,
254
  max_tokens=1,
255
  )
256
- acc = int(response[:1].strip() == task["answer"])
 
 
 
257
  except Exception as e:
258
  if "ResponsibleAIPolicyViolation" in str(e):
259
  acc = 0
@@ -270,11 +276,12 @@ async def mmlu_and_evaluate(model, language_bcp_47, nr):
270
  }
271
  ]
272
 
 
273
  async def arc_and_evaluate(model, language_bcp_47, nr):
274
  ds_name, examples, task = load_uhura_arc_easy(language_bcp_47, nr)
275
  if not task:
276
  return []
277
-
278
  messages = []
279
  for example in examples:
280
  messages += [
@@ -289,7 +296,10 @@ async def arc_and_evaluate(model, language_bcp_47, nr):
289
  temperature=0,
290
  max_tokens=1,
291
  )
292
- acc = int(response[:1].strip() == task["answer"])
 
 
 
293
  except Exception as e:
294
  if "ResponsibleAIPolicyViolation" in str(e):
295
  acc = 0
@@ -305,7 +315,68 @@ async def arc_and_evaluate(model, language_bcp_47, nr):
305
  "sentence_nr": nr,
306
  }
307
  ]
308
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
309
 
310
  async def mgsm_and_evaluate(model, language_bcp_47, nr):
311
  system_prompt = """
@@ -325,11 +396,9 @@ async def mgsm_and_evaluate(model, language_bcp_47, nr):
325
  temperature=0,
326
  max_tokens=1024,
327
  )
328
- number = response.split("####")
329
- if len(number) == 2:
330
- accuracy = int(
331
- parse_number(number[1].strip()) == parse_number(question["answer_number"])
332
- )
333
  else:
334
  accuracy = 0
335
 
@@ -383,6 +452,7 @@ tasks = {
383
  # "mlm": mlm_and_evaluate,
384
  "mmlu": mmlu_and_evaluate,
385
  "arc": arc_and_evaluate,
 
386
  "mgsm": mgsm_and_evaluate,
387
  # "asr": transcribe_and_evaluate,
388
  }
 
9
  from datasets_.mgsm import load_mgsm, parse_number
10
  from datasets_.mmlu import load_mmlu
11
  from datasets_.arc import load_uhura_arc_easy
12
+ from datasets_.truthfulqa import load_truthfulqa
13
  from google.cloud import translate_v2 as translate
14
  from langcodes import closest_supported_match
15
  from languages import languages, script_name
 
225
  }
226
  ]
227
 
228
+
229
  def format_multiple_choice(item):
230
  return f"""{item["question"]}
231
 
 
236
 
237
  A|B|C|D?"""
238
 
239
+
240
  async def mmlu_and_evaluate(model, language_bcp_47, nr):
241
  ds_name, examples, task = load_mmlu(language_bcp_47, nr)
242
  if not task:
 
256
  temperature=0,
257
  max_tokens=1,
258
  )
259
+ if response:
260
+ acc = int(response[:1].strip() == task["answer"])
261
+ else:
262
+ acc = 0
263
  except Exception as e:
264
  if "ResponsibleAIPolicyViolation" in str(e):
265
  acc = 0
 
276
  }
277
  ]
278
 
279
+
280
  async def arc_and_evaluate(model, language_bcp_47, nr):
281
  ds_name, examples, task = load_uhura_arc_easy(language_bcp_47, nr)
282
  if not task:
283
  return []
284
+
285
  messages = []
286
  for example in examples:
287
  messages += [
 
296
  temperature=0,
297
  max_tokens=1,
298
  )
299
+ if response:
300
+ acc = int(response[:1].strip() == task["answer"])
301
+ else:
302
+ acc = 0
303
  except Exception as e:
304
  if "ResponsibleAIPolicyViolation" in str(e):
305
  acc = 0
 
315
  "sentence_nr": nr,
316
  }
317
  ]
318
+
319
+
320
+ letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
321
+
322
+
323
+ def shuffle_choices_and_labels(item):
324
+ indices = list(range(len(item["choices"])))
325
+ random.shuffle(indices)
326
+ item["choices"] = [item["choices"][i] for i in indices]
327
+ item["labels"] = [item["labels"][i] for i in indices]
328
+ return item
329
+
330
+
331
+ def format_multiple_choice_truthfulqa(item):
332
+ text = item["question"] + "\n\n"
333
+ for i, choice in enumerate(item["choices"]):
334
+ text += f"{letters[i]}: {choice}\n"
335
+ text += "|".join(letters[: len(item["choices"])]) + "?"
336
+ return text
337
+
338
+
339
+ async def truthfulqa_and_evaluate(model, language_bcp_47, nr):
340
+ ds_name, examples, task = load_truthfulqa(language_bcp_47, nr)
341
+ if not task:
342
+ return []
343
+ task = shuffle_choices_and_labels(task)
344
+ answer = letters[task["labels"].index(1)]
345
+ messages = []
346
+ for example in examples:
347
+ example = shuffle_choices_and_labels(example)
348
+ messages += [
349
+ {"role": "user", "content": format_multiple_choice_truthfulqa(example)},
350
+ {"role": "assistant", "content": letters[example["labels"].index(1)]},
351
+ ]
352
+ messages += [{"role": "user", "content": format_multiple_choice_truthfulqa(task)}]
353
+ try:
354
+ response = await complete(
355
+ model=model,
356
+ messages=messages,
357
+ temperature=0,
358
+ max_tokens=1,
359
+ )
360
+ if response:
361
+ acc = int(response[:1].strip() == answer)
362
+ else:
363
+ acc = 0
364
+ except Exception as e:
365
+ if "ResponsibleAIPolicyViolation" in str(e):
366
+ acc = 0
367
+ else:
368
+ raise e
369
+ return [
370
+ {
371
+ "model": model,
372
+ "bcp_47": language_bcp_47,
373
+ "task": "truthfulqa",
374
+ "metric": "accuracy",
375
+ "score": acc,
376
+ "sentence_nr": nr,
377
+ }
378
+ ]
379
+
380
 
381
  async def mgsm_and_evaluate(model, language_bcp_47, nr):
382
  system_prompt = """
 
396
  temperature=0,
397
  max_tokens=1024,
398
  )
399
+ if response and len(response.split("####")) == 2:
400
+ number = response.split("####")[1].strip()
401
+ accuracy = int(parse_number(number) == parse_number(question["answer_number"]))
 
 
402
  else:
403
  accuracy = 0
404
 
 
452
  # "mlm": mlm_and_evaluate,
453
  "mmlu": mmlu_and_evaluate,
454
  "arc": arc_and_evaluate,
455
+ "truthfulqa": truthfulqa_and_evaluate,
456
  "mgsm": mgsm_and_evaluate,
457
  # "asr": transcribe_and_evaluate,
458
  }
frontend/src/components/ScoreColumns.js CHANGED
@@ -14,7 +14,7 @@ const ScoreColumns = [
14
  <Column
15
  field='average'
16
  header='Proficiency'
17
- headerTooltip='Language Proficiency Score (average translation and classification scores, after min-max normalization)'
18
  sortable
19
  body={scoreBodyTemplate('average', { minScore: 0.2, maxScore: 0.5 })}
20
  style={{ minWidth: '5rem', maxWidth: '10rem' }}
 
14
  <Column
15
  field='average'
16
  header='Proficiency'
17
+ headerTooltip='Language Proficiency Score (average of the scores for each task, after min-max normalization)'
18
  sortable
19
  body={scoreBodyTemplate('average', { minScore: 0.2, maxScore: 0.5 })}
20
  style={{ minWidth: '5rem', maxWidth: '10rem' }}
languages.json CHANGED
@@ -79,7 +79,7 @@
79
  "family":"Indo-European",
80
  "flores_path":"fra_Latn",
81
  "fleurs_tag":"fr_fr",
82
- "commonvoice_hours":1064.0,
83
  "commonvoice_locale":"fr",
84
  "in_benchmark":true
85
  },
@@ -1375,7 +1375,7 @@
1375
  "family":"Turkic",
1376
  "flores_path":"uig_Arab",
1377
  "fleurs_tag":null,
1378
- "commonvoice_hours":410.0,
1379
  "commonvoice_locale":"ug",
1380
  "in_benchmark":true
1381
  },
@@ -1747,7 +1747,7 @@
1747
  "family":"Indo-European",
1748
  "flores_path":"nob_Latn",
1749
  "fleurs_tag":"nb_no",
1750
- "commonvoice_hours":0.4,
1751
  "commonvoice_locale":"nb-NO",
1752
  "in_benchmark":true
1753
  },
@@ -2323,7 +2323,7 @@
2323
  "family":"Dravidian",
2324
  "flores_path":null,
2325
  "fleurs_tag":null,
2326
- "commonvoice_hours":0.0,
2327
  "commonvoice_locale":"brh",
2328
  "in_benchmark":false
2329
  },
@@ -2623,7 +2623,7 @@
2623
  "family":"Indo-European",
2624
  "flores_path":null,
2625
  "fleurs_tag":null,
2626
- "commonvoice_hours":0.0,
2627
  "commonvoice_locale":"haz",
2628
  "in_benchmark":false
2629
  },
@@ -4651,7 +4651,7 @@
4651
  "family":"Abkhaz-Adyge",
4652
  "flores_path":null,
4653
  "fleurs_tag":null,
4654
- "commonvoice_hours":29.0,
4655
  "commonvoice_locale":"ady",
4656
  "in_benchmark":false
4657
  },
 
79
  "family":"Indo-European",
80
  "flores_path":"fra_Latn",
81
  "fleurs_tag":"fr_fr",
82
+ "commonvoice_hours":1065.0,
83
  "commonvoice_locale":"fr",
84
  "in_benchmark":true
85
  },
 
1375
  "family":"Turkic",
1376
  "flores_path":"uig_Arab",
1377
  "fleurs_tag":null,
1378
+ "commonvoice_hours":411.0,
1379
  "commonvoice_locale":"ug",
1380
  "in_benchmark":true
1381
  },
 
1747
  "family":"Indo-European",
1748
  "flores_path":"nob_Latn",
1749
  "fleurs_tag":"nb_no",
1750
+ "commonvoice_hours":0.5,
1751
  "commonvoice_locale":"nb-NO",
1752
  "in_benchmark":true
1753
  },
 
2323
  "family":"Dravidian",
2324
  "flores_path":null,
2325
  "fleurs_tag":null,
2326
+ "commonvoice_hours":1.2,
2327
  "commonvoice_locale":"brh",
2328
  "in_benchmark":false
2329
  },
 
2623
  "family":"Indo-European",
2624
  "flores_path":null,
2625
  "fleurs_tag":null,
2626
+ "commonvoice_hours":0.9,
2627
  "commonvoice_locale":"haz",
2628
  "in_benchmark":false
2629
  },
 
4651
  "family":"Abkhaz-Adyge",
4652
  "flores_path":null,
4653
  "fleurs_tag":null,
4654
+ "commonvoice_hours":30.0,
4655
  "commonvoice_locale":"ady",
4656
  "in_benchmark":false
4657
  },
models.json CHANGED
@@ -15,6 +15,7 @@
15
  "classification",
16
  "mmlu",
17
  "arc",
 
18
  "mgsm"
19
  ]
20
  },
@@ -34,6 +35,7 @@
34
  "classification",
35
  "mmlu",
36
  "arc",
 
37
  "mgsm"
38
  ]
39
  },
@@ -53,6 +55,7 @@
53
  "classification",
54
  "mmlu",
55
  "arc",
 
56
  "mgsm"
57
  ]
58
  },
@@ -72,6 +75,7 @@
72
  "classification",
73
  "mmlu",
74
  "arc",
 
75
  "mgsm"
76
  ]
77
  },
@@ -91,6 +95,7 @@
91
  "classification",
92
  "mmlu",
93
  "arc",
 
94
  "mgsm"
95
  ]
96
  },
@@ -110,6 +115,7 @@
110
  "classification",
111
  "mmlu",
112
  "arc",
 
113
  "mgsm"
114
  ]
115
  },
@@ -129,6 +135,7 @@
129
  "classification",
130
  "mmlu",
131
  "arc",
 
132
  "mgsm"
133
  ]
134
  },
@@ -141,13 +148,14 @@
141
  "size":684531386000.0,
142
  "type":"open-source",
143
  "license":"Mit",
144
- "creation_date":1748390400000,
145
  "tasks":[
146
  "translation_from",
147
  "translation_to",
148
  "classification",
149
  "mmlu",
150
  "arc",
 
151
  "mgsm"
152
  ]
153
  },
@@ -167,6 +175,7 @@
167
  "classification",
168
  "mmlu",
169
  "arc",
 
170
  "mgsm"
171
  ]
172
  },
@@ -186,6 +195,7 @@
186
  "classification",
187
  "mmlu",
188
  "arc",
 
189
  "mgsm"
190
  ]
191
  },
@@ -205,6 +215,7 @@
205
  "classification",
206
  "mmlu",
207
  "arc",
 
208
  "mgsm"
209
  ]
210
  },
@@ -271,12 +282,14 @@
271
  "size":null,
272
  "type":"closed-source",
273
  "license":null,
274
- "creation_date":1750118400000.0,
275
  "tasks":[
276
  "translation_from",
277
  "translation_to",
278
  "classification",
279
  "mmlu",
 
 
280
  "mgsm"
281
  ]
282
  },
@@ -332,6 +345,7 @@
332
  "classification",
333
  "mmlu",
334
  "arc",
 
335
  "mgsm"
336
  ]
337
  },
@@ -351,6 +365,7 @@
351
  "classification",
352
  "mmlu",
353
  "arc",
 
354
  "mgsm"
355
  ]
356
  },
@@ -370,6 +385,7 @@
370
  "classification",
371
  "mmlu",
372
  "arc",
 
373
  "mgsm"
374
  ]
375
  },
@@ -404,6 +420,7 @@
404
  "classification",
405
  "mmlu",
406
  "arc",
 
407
  "mgsm"
408
  ]
409
  },
@@ -423,6 +440,7 @@
423
  "classification",
424
  "mmlu",
425
  "arc",
 
426
  "mgsm"
427
  ]
428
  },
@@ -442,6 +460,7 @@
442
  "classification",
443
  "mmlu",
444
  "arc",
 
445
  "mgsm"
446
  ]
447
  },
@@ -485,6 +504,7 @@
485
  "classification",
486
  "mmlu",
487
  "arc",
 
488
  "mgsm"
489
  ]
490
  },
@@ -504,6 +524,7 @@
504
  "classification",
505
  "mmlu",
506
  "arc",
 
507
  "mgsm"
508
  ]
509
  },
@@ -523,6 +544,7 @@
523
  "classification",
524
  "mmlu",
525
  "arc",
 
526
  "mgsm"
527
  ]
528
  },
@@ -542,6 +564,7 @@
542
  "classification",
543
  "mmlu",
544
  "arc",
 
545
  "mgsm"
546
  ]
547
  },
@@ -561,6 +584,7 @@
561
  "classification",
562
  "mmlu",
563
  "arc",
 
564
  "mgsm"
565
  ]
566
  },
@@ -580,6 +604,7 @@
580
  "classification",
581
  "mmlu",
582
  "arc",
 
583
  "mgsm"
584
  ]
585
  },
@@ -599,6 +624,27 @@
599
  "classification",
600
  "mmlu",
601
  "arc",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
602
  "mgsm"
603
  ]
604
  },
@@ -611,8 +657,16 @@
611
  "size":null,
612
  "type":"closed-source",
613
  "license":null,
614
- "creation_date":1744588800000.0,
615
- "tasks":null
 
 
 
 
 
 
 
 
616
  },
617
  {
618
  "id":"openai\/gpt-4.1-mini",
@@ -630,6 +684,7 @@
630
  "classification",
631
  "mmlu",
632
  "arc",
 
633
  "mgsm"
634
  ]
635
  },
@@ -649,6 +704,7 @@
649
  "classification",
650
  "mmlu",
651
  "arc",
 
652
  "mgsm"
653
  ]
654
  },
@@ -668,6 +724,67 @@
668
  "classification",
669
  "mmlu",
670
  "arc",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
671
  "mgsm"
672
  ]
673
  }
 
15
  "classification",
16
  "mmlu",
17
  "arc",
18
+ "truthfulqa",
19
  "mgsm"
20
  ]
21
  },
 
35
  "classification",
36
  "mmlu",
37
  "arc",
38
+ "truthfulqa",
39
  "mgsm"
40
  ]
41
  },
 
55
  "classification",
56
  "mmlu",
57
  "arc",
58
+ "truthfulqa",
59
  "mgsm"
60
  ]
61
  },
 
75
  "classification",
76
  "mmlu",
77
  "arc",
78
+ "truthfulqa",
79
  "mgsm"
80
  ]
81
  },
 
95
  "classification",
96
  "mmlu",
97
  "arc",
98
+ "truthfulqa",
99
  "mgsm"
100
  ]
101
  },
 
115
  "classification",
116
  "mmlu",
117
  "arc",
118
+ "truthfulqa",
119
  "mgsm"
120
  ]
121
  },
 
135
  "classification",
136
  "mmlu",
137
  "arc",
138
+ "truthfulqa",
139
  "mgsm"
140
  ]
141
  },
 
148
  "size":684531386000.0,
149
  "type":"open-source",
150
  "license":"Mit",
151
+ "creation_date":1748390400000.0,
152
  "tasks":[
153
  "translation_from",
154
  "translation_to",
155
  "classification",
156
  "mmlu",
157
  "arc",
158
+ "truthfulqa",
159
  "mgsm"
160
  ]
161
  },
 
175
  "classification",
176
  "mmlu",
177
  "arc",
178
+ "truthfulqa",
179
  "mgsm"
180
  ]
181
  },
 
195
  "classification",
196
  "mmlu",
197
  "arc",
198
+ "truthfulqa",
199
  "mgsm"
200
  ]
201
  },
 
215
  "classification",
216
  "mmlu",
217
  "arc",
218
+ "truthfulqa",
219
  "mgsm"
220
  ]
221
  },
 
282
  "size":null,
283
  "type":"closed-source",
284
  "license":null,
285
+ "creation_date":1750118400000,
286
  "tasks":[
287
  "translation_from",
288
  "translation_to",
289
  "classification",
290
  "mmlu",
291
+ "arc",
292
+ "truthfulqa",
293
  "mgsm"
294
  ]
295
  },
 
345
  "classification",
346
  "mmlu",
347
  "arc",
348
+ "truthfulqa",
349
  "mgsm"
350
  ]
351
  },
 
365
  "classification",
366
  "mmlu",
367
  "arc",
368
+ "truthfulqa",
369
  "mgsm"
370
  ]
371
  },
 
385
  "classification",
386
  "mmlu",
387
  "arc",
388
+ "truthfulqa",
389
  "mgsm"
390
  ]
391
  },
 
420
  "classification",
421
  "mmlu",
422
  "arc",
423
+ "truthfulqa",
424
  "mgsm"
425
  ]
426
  },
 
440
  "classification",
441
  "mmlu",
442
  "arc",
443
+ "truthfulqa",
444
  "mgsm"
445
  ]
446
  },
 
460
  "classification",
461
  "mmlu",
462
  "arc",
463
+ "truthfulqa",
464
  "mgsm"
465
  ]
466
  },
 
504
  "classification",
505
  "mmlu",
506
  "arc",
507
+ "truthfulqa",
508
  "mgsm"
509
  ]
510
  },
 
524
  "classification",
525
  "mmlu",
526
  "arc",
527
+ "truthfulqa",
528
  "mgsm"
529
  ]
530
  },
 
544
  "classification",
545
  "mmlu",
546
  "arc",
547
+ "truthfulqa",
548
  "mgsm"
549
  ]
550
  },
 
564
  "classification",
565
  "mmlu",
566
  "arc",
567
+ "truthfulqa",
568
  "mgsm"
569
  ]
570
  },
 
584
  "classification",
585
  "mmlu",
586
  "arc",
587
+ "truthfulqa",
588
  "mgsm"
589
  ]
590
  },
 
604
  "classification",
605
  "mmlu",
606
  "arc",
607
+ "truthfulqa",
608
  "mgsm"
609
  ]
610
  },
 
624
  "classification",
625
  "mmlu",
626
  "arc",
627
+ "truthfulqa",
628
+ "mgsm"
629
+ ]
630
+ },
631
+ {
632
+ "id":"openai\/gpt-3.5-turbo-0613",
633
+ "name":"GPT-3.5 Turbo (older v0613)",
634
+ "provider_name":"OpenAI",
635
+ "cost":2.0,
636
+ "hf_id":null,
637
+ "size":null,
638
+ "type":"closed-source",
639
+ "license":null,
640
+ "creation_date":1706140800000,
641
+ "tasks":[
642
+ "translation_from",
643
+ "translation_to",
644
+ "classification",
645
+ "mmlu",
646
+ "arc",
647
+ "truthfulqa",
648
  "mgsm"
649
  ]
650
  },
 
657
  "size":null,
658
  "type":"closed-source",
659
  "license":null,
660
+ "creation_date":1744588800000,
661
+ "tasks":[
662
+ "translation_from",
663
+ "translation_to",
664
+ "classification",
665
+ "mmlu",
666
+ "arc",
667
+ "truthfulqa",
668
+ "mgsm"
669
+ ]
670
  },
671
  {
672
  "id":"openai\/gpt-4.1-mini",
 
684
  "classification",
685
  "mmlu",
686
  "arc",
687
+ "truthfulqa",
688
  "mgsm"
689
  ]
690
  },
 
704
  "classification",
705
  "mmlu",
706
  "arc",
707
+ "truthfulqa",
708
  "mgsm"
709
  ]
710
  },
 
724
  "classification",
725
  "mmlu",
726
  "arc",
727
+ "truthfulqa",
728
+ "mgsm"
729
+ ]
730
+ },
731
+ {
732
+ "id":"qwen\/qwen3-235b-a22b",
733
+ "name":"Qwen3 235B A22B",
734
+ "provider_name":"Qwen",
735
+ "cost":0.0,
736
+ "hf_id":"Qwen\/Qwen3-235B-A22B",
737
+ "size":235093634560.0,
738
+ "type":"open-source",
739
+ "license":"Apache 2.0",
740
+ "creation_date":1745712000000,
741
+ "tasks":[
742
+ "translation_from",
743
+ "translation_to",
744
+ "classification",
745
+ "mmlu",
746
+ "arc",
747
+ "truthfulqa",
748
+ "mgsm"
749
+ ]
750
+ },
751
+ {
752
+ "id":"qwen\/qwen3-30b-a3b",
753
+ "name":"Qwen3 30B A3B",
754
+ "provider_name":"Qwen",
755
+ "cost":0.0,
756
+ "hf_id":"Qwen\/Qwen3-30B-A3B",
757
+ "size":30532122624.0,
758
+ "type":"open-source",
759
+ "license":"Apache 2.0",
760
+ "creation_date":1745712000000,
761
+ "tasks":[
762
+ "translation_from",
763
+ "translation_to",
764
+ "classification",
765
+ "mmlu",
766
+ "arc",
767
+ "truthfulqa",
768
+ "mgsm"
769
+ ]
770
+ },
771
+ {
772
+ "id":"qwen\/qwen3-32b",
773
+ "name":"Qwen3 32B",
774
+ "provider_name":"Qwen",
775
+ "cost":0.0,
776
+ "hf_id":"Qwen\/Qwen3-32B",
777
+ "size":32762123264.0,
778
+ "type":"open-source",
779
+ "license":"Apache 2.0",
780
+ "creation_date":1745712000000,
781
+ "tasks":[
782
+ "translation_from",
783
+ "translation_to",
784
+ "classification",
785
+ "mmlu",
786
+ "arc",
787
+ "truthfulqa",
788
  "mgsm"
789
  ]
790
  }
results.json CHANGED
The diff for this file is too large to render. See raw diff