Spaces:

fair-forward
/

evals-for-every-language

Running

App Files Files Community

davidpomerenke commited on 7 days ago

Commit

98c6811

verified ·

1 Parent(s): 4c5c136

Upload from GitHub Actions: Get more results, compute average based on all tasks

Browse files

Files changed (10) hide show

datasets.json +8 -8
evals/backend.py +3 -4
evals/datasets_/truthfulqa.py +30 -0
evals/main.py +1 -1
evals/models.py +11 -13
evals/tasks.py +79 -9
frontend/src/components/ScoreColumns.js +1 -1
languages.json +6 -6
models.json +121 -4
results.json +0 -0

datasets.json CHANGED Viewed

@@ -256,7 +256,7 @@
         "parallel": true,
         "translation": "machine",
         "base": "MMLU",
-        "implemented": false,
         "group": "Multitask Language Understanding"
     },
     {
@@ -300,7 +300,7 @@
         "parallel": true,
         "translation": "machine",
         "base": "MGSM",
-        "implemented": false,
         "group": "Grade School Math"
     },
     {
@@ -315,7 +315,7 @@
         "parallel": true,
         "translation": "machine",
         "base": "MGSM",
-        "implemented": false,
         "group": "Grade School Math"
     },
     {
@@ -345,7 +345,7 @@
         "parallel": true,
         "translation": "machine",
         "base": "AI2 ARC",
-        "implemented": false,
         "group": "Abstract Reasoning"
     },
     {
@@ -360,7 +360,7 @@
         "parallel": true,
         "translation": "machine",
         "base": "AI2 ARC",
-        "implemented": false,
         "group": "Abstract Reasoning"
     },
     {
@@ -375,7 +375,7 @@
         "parallel": true,
         "translation": "human",
         "base": "TruthfulQA",
-        "implemented": false,
         "group": "Truthfulness"
     },
     {
@@ -390,7 +390,7 @@
         "parallel": true,
         "translation": "machine",
         "base": "TruthfulQA",
-        "implemented": false,
         "group": "Truthfulness"
     },
     {
@@ -405,7 +405,7 @@
         "parallel": true,
         "translation": "machine",
         "base": "TruthfulQA",
-        "implemented": false,
         "group": "Truthfulness"
     },
     {

         "parallel": true,
         "translation": "machine",
         "base": "MMLU",
+        "implemented": true,
         "group": "Multitask Language Understanding"
     },
     {
         "parallel": true,
         "translation": "machine",
         "base": "MGSM",
+        "implemented": true,
         "group": "Grade School Math"
     },
     {
         "parallel": true,
         "translation": "machine",
         "base": "MGSM",
+        "implemented": true,
         "group": "Grade School Math"
     },
     {
         "parallel": true,
         "translation": "machine",
         "base": "AI2 ARC",
+        "implemented": true,
         "group": "Abstract Reasoning"
     },
     {
         "parallel": true,
         "translation": "machine",
         "base": "AI2 ARC",
+        "implemented": true,
         "group": "Abstract Reasoning"
     },
     {
         "parallel": true,
         "translation": "human",
         "base": "TruthfulQA",
+        "implemented": true,
         "group": "Truthfulness"
     },
     {
         "parallel": true,
         "translation": "machine",
         "base": "TruthfulQA",
+        "implemented": true,
         "group": "Truthfulness"
     },
     {
         "parallel": true,
         "translation": "machine",
         "base": "TruthfulQA",
+        "implemented": true,
         "group": "Truthfulness"
     },
     {

evals/backend.py CHANGED Viewed

@@ -26,11 +26,10 @@ task_metrics = [
     "classification_accuracy",
     "mmlu_accuracy",
     "arc_accuracy",
     "mgsm_accuracy",
 ]
-task_metrics_basic = ["translation_from_bleu", "translation_to_bleu", "classification_accuracy"]
 def compute_normalized_average(df, metrics):
     """Compute average of min-max normalized metric columns."""
@@ -58,7 +57,7 @@ def make_model_table(df, models):
     for metric in task_metrics:
         if metric not in df.columns:
             df[metric] = np.nan
-    df["average"] = compute_normalized_average(df, task_metrics_basic)
     df = df.sort_values(by="average", ascending=False).reset_index()
     df = pd.merge(df, models, left_on="model", right_on="id", how="left")
     df["rank"] = df.index + 1
@@ -93,7 +92,7 @@ def make_language_table(df, languages):
     for metric in task_metrics:
         if metric not in df.columns:
             df[metric] = np.nan
-    df["average"] = compute_normalized_average(df, task_metrics_basic)
     df = pd.merge(languages, df, on="bcp_47", how="outer")
     df = df.sort_values(by="speakers", ascending=False)
     df = df[

     "classification_accuracy",
     "mmlu_accuracy",
     "arc_accuracy",
+    "truthfulqa_accuracy",
     "mgsm_accuracy",
 ]
 def compute_normalized_average(df, metrics):
     """Compute average of min-max normalized metric columns."""
     for metric in task_metrics:
         if metric not in df.columns:
             df[metric] = np.nan
+    df["average"] = compute_normalized_average(df, task_metrics)
     df = df.sort_values(by="average", ascending=False).reset_index()
     df = pd.merge(df, models, left_on="model", right_on="id", how="left")
     df["rank"] = df.index + 1
     for metric in task_metrics:
         if metric not in df.columns:
             df[metric] = np.nan
+    df["average"] = compute_normalized_average(df, task_metrics)
     df = pd.merge(languages, df, on="bcp_47", how="outer")
     df = df.sort_values(by="speakers", ascending=False)
     df = df[

evals/datasets_/truthfulqa.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import random
+from collections import Counter, defaultdict
+from langcodes import Language, standardize_tag
+from rich import print
+from datasets_.util import _get_dataset_config_names, _load_dataset
+slug_uhura_truthfulqa = "masakhane/uhura-truthfulqa"
+tags_uhura_truthfulqa = {
+    standardize_tag(a.split("_")[0], macro=True): a for a in _get_dataset_config_names(slug_uhura_truthfulqa)
+    if a.endswith("multiple_choice")
+}
+def add_choices(row):
+    row["choices"] = row["mc1_targets"]["choices"]
+    row["labels"] = row["mc1_targets"]["labels"]
+    return row
+def load_truthfulqa(language_bcp_47, nr):
+    if language_bcp_47 in tags_uhura_truthfulqa.keys():
+        ds = _load_dataset(slug_uhura_truthfulqa, tags_uhura_truthfulqa[language_bcp_47])
+        ds = ds.map(add_choices)
+        examples = ds["train"]
+        task = ds["test"][nr]
+        return "masakhane/uhura-truthfulqa", examples, task
+    else:
+        return None, None, None

evals/main.py CHANGED Viewed

@@ -15,7 +15,7 @@ n_sentences = 10
 async def evaluate():
     # FIXME we should not need this for-loop, but it helps
-    for n_languages in range(20, 100, 20):
         print(f"running evaluations for {n_languages} languages")
         old_results = pd.read_json("results.json")
         old_models = pd.read_json("models.json")

 async def evaluate():
     # FIXME we should not need this for-loop, but it helps
+    for n_languages in range(10, 101, 10):
         print(f"running evaluations for {n_languages} languages")
         old_results = pd.read_json("results.json")
         old_models = pd.read_json("models.json")

evals/models.py CHANGED Viewed

@@ -11,8 +11,7 @@ from elevenlabs import AsyncElevenLabs
 from google.cloud import translate_v2 as translate
 from huggingface_hub import AsyncInferenceClient, HfApi
 from joblib.memory import Memory
-from langcodes import closest_supported_match
-from openai import AsyncOpenAI, PermissionDeniedError
 from requests import HTTPError, get
 # for development purposes, all languages will be evaluated on the fast models
@@ -23,12 +22,12 @@ important_models = [
     "meta-llama/llama-3.1-70b-instruct",  # 0.3$
     "meta-llama/llama-3-70b-instruct",  # 0.4$
     # "meta-llama/llama-2-70b-chat", # 0.9$; not properly supported by OpenRouter
-    # "openai/gpt-4.1",  # 8$
     "openai/gpt-4.1-mini",  # 1.6$
     "openai/gpt-4.1-nano",  # 0.4$
     "openai/gpt-4o-mini",  # 0.6$
     # "openai/gpt-4o-2024-11-20", # 10$
-    # "openai/gpt-3.5-turbo-0613",  # 2$
     # "openai/gpt-3.5-turbo",  # 1.5$
     # "anthropic/claude-3.5-haiku", # 4$ -> too expensive for dev
     "mistralai/mistral-small-3.1-24b-instruct",  # 0.3$
@@ -37,6 +36,9 @@ important_models = [
     "google/gemini-2.5-flash",  # 0.6$
     "google/gemini-2.0-flash-lite-001",  # 0.3$
     "google/gemma-3-27b-it",  # 0.2$
     # "qwen/qwen-turbo", # 0.2$; recognizes "inappropriate content"
     # "qwen/qwq-32b",  # 0.2$
     # "qwen/qwen-2.5-72b-instruct",  # 0.39$
@@ -49,7 +51,6 @@ important_models = [
 ]
 blocklist = [
-    "microsoft/wizardlm-2-8x22b",  # temporarily rate-limited
     "google/gemini-2.5-pro-preview",
     "google/gemini-2.5-flash-preview",
     "google/gemini-2.5-flash-lite-preview",
@@ -150,9 +151,10 @@ async def complete(**kwargs) -> str | None:
     async with openrouter_rate_limit:
         try:
             response = await client.chat.completions.create(**kwargs)
-        except PermissionDeniedError as e:
-            print(e)
-            return None
     if not response.choices:
         raise Exception(response)
     return response.choices[0].message.content.strip()
@@ -281,13 +283,9 @@ def load_models(date: date):
     )
     # models = models[models["cost"] <= 2.0].reset_index(drop=True)
     models["tasks"] = [
-        ["translation_from", "translation_to", "classification", "mmlu", "arc", "mgsm"]
     ] * len(models)
     models = pd.concat([models, get_translation_models()])
-    models = models[  # temporary fix FIXME
-        (models["id"] != "google/gemini-2.5-pro")
-        & (models["id"] != "google/gemini-2.5-pro-preview")
-    ]
     return models

 from google.cloud import translate_v2 as translate
 from huggingface_hub import AsyncInferenceClient, HfApi
 from joblib.memory import Memory
+from openai import AsyncOpenAI, BadRequestError
 from requests import HTTPError, get
 # for development purposes, all languages will be evaluated on the fast models
     "meta-llama/llama-3.1-70b-instruct",  # 0.3$
     "meta-llama/llama-3-70b-instruct",  # 0.4$
     # "meta-llama/llama-2-70b-chat", # 0.9$; not properly supported by OpenRouter
+    "openai/gpt-4.1",  # 8$
     "openai/gpt-4.1-mini",  # 1.6$
     "openai/gpt-4.1-nano",  # 0.4$
     "openai/gpt-4o-mini",  # 0.6$
     # "openai/gpt-4o-2024-11-20", # 10$
+    "openai/gpt-3.5-turbo-0613",  # 2$
     # "openai/gpt-3.5-turbo",  # 1.5$
     # "anthropic/claude-3.5-haiku", # 4$ -> too expensive for dev
     "mistralai/mistral-small-3.1-24b-instruct",  # 0.3$
     "google/gemini-2.5-flash",  # 0.6$
     "google/gemini-2.0-flash-lite-001",  # 0.3$
     "google/gemma-3-27b-it",  # 0.2$
+    "qwen/qwen3-32b",
+    "qwen/qwen3-235b-a22b",
+    "qwen/qwen3-30b-a3b",  # 0.29$
     # "qwen/qwen-turbo", # 0.2$; recognizes "inappropriate content"
     # "qwen/qwq-32b",  # 0.2$
     # "qwen/qwen-2.5-72b-instruct",  # 0.39$
 ]
 blocklist = [
     "google/gemini-2.5-pro-preview",
     "google/gemini-2.5-flash-preview",
     "google/gemini-2.5-flash-lite-preview",
     async with openrouter_rate_limit:
         try:
             response = await client.chat.completions.create(**kwargs)
+        except BadRequestError as e:
+            if "filtered" in e.message:
+                return None
+            raise e
     if not response.choices:
         raise Exception(response)
     return response.choices[0].message.content.strip()
     )
     # models = models[models["cost"] <= 2.0].reset_index(drop=True)
     models["tasks"] = [
+        ["translation_from", "translation_to", "classification", "mmlu", "arc", "truthfulqa", "mgsm"]
     ] * len(models)
     models = pd.concat([models, get_translation_models()])
     return models

evals/tasks.py CHANGED Viewed

@@ -9,6 +9,7 @@ from datasets_.flores import flores_sentences
 from datasets_.mgsm import load_mgsm, parse_number
 from datasets_.mmlu import load_mmlu
 from datasets_.arc import load_uhura_arc_easy
 from google.cloud import translate_v2 as translate
 from langcodes import closest_supported_match
 from languages import languages, script_name
@@ -224,6 +225,7 @@ async def mlm_and_evaluate(model, language_bcp_47, nr):
         }
     ]
 def format_multiple_choice(item):
     return f"""{item["question"]}
@@ -234,6 +236,7 @@ def format_multiple_choice(item):
     A|B|C|D?"""
 async def mmlu_and_evaluate(model, language_bcp_47, nr):
     ds_name, examples, task = load_mmlu(language_bcp_47, nr)
     if not task:
@@ -253,7 +256,10 @@ async def mmlu_and_evaluate(model, language_bcp_47, nr):
             temperature=0,
             max_tokens=1,
         )
-        acc = int(response[:1].strip() == task["answer"])
     except Exception as e:
         if "ResponsibleAIPolicyViolation" in str(e):
             acc = 0
@@ -270,11 +276,12 @@ async def mmlu_and_evaluate(model, language_bcp_47, nr):
         }
     ]
 async def arc_and_evaluate(model, language_bcp_47, nr):
     ds_name, examples, task = load_uhura_arc_easy(language_bcp_47, nr)
     if not task:
         return []
     messages = []
     for example in examples:
         messages += [
@@ -289,7 +296,10 @@ async def arc_and_evaluate(model, language_bcp_47, nr):
             temperature=0,
             max_tokens=1,
         )
-        acc = int(response[:1].strip() == task["answer"])
     except Exception as e:
         if "ResponsibleAIPolicyViolation" in str(e):
             acc = 0
@@ -305,7 +315,68 @@ async def arc_and_evaluate(model, language_bcp_47, nr):
             "sentence_nr": nr,
         }
     ]
 async def mgsm_and_evaluate(model, language_bcp_47, nr):
     system_prompt = """
@@ -325,11 +396,9 @@ async def mgsm_and_evaluate(model, language_bcp_47, nr):
         temperature=0,
         max_tokens=1024,
     )
-    number = response.split("####")
-    if len(number) == 2:
-        accuracy = int(
-            parse_number(number[1].strip()) == parse_number(question["answer_number"])
-        )
     else:
         accuracy = 0
@@ -383,6 +452,7 @@ tasks = {
     # "mlm": mlm_and_evaluate,
     "mmlu": mmlu_and_evaluate,
     "arc": arc_and_evaluate,
     "mgsm": mgsm_and_evaluate,
     # "asr": transcribe_and_evaluate,
 }

 from datasets_.mgsm import load_mgsm, parse_number
 from datasets_.mmlu import load_mmlu
 from datasets_.arc import load_uhura_arc_easy
+from datasets_.truthfulqa import load_truthfulqa
 from google.cloud import translate_v2 as translate
 from langcodes import closest_supported_match
 from languages import languages, script_name
         }
     ]
 def format_multiple_choice(item):
     return f"""{item["question"]}
     A|B|C|D?"""
 async def mmlu_and_evaluate(model, language_bcp_47, nr):
     ds_name, examples, task = load_mmlu(language_bcp_47, nr)
     if not task:
             temperature=0,
             max_tokens=1,
         )
+        if response:
+            acc = int(response[:1].strip() == task["answer"])
+        else:
+            acc = 0
     except Exception as e:
         if "ResponsibleAIPolicyViolation" in str(e):
             acc = 0
         }
     ]
 async def arc_and_evaluate(model, language_bcp_47, nr):
     ds_name, examples, task = load_uhura_arc_easy(language_bcp_47, nr)
     if not task:
         return []
     messages = []
     for example in examples:
         messages += [
             temperature=0,
             max_tokens=1,
         )
+        if response:
+            acc = int(response[:1].strip() == task["answer"])
+        else:
+            acc = 0
     except Exception as e:
         if "ResponsibleAIPolicyViolation" in str(e):
             acc = 0
             "sentence_nr": nr,
         }
     ]
+letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+def shuffle_choices_and_labels(item):
+    indices = list(range(len(item["choices"])))
+    random.shuffle(indices)
+    item["choices"] = [item["choices"][i] for i in indices]
+    item["labels"] = [item["labels"][i] for i in indices]
+    return item
+def format_multiple_choice_truthfulqa(item):
+    text = item["question"] + "\n\n"
+    for i, choice in enumerate(item["choices"]):
+        text += f"{letters[i]}: {choice}\n"
+    text += "|".join(letters[: len(item["choices"])]) + "?"
+    return text
+async def truthfulqa_and_evaluate(model, language_bcp_47, nr):
+    ds_name, examples, task = load_truthfulqa(language_bcp_47, nr)
+    if not task:
+        return []
+    task = shuffle_choices_and_labels(task)
+    answer = letters[task["labels"].index(1)]
+    messages = []
+    for example in examples:
+        example = shuffle_choices_and_labels(example)
+        messages += [
+            {"role": "user", "content": format_multiple_choice_truthfulqa(example)},
+            {"role": "assistant", "content": letters[example["labels"].index(1)]},
+        ]
+    messages += [{"role": "user", "content": format_multiple_choice_truthfulqa(task)}]
+    try:
+        response = await complete(
+            model=model,
+            messages=messages,
+            temperature=0,
+            max_tokens=1,
+        )
+        if response:
+            acc = int(response[:1].strip() == answer)
+        else:
+            acc = 0
+    except Exception as e:
+        if "ResponsibleAIPolicyViolation" in str(e):
+            acc = 0
+        else:
+            raise e
+    return [
+        {
+            "model": model,
+            "bcp_47": language_bcp_47,
+            "task": "truthfulqa",
+            "metric": "accuracy",
+            "score": acc,
+            "sentence_nr": nr,
+        }
+    ]
 async def mgsm_and_evaluate(model, language_bcp_47, nr):
     system_prompt = """
         temperature=0,
         max_tokens=1024,
     )
+    if response and len(response.split("####")) == 2:
+        number = response.split("####")[1].strip()
+        accuracy = int(parse_number(number) == parse_number(question["answer_number"]))
     else:
         accuracy = 0
     # "mlm": mlm_and_evaluate,
     "mmlu": mmlu_and_evaluate,
     "arc": arc_and_evaluate,
+    "truthfulqa": truthfulqa_and_evaluate,
     "mgsm": mgsm_and_evaluate,
     # "asr": transcribe_and_evaluate,
 }

frontend/src/components/ScoreColumns.js CHANGED Viewed

@@ -14,7 +14,7 @@ const ScoreColumns = [
   <Column
     field='average'
     header='Proficiency'
-    headerTooltip='Language Proficiency Score (average translation and classification scores, after min-max normalization)'
     sortable
     body={scoreBodyTemplate('average', { minScore: 0.2, maxScore: 0.5 })}
     style={{ minWidth: '5rem', maxWidth: '10rem' }}

   <Column
     field='average'
     header='Proficiency'
+    headerTooltip='Language Proficiency Score (average of the scores for each task, after min-max normalization)'
     sortable
     body={scoreBodyTemplate('average', { minScore: 0.2, maxScore: 0.5 })}
     style={{ minWidth: '5rem', maxWidth: '10rem' }}

languages.json CHANGED Viewed

@@ -79,7 +79,7 @@
     "family":"Indo-European",
     "flores_path":"fra_Latn",
     "fleurs_tag":"fr_fr",
-    "commonvoice_hours":1064.0,
     "commonvoice_locale":"fr",
     "in_benchmark":true
   },
@@ -1375,7 +1375,7 @@
     "family":"Turkic",
     "flores_path":"uig_Arab",
     "fleurs_tag":null,
-    "commonvoice_hours":410.0,
     "commonvoice_locale":"ug",
     "in_benchmark":true
   },
@@ -1747,7 +1747,7 @@
     "family":"Indo-European",
     "flores_path":"nob_Latn",
     "fleurs_tag":"nb_no",
-    "commonvoice_hours":0.4,
     "commonvoice_locale":"nb-NO",
     "in_benchmark":true
   },
@@ -2323,7 +2323,7 @@
     "family":"Dravidian",
     "flores_path":null,
     "fleurs_tag":null,
-    "commonvoice_hours":0.0,
     "commonvoice_locale":"brh",
     "in_benchmark":false
   },
@@ -2623,7 +2623,7 @@
     "family":"Indo-European",
     "flores_path":null,
     "fleurs_tag":null,
-    "commonvoice_hours":0.0,
     "commonvoice_locale":"haz",
     "in_benchmark":false
   },
@@ -4651,7 +4651,7 @@
     "family":"Abkhaz-Adyge",
     "flores_path":null,
     "fleurs_tag":null,
-    "commonvoice_hours":29.0,
     "commonvoice_locale":"ady",
     "in_benchmark":false
   },

     "family":"Indo-European",
     "flores_path":"fra_Latn",
     "fleurs_tag":"fr_fr",
+    "commonvoice_hours":1065.0,
     "commonvoice_locale":"fr",
     "in_benchmark":true
   },
     "family":"Turkic",
     "flores_path":"uig_Arab",
     "fleurs_tag":null,
+    "commonvoice_hours":411.0,
     "commonvoice_locale":"ug",
     "in_benchmark":true
   },
     "family":"Indo-European",
     "flores_path":"nob_Latn",
     "fleurs_tag":"nb_no",
+    "commonvoice_hours":0.5,
     "commonvoice_locale":"nb-NO",
     "in_benchmark":true
   },
     "family":"Dravidian",
     "flores_path":null,
     "fleurs_tag":null,
+    "commonvoice_hours":1.2,
     "commonvoice_locale":"brh",
     "in_benchmark":false
   },
     "family":"Indo-European",
     "flores_path":null,
     "fleurs_tag":null,
+    "commonvoice_hours":0.9,
     "commonvoice_locale":"haz",
     "in_benchmark":false
   },
     "family":"Abkhaz-Adyge",
     "flores_path":null,
     "fleurs_tag":null,
+    "commonvoice_hours":30.0,
     "commonvoice_locale":"ady",
     "in_benchmark":false
   },

models.json CHANGED Viewed

@@ -15,6 +15,7 @@
       "classification",
       "mmlu",
       "arc",
       "mgsm"
     ]
   },
@@ -34,6 +35,7 @@
       "classification",
       "mmlu",
       "arc",
       "mgsm"
     ]
   },
@@ -53,6 +55,7 @@
       "classification",
       "mmlu",
       "arc",
       "mgsm"
     ]
   },
@@ -72,6 +75,7 @@
       "classification",
       "mmlu",
       "arc",
       "mgsm"
     ]
   },
@@ -91,6 +95,7 @@
       "classification",
       "mmlu",
       "arc",
       "mgsm"
     ]
   },
@@ -110,6 +115,7 @@
       "classification",
       "mmlu",
       "arc",
       "mgsm"
     ]
   },
@@ -129,6 +135,7 @@
       "classification",
       "mmlu",
       "arc",
       "mgsm"
     ]
   },
@@ -141,13 +148,14 @@
     "size":684531386000.0,
     "type":"open-source",
     "license":"Mit",
-    "creation_date":1748390400000,
     "tasks":[
       "translation_from",
       "translation_to",
       "classification",
       "mmlu",
       "arc",
       "mgsm"
     ]
   },
@@ -167,6 +175,7 @@
       "classification",
       "mmlu",
       "arc",
       "mgsm"
     ]
   },
@@ -186,6 +195,7 @@
       "classification",
       "mmlu",
       "arc",
       "mgsm"
     ]
   },
@@ -205,6 +215,7 @@
       "classification",
       "mmlu",
       "arc",
       "mgsm"
     ]
   },
@@ -271,12 +282,14 @@
     "size":null,
     "type":"closed-source",
     "license":null,
-    "creation_date":1750118400000.0,
     "tasks":[
       "translation_from",
       "translation_to",
       "classification",
       "mmlu",
       "mgsm"
     ]
   },
@@ -332,6 +345,7 @@
       "classification",
       "mmlu",
       "arc",
       "mgsm"
     ]
   },
@@ -351,6 +365,7 @@
       "classification",
       "mmlu",
       "arc",
       "mgsm"
     ]
   },
@@ -370,6 +385,7 @@
       "classification",
       "mmlu",
       "arc",
       "mgsm"
     ]
   },
@@ -404,6 +420,7 @@
       "classification",
       "mmlu",
       "arc",
       "mgsm"
     ]
   },
@@ -423,6 +440,7 @@
       "classification",
       "mmlu",
       "arc",
       "mgsm"
     ]
   },
@@ -442,6 +460,7 @@
       "classification",
       "mmlu",
       "arc",
       "mgsm"
     ]
   },
@@ -485,6 +504,7 @@
       "classification",
       "mmlu",
       "arc",
       "mgsm"
     ]
   },
@@ -504,6 +524,7 @@
       "classification",
       "mmlu",
       "arc",
       "mgsm"
     ]
   },
@@ -523,6 +544,7 @@
       "classification",
       "mmlu",
       "arc",
       "mgsm"
     ]
   },
@@ -542,6 +564,7 @@
       "classification",
       "mmlu",
       "arc",
       "mgsm"
     ]
   },
@@ -561,6 +584,7 @@
       "classification",
       "mmlu",
       "arc",
       "mgsm"
     ]
   },
@@ -580,6 +604,7 @@
       "classification",
       "mmlu",
       "arc",
       "mgsm"
     ]
   },
@@ -599,6 +624,27 @@
       "classification",
       "mmlu",
       "arc",
       "mgsm"
     ]
   },
@@ -611,8 +657,16 @@
     "size":null,
     "type":"closed-source",
     "license":null,
-    "creation_date":1744588800000.0,
-    "tasks":null
   },
   {
     "id":"openai\/gpt-4.1-mini",
@@ -630,6 +684,7 @@
       "classification",
       "mmlu",
       "arc",
       "mgsm"
     ]
   },
@@ -649,6 +704,7 @@
       "classification",
       "mmlu",
       "arc",
       "mgsm"
     ]
   },
@@ -668,6 +724,67 @@
       "classification",
       "mmlu",
       "arc",
       "mgsm"
     ]
   }

       "classification",
       "mmlu",
       "arc",
+      "truthfulqa",
       "mgsm"
     ]
   },
       "classification",
       "mmlu",
       "arc",
+      "truthfulqa",
       "mgsm"
     ]
   },
       "classification",
       "mmlu",
       "arc",
+      "truthfulqa",
       "mgsm"
     ]
   },
       "classification",
       "mmlu",
       "arc",
+      "truthfulqa",
       "mgsm"
     ]
   },
       "classification",
       "mmlu",
       "arc",
+      "truthfulqa",
       "mgsm"
     ]
   },
       "classification",
       "mmlu",
       "arc",
+      "truthfulqa",
       "mgsm"
     ]
   },
       "classification",
       "mmlu",
       "arc",
+      "truthfulqa",
       "mgsm"
     ]
   },
     "size":684531386000.0,
     "type":"open-source",
     "license":"Mit",
+    "creation_date":1748390400000.0,
     "tasks":[
       "translation_from",
       "translation_to",
       "classification",
       "mmlu",
       "arc",
+      "truthfulqa",
       "mgsm"
     ]
   },
       "classification",
       "mmlu",
       "arc",
+      "truthfulqa",
       "mgsm"
     ]
   },
       "classification",
       "mmlu",
       "arc",
+      "truthfulqa",
       "mgsm"
     ]
   },
       "classification",
       "mmlu",
       "arc",
+      "truthfulqa",
       "mgsm"
     ]
   },
     "size":null,
     "type":"closed-source",
     "license":null,
+    "creation_date":1750118400000,
     "tasks":[
       "translation_from",
       "translation_to",
       "classification",
       "mmlu",
+      "arc",
+      "truthfulqa",
       "mgsm"
     ]
   },
       "classification",
       "mmlu",
       "arc",
+      "truthfulqa",
       "mgsm"
     ]
   },
       "classification",
       "mmlu",
       "arc",
+      "truthfulqa",
       "mgsm"
     ]
   },
       "classification",
       "mmlu",
       "arc",
+      "truthfulqa",
       "mgsm"
     ]
   },
       "classification",
       "mmlu",
       "arc",
+      "truthfulqa",
       "mgsm"
     ]
   },
       "classification",
       "mmlu",
       "arc",
+      "truthfulqa",
       "mgsm"
     ]
   },
       "classification",
       "mmlu",
       "arc",
+      "truthfulqa",
       "mgsm"
     ]
   },
       "classification",
       "mmlu",
       "arc",
+      "truthfulqa",
       "mgsm"
     ]
   },
       "classification",
       "mmlu",
       "arc",
+      "truthfulqa",
       "mgsm"
     ]
   },
       "classification",
       "mmlu",
       "arc",
+      "truthfulqa",
       "mgsm"
     ]
   },
       "classification",
       "mmlu",
       "arc",
+      "truthfulqa",
       "mgsm"
     ]
   },
       "classification",
       "mmlu",
       "arc",
+      "truthfulqa",
       "mgsm"
     ]
   },
       "classification",
       "mmlu",
       "arc",
+      "truthfulqa",
       "mgsm"
     ]
   },
       "classification",
       "mmlu",
       "arc",
+      "truthfulqa",
+      "mgsm"
+    ]
+  },
+  {
+    "id":"openai\/gpt-3.5-turbo-0613",
+    "name":"GPT-3.5 Turbo (older v0613)",
+    "provider_name":"OpenAI",
+    "cost":2.0,
+    "hf_id":null,
+    "size":null,
+    "type":"closed-source",
+    "license":null,
+    "creation_date":1706140800000,
+    "tasks":[
+      "translation_from",
+      "translation_to",
+      "classification",
+      "mmlu",
+      "arc",
+      "truthfulqa",
       "mgsm"
     ]
   },
     "size":null,
     "type":"closed-source",
     "license":null,
+    "creation_date":1744588800000,
+    "tasks":[
+      "translation_from",
+      "translation_to",
+      "classification",
+      "mmlu",
+      "arc",
+      "truthfulqa",
+      "mgsm"
+    ]
   },
   {
     "id":"openai\/gpt-4.1-mini",
       "classification",
       "mmlu",
       "arc",
+      "truthfulqa",
       "mgsm"
     ]
   },
       "classification",
       "mmlu",
       "arc",
+      "truthfulqa",
       "mgsm"
     ]
   },
       "classification",
       "mmlu",
       "arc",
+      "truthfulqa",
+      "mgsm"
+    ]
+  },
+  {
+    "id":"qwen\/qwen3-235b-a22b",
+    "name":"Qwen3 235B A22B",
+    "provider_name":"Qwen",
+    "cost":0.0,
+    "hf_id":"Qwen\/Qwen3-235B-A22B",
+    "size":235093634560.0,
+    "type":"open-source",
+    "license":"Apache 2.0",
+    "creation_date":1745712000000,
+    "tasks":[
+      "translation_from",
+      "translation_to",
+      "classification",
+      "mmlu",
+      "arc",
+      "truthfulqa",
+      "mgsm"
+    ]
+  },
+  {
+    "id":"qwen\/qwen3-30b-a3b",
+    "name":"Qwen3 30B A3B",
+    "provider_name":"Qwen",
+    "cost":0.0,
+    "hf_id":"Qwen\/Qwen3-30B-A3B",
+    "size":30532122624.0,
+    "type":"open-source",
+    "license":"Apache 2.0",
+    "creation_date":1745712000000,
+    "tasks":[
+      "translation_from",
+      "translation_to",
+      "classification",
+      "mmlu",
+      "arc",
+      "truthfulqa",
+      "mgsm"
+    ]
+  },
+  {
+    "id":"qwen\/qwen3-32b",
+    "name":"Qwen3 32B",
+    "provider_name":"Qwen",
+    "cost":0.0,
+    "hf_id":"Qwen\/Qwen3-32B",
+    "size":32762123264.0,
+    "type":"open-source",
+    "license":"Apache 2.0",
+    "creation_date":1745712000000,
+    "tasks":[
+      "translation_from",
+      "translation_to",
+      "classification",
+      "mmlu",
+      "arc",
+      "truthfulqa",
       "mgsm"
     ]
   }

results.json CHANGED Viewed

The diff for this file is too large to render. See raw diff