{
  "results": {
    "b4b": {
      "acc,none": 0.7536991368680641,
      "acc_stderr,none": 0.09728135187806679,
      "acc_norm,none": 0.7536991368680641,
      "acc_norm_stderr,none": 0.09728135187806679,
      "alias": "b4b"
    },
    "b4bqa": {
      "acc,none": 0.8515625,
      "acc_stderr,none": 0.008401025189152976,
      "acc_norm,none": 0.8515625,
      "acc_norm_stderr,none": 0.008401025189152976,
      "alias": " - b4bqa"
    },
    "medmcqa_g2b": {
      "acc,none": 0.5977011494252874,
      "acc_stderr,none": 0.026323989201783506,
      "acc_norm,none": 0.5977011494252874,
      "acc_norm_stderr,none": 0.026323989201783506,
      "alias": " - medmcqa_g2b"
    },
    "medmcqa_orig_filtered": {
      "acc,none": 0.6925287356321839,
      "acc_stderr,none": 0.024771735192072118,
      "acc_norm,none": 0.6925287356321839,
      "acc_norm_stderr,none": 0.024771735192072118,
      "alias": " - medmcqa_orig_filtered"
    },
    "medqa_4options_g2b": {
      "acc,none": 0.5978835978835979,
      "acc_stderr,none": 0.025253032554997695,
      "acc_norm,none": 0.5978835978835979,
      "acc_norm_stderr,none": 0.025253032554997695,
      "alias": " - medqa_4options_g2b"
    },
    "medqa_4options_orig_filtered": {
      "acc,none": 0.6455026455026455,
      "acc_stderr,none": 0.024636830602842,
      "acc_norm,none": 0.6455026455026455,
      "acc_norm_stderr,none": 0.024636830602842,
      "alias": " - medqa_4options_orig_filtered"
    }
  },
  "groups": {
    "b4b": {
      "acc,none": 0.7536991368680641,
      "acc_stderr,none": 0.09728135187806679,
      "acc_norm,none": 0.7536991368680641,
      "acc_norm_stderr,none": 0.09728135187806679,
      "alias": "b4b"
    }
  },
  "configs": {
    "b4bqa": {
      "task": "b4bqa",
      "dataset_path": "AIM-Harvard/b4b_drug_qa",
      "test_split": "test",
      "doc_to_text": "<function process_cd at 0x7f7ab0e88700>",
      "doc_to_target": "correct_choice",
      "doc_to_choice": [
        "A",
        "B",
        "C",
        "D"
      ],
      "description": "",
      "target_delimiter": " ",
      "fewshot_delimiter": "\n\n",
      "metric_list": [
        {
          "metric": "acc",
          "aggregation": "mean",
          "higher_is_better": true
        },
        {
          "metric": "acc_norm",
          "aggregation": "mean",
          "higher_is_better": true
        }
      ],
      "output_type": "multiple_choice",
      "repeats": 1,
      "should_decontaminate": false
    },
    "medmcqa_g2b": {
      "task": "medmcqa_g2b",
      "dataset_path": "AIM-Harvard/medmcqa_generic_to_brand",
      "training_split": "train",
      "validation_split": "validation",
      "test_split": "validation",
      "doc_to_text": "<function doc_to_text at 0x7f7ab0ed1f30>",
      "doc_to_target": "cop",
      "doc_to_choice": [
        "A",
        "B",
        "C",
        "D"
      ],
      "description": "",
      "target_delimiter": " ",
      "fewshot_delimiter": "\n\n",
      "metric_list": [
        {
          "metric": "acc",
          "aggregation": "mean",
          "higher_is_better": true
        },
        {
          "metric": "acc_norm",
          "aggregation": "mean",
          "higher_is_better": true
        }
      ],
      "output_type": "multiple_choice",
      "repeats": 1,
      "should_decontaminate": true,
      "doc_to_decontamination_query": "{{question}}"
    },
    "medmcqa_orig_filtered": {
      "task": "medmcqa_orig_filtered",
      "dataset_path": "AIM-Harvard/medmcqa_original",
      "training_split": "train",
      "validation_split": "validation",
      "test_split": "validation",
      "doc_to_text": "<function doc_to_text at 0x7f7aa1ac9120>",
      "doc_to_target": "cop",
      "doc_to_choice": [
        "A",
        "B",
        "C",
        "D"
      ],
      "description": "",
      "target_delimiter": " ",
      "fewshot_delimiter": "\n\n",
      "metric_list": [
        {
          "metric": "acc",
          "aggregation": "mean",
          "higher_is_better": true
        },
        {
          "metric": "acc_norm",
          "aggregation": "mean",
          "higher_is_better": true
        }
      ],
      "output_type": "multiple_choice",
      "repeats": 1,
      "should_decontaminate": true,
      "doc_to_decontamination_query": "{{question}}"
    },
    "medqa_4options_g2b": {
      "task": "medqa_4options_g2b",
      "dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_generic_to_brand",
      "training_split": "train",
      "validation_split": "validation",
      "test_split": "test",
      "doc_to_text": "<function doc_to_text at 0x7f7ab0d90700>",
      "doc_to_target": "<function doc_to_target at 0x7f7ab0d90a60>",
      "doc_to_choice": [
        "A",
        "B",
        "C",
        "D"
      ],
      "description": "",
      "target_delimiter": " ",
      "fewshot_delimiter": "\n\n",
      "metric_list": [
        {
          "metric": "acc",
          "aggregation": "mean",
          "higher_is_better": true
        },
        {
          "metric": "acc_norm",
          "aggregation": "mean",
          "higher_is_better": true
        }
      ],
      "output_type": "multiple_choice",
      "repeats": 1,
      "should_decontaminate": false
    },
    "medqa_4options_orig_filtered": {
      "task": "medqa_4options_orig_filtered",
      "dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_original",
      "training_split": "train",
      "validation_split": "validation",
      "test_split": "test",
      "doc_to_text": "<function doc_to_text at 0x7f7ab289e560>",
      "doc_to_target": "<function doc_to_target at 0x7f7ab0e6d000>",
      "doc_to_choice": [
        "A",
        "B",
        "C",
        "D"
      ],
      "description": "",
      "target_delimiter": " ",
      "fewshot_delimiter": "\n\n",
      "metric_list": [
        {
          "metric": "acc",
          "aggregation": "mean",
          "higher_is_better": true
        },
        {
          "metric": "acc_norm",
          "aggregation": "mean",
          "higher_is_better": true
        }
      ],
      "output_type": "multiple_choice",
      "repeats": 1,
      "should_decontaminate": false
    }
  },
  "versions": {
    "b4b": "N/A",
    "b4bqa": "Yaml",
    "medmcqa_g2b": "Yaml",
    "medmcqa_orig_filtered": "Yaml",
    "medqa_4options_g2b": "Yaml",
    "medqa_4options_orig_filtered": "Yaml"
  },
  "n-shot": {
    "b4b": 0,
    "b4bqa": 0,
    "medmcqa_g2b": 0,
    "medmcqa_orig_filtered": 0,
    "medqa_4options_g2b": 0,
    "medqa_4options_orig_filtered": 0
  },
  "config": {
    "model": "hf",
    "model_args": "pretrained=01-ai/Yi-1.5-34B,parallelize=True,load_in_4bit=True",
    "batch_size": "auto",
    "batch_sizes": [
      64
    ],
    "device": null,
    "use_cache": null,
    "limit": null,
    "bootstrap_iters": 100000,
    "gen_kwargs": null
  },
  "git_hash": "928c7657"
}