Spaces:

Tzktz
/

Dit-document-layout-analysis

Running

App Files Files Community

Dit-document-layout-analysis / unilm /adalm /finetune /utils_for_glue.py

Tzktz

Upload 7664 files

6fc683c verified over 1 year ago

raw

history blame contribute delete

31.1 kB

	# coding=utf-8
	# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
	# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	""" GLUE processors and helpers """

	import logging
	import os
	import csv
	import sys
	import copy
	import json
	from scipy.stats import pearsonr, spearmanr
	from sklearn.metrics import matthews_corrcoef, f1_score
	from sklearn.preprocessing import MultiLabelBinarizer
	logger = logging.getLogger(__name__)


	class InputExample(object):
	"""
	A single training/test example for simple sequence classification.

	Args:
	guid: Unique id for the example.
	text_a: string. The untokenized text of the first sequence. For single
	sequence tasks, only this sequence must be specified.
	text_b: (Optional) string. The untokenized text of the second sequence.
	Only must be specified for sequence pair tasks.
	label: (Optional) string. The label of the example. This should be
	specified for train and dev examples, but not for test examples.
	"""
	def __init__(self, guid, text_a, text_b=None, label=None):
	self.guid = guid
	self.text_a = text_a
	self.text_b = text_b
	self.label = label

	def __repr__(self):
	return str(self.to_json_string())

	def to_dict(self):
	"""Serializes this instance to a Python dictionary."""
	output = copy.deepcopy(self.__dict__)
	return output

	def to_json_string(self):
	"""Serializes this instance to a JSON string."""
	return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"


	class InputFeatures(object):
	"""
	A single set of features of data.

	Args:
	input_ids: Indices of input sequence tokens in the vocabulary.
	attention_mask: Mask to avoid performing attention on padding token indices.
	Mask values selected in ``[0, 1]``:
	Usually ``1`` for tokens that are NOT MASKED, ``0`` for MASKED (padded) tokens.
	token_type_ids: Segment token indices to indicate first and second portions of the inputs.
	label: Label corresponding to the input
	"""

	def __init__(self, input_ids, attention_mask=None, token_type_ids=None, label=None):
	self.input_ids = input_ids
	self.attention_mask = attention_mask
	self.token_type_ids = token_type_ids
	self.label = label

	def __repr__(self):
	return str(self.to_json_string())

	def to_dict(self):
	"""Serializes this instance to a Python dictionary."""
	output = copy.deepcopy(self.__dict__)
	return output

	def to_json_string(self):
	"""Serializes this instance to a JSON string."""
	return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"


	class DataProcessor(object):
	"""Base class for data converters for sequence classification data sets."""

	def get_train_examples(self, data_dir):
	"""Gets a collection of `InputExample`s for the train set."""
	raise NotImplementedError()

	def get_dev_examples(self, data_dir):
	"""Gets a collection of `InputExample`s for the dev set."""
	raise NotImplementedError()

	def get_labels(self):
	"""Gets the list of labels for this data set."""
	raise NotImplementedError()

	@classmethod
	def _read_tsv(cls, input_file, quotechar=None):
	"""Reads a tab separated value file."""
	with open(input_file, "r", encoding="utf-8-sig") as f:
	reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
	lines = []
	for line in reader:
	if sys.version_info[0] == 2:
	line = list(unicode(cell, 'utf-8') for cell in line)
	lines.append(line)
	return lines

	@classmethod
	def _read_json(cls, input_file):
	with open(input_file, "r", encoding="utf-8-sig") as f:
	lines = json.loads(f.read())
	return lines

	@classmethod
	def _read_jsonl(cls, input_file):
	with open(input_file, "r", encoding="utf-8-sig") as f:
	lines = f.readlines()
	return lines

	def glue_convert_examples_to_features(examples, tokenizer,
	max_length=512,
	task=None,
	label_list=None,
	output_mode=None,
	pad_on_left=False,
	pad_token=0,
	pad_token_segment_id=0,
	mask_padding_with_zero=True):
	"""
	Loads a data file into a list of ``InputFeatures``

	Args:
	examples: List of ``InputExamples`` or ``tf.data.Dataset`` containing the examples.
	tokenizer: Instance of a tokenizer that will tokenize the examples
	max_length: Maximum example length
	task: GLUE task
	label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method
	output_mode: String indicating the output mode. Either ``regression`` or ``classification``
	pad_on_left: If set to ``True``, the examples will be padded on the left rather than on the right (default)
	pad_token: Padding token
	pad_token_segment_id: The segment ID for the padding token (It is usually 0, but can vary such as for XLNet where it is 4)
	mask_padding_with_zero: If set to ``True``, the attention mask will be filled by ``1`` for actual values
	and by ``0`` for padded values. If set to ``False``, inverts it (``1`` for padded values, ``0`` for
	actual values)

	Returns:
	If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset``
	containing the task-specific features. If the input is a list of ``InputExamples``, will return
	a list of task-specific ``InputFeatures`` which can be fed to the model.

	"""
	is_tf_dataset = False

	if task is not None:
	processor = glue_processors[task]()
	if label_list is None:
	label_list = processor.get_labels()
	logger.info("Using label list %s for task %s" % (label_list, task))
	if output_mode is None:
	output_mode = glue_output_modes[task]
	logger.info("Using output mode %s for task %s" % (output_mode, task))

	label_map = {label: i for i, label in enumerate(label_list)}

	features = []
	for (ex_index, example) in enumerate(examples):
	if ex_index % 10000 == 0:
	logger.info("Writing example %d" % (ex_index))
	if is_tf_dataset:
	example = processor.get_example_from_tensor_dict(example)
	example = processor.tfds_map(example)

	inputs = tokenizer.encode_plus(
	example.text_a,
	example.text_b,
	add_special_tokens=True,
	max_length=max_length,
	)
	input_ids = inputs["input_ids"]
	if "token_type_ids" in inputs:
	token_type_ids = inputs["token_type_ids"]
	else:
	token_type_ids = []

	# The mask has 1 for real tokens and 0 for padding tokens. Only real
	# tokens are attended to.
	attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

	# Zero-pad up to the sequence length.
	padding_length = max_length - len(input_ids)
	if pad_on_left:
	input_ids = ([pad_token] * padding_length) + input_ids
	attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask
	if len(token_type_ids) == 0:
	padding_length = max_length
	token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids
	else:
	input_ids = input_ids + ([pad_token] * padding_length)
	attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
	if len(token_type_ids) == 0:
	padding_length = max_length
	token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)

	assert len(input_ids) == max_length, "Error with input length {} vs {}".format(len(input_ids), max_length)
	assert len(attention_mask) == max_length, "Error with input length {} vs {}".format(len(attention_mask), max_length)
	assert len(token_type_ids) == max_length, "Error with input length {} vs {}".format(len(token_type_ids), max_length)

	if output_mode == "classification":
	label = label_map[example.label]
	elif output_mode == "regression":
	label = float(example.label)
	else:
	raise KeyError(output_mode)

	if ex_index < 5:
	logger.info("* Example *")
	logger.info("guid: %s" % (example.guid))
	logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
	logger.info("input_tokens: %s" % " ".join(tokenizer.convert_ids_to_tokens(input_ids)))
	logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask]))
	logger.info("token_type_ids: %s" % " ".join([str(x) for x in token_type_ids]))
	logger.info("label: %s (id = %d)" % (example.label, label))

	features.append(
	InputFeatures(input_ids=input_ids,
	attention_mask=attention_mask,
	token_type_ids=token_type_ids,
	label=label))

	return features


	class MrpcProcessor(DataProcessor):
	"""Processor for the MRPC data set (GLUE version)."""

	def get_example_from_tensor_dict(self, tensor_dict):
	"""See base class."""
	return InputExample(tensor_dict['idx'].numpy(),
	tensor_dict['sentence1'].numpy().decode('utf-8'),
	tensor_dict['sentence2'].numpy().decode('utf-8'),
	str(tensor_dict['label'].numpy()))

	def get_train_examples(self, data_dir):
	"""See base class."""
	logger.info("LOOKING AT {}".format(os.path.join(data_dir, "train.tsv")))
	return self._create_examples(
	self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")

	def get_dev_examples(self, data_dir):
	"""See base class."""
	return self._create_examples(
	self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")

	def get_labels(self):
	"""See base class."""
	return ["0", "1"]

	def _create_examples(self, lines, set_type):
	"""Creates examples for the training and dev sets."""
	examples = []
	for (i, line) in enumerate(lines):
	if i == 0:
	continue
	guid = "%s-%s" % (set_type, i)
	text_a = line[3]
	text_b = line[4]
	label = line[0]
	examples.append(
	InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
	return examples


	class MnliProcessor(DataProcessor):
	"""Processor for the MultiNLI data set (GLUE version)."""

	def get_example_from_tensor_dict(self, tensor_dict):
	"""See base class."""
	return InputExample(tensor_dict['idx'].numpy(),
	tensor_dict['premise'].numpy().decode('utf-8'),
	tensor_dict['hypothesis'].numpy().decode('utf-8'),
	str(tensor_dict['label'].numpy()))

	def get_train_examples(self, data_dir):
	"""See base class."""
	return self._create_examples(
	self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")

	def get_dev_examples(self, data_dir):
	"""See base class."""
	return self._create_examples(
	self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")),
	"dev_matched")

	def get_test_examples(self, data_dir):
	"""See base class."""
	return self._create_examples(
	self._read_tsv(os.path.join(data_dir, "test_matched.tsv")),
	"test_matched")

	def get_labels(self):
	"""See base class."""
	return ["contradiction", "entailment", "neutral"]

	def _create_examples(self, lines, set_type):
	"""Creates examples for the training and dev sets."""
	examples = []
	for (i, line) in enumerate(lines):
	if i == 0:
	continue
	guid = "%s-%s" % (set_type, line[0])
	text_a = line[8]
	text_b = line[9]
	label = line[-1]
	examples.append(
	InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
	return examples


	class MnliMismatchedProcessor(MnliProcessor):
	"""Processor for the MultiNLI Mismatched data set (GLUE version)."""

	def get_dev_examples(self, data_dir):
	"""See base class."""
	return self._create_examples(
	self._read_tsv(os.path.join(data_dir, "dev_mismatched.tsv")),
	"dev_mismatched")
	def get_test_examples(self, data_dir):
	"""See base class."""
	return self._create_examples(
	self._read_tsv(os.path.join(data_dir, "test_mismatched.tsv")),
	"test_mismatched")


	class ColaProcessor(DataProcessor):
	"""Processor for the CoLA data set (GLUE version)."""

	def get_example_from_tensor_dict(self, tensor_dict):
	"""See base class."""
	return InputExample(tensor_dict['idx'].numpy(),
	tensor_dict['sentence'].numpy().decode('utf-8'),
	None,
	str(tensor_dict['label'].numpy()))

	def get_train_examples(self, data_dir):
	"""See base class."""
	return self._create_examples(
	self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")

	def get_dev_examples(self, data_dir):
	"""See base class."""
	return self._create_examples(
	self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")

	def get_labels(self):
	"""See base class."""
	return ["0", "1"]

	def _create_examples(self, lines, set_type):
	"""Creates examples for the training and dev sets."""
	examples = []
	for (i, line) in enumerate(lines):
	guid = "%s-%s" % (set_type, i)
	text_a = line[3]
	label = line[1]
	examples.append(
	InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
	return examples


	class Sst2Processor(DataProcessor):
	"""Processor for the SST-2 data set (GLUE version)."""

	def get_example_from_tensor_dict(self, tensor_dict):
	"""See base class."""
	return InputExample(tensor_dict['idx'].numpy(),
	tensor_dict['sentence'].numpy().decode('utf-8'),
	None,
	str(tensor_dict['label'].numpy()))

	def get_train_examples(self, data_dir):
	"""See base class."""
	return self._create_examples(
	self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")

	def get_dev_examples(self, data_dir):
	"""See base class."""
	return self._create_examples(
	self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")

	def get_labels(self):
	"""See base class."""
	return ["0", "1"]

	def _create_examples(self, lines, set_type):
	"""Creates examples for the training and dev sets."""
	examples = []
	for (i, line) in enumerate(lines):
	if i == 0:
	continue
	guid = "%s-%s" % (set_type, i)
	text_a = line[0]
	label = line[1]
	examples.append(
	InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
	return examples


	class StsbProcessor(DataProcessor):
	"""Processor for the STS-B data set (GLUE version)."""

	def get_example_from_tensor_dict(self, tensor_dict):
	"""See base class."""
	return InputExample(tensor_dict['idx'].numpy(),
	tensor_dict['sentence1'].numpy().decode('utf-8'),
	tensor_dict['sentence2'].numpy().decode('utf-8'),
	str(tensor_dict['label'].numpy()))

	def get_train_examples(self, data_dir):
	"""See base class."""
	return self._create_examples(
	self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")

	def get_dev_examples(self, data_dir):
	"""See base class."""
	return self._create_examples(
	self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")

	def get_test_examples(self, data_dir):
	"""See base class."""
	return self._create_examples(
	self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")

	def get_labels(self):
	"""See base class."""
	return [None]

	def _create_examples(self, lines, set_type):
	"""Creates examples for the training and dev sets."""
	examples = []
	for (i, line) in enumerate(lines):
	if i == 0:
	continue
	guid = "%s-%s" % (set_type, line[0])
	text_a = line[1]
	text_b = line[2]
	label = line[-1]
	examples.append(
	InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
	return examples


	class QqpProcessor(DataProcessor):
	"""Processor for the QQP data set (GLUE version)."""

	def get_example_from_tensor_dict(self, tensor_dict):
	"""See base class."""
	return InputExample(tensor_dict['idx'].numpy(),
	tensor_dict['question1'].numpy().decode('utf-8'),
	tensor_dict['question2'].numpy().decode('utf-8'),
	str(tensor_dict['label'].numpy()))

	def get_train_examples(self, data_dir):
	"""See base class."""
	return self._create_examples(
	self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")

	def get_dev_examples(self, data_dir):
	"""See base class."""
	return self._create_examples(
	self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")

	def get_labels(self):
	"""See base class."""
	return ["0", "1"]

	def _create_examples(self, lines, set_type):
	"""Creates examples for the training and dev sets."""
	examples = []
	for (i, line) in enumerate(lines):
	if i == 0:
	continue
	guid = "%s-%s" % (set_type, line[0])
	try:
	text_a = line[3]
	text_b = line[4]
	label = line[5]
	except IndexError:
	continue
	examples.append(
	InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
	return examples


	class QnliProcessor(DataProcessor):
	"""Processor for the QNLI data set (GLUE version)."""

	def get_example_from_tensor_dict(self, tensor_dict):
	"""See base class."""
	return InputExample(tensor_dict['idx'].numpy(),
	tensor_dict['question'].numpy().decode('utf-8'),
	tensor_dict['sentence'].numpy().decode('utf-8'),
	str(tensor_dict['label'].numpy()))

	def get_train_examples(self, data_dir):
	"""See base class."""
	return self._create_examples(
	self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")

	def get_dev_examples(self, data_dir):
	"""See base class."""
	return self._create_examples(
	self._read_tsv(os.path.join(data_dir, "dev.tsv")),
	"dev_matched")

	def get_labels(self):
	"""See base class."""
	return ["entailment", "not_entailment"]

	def _create_examples(self, lines, set_type):
	"""Creates examples for the training and dev sets."""
	examples = []
	for (i, line) in enumerate(lines):
	if i == 0:
	continue
	guid = "%s-%s" % (set_type, line[0])
	text_a = line[1]
	text_b = line[2]
	label = line[-1]
	examples.append(
	InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
	return examples


	class RteProcessor(DataProcessor):
	"""Processor for the RTE data set (GLUE version)."""

	def get_example_from_tensor_dict(self, tensor_dict):
	"""See base class."""
	return InputExample(tensor_dict['idx'].numpy(),
	tensor_dict['sentence1'].numpy().decode('utf-8'),
	tensor_dict['sentence2'].numpy().decode('utf-8'),
	str(tensor_dict['label'].numpy()))

	def get_train_examples(self, data_dir):
	"""See base class."""
	return self._create_examples(
	self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")

	def get_dev_examples(self, data_dir):
	"""See base class."""
	return self._create_examples(
	self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")

	def get_labels(self):
	"""See base class."""
	return ["entailment", "not_entailment"]

	def _create_examples(self, lines, set_type):
	"""Creates examples for the training and dev sets."""
	examples = []
	for (i, line) in enumerate(lines):
	if i == 0:
	continue
	guid = "%s-%s" % (set_type, line[0])
	text_a = line[1]
	text_b = line[2]
	label = line[-1]
	examples.append(
	InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
	return examples


	class WnliProcessor(DataProcessor):
	"""Processor for the WNLI data set (GLUE version)."""

	def get_example_from_tensor_dict(self, tensor_dict):
	"""See base class."""
	return InputExample(tensor_dict['idx'].numpy(),
	tensor_dict['sentence1'].numpy().decode('utf-8'),
	tensor_dict['sentence2'].numpy().decode('utf-8'),
	str(tensor_dict['label'].numpy()))

	def get_train_examples(self, data_dir):
	"""See base class."""
	return self._create_examples(
	self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")

	def get_dev_examples(self, data_dir):
	"""See base class."""
	return self._create_examples(
	self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")

	def get_labels(self):
	"""See base class."""
	return ["0", "1"]

	def _create_examples(self, lines, set_type):
	"""Creates examples for the training and dev sets."""
	examples = []
	for (i, line) in enumerate(lines):
	if i == 0:
	continue
	guid = "%s-%s" % (set_type, line[0])
	text_a = line[1]
	text_b = line[2]
	label = line[-1]
	examples.append(
	InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
	return examples


	class ChemProcessor(DataProcessor):
	def get_train_examples(self, data_dir):
	"""See base class."""
	return self._create_examples(
	self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")

	def get_dev_examples(self, data_dir):
	"""See base class."""
	return self._create_examples(
	self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")

	def get_test_examples(self, data_dir):
	"""See base class."""
	return self._create_examples(
	self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")

	def get_labels(self):
	"""See base class."""
	return ["false","CPR:3", "CPR:4", "CPR:5", "CPR:6", "CPR:9"]

	def _create_examples(self, lines, set_type):
	"""Creates examples for the training and dev sets."""
	examples = []
	for (i, line) in enumerate(lines):
	guid = "%s-%s" % (set_type, line[0])
	text_a = line[1]
	label = line[-1]
	examples.append(
	InputExample(guid=guid, text_a=text_a, label=label))
	return examples

	class ARCProcessor(DataProcessor):
	def get_train_examples(self, data_dir):
	"""See base class."""
	return self._create_examples(
	self._read_jsonl(os.path.join(data_dir, "train.jsonl")), "train")

	def get_dev_examples(self, data_dir):
	"""See base class."""
	return self._create_examples(
	self._read_jsonl(os.path.join(data_dir, "dev.jsonl")), "dev")

	def get_test_examples(self, data_dir):
	"""See base class."""
	return self._create_examples(
	self._read_jsonl(os.path.join(data_dir, "test.jsonl")), "test")

	def get_labels(self):
	"""See base class."""
	return ["CompareOrContrast", "Background", "Uses", "Motivation", "Extends", "Future"]

	def _create_examples(self, lines, set_type):
	"""Creates examples for the training and dev sets."""
	examples = []
	for (i, line) in enumerate(lines):
	line = json.loads(line)
	guid = "%s-%s" % (set_type, i)
	text_a = line["text"]
	label = line["label"]
	examples.append(
	InputExample(guid=guid, text_a=text_a, label=label))
	return examples

	class SCIProcessor(DataProcessor):
	def get_train_examples(self, data_dir):
	"""See base class."""
	return self._create_examples(
	self._read_jsonl(os.path.join(data_dir, "train.jsonl")), "train")

	def get_dev_examples(self, data_dir):
	"""See base class."""
	return self._create_examples(
	self._read_jsonl(os.path.join(data_dir, "dev.jsonl")), "dev")

	def get_test_examples(self, data_dir):
	"""See base class."""
	return self._create_examples(
	self._read_jsonl(os.path.join(data_dir, "test.jsonl")), "test")

	def get_labels(self):
	"""See base class."""
	return ["COMPARE","CONJUNCTION","FEATURE-OF","HYPONYM-OF","USED-FOR","EVALUATE-FOR","PART-OF"]

	def _create_examples(self, lines, set_type):
	"""Creates examples for the training and dev sets."""
	examples = []
	for (i, line) in enumerate(lines):
	line = json.loads(line)
	guid = "%s-%s" % (set_type, i)
	text_a = line["text"]
	label = line["label"]
	examples.append(
	InputExample(guid=guid, text_a=text_a, label=label))
	return examples

	glue_tasks_num_labels = {
	"cola": 2,
	"mnli": 3,
	"mrpc": 2,
	"sst-2": 2,
	"sts-b": 1,
	"qqp": 2,
	"qnli": 2,
	"rte": 2,
	"wnli": 2,
	"chemprot": 6,
	"arc": 6,
	"sci": 7,
	}

	glue_processors = {
	"cola": ColaProcessor,
	"mnli": MnliProcessor,
	"mnli-mm": MnliMismatchedProcessor,
	"mrpc": MrpcProcessor,
	"sst-2": Sst2Processor,
	"sts-b": StsbProcessor,
	"qqp": QqpProcessor,
	"qnli": QnliProcessor,
	"rte": RteProcessor,
	"wnli": WnliProcessor,
	"chemprot": ChemProcessor,
	"arc": ARCProcessor,
	"sci": SCIProcessor,
	}

	glue_output_modes = {
	"cola": "classification",
	"mnli": "classification",
	"mnli-mm": "classification",
	"mrpc": "classification",
	"sst-2": "classification",
	"sts-b": "regression",
	"qqp": "classification",
	"qnli": "classification",
	"rte": "classification",
	"wnli": "classification",
	"chemprot": "classification",
	"arc": "classification",
	"sci": "classification",
	}

	def simple_accuracy(preds, labels):
	return (preds == labels).mean()


	def acc_and_f1(preds, labels):
	acc = simple_accuracy(preds, labels)
	f1 = f1_score(y_true=labels, y_pred=preds)
	return {
	"acc": acc,
	"f1": f1,
	"acc_and_f1": (acc + f1) / 2,
	}

	def acc_and_macro_f1(preds, labels):
	acc = simple_accuracy(preds, labels)
	f1 = f1_score(y_true=labels, y_pred=preds,average="macro")
	return {
	"f1": f1,
	"acc": acc,
	"acc_and_f1": (acc + f1) / 2,
	}

	def acc_and_micro_f1(preds, labels, label_list):
	acc = simple_accuracy(preds, labels)
	print(label_list)
	label_list = [str(i+1) for i in range(len(label_list))]
	print(label_list)
	mlb = MultiLabelBinarizer(classes = label_list)
	labels = labels.tolist()
	labels = [str(i) for i in labels]
	print(labels[:20])
	labels = mlb.fit_transform(labels)
	preds = preds.tolist()
	preds = [str(i) for i in preds]
	print(preds[:20])
	preds = mlb.fit_transform(preds)
	f1 = f1_score(y_true=labels, y_pred=preds,average="micro")
	return {
	"f1": f1,
	"acc": acc,
	"f1_macro": f1_score(y_true=labels, y_pred=preds,average="macro"),
	"acc_and_f1": (acc + f1) / 2,
	}

	def pearson_and_spearman(preds, labels):
	pearson_corr = pearsonr(preds, labels)[0]
	spearman_corr = spearmanr(preds, labels)[0]
	return {
	"pearson": pearson_corr,
	"spearmanr": spearman_corr,
	"corr": (pearson_corr + spearman_corr) / 2,
	}


	def glue_compute_metrics(task_name, preds, labels, label_list):
	assert len(preds) == len(labels)
	if task_name == "cola":
	return {"mcc": matthews_corrcoef(labels, preds)}
	elif task_name == "sst-2":
	return {"acc": simple_accuracy(preds, labels)}
	elif task_name == "mrpc":
	return acc_and_f1(preds, labels)
	elif task_name == "sts-b":
	return pearson_and_spearman(preds, labels)
	elif task_name == "qqp":
	return acc_and_f1(preds, labels)
	elif task_name == "mnli":
	return {"acc": simple_accuracy(preds, labels)}
	elif task_name == "mnli-mm":
	return {"acc": simple_accuracy(preds, labels)}
	elif task_name == "qnli":
	return {"acc": simple_accuracy(preds, labels)}
	elif task_name == "rte":
	return {"acc": simple_accuracy(preds, labels)}
	elif task_name == "wnli":
	return {"acc": simple_accuracy(preds, labels)}
	elif task_name == "chemprot":
	return acc_and_micro_f1(preds, labels, label_list)
	elif task_name == "arc" or task_name == "sci":
	return acc_and_macro_f1(preds, labels)
	else:
	raise KeyError(task_name)