Spaces:

abdiharyadi
/

kancilgpt

Sleeping

App Files Files Community

abdiharyadi commited on Oct 30, 2024

Commit

e18c38e

1 Parent(s): 632ca18

fix: update IndoNLGTokenizer

Browse files

Files changed (1) hide show

indobenchmark.py +3 -261

indobenchmark.py CHANGED Viewed

@@ -14,21 +14,11 @@
 # limitations under the License
 """ Tokenization classes for IndoNLG model."""
-from typing import Dict, List, Optional, Tuple, Union
-from transformers import PreTrainedTokenizer, BatchEncoding
-from collections.abc import Mapping
-from transformers.utils import (
-    PaddingStrategy,
-    TensorType,
-    is_tf_available,
-    is_torch_available,
-    logging,
-    to_py_obj,
-)
-import numpy as np
 import sentencepiece as spm
-from transformers.utils.generic import _is_tensorflow, _is_torch
 logger = logging.get_logger(__name__)
@@ -350,251 +340,3 @@ class IndoNLGTokenizer(PreTrainedTokenizer):
     def decode(self, inputs, skip_special_tokens=False, **kwargs):
         outputs = super().decode(inputs, skip_special_tokens=skip_special_tokens, **kwargs)
         return outputs.replace(' ','').replace(SPIECE_UNDERLINE, ' ')
-    def _pad_decoder(
-        self,
-        encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
-        max_length: Optional[int] = None,
-        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
-        pad_to_multiple_of: Optional[int] = None,
-        return_attention_mask: Optional[bool] = None,
-    ) -> dict:
-        """
-        Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
-        Args:
-            encoded_inputs:
-                Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
-            max_length: maximum length of the returned list and optionally padding length (see below).
-                Will truncate by taking into account the special tokens.
-            padding_strategy: PaddingStrategy to use for padding.
-                - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
-                - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
-                - PaddingStrategy.DO_NOT_PAD: Do not pad
-                The tokenizer padding sides are defined in self.padding_side:
-                    - 'left': pads on the left of the sequences
-                    - 'right': pads on the right of the sequences
-            pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
-                This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
-                >= 7.5 (Volta).
-            return_attention_mask:
-                (optional) Set to False to avoid returning attention mask (default: set to model specifics)
-        """
-        # Load from model defaults
-        if return_attention_mask is None:
-            return_attention_mask = "decoder_attention_mask" in self.model_input_names
-        required_input = encoded_inputs[self.model_input_names[2]]
-        if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
-            max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
-        needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
-        # Initialize attention mask if not present.
-        if return_attention_mask and "decoder_attention_mask" not in encoded_inputs:
-            encoded_inputs["decoder_attention_mask"] = [1] * len(required_input)
-        if needs_to_be_padded:
-            difference = max_length - len(required_input)
-            if self.padding_side == "right":
-                if return_attention_mask:
-                    encoded_inputs["decoder_attention_mask"] = encoded_inputs["decoder_attention_mask"] + [0] * difference
-                if "decoder_token_type_ids" in encoded_inputs:
-                    encoded_inputs["decoder_token_type_ids"] = (
-                        encoded_inputs["decoder_token_type_ids"] + [self.pad_token_type_id] * difference
-                    )
-                if "decoder_special_tokens_mask" in encoded_inputs:
-                    encoded_inputs["decoder_special_tokens_mask"] = encoded_inputs["decoder_special_tokens_mask"] + [1] * difference
-                encoded_inputs[self.model_input_names[2]] = required_input + [self.pad_token_id] * difference
-                label_input = encoded_inputs[self.model_input_names[4]]
-                encoded_inputs[self.model_input_names[4]] = label_input + [-100] * difference
-            elif self.padding_side == "left":
-                if return_attention_mask:
-                    encoded_inputs["decoder_attention_mask"] = [0] * difference + encoded_inputs["decoder_attention_mask"]
-                if "decoder_token_type_ids" in encoded_inputs:
-                    encoded_inputs["decoder_token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs[
-                        "decoder_token_type_ids"
-                    ]
-                if "decoder_special_tokens_mask" in encoded_inputs:
-                    encoded_inputs["decoder_special_tokens_mask"] = [1] * difference + encoded_inputs["decoder_special_tokens_mask"]
-                encoded_inputs[self.model_input_names[2]] = [self.pad_token_id] * difference + required_input
-                label_input = encoded_inputs[self.model_input_names[4]]
-                encoded_inputs[self.model_input_names[4]] = label_input + [-100] * difference
-            else:
-                raise ValueError("Invalid padding strategy:" + str(self.padding_side))
-        return encoded_inputs
-    def pad(self,
-        encoded_inputs: Union[
-            BatchEncoding,
-            List[BatchEncoding],
-            Dict[str, EncodedInput],
-            Dict[str, List[EncodedInput]],
-            List[Dict[str, EncodedInput]],
-        ],
-        padding: Union[bool, str, PaddingStrategy] = True,
-        max_length: Optional[int] = None,
-        pad_to_multiple_of: Optional[int] = None,
-        return_attention_mask: Optional[bool] = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
-        verbose: bool = True,
-    ) -> BatchEncoding:
-        """
-        Pad a single encoded input or a batch of encoded inputs up to predefined length or to the max sequence length
-        in the batch.
-        Padding side (left/right) padding token ids are defined at the tokenizer level (with `self.padding_side`,
-        `self.pad_token_id` and `self.pad_token_type_id`)
-        <Tip>
-        If the `encoded_inputs` passed are dictionary of numpy arrays, PyTorch tensors or TensorFlow tensors, the
-        result will use the same type unless you provide a different tensor type with `return_tensors`. In the case of
-        PyTorch tensors, you will lose the specific device of your tensors however.
-        </Tip>
-        Args:
-            encoded_inputs ([`BatchEncoding`], list of [`BatchEncoding`], `Dict[str, List[int]]`, `Dict[str, List[List[int]]` or `List[Dict[str, List[int]]]`):
-                Tokenized inputs. Can represent one input ([`BatchEncoding`] or `Dict[str, List[int]]`) or a batch of
-                tokenized inputs (list of [`BatchEncoding`], *Dict[str, List[List[int]]]* or *List[Dict[str,
-                List[int]]]*) so you can use this method during preprocessing as well as in a PyTorch Dataloader
-                collate function.
-                Instead of `List[int]` you can have tensors (numpy arrays, PyTorch tensors or TensorFlow tensors), see
-                the note above for the return type.
-            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
-                 Select a strategy to pad the returned sequences (according to the model's padding side and padding
-                 index) among:
-                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
-                  sequence if provided).
-                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
-                  acceptable input length for the model if that argument is not provided.
-                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
-                  lengths).
-            max_length (`int`, *optional*):
-                Maximum length of the returned list and optionally padding length (see above).
-            pad_to_multiple_of (`int`, *optional*):
-                If set will pad the sequence to a multiple of the provided value.
-                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
-                >= 7.5 (Volta).
-            return_attention_mask (`bool`, *optional*):
-                Whether to return the attention mask. If left to the default, will return the attention mask according
-                to the specific tokenizer's default, defined by the `return_outputs` attribute.
-                [What are attention masks?](../glossary#attention-mask)
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors instead of list of python integers. Acceptable values are:
-                - `'tf'`: Return TensorFlow `tf.constant` objects.
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return Numpy `np.ndarray` objects.
-            verbose (`bool`, *optional*, defaults to `True`):
-                Whether or not to print more information and warnings.
-        """
-        # If we have a list of dicts, let's convert it in a dict of lists
-        # We do this to allow using this method as a collate_fn function in PyTorch Dataloader
-        if isinstance(encoded_inputs, (list, tuple)) and isinstance(encoded_inputs[0], Mapping):
-            encoded_inputs = {key: [example[key] for example in encoded_inputs] for key in encoded_inputs[0].keys()}
-        # The model's main input name, usually `input_ids`, has be passed for padding
-        if self.model_input_names[0] not in encoded_inputs:
-            raise ValueError(
-                "You should supply an encoding or a list of encodings to this method "
-                f"that includes {self.model_input_names[0]}, but you provided {list(encoded_inputs.keys())}"
-            )
-        required_input = encoded_inputs[self.model_input_names[0]]
-        if not required_input:
-            if return_attention_mask:
-                encoded_inputs["attention_mask"] = []
-            return encoded_inputs
-        # If we have PyTorch/TF/NumPy tensors/arrays as inputs, we cast them as python objects
-        # and rebuild them afterwards if no return_tensors is specified
-        # Note that we lose the specific device the tensor may be on for PyTorch
-        first_element = required_input[0]
-        if isinstance(first_element, (list, tuple)):
-            # first_element might be an empty list/tuple in some edge cases so we grab the first non empty element.
-            for item in required_input:
-                if len(item) != 0:
-                    first_element = item[0]
-                    break
-        # At this state, if `first_element` is still a list/tuple, it's an empty one so there is nothing to do.
-        if not isinstance(first_element, (int, list, tuple)):
-            if is_tf_available() and _is_tensorflow(first_element):
-                return_tensors = "tf" if return_tensors is None else return_tensors
-            elif is_torch_available() and _is_torch(first_element):
-                return_tensors = "pt" if return_tensors is None else return_tensors
-            elif isinstance(first_element, np.ndarray):
-                return_tensors = "np" if return_tensors is None else return_tensors
-            else:
-                raise ValueError(
-                    f"type of {first_element} unknown: {type(first_element)}. "
-                    f"Should be one of a python, numpy, pytorch or tensorflow object."
-                )
-            for key, value in encoded_inputs.items():
-                encoded_inputs[key] = to_py_obj(value)
-        # Convert padding_strategy in PaddingStrategy
-        padding_strategy, _, max_length, _ = self._get_padding_truncation_strategies(
-            padding=padding, max_length=max_length, verbose=verbose
-        )
-        required_input = encoded_inputs[self.model_input_names[0]]
-        if required_input and not isinstance(required_input[0], (list, tuple)):
-            encoded_inputs = self._pad(
-                encoded_inputs,
-                max_length=max_length,
-                padding_strategy=padding_strategy,
-                pad_to_multiple_of=pad_to_multiple_of,
-                return_attention_mask=return_attention_mask,
-            )
-            return BatchEncoding(encoded_inputs, tensor_type=return_tensors)
-        batch_size = len(required_input)
-        assert all(
-            len(v) == batch_size for v in encoded_inputs.values()
-        ), "Some items in the output dictionary have a different batch size than others."
-        if padding_strategy == PaddingStrategy.LONGEST:
-            max_length = max(len(inputs) for inputs in required_input)
-            padding_strategy = PaddingStrategy.MAX_LENGTH
-        batch_outputs = {}
-        for i in range(batch_size):
-            inputs = dict((k, v[i]) for k, v in encoded_inputs.items())
-            outputs = self._pad(
-                inputs,
-                max_length=max_length,
-                padding_strategy=padding_strategy,
-                pad_to_multiple_of=pad_to_multiple_of,
-                return_attention_mask=return_attention_mask,
-            )
-            # Handle decoder_input_ids
-            if self.model_input_names[2] in outputs:
-                max_decoder_length = max(len(inputs) for inputs in encoded_inputs[self.model_input_names[2]])
-                outputs = self._pad_decoder(
-                    outputs,
-                    max_length=max_decoder_length,
-                    padding_strategy=padding_strategy,
-                    pad_to_multiple_of=pad_to_multiple_of,
-                    return_attention_mask=return_attention_mask,
-                )
-            for key, value in outputs.items():
-                if key not in batch_outputs:
-                    batch_outputs[key] = []
-                batch_outputs[key].append(value)
-        return BatchEncoding(batch_outputs, tensor_type=return_tensors)

 # limitations under the License
 """ Tokenization classes for IndoNLG model."""
+from typing import List, Optional, Tuple, Union
+from transformers import PreTrainedTokenizer
+from transformers.utils import logging
 import sentencepiece as spm
 logger = logging.get_logger(__name__)
     def decode(self, inputs, skip_special_tokens=False, **kwargs):
         outputs = super().decode(inputs, skip_special_tokens=skip_special_tokens, **kwargs)
         return outputs.replace(' ','').replace(SPIECE_UNDERLINE, ' ')