Source code for paddlenlp.transformers.xlnet.tokenizer

# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Tokenization class for XLNet model."""

import os
import unicodedata
from shutil import copyfile
from typing import List, Optional

from paddle.utils import try_import
from .. import PretrainedTokenizer

__all__ = ['XLNetTokenizer']

SENTENCEPIECE_UNDERLINE = "▁"
SPIECE_UNDERLINE = SENTENCEPIECE_UNDERLINE  # Kept for backward compatibility

# Segments (not really needed)
SEG_ID_A = 0
SEG_ID_B = 1
SEG_ID_CLS = 2
SEG_ID_SEP = 3
SEG_ID_PAD = 4


[docs]class XLNetTokenizer(PretrainedTokenizer):
    """
    Constructs an XLNet tokenizer. Based on `SentencePiece <https://github.com/google/sentencepiece>`__.

    Args:
        vocab_file (`str`):
            ``SentencePiece`` file (ends with .spm) that contains the vocabulary necessary
            to instantiate a tokenizer.
        do_lower_case (`bool`, optional):
            Whether to lowercase the input when tokenizing. Defaults to ``False`` and
            we do not lowercase the input.
        remove_space (`bool`, optional):
            Whether to strip the text when tokenizing. Defaults to ``True`` and
            we remove excess spaces before and after the string.
        keep_accents (`bool`, optional):
            Whether to keep accents when tokenizing.
            Defaults to ``False`` and we don't keep accents.
        bos_token (`str`, optional):
            The beginning of sequence token that was used during pretraining. Defaults to ``"<s>"``.
        eos_token (`str`, optional):
            The end of sequence token. Defaults to ``"</s>"``.
        unk_token (`str`, optional):
            The unknown token. A token that is not in the vocabulary is set to be unk_token
            inorder to be converted to an ID. Defaults to ``"<unk>"``.
        sep_token (`str`, optional):
            The separator token. Defaults to ``"<sep>"``.
        pad_token (`str`, optional):
            The token used for padding. Defaults to ``"<pad>"``.
        cls_token (`str`, optional):
            The classifier token which is used when doing sequence classification.
            It is the last token of the sequence when built with special tokens. Defaults to ``"<cls>"``.
        mask_token (`str`, optional):
            The token used for masking values. In the masked language modeling task,
            this is the token used and which the model will try to predict. Defaults to ``"<mask>"``.
        additional_special_tokens (`List[str]`, optional):
            Additional special tokens used by the tokenizer. Defaults to ``["<eop>", "<eod>"]``.

    Attributes:
        sp_model (`SentencePieceProcessor`):
            The ``SentencePiece`` processor that is used for every conversion (string, tokens and IDs).
    """

    resource_files_names = {"vocab_file": "spiece.model"}
    pretrained_resource_files_map = {
        "vocab_file": {
            "xlnet-base-cased":
            "https://paddlenlp.bj.bcebos.com/models/transformers/xlnet/xlnet-base-cased-spiece.model",
            "xlnet-large-cased":
            "https://paddlenlp.bj.bcebos.com/models/transformers/xlnet/xlnet-large-cased-spiece.model",
            "chinese-xlnet-base":
            "https://paddlenlp.bj.bcebos.com/models/transformers/xlnet/chinese-xlnet-base-spiece.model",
            "chinese-xlnet-mid":
            "https://paddlenlp.bj.bcebos.com/models/transformers/xlnet/chinese-xlnet-mid-spiece.model",
            "chinese-xlnet-large":
            "https://paddlenlp.bj.bcebos.com/models/transformers/xlnet/chinese-xlnet-large-spiece.model",
        }
    }
    pretrained_init_configuration = {
        "xlnet-base-cased": {
            "do_lower_case": False
        },
        "xlnet-large-cased": {
            "do_lower_case": False
        },
        "chinese-xlnet-base": {
            "do_lower_case": False
        },
        "chinese-xlnet-mid": {
            "do_lower_case": False
        },
        "chinese-xlnet-large": {
            "do_lower_case": False
        },
    }
    pretrained_positional_embedding_sizes = {
        "xlnet-base-cased": None,
        "xlnet-large-cased": None,
        "chinese-xlnet-base": None,
        "chinese-xlnet-mid": None,
        "chinese-xlnet-large": None,
    }
    max_model_input_sizes = pretrained_positional_embedding_sizes
    padding_side = "left"
    pad_token_type_id = 3

    def __init__(
            self,
            vocab_file,
            do_lower_case=False,
            remove_space=True,
            keep_accents=False,
            bos_token="<s>",
            eos_token="</s>",
            unk_token="<unk>",
            sep_token="<sep>",
            pad_token="<pad>",
            cls_token="<cls>",
            mask_token="<mask>",
            additional_special_tokens=["<eop>", "<eod>"], ):

        self.do_lower_case = do_lower_case
        self.remove_space = remove_space
        self.keep_accents = keep_accents
        self.vocab_file = vocab_file
        spm = try_import("sentencepiece")
        self.sp_model = spm.SentencePieceProcessor()
        self.sp_model.Load(vocab_file)

    @property
    def vocab_size(self):
        return len(self.sp_model)

    def get_vocab(self):
        vocab = {
            self.convert_ids_to_tokens(i): i
            for i in range(self.vocab_size)
        }
        vocab.update(self.added_tokens_encoder)
        return vocab

    def __getstate__(self):
        state = self.__dict__.copy()
        state["sp_model"] = None
        return state

    def __setstate__(self, d):
        self.__dict__ = d
        spm = try_import("sentencepiece")
        self.sp_model = spm.SentencePieceProcessor()
        self.sp_model.Load(self.vocab_file)

    def preprocess_text(self, inputs):
        if self.remove_space:
            outputs = " ".join(inputs.strip().split())
        else:
            outputs = inputs
        outputs = outputs.replace("``", '"').replace("''", '"')

        if not self.keep_accents:
            outputs = unicodedata.normalize("NFKD", outputs)
            outputs = "".join(
                [c for c in outputs if not unicodedata.combining(c)])
        if self.do_lower_case:
            outputs = outputs.lower()

        return outputs

    def _tokenize(self, text, sample=False):
        """Tokenize a string."""
        text = self.preprocess_text(text)

        if not sample:
            pieces = self.sp_model.EncodeAsPieces(text)
        else:
            pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1)
        new_pieces = []
        for piece in pieces:
            if len(piece) > 1 and piece[-1] == str(",") and piece[-2].isdigit():
                cur_pieces = self.sp_model.EncodeAsPieces(piece[:-1].replace(
                    SPIECE_UNDERLINE, ""))
                if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][
                        0] == SPIECE_UNDERLINE:
                    if len(cur_pieces[0]) == 1:
                        cur_pieces = cur_pieces[1:]
                    else:
                        cur_pieces[0] = cur_pieces[0][1:]
                cur_pieces.append(piece[-1])
                new_pieces.extend(cur_pieces)
            else:
                new_pieces.append(piece)

        return new_pieces

[docs]    def tokenize(self, text):
        """
        End-to-end tokenization for XLNet models.

        Args:
            text (`str`):
                The text to be tokenized.
        Returns:
            `List(str)`: A list of string representing converted tokens.
        """
        return self._tokenize(text)

    def _convert_token_to_id(self, token):
        """Converts a token (str) to an id using the vocab. """
        return self.sp_model.PieceToId(token)

    def _convert_id_to_token(self, index):
        """Converts an index (integer) to a token (str) using the vocab."""
        return self.sp_model.IdToPiece(index)

[docs]    def convert_tokens_to_ids(self, tokens):
        """
        Converts a token (or a sequence of tokens) to a single integer id (or a sequence of ids),
        using the vocabulary.

        Args:
            tokens (`str` or `List[str]`):
                One or several token(s) to convert to token id(s).

        Returns:
            `int` or `List[int]` or `tuple(int)`: The token id or list of token ids or tuple of token ids.
        """
        if not isinstance(tokens, (list, tuple)):
            return self._convert_token_to_id(tokens)
        else:
            return [self._convert_token_to_id(token) for token in tokens]

[docs]    def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
        """
        Converts a single index or a sequence of indices to a token or
        a sequence of tokens, using the vocabulary and added tokens.

        Args:
            ids (`int` or `List[int]`):
                The token id (or token ids) to be converted to token(s).
            skip_special_tokens (`bool`, optional):
                Whether or not to remove special tokens in the decoding.
                Defaults to ``False`` and we do not remove special tokens.

        Returns:
            `str` or `List[str]`: The decoded token(s).
        """
        if not isinstance(ids, (list, tuple)):
            return self._convert_id_to_token(ids)
        tokens = [self._convert_id_to_token(_id) for _id in ids]
        if skip_special_tokens:
            return [
                token for token in tokens
                if token not in self.all_special_tokens
            ]
        return tokens

[docs]    def convert_tokens_to_string(self, tokens):
        """Converts a sequence of tokens (strings for sub-words) in a single string."""
        out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
        return out_string

[docs]    def num_special_tokens_to_add(self, pair=False):
        """
        Returns the number of added tokens when encoding a sequence with special tokens.

        Note:
            This encodes inputs and checks the number of added tokens, and is therefore not efficient.
            Do not put this inside your training loop.

        Args:
            pair (`bool`, optional):
                Whether the sequence is a sequence pair or a single sequence.
                Defaults to ``False`` and the input is a single sequence.

        Returns:
            `int`: Number of tokens added to sequences.
        """
        token_ids_0 = []
        token_ids_1 = []
        return len(
            self.build_inputs_with_special_tokens(token_ids_0, token_ids_1
                                                  if pair else None))

[docs]    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
        """
        Builds model inputs from a sequence or a pair of sequence
        for sequence classification tasks by concatenating and
        adding special tokens. An XLNet sequence has the following format:

        - single sequence:      ``X <sep> <cls>``
        - pair of sequences:    ``A <sep> B <sep> <cls>``

        Args:
            token_ids_0 (`List[int]`):
                List of IDs for the first sequence.
            token_ids_1 (`List[int]`, optional):
                Optional second list of IDs for sequence pairs. Defaults to ``None``.

        Returns:
            `List[int]`: List of input IDs with the appropriate special tokens.
        """
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]
        if token_ids_1 is None:
            return token_ids_0 + sep + cls
        return token_ids_0 + sep + token_ids_1 + sep + cls

[docs]    def build_offset_mapping_with_special_tokens(self,
                                                 offset_mapping_0,
                                                 offset_mapping_1=None):
        """
        Builds offset map from a pair of offset map by concatenating
        and adding offsets of special tokens.

        An XLNet offset_mapping has the following format:

        - single sequence:      ``X (0,0) (0,0)``
        - pair of sequences:    ``A (0,0) B (0,0) (0,0)``

        Args:
            offset_mapping_0 (`List[tuple]`):
                List of char offsets to which the special tokens will be added.
            offset_mapping_1 (`List[tuple]`, optional):
                Optional second list of char offsets for offset mapping pairs.
                Defaults to ``None``.

        Returns:
            `List[tuple]`: List of char offsets with the appropriate offsets of special tokens.
        """
        if offset_mapping_1 is None:
            return offset_mapping_0 + [(0, 0)] + [(0, 0)]

        return offset_mapping_0 + [(0, 0)] + offset_mapping_1 + [(0, 0)] + [
            (0, 0)
        ]

[docs]    def get_special_tokens_mask(self,
                                token_ids_0,
                                token_ids_1=None,
                                already_has_special_tokens=False):
        """
        Creates a special tokens mask from the input sequences.
        This method is called when adding special tokens using the tokenizer ``encode`` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs for the first sequence.
            token_ids_1 (`List[int]`, optional):
                Optional second list of IDs for sequence pairs.
                Defaults to ``None``.
            already_has_special_tokens (`bool`, optional):
                Whether or not the token list is already formatted with special tokens for the model.
                Defaults to ``False``.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """

        if already_has_special_tokens:
            if token_ids_1 is not None:
                raise ValueError(
                    "You should not supply a second sequence if the provided sequence of "
                    "ids is already formatted with special tokens for the model."
                )
            return list(
                map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0,
                    token_ids_0))

        if token_ids_1 is not None:
            return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)
                                                     ) + [1, 1]
        return ([0] * len(token_ids_0)) + [1, 1]

[docs]    def create_token_type_ids_from_sequences(self,
                                             token_ids_0,
                                             token_ids_1=None):
        """
        Creates a mask from the input sequences.
        An XLNet sequence pair mask has the following format:

        ::

            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 2
            | first sequence    | second sequence |

        - 0 stands for the segment id of **first segment tokens**,
        - 1 stands for the segment id of **second segment tokens**,
        - 2 stands for the segment id of **cls_token**.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs for the first sequence.
            token_ids_1 (`List[int]`, optional):
                Optional second list of IDs for the sequence pair. Defaults to ``None``.

        Returns:
            `List[int]`: List of token type IDs according to the given sequence(s).
        """
        sep = [self.sep_token_id]
        cls_segment_id = [2]

        if token_ids_1 is None:
            return len(token_ids_0 + sep) * [0] + cls_segment_id
        return len(token_ids_0 + sep) * [0] + len(token_ids_1 +
                                                  sep) * [1] + cls_segment_id

[docs]    def save_resources(self, save_directory):
        """
        Saves tokenizer related resources to files under `save_directory`.

        Args:
            save_directory (`str`):
                Directory to save files into.
        """
        for name, file_name in self.resource_files_names.items():
            save_path = os.path.join(save_directory, file_name)
            if os.path.abspath(self.vocab_file) != os.path.abspath(save_path):
                copyfile(self.vocab_file, save_path)