Source code for paddlenlp.transformers.xlnet.tokenizer

# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Tokenization class for XLNet model."""

import os
import unicodedata
from shutil import copyfile
from typing import List, Optional

from paddle.utils import try_import
from .. import PretrainedTokenizer

__all__ = ['XLNetTokenizer']

SENTENCEPIECE_UNDERLINE = "▁"
SPIECE_UNDERLINE = SENTENCEPIECE_UNDERLINE  # Kept for backward compatibility

# Segments (not really needed)
SEG_ID_A = 0
SEG_ID_B = 1
SEG_ID_CLS = 2
SEG_ID_SEP = 3
SEG_ID_PAD = 4


[docs]class XLNetTokenizer(PretrainedTokenizer): """ Constructs an XLNet tokenizer. Based on `SentencePiece <https://github.com/google/sentencepiece>`__. Args: vocab_file (`str`): ``SentencePiece`` file (ends with .spm) that contains the vocabulary necessary to instantiate a tokenizer. do_lower_case (`bool`, optional): Whether to lowercase the input when tokenizing. Defaults to ``False`` and we do not lowercase the input. remove_space (`bool`, optional): Whether to strip the text when tokenizing. Defaults to ``True`` and we remove excess spaces before and after the string. keep_accents (`bool`, optional): Whether to keep accents when tokenizing. Defaults to ``False`` and we don't keep accents. bos_token (`str`, optional): The beginning of sequence token that was used during pretraining. Defaults to ``"<s>"``. eos_token (`str`, optional): The end of sequence token. Defaults to ``"</s>"``. unk_token (`str`, optional): The unknown token. A token that is not in the vocabulary is set to be unk_token inorder to be converted to an ID. Defaults to ``"<unk>"``. sep_token (`str`, optional): The separator token. Defaults to ``"<sep>"``. pad_token (`str`, optional): The token used for padding. Defaults to ``"<pad>"``. cls_token (`str`, optional): The classifier token which is used when doing sequence classification. It is the last token of the sequence when built with special tokens. Defaults to ``"<cls>"``. mask_token (`str`, optional): The token used for masking values. In the masked language modeling task, this is the token used and which the model will try to predict. Defaults to ``"<mask>"``. additional_special_tokens (`List[str]`, optional): Additional special tokens used by the tokenizer. Defaults to ``["<eop>", "<eod>"]``. Attributes: sp_model (`SentencePieceProcessor`): The ``SentencePiece`` processor that is used for every conversion (string, tokens and IDs). """ resource_files_names = {"vocab_file": "spiece.model"} pretrained_resource_files_map = { "vocab_file": { "xlnet-base-cased": "https://paddlenlp.bj.bcebos.com/models/transformers/xlnet/xlnet-base-cased-spiece.model", "xlnet-large-cased": "https://paddlenlp.bj.bcebos.com/models/transformers/xlnet/xlnet-large-cased-spiece.model", "chinese-xlnet-base": "https://paddlenlp.bj.bcebos.com/models/transformers/xlnet/chinese-xlnet-base-spiece.model", "chinese-xlnet-mid": "https://paddlenlp.bj.bcebos.com/models/transformers/xlnet/chinese-xlnet-mid-spiece.model", "chinese-xlnet-large": "https://paddlenlp.bj.bcebos.com/models/transformers/xlnet/chinese-xlnet-large-spiece.model", } } pretrained_init_configuration = { "xlnet-base-cased": { "do_lower_case": False }, "xlnet-large-cased": { "do_lower_case": False }, "chinese-xlnet-base": { "do_lower_case": False }, "chinese-xlnet-mid": { "do_lower_case": False }, "chinese-xlnet-large": { "do_lower_case": False }, } pretrained_positional_embedding_sizes = { "xlnet-base-cased": None, "xlnet-large-cased": None, "chinese-xlnet-base": None, "chinese-xlnet-mid": None, "chinese-xlnet-large": None, } max_model_input_sizes = pretrained_positional_embedding_sizes padding_side = "left" pad_token_type_id = 3 def __init__( self, vocab_file, do_lower_case=False, remove_space=True, keep_accents=False, bos_token="<s>", eos_token="</s>", unk_token="<unk>", sep_token="<sep>", pad_token="<pad>", cls_token="<cls>", mask_token="<mask>", additional_special_tokens=["<eop>", "<eod>"], ): self.do_lower_case = do_lower_case self.remove_space = remove_space self.keep_accents = keep_accents self.vocab_file = vocab_file spm = try_import("sentencepiece") self.sp_model = spm.SentencePieceProcessor() self.sp_model.Load(vocab_file) @property def vocab_size(self): return len(self.sp_model) def get_vocab(self): vocab = { self.convert_ids_to_tokens(i): i for i in range(self.vocab_size) } vocab.update(self.added_tokens_encoder) return vocab def __getstate__(self): state = self.__dict__.copy() state["sp_model"] = None return state def __setstate__(self, d): self.__dict__ = d spm = try_import("sentencepiece") self.sp_model = spm.SentencePieceProcessor() self.sp_model.Load(self.vocab_file) def preprocess_text(self, inputs): if self.remove_space: outputs = " ".join(inputs.strip().split()) else: outputs = inputs outputs = outputs.replace("``", '"').replace("''", '"') if not self.keep_accents: outputs = unicodedata.normalize("NFKD", outputs) outputs = "".join( [c for c in outputs if not unicodedata.combining(c)]) if self.do_lower_case: outputs = outputs.lower() return outputs def _tokenize(self, text, sample=False): """Tokenize a string.""" text = self.preprocess_text(text) if not sample: pieces = self.sp_model.EncodeAsPieces(text) else: pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1) new_pieces = [] for piece in pieces: if len(piece) > 1 and piece[-1] == str(",") and piece[-2].isdigit(): cur_pieces = self.sp_model.EncodeAsPieces(piece[:-1].replace( SPIECE_UNDERLINE, "")) if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][ 0] == SPIECE_UNDERLINE: if len(cur_pieces[0]) == 1: cur_pieces = cur_pieces[1:] else: cur_pieces[0] = cur_pieces[0][1:] cur_pieces.append(piece[-1]) new_pieces.extend(cur_pieces) else: new_pieces.append(piece) return new_pieces
[docs] def tokenize(self, text): """ End-to-end tokenization for XLNet models. Args: text (`str`): The text to be tokenized. Returns: `List(str)`: A list of string representing converted tokens. """ return self._tokenize(text)
def _convert_token_to_id(self, token): """Converts a token (str) to an id using the vocab. """ return self.sp_model.PieceToId(token) def _convert_id_to_token(self, index): """Converts an index (integer) to a token (str) using the vocab.""" return self.sp_model.IdToPiece(index)
[docs] def convert_tokens_to_ids(self, tokens): """ Converts a token (or a sequence of tokens) to a single integer id (or a sequence of ids), using the vocabulary. Args: tokens (`str` or `List[str]`): One or several token(s) to convert to token id(s). Returns: `int` or `List[int]` or `tuple(int)`: The token id or list of token ids or tuple of token ids. """ if not isinstance(tokens, (list, tuple)): return self._convert_token_to_id(tokens) else: return [self._convert_token_to_id(token) for token in tokens]
[docs] def convert_ids_to_tokens(self, ids, skip_special_tokens=False): """ Converts a single index or a sequence of indices to a token or a sequence of tokens, using the vocabulary and added tokens. Args: ids (`int` or `List[int]`): The token id (or token ids) to be converted to token(s). skip_special_tokens (`bool`, optional): Whether or not to remove special tokens in the decoding. Defaults to ``False`` and we do not remove special tokens. Returns: `str` or `List[str]`: The decoded token(s). """ if not isinstance(ids, (list, tuple)): return self._convert_id_to_token(ids) tokens = [self._convert_id_to_token(_id) for _id in ids] if skip_special_tokens: return [ token for token in tokens if token not in self.all_special_tokens ] return tokens
[docs] def convert_tokens_to_string(self, tokens): """Converts a sequence of tokens (strings for sub-words) in a single string.""" out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip() return out_string
[docs] def num_special_tokens_to_add(self, pair=False): """ Returns the number of added tokens when encoding a sequence with special tokens. Note: This encodes inputs and checks the number of added tokens, and is therefore not efficient. Do not put this inside your training loop. Args: pair (`bool`, optional): Whether the sequence is a sequence pair or a single sequence. Defaults to ``False`` and the input is a single sequence. Returns: `int`: Number of tokens added to sequences. """ token_ids_0 = [] token_ids_1 = [] return len( self.build_inputs_with_special_tokens(token_ids_0, token_ids_1 if pair else None))
[docs] def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): """ Builds model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and adding special tokens. An XLNet sequence has the following format: - single sequence: ``X <sep> <cls>`` - pair of sequences: ``A <sep> B <sep> <cls>`` Args: token_ids_0 (`List[int]`): List of IDs for the first sequence. token_ids_1 (`List[int]`, optional): Optional second list of IDs for sequence pairs. Defaults to ``None``. Returns: `List[int]`: List of input IDs with the appropriate special tokens. """ sep = [self.sep_token_id] cls = [self.cls_token_id] if token_ids_1 is None: return token_ids_0 + sep + cls return token_ids_0 + sep + token_ids_1 + sep + cls
[docs] def build_offset_mapping_with_special_tokens(self, offset_mapping_0, offset_mapping_1=None): """ Builds offset map from a pair of offset map by concatenating and adding offsets of special tokens. An XLNet offset_mapping has the following format: - single sequence: ``X (0,0) (0,0)`` - pair of sequences: ``A (0,0) B (0,0) (0,0)`` Args: offset_mapping_0 (`List[tuple]`): List of char offsets to which the special tokens will be added. offset_mapping_1 (`List[tuple]`, optional): Optional second list of char offsets for offset mapping pairs. Defaults to ``None``. Returns: `List[tuple]`: List of char offsets with the appropriate offsets of special tokens. """ if offset_mapping_1 is None: return offset_mapping_0 + [(0, 0)] + [(0, 0)] return offset_mapping_0 + [(0, 0)] + offset_mapping_1 + [(0, 0)] + [ (0, 0) ]
[docs] def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False): """ Creates a special tokens mask from the input sequences. This method is called when adding special tokens using the tokenizer ``encode`` method. Args: token_ids_0 (`List[int]`): List of IDs for the first sequence. token_ids_1 (`List[int]`, optional): Optional second list of IDs for sequence pairs. Defaults to ``None``. already_has_special_tokens (`bool`, optional): Whether or not the token list is already formatted with special tokens for the model. Defaults to ``False``. Returns: `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. """ if already_has_special_tokens: if token_ids_1 is not None: raise ValueError( "You should not supply a second sequence if the provided sequence of " "ids is already formatted with special tokens for the model." ) return list( map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) if token_ids_1 is not None: return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1) ) + [1, 1] return ([0] * len(token_ids_0)) + [1, 1]
[docs] def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None): """ Creates a mask from the input sequences. An XLNet sequence pair mask has the following format: :: 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 2 | first sequence | second sequence | - 0 stands for the segment id of **first segment tokens**, - 1 stands for the segment id of **second segment tokens**, - 2 stands for the segment id of **cls_token**. Args: token_ids_0 (`List[int]`): List of IDs for the first sequence. token_ids_1 (`List[int]`, optional): Optional second list of IDs for the sequence pair. Defaults to ``None``. Returns: `List[int]`: List of token type IDs according to the given sequence(s). """ sep = [self.sep_token_id] cls_segment_id = [2] if token_ids_1 is None: return len(token_ids_0 + sep) * [0] + cls_segment_id return len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] + cls_segment_id
[docs] def save_resources(self, save_directory): """ Saves tokenizer related resources to files under `save_directory`. Args: save_directory (`str`): Directory to save files into. """ for name, file_name in self.resource_files_names.items(): save_path = os.path.join(save_directory, file_name) if os.path.abspath(self.vocab_file) != os.path.abspath(save_path): copyfile(self.vocab_file, save_path)