# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Tokenization class for XLNet model."""
import os
import unicodedata
from shutil import copyfile
from typing import List, Optional
from paddle.utils import try_import
from .. import PretrainedTokenizer
__all__ = ['XLNetTokenizer']
SENTENCEPIECE_UNDERLINE = "▁"
SPIECE_UNDERLINE = SENTENCEPIECE_UNDERLINE # Kept for backward compatibility
# Segments (not really needed)
SEG_ID_A = 0
SEG_ID_B = 1
SEG_ID_CLS = 2
SEG_ID_SEP = 3
SEG_ID_PAD = 4
[docs]class XLNetTokenizer(PretrainedTokenizer):
"""
Constructs an XLNet tokenizer. Based on `SentencePiece <https://github.com/google/sentencepiece>`__.
Args:
vocab_file (`str`):
``SentencePiece`` file (ends with .spm) that contains the vocabulary necessary
to instantiate a tokenizer.
do_lower_case (`bool`, optional):
Whether to lowercase the input when tokenizing. Defaults to ``False`` and
we do not lowercase the input.
remove_space (`bool`, optional):
Whether to strip the text when tokenizing. Defaults to ``True`` and
we remove excess spaces before and after the string.
keep_accents (`bool`, optional):
Whether to keep accents when tokenizing.
Defaults to ``False`` and we don't keep accents.
bos_token (`str`, optional):
The beginning of sequence token that was used during pretraining. Defaults to ``"<s>"``.
eos_token (`str`, optional):
The end of sequence token. Defaults to ``"</s>"``.
unk_token (`str`, optional):
The unknown token. A token that is not in the vocabulary is set to be unk_token
inorder to be converted to an ID. Defaults to ``"<unk>"``.
sep_token (`str`, optional):
The separator token. Defaults to ``"<sep>"``.
pad_token (`str`, optional):
The token used for padding. Defaults to ``"<pad>"``.
cls_token (`str`, optional):
The classifier token which is used when doing sequence classification.
It is the last token of the sequence when built with special tokens. Defaults to ``"<cls>"``.
mask_token (`str`, optional):
The token used for masking values. In the masked language modeling task,
this is the token used and which the model will try to predict. Defaults to ``"<mask>"``.
additional_special_tokens (`List[str]`, optional):
Additional special tokens used by the tokenizer. Defaults to ``["<eop>", "<eod>"]``.
Attributes:
sp_model (`SentencePieceProcessor`):
The ``SentencePiece`` processor that is used for every conversion (string, tokens and IDs).
"""
resource_files_names = {"vocab_file": "spiece.model"}
pretrained_resource_files_map = {
"vocab_file": {
"xlnet-base-cased":
"https://paddlenlp.bj.bcebos.com/models/transformers/xlnet/xlnet-base-cased-spiece.model",
"xlnet-large-cased":
"https://paddlenlp.bj.bcebos.com/models/transformers/xlnet/xlnet-large-cased-spiece.model",
"chinese-xlnet-base":
"https://paddlenlp.bj.bcebos.com/models/transformers/xlnet/chinese-xlnet-base-spiece.model",
"chinese-xlnet-mid":
"https://paddlenlp.bj.bcebos.com/models/transformers/xlnet/chinese-xlnet-mid-spiece.model",
"chinese-xlnet-large":
"https://paddlenlp.bj.bcebos.com/models/transformers/xlnet/chinese-xlnet-large-spiece.model",
}
}
pretrained_init_configuration = {
"xlnet-base-cased": {
"do_lower_case": False
},
"xlnet-large-cased": {
"do_lower_case": False
},
"chinese-xlnet-base": {
"do_lower_case": False
},
"chinese-xlnet-mid": {
"do_lower_case": False
},
"chinese-xlnet-large": {
"do_lower_case": False
},
}
pretrained_positional_embedding_sizes = {
"xlnet-base-cased": None,
"xlnet-large-cased": None,
"chinese-xlnet-base": None,
"chinese-xlnet-mid": None,
"chinese-xlnet-large": None,
}
max_model_input_sizes = pretrained_positional_embedding_sizes
padding_side = "left"
pad_token_type_id = 3
def __init__(
self,
vocab_file,
do_lower_case=False,
remove_space=True,
keep_accents=False,
bos_token="<s>",
eos_token="</s>",
unk_token="<unk>",
sep_token="<sep>",
pad_token="<pad>",
cls_token="<cls>",
mask_token="<mask>",
additional_special_tokens=["<eop>", "<eod>"], ):
self.do_lower_case = do_lower_case
self.remove_space = remove_space
self.keep_accents = keep_accents
self.vocab_file = vocab_file
spm = try_import("sentencepiece")
self.sp_model = spm.SentencePieceProcessor()
self.sp_model.Load(vocab_file)
@property
def vocab_size(self):
return len(self.sp_model)
def get_vocab(self):
vocab = {
self.convert_ids_to_tokens(i): i
for i in range(self.vocab_size)
}
vocab.update(self.added_tokens_encoder)
return vocab
def __getstate__(self):
state = self.__dict__.copy()
state["sp_model"] = None
return state
def __setstate__(self, d):
self.__dict__ = d
spm = try_import("sentencepiece")
self.sp_model = spm.SentencePieceProcessor()
self.sp_model.Load(self.vocab_file)
def preprocess_text(self, inputs):
if self.remove_space:
outputs = " ".join(inputs.strip().split())
else:
outputs = inputs
outputs = outputs.replace("``", '"').replace("''", '"')
if not self.keep_accents:
outputs = unicodedata.normalize("NFKD", outputs)
outputs = "".join(
[c for c in outputs if not unicodedata.combining(c)])
if self.do_lower_case:
outputs = outputs.lower()
return outputs
def _tokenize(self, text, sample=False):
"""Tokenize a string."""
text = self.preprocess_text(text)
if not sample:
pieces = self.sp_model.EncodeAsPieces(text)
else:
pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1)
new_pieces = []
for piece in pieces:
if len(piece) > 1 and piece[-1] == str(",") and piece[-2].isdigit():
cur_pieces = self.sp_model.EncodeAsPieces(piece[:-1].replace(
SPIECE_UNDERLINE, ""))
if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][
0] == SPIECE_UNDERLINE:
if len(cur_pieces[0]) == 1:
cur_pieces = cur_pieces[1:]
else:
cur_pieces[0] = cur_pieces[0][1:]
cur_pieces.append(piece[-1])
new_pieces.extend(cur_pieces)
else:
new_pieces.append(piece)
return new_pieces
[docs] def tokenize(self, text):
"""
End-to-end tokenization for XLNet models.
Args:
text (`str`):
The text to be tokenized.
Returns:
`List(str)`: A list of string representing converted tokens.
"""
return self._tokenize(text)
def _convert_token_to_id(self, token):
"""Converts a token (str) to an id using the vocab. """
return self.sp_model.PieceToId(token)
def _convert_id_to_token(self, index):
"""Converts an index (integer) to a token (str) using the vocab."""
return self.sp_model.IdToPiece(index)
[docs] def convert_tokens_to_ids(self, tokens):
"""
Converts a token (or a sequence of tokens) to a single integer id (or a sequence of ids),
using the vocabulary.
Args:
tokens (`str` or `List[str]`):
One or several token(s) to convert to token id(s).
Returns:
`int` or `List[int]` or `tuple(int)`: The token id or list of token ids or tuple of token ids.
"""
if not isinstance(tokens, (list, tuple)):
return self._convert_token_to_id(tokens)
else:
return [self._convert_token_to_id(token) for token in tokens]
[docs] def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
"""
Converts a single index or a sequence of indices to a token or
a sequence of tokens, using the vocabulary and added tokens.
Args:
ids (`int` or `List[int]`):
The token id (or token ids) to be converted to token(s).
skip_special_tokens (`bool`, optional):
Whether or not to remove special tokens in the decoding.
Defaults to ``False`` and we do not remove special tokens.
Returns:
`str` or `List[str]`: The decoded token(s).
"""
if not isinstance(ids, (list, tuple)):
return self._convert_id_to_token(ids)
tokens = [self._convert_id_to_token(_id) for _id in ids]
if skip_special_tokens:
return [
token for token in tokens
if token not in self.all_special_tokens
]
return tokens
[docs] def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (strings for sub-words) in a single string."""
out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
return out_string
[docs] def num_special_tokens_to_add(self, pair=False):
"""
Returns the number of added tokens when encoding a sequence with special tokens.
Note:
This encodes inputs and checks the number of added tokens, and is therefore not efficient.
Do not put this inside your training loop.
Args:
pair (`bool`, optional):
Whether the sequence is a sequence pair or a single sequence.
Defaults to ``False`` and the input is a single sequence.
Returns:
`int`: Number of tokens added to sequences.
"""
token_ids_0 = []
token_ids_1 = []
return len(
self.build_inputs_with_special_tokens(token_ids_0, token_ids_1
if pair else None))
[docs] def build_offset_mapping_with_special_tokens(self,
offset_mapping_0,
offset_mapping_1=None):
"""
Builds offset map from a pair of offset map by concatenating
and adding offsets of special tokens.
An XLNet offset_mapping has the following format:
- single sequence: ``X (0,0) (0,0)``
- pair of sequences: ``A (0,0) B (0,0) (0,0)``
Args:
offset_mapping_0 (`List[tuple]`):
List of char offsets to which the special tokens will be added.
offset_mapping_1 (`List[tuple]`, optional):
Optional second list of char offsets for offset mapping pairs.
Defaults to ``None``.
Returns:
`List[tuple]`: List of char offsets with the appropriate offsets of special tokens.
"""
if offset_mapping_1 is None:
return offset_mapping_0 + [(0, 0)] + [(0, 0)]
return offset_mapping_0 + [(0, 0)] + offset_mapping_1 + [(0, 0)] + [
(0, 0)
]
[docs] def get_special_tokens_mask(self,
token_ids_0,
token_ids_1=None,
already_has_special_tokens=False):
"""
Creates a special tokens mask from the input sequences.
This method is called when adding special tokens using the tokenizer ``encode`` method.
Args:
token_ids_0 (`List[int]`):
List of IDs for the first sequence.
token_ids_1 (`List[int]`, optional):
Optional second list of IDs for sequence pairs.
Defaults to ``None``.
already_has_special_tokens (`bool`, optional):
Whether or not the token list is already formatted with special tokens for the model.
Defaults to ``False``.
Returns:
`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
if token_ids_1 is not None:
raise ValueError(
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
)
return list(
map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0,
token_ids_0))
if token_ids_1 is not None:
return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)
) + [1, 1]
return ([0] * len(token_ids_0)) + [1, 1]
[docs] def create_token_type_ids_from_sequences(self,
token_ids_0,
token_ids_1=None):
"""
Creates a mask from the input sequences.
An XLNet sequence pair mask has the following format:
::
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 2
| first sequence | second sequence |
- 0 stands for the segment id of **first segment tokens**,
- 1 stands for the segment id of **second segment tokens**,
- 2 stands for the segment id of **cls_token**.
Args:
token_ids_0 (`List[int]`):
List of IDs for the first sequence.
token_ids_1 (`List[int]`, optional):
Optional second list of IDs for the sequence pair. Defaults to ``None``.
Returns:
`List[int]`: List of token type IDs according to the given sequence(s).
"""
sep = [self.sep_token_id]
cls_segment_id = [2]
if token_ids_1 is None:
return len(token_ids_0 + sep) * [0] + cls_segment_id
return len(token_ids_0 + sep) * [0] + len(token_ids_1 +
sep) * [1] + cls_segment_id
[docs] def save_resources(self, save_directory):
"""
Saves tokenizer related resources to files under `save_directory`.
Args:
save_directory (`str`):
Directory to save files into.
"""
for name, file_name in self.resource_files_names.items():
save_path = os.path.join(save_directory, file_name)
if os.path.abspath(self.vocab_file) != os.path.abspath(save_path):
copyfile(self.vocab_file, save_path)