Source code for paddlenlp.data.vocab

# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import collections
import io
import json
import os
import warnings


[docs]class Vocab(object):
    """
    Vocab is for mapping between text tokens and ids.
    Args:
        counter (collections.Counter, optional): A Counter intance describes
            the tokens and their frequencies. Its keys will be indexed accroding
            to the order of frequency sorting to construct mapping relationship. 
            If None, `token_to_idx` must be provided as the mapping relationship.
            Default: None.
        max_size (int, optional): Max size of vocab, not including special tokens.
            Default: None.
        min_freq (int): Ignore tokens whose frequencies are less than `min_freq`.
            Default: 1.
        token_to_idx (dict, optional): A dict specifies the mapping relationship
            between tokens and indices to be used. If provided, adjust the tokens
            and indices mapping according to it. If None, counter must be provided.
            Default: None.
        unk_token (str): special token for unknow token '<unk>'. If no need, it also
            could be None. Default: None.
        pad_token (str): special token for padding token '<pad>'. If no need, it also
            could be None. Default: None.
        bos_token (str): special token for bos token '<bos>'. If no need, it also
            could be None. Default: None.
        eos_token (str): special token for eos token '<eos>'. If no need, it also
            could be None. Default: None.
        **kwargs (dict): Keyword arguments ending with `_token`. It can be used
            to specify further special tokens that will be exposed as attribute
            of the vocabulary and associated with an index.
    """

    def __init__(self,
                 counter=None,
                 max_size=None,
                 min_freq=1,
                 token_to_idx=None,
                 unk_token=None,
                 pad_token=None,
                 bos_token=None,
                 eos_token=None,
                 **kwargs):
        # Handle special tokens
        combs = (('unk_token', unk_token), ('pad_token', pad_token),
                 ('bos_token', bos_token), ('eos_token', eos_token))
        for name, value in combs:
            kwargs[name] = value
        special_tokens = []
        special_iter = kwargs.keys()
        # sort alphabetically
        special_iter = sorted(special_iter)
        for special_token_name in special_iter:
            # Test if kwarg specifies a special token
            if not special_token_name.endswith('_token'):
                raise ValueError('{} is invalid. Only keyword arguments '
                                 'that end in \'_token\' are supported '
                                 'to declare special tokens.'.format(
                                     special_token_name))

            special_token = kwargs[special_token_name]
            if special_token is not None and special_token not in special_tokens:
                special_tokens.append(special_token)

        if counter is None:
            # use token_to_idx as dict to import pretrained vocabulary
            assert token_to_idx, (
                'token_to_idx should not be None when counter is None')
            for special_token in special_tokens:
                assert special_token in token_to_idx, '{} is not in token_to_idx'.format(
                    special_token)
            self._token_to_idx = token_to_idx
            self._idx_to_token = sorted(
                self._token_to_idx.keys(),
                key=lambda token: self._token_to_idx[token])
            if unk_token:
                unk_index = self._token_to_idx[unk_token]
                self._token_to_idx = collections.defaultdict(lambda: unk_index)
                self._token_to_idx.update(token_to_idx)
        else:
            self._idx_to_token = list(special_tokens)
            self._token_to_idx = collections.defaultdict()
            self._token_to_idx.update(
                (token, idx) for idx, token in enumerate(self._idx_to_token))
            self._index_counter_keys(counter, special_tokens, max_size,
                                     min_freq)
            if token_to_idx:
                self._sort_index_according_to_user_specification(token_to_idx)
            if unk_token:
                self._token_to_idx.default_factory = lambda: self._token_to_idx[unk_token]

        # _expose_tokens_as_attributes
        self._identifiers_to_tokens = kwargs
        for identifier, token in kwargs.items():
            if identifier.startswith('_'):
                raise ValueError(
                    'It is not allowed to use identifiers starting with '
                    'underscore. In Python identifier names beginning with '
                    'underscore are internal.')
            if hasattr(self, identifier):
                raise ValueError(
                    'vocab.{} already exists. '
                    'Please choose a different identifier for token {}'.format(
                        identifier, token))
            setattr(self, identifier, token)

    def _index_counter_keys(self, counter, special_tokens, max_size, min_freq):
        # sort by frequency, then alphabetically
        token_freqs = sorted(counter.items(), key=lambda x: x[0])
        token_freqs.sort(key=lambda x: x[1], reverse=True)
        # frequencies of special tokens are not counted when building vocabulary
        # in frequency order
        special_tokens = set(special_tokens)
        max_size = None if max_size is None else max_size + len(special_tokens)
        for token, freq in token_freqs:
            if freq < min_freq or len(self._idx_to_token) == max_size:
                break
            if token not in special_tokens:
                self._idx_to_token.append(token)
                self._token_to_idx[token] = len(self._idx_to_token) - 1

    def _sort_index_according_to_user_specification(self, token_to_idx):
        # Sanity checks
        if not set(token_to_idx.keys()).issubset(self.token_to_idx.keys()):
            raise ValueError(
                'User-specified token_to_idx mapping can only contain '
                'tokens that will be part of the vocabulary.')
        if len(set(token_to_idx.values())) != len(token_to_idx):
            raise ValueError(
                'User-specified indices must not contain duplicates.')
        if min(token_to_idx.values()) < 0 or max(token_to_idx.values()) >= len(
                self.token_to_idx):
            raise ValueError(
                'User-specified indices must not be < 0 or >= the number of tokens '
                'that will be in the vocabulary. The current vocab contains {}'
                'tokens.'.format(len(self.token_to_idx)))

        # Update index ordering
        for token, new_idx in token_to_idx.items():
            old_idx = self.token_to_idx[token]
            ousted_token = self.idx_to_token[new_idx]

            self.token_to_idx[token] = new_idx
            self.token_to_idx[ousted_token] = old_idx
            self.idx_to_token[old_idx] = ousted_token
            self.idx_to_token[new_idx] = token

[docs]    def to_tokens(self, indices):
        """
        Map the input indices to token list.
        Args:
            indices (list|tuple|int): input indices for mapping.
        Returns:
            list|str: obtained token(s).
        """
        to_reduce = False
        if not isinstance(indices, (list, tuple)):
            indices = [indices]
            to_reduce = True

        max_idx = len(self._idx_to_token) - 1

        tokens = []
        for idx in indices:
            if not isinstance(idx, int):
                warnings.warn(
                    "The type of `to_tokens()`'s input `indices` is not `int` which will be forcibly transfered to `int`. "
                )
                idx = int(idx)
            if idx > max_idx:
                raise ValueError(
                    'Token index {} in the provided `indices` is invalid.'.
                    format(idx))
            tokens.append(self._idx_to_token[idx])

        return tokens[0] if to_reduce else tokens

[docs]    def to_indices(self, tokens):
        """
        Map the input tokens into indices
        Args:
            tokens (list|tuple, optional): input tokens for mapping.
        Returns:
            list|int: obationed indice list.
        """
        return self[tokens]

    def __getitem__(self, tokens):
        if not isinstance(tokens, (list, tuple)):
            return self._token_to_idx[tokens]
        else:
            return [self._token_to_idx[token] for token in tokens]

    def __len__(self):
        return len(self._idx_to_token)

    def __contains__(self, token):
        return token in self._token_to_idx

    def __call__(self, tokens):
        return self[tokens]

    @property
    def idx_to_token(self):
        """
        Return index-token dict
        """
        return self._idx_to_token

    @property
    def token_to_idx(self):
        """
        Return token-index dict
        """
        return self._token_to_idx

[docs]    def to_json(self, path=None):
        """
        Summarize some information of vocab as JSON string. If path is gaven,
        the JSON string will be saved into files.
        Args:
            path (str, optional): the path to save JSON string. If None, the
                JSON will not be saved. Default: None.
        Returns:
                str: JSON string.
        """
        vocab_dict = {}
        vocab_dict['idx_to_token'] = self.idx_to_token
        vocab_dict['token_to_idx'] = dict(self.token_to_idx)
        vocab_dict['unk_token'] = self.unk_token
        vocab_dict['identifiers_to_tokens'] = self._identifiers_to_tokens
        json_str = json.dumps(vocab_dict)
        if path:
            with io.open(path, 'w', encoding='utf-8') as f:
                f.write(json_str)
        return json_str

[docs]    @classmethod
    def from_json(cls, json_str):
        """
        Load vocab from JSON string or JSON file.
        Args:
            json_str (str): JSON string or file path of JSON string.
        Returns:
            Vocab: vocab generated from information contained in JSON string.
        """
        if os.path.isfile(json_str):
            with io.open(json_str, 'w', encoding='utf-8') as f:
                vocab_dict = json.load(f)
        else:
            vocab_dict = json.loads(json_str)
        token_to_idx = vocab_dict.get('token_to_idx')
        unk_token = vocab_dict.get('unk_token')
        identifiers_to_tokens = vocab_dict.get('identifiers_to_tokens', dict())
        if 'unk_token' in identifiers_to_tokens:
            del identifiers_to_tokens['unk_token']
        vocab = cls(counter=None,
                    token_to_idx=token_to_idx,
                    unk_token=unk_token,
                    **identifiers_to_tokens)
        return vocab

[docs]    @classmethod
    def from_dict(cls,
                  token_to_idx,
                  unk_token=None,
                  pad_token=None,
                  bos_token=None,
                  eos_token=None,
                  **kwargs):
        """
        Generate vocab from a dict.
        Args:
            token_to_idx (dict): A dict describes the mapping relationship between
                tokens to indices.
            unk_token (str): special token for unknow token. If no need, it also
                could be None. Default: None.
            pad_token (str): special token for padding token. If no need, it also
                could be None. Default: None.
            bos_token (str): special token for bos token. If no need, it also
                could be None. Default: None.
            eos_token (str): special token for eos token. If no need, it also
                could be None. Default: None.
            **kwargs (dict): Keyword arguments ending with `_token`. It can be used
                to specify further special tokens that will be exposed as attribute
                of the vocabulary and associated with an index.
        Returns:
            Vocab: vocab generated from the given dict and special tokens.
        """
        vocab = cls(counter=None,
                    token_to_idx=token_to_idx,
                    unk_token=unk_token,
                    pad_token=pad_token,
                    bos_token=bos_token,
                    eos_token=eos_token,
                    **kwargs)
        return vocab

[docs]    @staticmethod
    def build_vocab(iterator,
                    max_size=None,
                    min_freq=1,
                    token_to_idx=None,
                    unk_token=None,
                    pad_token=None,
                    bos_token=None,
                    eos_token=None,
                    **kwargs):
        """
        Building vocab accoring to given iterator and other information. Iterate
        over the `iterator` to construct a `Counter` and as `__init__`
        Args:
            iterator (collections.Iterable): Iterator of tokens. Each tokens should be list of token if wordlevel vocab is needed.
            max_size (int, optional): Max size of vocab, not including special tokens. Default: None.
            min_freq (int): Ignore tokens whose frequencies are less than `min_freq`. Default: 1.
            token_to_idx (dict, optional): A dict specifies the mapping relationship
                between tokens and indices to be used. If provided, adjust the tokens
                and indices mapping according to it. If None, counter must be provided.
                Default: None.
            unk_token (str): special token for unknow token '<unk>'. If no need, it also
                could be None. Default: None.
            pad_token (str): special token for padding token '<pad>'. If no need, it also
                could be None. Default: None.
            bos_token (str): special token for bos token '<bos>'. If no need, it also
                could be None. Default: None.
            eos_token (str): special token for eos token '<eos>'. If no need, it also
                could be None. Default: None.
            **kwargs (dict): Keyword arguments ending with `_token`. It can be used
                to specify further special tokens that will be exposed as attribute
                of the vocabulary and associated with an index.
        Returns:
            Vocab: Generated vocab from given iterator and other informations.
        """
        counter = collections.Counter()
        for tokens in iterator:
            counter.update(tokens)
        vocab = Vocab(
            counter,
            max_size=max_size,
            min_freq=min_freq,
            token_to_idx=token_to_idx,
            unk_token=unk_token,
            pad_token=pad_token,
            bos_token=bos_token,
            eos_token=eos_token,
            **kwargs)
        return vocab

[docs]    @staticmethod
    def load_vocabulary(filepath,
                        unk_token=None,
                        pad_token=None,
                        bos_token=None,
                        eos_token=None,
                        **kwargs):
        """
        Instantiate an instance of `Vocab` from a file reserving all tokens
        by using `Vocab.from_dict`. The file contains a token per line, and the
        line number would be the index of corresponding token.
        Args:
            filepath (str): path of file to construct vocabulary.
            unk_token (str): special token for unknown token. If no need, it also
                could be None. Default: None.
            pad_token (str): special token for padding token. If no need, it also
                could be None. Default: None.
            bos_token (str): special token for bos token. If no need, it also
                could be None. Default: None.
            eos_token (str): special token for eos token. If no need, it also
                could be None. Default: None.
            **kwargs (dict): keyword arguments for `Vocab.from_dict`.
        Returns:
            Vocab: An instance of `Vocab`.
        """
        token_to_idx = {}
        with io.open(filepath, 'r', encoding='utf-8') as f:
            for index, line in enumerate(f):
                token = line.rstrip('\n')
                token_to_idx[token] = int(index)
        vocab = Vocab.from_dict(
            token_to_idx,
            unk_token=unk_token,
            pad_token=pad_token,
            bos_token=bos_token,
            eos_token=eos_token,
            **kwargs)
        return vocab