Source code for paddlenlp.datasets.glue

# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import collections
import json
import os

from paddle.dataset.common import md5file
from paddle.utils.download import get_path_from_url
from paddlenlp.utils.env import DATA_HOME
from . import DatasetBuilder


[docs]class Glue(DatasetBuilder):
    BUILDER_CONFIGS = {
        'cola': {
            'url': "https://dataset.bj.bcebos.com/glue/CoLA.zip",
            'md5': 'b178a7c2f397b0433c39c7caf50a3543',
            'splits': {
                'train': [
                    os.path.join('CoLA', 'train.tsv'),
                    'c79d4693b8681800338aa044bf9e797b', (3, 1), 0
                ],
                'dev': [
                    os.path.join('CoLA', 'dev.tsv'),
                    'c5475ccefc9e7ca0917294b8bbda783c', (3, 1), 0
                ],
                'test': [
                    os.path.join('CoLA', 'test.tsv'),
                    'd8721b7dedda0dcca73cebb2a9f4259f', (1, ), 1
                ]
            },
            'labels': ["0", "1"]
        },
        'sst-2': {
            'url': "https://dataset.bj.bcebos.com/glue/SST.zip",
            'md5': '9f81648d4199384278b86e315dac217c',
            'splits': {
                'train': [
                    os.path.join('SST-2', 'train.tsv'),
                    'da409a0a939379ed32a470bc0f7fe99a', (0, 1), 1
                ],
                'dev': [
                    os.path.join('SST-2', 'dev.tsv'),
                    '268856b487b2a31a28c0a93daaff7288', (0, 1), 1
                ],
                'test': [
                    os.path.join('SST-2', 'test.tsv'),
                    '3230e4efec76488b87877a56ae49675a', (1, ), 1
                ]
            },
            'labels': ["0", "1"]
        },
        'sts-b': {
            'url': 'https://dataset.bj.bcebos.com/glue/STS.zip',
            'md5': 'd573676be38f1a075a5702b90ceab3de',
            'splits': {
                'train': [
                    os.path.join('STS-B', 'train.tsv'),
                    '4f7a86dde15fe4832c18e5b970998672', (7, 8, 9), 1
                ],
                'dev': [
                    os.path.join('STS-B', 'dev.tsv'),
                    '5f4d6b0d2a5f268b1b56db773ab2f1fe', (7, 8, 9), 1
                ],
                'test': [
                    os.path.join('STS-B', 'test.tsv'),
                    '339b5817e414d19d9bb5f593dd94249c', (7, 8), 1
                ]
            },
            'labels': None
        },
        'qqp': {
            'url': 'https://dataset.bj.bcebos.com/glue/QQP.zip',
            'md5': '884bf26e39c783d757acc510a2a516ef',
            'splits': {
                'train': [
                    os.path.join('QQP', 'train.tsv'),
                    'e003db73d277d38bbd83a2ef15beb442', (3, 4, 5), 1
                ],
                'dev': [
                    os.path.join('QQP', 'dev.tsv'),
                    'cff6a448d1580132367c22fc449ec214', (3, 4, 5), 1
                ],
                'test': [
                    os.path.join('QQP', 'test.tsv'),
                    '73de726db186b1b08f071364b2bb96d0', (1, 2), 1
                ]
            },
            'labels': ["0", "1"]
        },
        'mnli': {
            'url': 'https://dataset.bj.bcebos.com/glue/MNLI.zip',
            'md5': 'e343b4bdf53f927436d0792203b9b9ff',
            'splits': {
                'train': [
                    os.path.join('MNLI', 'train.tsv'),
                    '220192295e23b6705f3545168272c740', (8, 9, 11), 1
                ],
                'dev_matched': [
                    os.path.join('MNLI', 'dev_matched.tsv'),
                    'c3fa2817007f4cdf1a03663611a8ad23', (8, 9, 15), 1
                ],
                'dev_mismatched': [
                    os.path.join('MNLI', 'dev_mismatched.tsv'),
                    'b219e6fe74e4aa779e2f417ffe713053', (8, 9, 15), 1
                ],
                'test_matched': [
                    os.path.join('MNLI', 'test_matched.tsv'),
                    '33ea0389aedda8a43dabc9b3579684d9', (8, 9), 1
                ],
                'test_mismatched': [
                    os.path.join('MNLI', 'test_mismatched.tsv'),
                    '7d2f60a73d54f30d8a65e474b615aeb6', (8, 9), 1
                ]
            },
            'labels': ["contradiction", "entailment", "neutral"]
        },
        'qnli': {
            'url': 'https://dataset.bj.bcebos.com/glue/QNLI.zip',
            'md5': 'b4efd6554440de1712e9b54e14760e82',
            'splits': {
                'train': [
                    os.path.join('QNLI', 'train.tsv'),
                    '5e6063f407b08d1f7c7074d049ace94a', (1, 2, 3), 1
                ],
                'dev': [
                    os.path.join('QNLI', 'dev.tsv'),
                    '1e81e211959605f144ba6c0ad7dc948b', (1, 2, 3), 1
                ],
                'test': [
                    os.path.join('QNLI', 'test.tsv'),
                    'f2a29f83f3fe1a9c049777822b7fa8b0', (1, 2), 1
                ]
            },
            'labels': ["entailment", "not_entailment"]
        },
        'rte': {
            'url': 'https://dataset.bj.bcebos.com/glue/RTE.zip',
            'md5': 'bef554d0cafd4ab6743488101c638539',
            'splits': {
                'train': [
                    os.path.join('RTE', 'train.tsv'),
                    'd2844f558d111a16503144bb37a8165f', (1, 2, 3), 1
                ],
                'dev': [
                    os.path.join('RTE', 'dev.tsv'),
                    '973cb4178d4534cf745a01c309d4a66c', (1, 2, 3), 1
                ],
                'test': [
                    os.path.join('RTE', 'test.tsv'),
                    '6041008f3f3e48704f57ce1b88ad2e74', (1, 2), 1
                ]
            },
            'labels': ["entailment", "not_entailment"]
        },
        'wnli': {
            'url': 'https://dataset.bj.bcebos.com/glue/WNLI.zip',
            'md5': 'a1b4bd2861017d302d29e42139657a42',
            'splits': {
                'train': [
                    os.path.join('WNLI', 'train.tsv'),
                    '5cdc5a87b7be0c87a6363fa6a5481fc1', (1, 2, 3), 1
                ],
                'dev': [
                    os.path.join('WNLI', 'dev.tsv'),
                    'a79a6dd5d71287bcad6824c892e517ee', (1, 2, 3), 1
                ],
                'test': [
                    os.path.join('WNLI', 'test.tsv'),
                    'a18789ba4f60f6fdc8cb4237e4ba24b5', (1, 2), 1
                ]
            },
            'labels': ["0", "1"]
        },
        'mrpc': {
            'url': {
                'train_data':
                'https://dataset.bj.bcebos.com/glue/mrpc/msr_paraphrase_train.txt',
                'dev_id': 'https://dataset.bj.bcebos.com/glue/mrpc/dev_ids.tsv',
                'test_data':
                'https://dataset.bj.bcebos.com/glue/mrpc/msr_paraphrase_test.txt'
            },
            'md5': {
                'train_data': '793daf7b6224281e75fe61c1f80afe35',
                'dev_id': '7ab59a1b04bd7cb773f98a0717106c9b',
                'test_data': 'e437fdddb92535b820fe8852e2df8a49'
            },
            'splits': {
                'train': [
                    os.path.join('MRPC', 'train.tsv'),
                    'dc2dac669a113866a6480a0b10cd50bf', (3, 4, 0), 1
                ],
                'dev': [
                    os.path.join('MRPC', 'dev.tsv'),
                    '185958e46ba556b38c6a7cc63f3a2135', (3, 4, 0), 1
                ],
                'test': [
                    os.path.join('MRPC', 'test.tsv'),
                    '4825dab4b4832f81455719660b608de5', (3, 4), 1
                ]
            },
            'labels': ["0", "1"]
        }
    }

    def _get_data(self, mode, **kwargs):
        builder_config = self.BUILDER_CONFIGS[self.name]
        if self.name != 'mrpc':
            default_root = os.path.join(DATA_HOME, self.__class__.__name__)
            filename, data_hash, _, _ = builder_config['splits'][mode]
            fullname = os.path.join(default_root, filename)
            if not os.path.exists(fullname) or (
                    data_hash and not md5file(fullname) == data_hash):
                get_path_from_url(builder_config['url'], default_root,
                                  builder_config['md5'])

        else:
            default_root = os.path.join(DATA_HOME, self.__class__.__name__)
            filename, data_hash, _, _ = builder_config['splits'][mode]
            fullname = os.path.join(default_root, filename)
            if not os.path.exists(fullname) or (
                    data_hash and not md5file(fullname) == data_hash):
                if mode in ('train', 'dev'):
                    dev_id_path = get_path_from_url(
                        builder_config['url']['dev_id'],
                        os.path.join(default_root, 'MRPC'),
                        builder_config['md5']['dev_id'])
                    train_data_path = get_path_from_url(
                        builder_config['url']['train_data'],
                        os.path.join(default_root, 'MRPC'),
                        builder_config['md5']['train_data'])
                    # read dev data ids
                    dev_ids = []
                    print(dev_id_path)
                    with open(dev_id_path, encoding='utf-8') as ids_fh:
                        for row in ids_fh:
                            dev_ids.append(row.strip().split('\t'))

                    # generate train and dev set
                    train_path = os.path.join(default_root, 'MRPC', 'train.tsv')
                    dev_path = os.path.join(default_root, 'MRPC', 'dev.tsv')
                    with open(train_data_path, encoding='utf-8') as data_fh:
                        with open(
                                train_path, 'w', encoding='utf-8') as train_fh:
                            with open(dev_path, 'w', encoding='utf8') as dev_fh:
                                header = data_fh.readline()
                                train_fh.write(header)
                                dev_fh.write(header)
                                for row in data_fh:
                                    label, id1, id2, s1, s2 = row.strip().split(
                                        '\t')
                                    example = '%s\t%s\t%s\t%s\t%s\n' % (
                                        label, id1, id2, s1, s2)
                                    if [id1, id2] in dev_ids:
                                        dev_fh.write(example)
                                    else:
                                        train_fh.write(example)

                else:
                    test_data_path = get_path_from_url(
                        builder_config['url']['test_data'],
                        os.path.join(default_root, 'MRPC'),
                        builder_config['md5']['test_data'])
                    test_path = os.path.join(default_root, 'MRPC', 'test.tsv')
                    with open(test_data_path, encoding='utf-8') as data_fh:
                        with open(test_path, 'w', encoding='utf-8') as test_fh:
                            header = data_fh.readline()
                            test_fh.write(
                                'index\t#1 ID\t#2 ID\t#1 String\t#2 String\n')
                            for idx, row in enumerate(data_fh):
                                label, id1, id2, s1, s2 = row.strip().split(
                                    '\t')
                                test_fh.write('%d\t%s\t%s\t%s\t%s\n' %
                                              (idx, id1, id2, s1, s2))

        return fullname

    def _read(self, filename, split):
        _, _, field_indices, num_discard_samples = self.BUILDER_CONFIGS[
            self.name]['splits'][split]
        with open(filename, 'r', encoding='utf-8') as f:
            for idx, line in enumerate(f):
                if idx < num_discard_samples:
                    continue
                line_stripped = line.strip().split('\t')
                if not line_stripped:
                    continue
                example = [line_stripped[indice] for indice in field_indices]
                if self.name in ['cola', 'sst-2']:
                    yield {
                        'sentence': example[0]
                    } if 'test' in split else {
                        'sentence': example[0],
                        'labels': example[-1]
                    }
                else:
                    yield {
                        'sentence1': example[0],
                        'sentence2': example[1]
                    } if 'test' in split else {
                        'sentence1': example[0],
                        'sentence2': example[1],
                        'labels': example[-1]
                    }

[docs]    def get_labels(self):
        """
        Return labels of the Glue task.
        """
        return self.BUILDER_CONFIGS[self.name]['labels']