Source code for textflint.generation_layer.transformation.MRC.add_sent_diverse

r"""
Add a distractor sentence to penalize MRC model
==========================================================
This transformation is based on CoreNLP, which is written in Java;
recent releases require Java 1.8+.
You need to have Java installed to run CoreNLP.
"""
import collections

from ..transformation import Transformation
from ....common.utils import FlintError
from ....input_layer.component.sample.mrc_sample import MRCSample, \
    ConstituencyParse

__all__ = ['AddSentDiverse']


[docs]class AddSentDiverse(Transformation):
    r"""
    Generate a distractor before the sentence with answer.

    Example::

        origin question: Which NFL team represented the AFC at Super Bowl 50?
        transform distarctor: The UNICEF team of Kew Gardens represented
            the UNICEF at Champ Bowl 40.

    """
    def __init__(self):
        super().__init__()
        self.rules = collections.OrderedDict([
            # special tokens transformation
            ('special', MRCSample.alter_special),
            # synonym words in wordnet
            ('wn_antonyms', MRCSample.alter_wordnet_antonyms),
            ('nearbyNum', MRCSample.alter_nearby(
                ['CD'], ignore_pos=True)),                  # num
            ('nearbyProperNoun', MRCSample.alter_nearby(
                ['NNP', 'NNPS'])),                   # proper nouns
            ('nearbyProperNoun', MRCSample.alter_nearby(
                ['NNP', 'NNPS'], ignore_pos=True)),
            ('nearbyEntityNouns', MRCSample.alter_nearby(
                ['NN', 'NNS'], is_ner=True)),       # entity nouns
            ('nearbyEntityJJ', MRCSample.alter_nearby(
                ['JJ', 'JJR', 'JJS'], is_ner=True)),   # entity type
            ('entityType', MRCSample.alter_entity_type),
        ])

    def __repr__(self):
        return 'AddSentenceDiverse'

    def _transform(
            self,
            sample,
            nearby_word_dict=None,
            pos_tag_dict=None,
            **kwargs
    ):
        r"""
        Transform the question based on specific rules, replace the ground truth
        with fake answer, and then convert the question
        and fake answer to a distractor.

        :param sample: the sample to transform
        :param dict nearby_word_dict: the dict to search for nearby words
        :param dict pos_tag_dict: the dict to search for
            the most frequent pos tags
        :param kwargs:
        :return: list of sample
        """
        # filter no-answer samples
        if sample.is_impossible:
            return []
        question = sample.get_value('question')
        answers = sample.get_answers()
        answer_token_start = answers[0]['start']
        answer_text = answers[0]['text']
        sentences = sample.get_sentences('context')

        try:  # constituency parsing and linguistic feature generation
            question_tokens = self.processor.feature_extract(question)
            parse = self.processor.get_parser(question)
        except IOError:
            raise FlintError("Corenlp HTTPError, skip this sample")
        # Transform a sentence with AlterSentence Transformation
        alter_question, tokens, _ = sample.alter_sentence(
            question_tokens,
            nearby_word_dict=nearby_word_dict,
            pos_tag_dict=pos_tag_dict,
            rules=self.rules
        )

        assert len(tokens) == len(question_tokens)
        try:  # TODO
            const_parse = sample.read_const_parse(parse)
            const_parse = ConstituencyParse.replace_words(
                const_parse, [t['word'] for t in tokens])
        except IndexError:
            raise FlintError("Corenlp parsing mismatches spacy tokenizer")

        length = 0
        # Insert
        for i, sent in enumerate(sentences):

            if length + len(self.processor.tokenize(sent)) \
                    < answer_token_start:
                length = length + len(self.processor.tokenize(sent))
                continue
            sent_tokens = self.processor.feature_extract(sent)
            new_ans = sample.convert_answer(
                answer_text, sent_tokens, alter_question)
            distractor = sample.run_conversion(
                alter_question, new_ans, tokens, const_parse)
            if distractor and new_ans:
                # Insert the distract sentence before the answer
                new_sample = sample.insert_field_before_index(
                    'context', length, self.processor.tokenize(distractor))
                return [new_sample]
            else:
                return []