Source code for textflint.generation_layer.transformation.MRC.perturb_answer

r"""
Perturb Answer by altering the sentence that contains answer
==========================================================
"""

import collections

from ....common.settings import ORIGIN
from ..transformation import Transformation
from ....input_layer.component.sample.mrc_sample import MRCSample


[docs]class PerturbAnswer(Transformation):
    r"""
    Transform the sentence containing answer with AlterSentence transformation.

    Example::

        origin sentence: Denver Broncos defeated the National Football
            Conference champion Carolina Panthers 24–10 to earn their
            third Super Bowl title.
        transformed sentence: Denver Broncos defeated the National Football
            Conference champ Carolina Panthers 24–10 to earn
            their 3rd Super Bowl rubric.
    """

    def __init__(self):
        super().__init__()

        # Rules for altering sentences
        self.rules = collections.OrderedDict([
            ('wn_synonyms', MRCSample.alter_wordnet_synonyms),
            ('nearbyProperNoun', MRCSample.alter_nearby(['NNP', 'NNPS'])),
            ('nearbyProperNoun', MRCSample.alter_nearby(
                ['NNP', 'NNPS'], ignore_pos=True)),
            ('nearbyEntityNouns', MRCSample.alter_nearby(
                ['NN', 'NNS'], is_ner=True)),
            ('nearbyEntityJJ', MRCSample.alter_nearby(
                ['JJ', 'JJR', 'JJS'], is_ner=True)),
        ])

    def __repr__(self):
        return 'PerturbAnswer'

    def _transform(
            self,
            sample,
            nearby_word_dict=None,
            pos_tag_dict=None,
            **kwargs
    ):
        r"""
        Extract the sentence with answer from context, replace synonyms
            based on WordNet and glove
        embedding space while keep the semantic meaning unchanged.
        :param sample: the sample to transform
        :param dict nearby_word_dict: the dict to search for nearby words
        :param dict pos_tag_dict: the dict to search for
            the most frequent pos tags
        :param kwargs:
        :return: list of sample
        """

        # filter no-answer samples
        if sample.is_impossible:
            return []
        answers = sample.get_answers()
        answer_token_start = answers[0]['start']
        answer_text = answers[0]['text']
        sentences = sample.get_sentences('context')

        sent_start = 0
        alter_sent = None
        indices = None
        # Pick up the sentence that contains the answer
        for i, sent in enumerate(sentences):
            if sent_start + len(self.processor.tokenize(sent)) \
                    <= answer_token_start:
                sent_start += len(self.processor.tokenize(sent))
                continue
            # deal with sentence tokenize error
            if sent.find(answer_text) < 0:
                return []
            sent = self.processor.feature_extract(sent)
            # Transform a sentence with AlterSentence function
            alter_sent, _, indices = sample.alter_sentence(
                sent, nearby_word_dict=nearby_word_dict,
                pos_tag_dict=pos_tag_dict, rules=self.rules)
            indices = [index + sent_start for index in indices]
            break
        if alter_sent is None:
            return None
        transform_samples = []
        results = []
        replace_items = []
        words = self.processor.tokenize(alter_sent)
        context_mask = sample.get_mask('context')

        for index in indices:
            if index >= len(context_mask):
                return []
            if context_mask[index] != ORIGIN:
                continue
            results.append(index)
            replace_items.append(words[index - sent_start])
        if results:
            new_sample = sample.replace_field_at_indices(
                'context', results, replace_items)
        else:
            return []
        transform_samples.append(new_sample)

        return transform_samples