Source code for textflint.generation_layer.transformation.MRC.add_sent_diverse

r"""
Add a distractor sentence to penalize MRC model
==========================================================
This transformation is based on CoreNLP, which is written in Java;
recent releases require Java 1.8+.
You need to have Java installed to run CoreNLP.
"""
import collections

from ..transformation import Transformation
from ....common.utils import FlintError
from ....input_layer.component.sample.mrc_sample import MRCSample, \
    ConstituencyParse

__all__ = ['AddSentDiverse']


[docs]class AddSentDiverse(Transformation): r""" Generate a distractor before the sentence with answer. Example:: origin question: Which NFL team represented the AFC at Super Bowl 50? transform distarctor: The UNICEF team of Kew Gardens represented the UNICEF at Champ Bowl 40. """ def __init__(self): super().__init__() self.rules = collections.OrderedDict([ # special tokens transformation ('special', MRCSample.alter_special), # synonym words in wordnet ('wn_antonyms', MRCSample.alter_wordnet_antonyms), ('nearbyNum', MRCSample.alter_nearby( ['CD'], ignore_pos=True)), # num ('nearbyProperNoun', MRCSample.alter_nearby( ['NNP', 'NNPS'])), # proper nouns ('nearbyProperNoun', MRCSample.alter_nearby( ['NNP', 'NNPS'], ignore_pos=True)), ('nearbyEntityNouns', MRCSample.alter_nearby( ['NN', 'NNS'], is_ner=True)), # entity nouns ('nearbyEntityJJ', MRCSample.alter_nearby( ['JJ', 'JJR', 'JJS'], is_ner=True)), # entity type ('entityType', MRCSample.alter_entity_type), ]) def __repr__(self): return 'AddSentenceDiverse' def _transform( self, sample, nearby_word_dict=None, pos_tag_dict=None, **kwargs ): r""" Transform the question based on specific rules, replace the ground truth with fake answer, and then convert the question and fake answer to a distractor. :param sample: the sample to transform :param dict nearby_word_dict: the dict to search for nearby words :param dict pos_tag_dict: the dict to search for the most frequent pos tags :param kwargs: :return: list of sample """ # filter no-answer samples if sample.is_impossible: return [] question = sample.get_value('question') answers = sample.get_answers() answer_token_start = answers[0]['start'] answer_text = answers[0]['text'] sentences = sample.get_sentences('context') try: # constituency parsing and linguistic feature generation question_tokens = self.processor.feature_extract(question) parse = self.processor.get_parser(question) except IOError: raise FlintError("Corenlp HTTPError, skip this sample") # Transform a sentence with AlterSentence Transformation alter_question, tokens, _ = sample.alter_sentence( question_tokens, nearby_word_dict=nearby_word_dict, pos_tag_dict=pos_tag_dict, rules=self.rules ) assert len(tokens) == len(question_tokens) try: # TODO const_parse = sample.read_const_parse(parse) const_parse = ConstituencyParse.replace_words( const_parse, [t['word'] for t in tokens]) except IndexError: raise FlintError("Corenlp parsing mismatches spacy tokenizer") length = 0 # Insert for i, sent in enumerate(sentences): if length + len(self.processor.tokenize(sent)) \ < answer_token_start: length = length + len(self.processor.tokenize(sent)) continue sent_tokens = self.processor.feature_extract(sent) new_ans = sample.convert_answer( answer_text, sent_tokens, alter_question) distractor = sample.run_conversion( alter_question, new_ans, tokens, const_parse) if distractor and new_ans: # Insert the distract sentence before the answer new_sample = sample.insert_field_before_index( 'context', length, self.processor.tokenize(distractor)) return [new_sample] else: return []