Source code for textflint.generation_layer.transformation.SM.swap_word

r"""
Replacing its words with antonyms provided by WordNet
==========================================================
"""
__all__ = ["SwapWord"]
from nltk.wsd import lesk

from ...transformation import Transformation
from ....common.settings import BLACK_LIST_WORD


[docs]class SwapWord(Transformation): r""" Transforms an input by replacing its words with antonyms provided by WordNet. Download nltk_data before running. Implement follow by Stress Test Evaluation for Natural Language Inference For the correctness of trasformation we swap the word has best_sense(Wordnet) to its antonym https://www.aclweb.org/anthology/C18-1198/ Example:: { sentence1: I hate this book. sentence2: This book is my favorite. y: 0 } """
[docs] def __init__( self, language="eng" ): r""" :param string language: language of transformation """ super().__init__() self.language = language self.blacklist_words = BLACK_LIST_WORD
def __repr__(self): return 'SwapWord'
[docs] def transform(self, sample, n=1, **kwargs): r""" Transform data sample to a list of Sample. :param ~SMSample sample: Data sample for augmentation :param int n: Default is 1. MAX number of unique augmented output :param **kwargs: :return: Augmented data """ transform_results = self._transform(sample, **kwargs) if transform_results: return [data for data in transform_results if not data.is_origin] else: return []
def _transform(self, sample, **kwargs): r""" Transform text string, this kind of transformation can only produce one sample. :param ~NLISample sample: input data, a NLISample contains 'sentence1' field, 'sentence2' field and 'y' field :param int n: number of generated samples, this transformation can only generate one sample :return list trans_samples: transformed sample list that only contain one sample """ label_tag = sample.get_value('y') if label_tag != '1': return None tokens1 = sample.get_words('sentence1') tokens2 = sample.get_words('sentence2') original_text2 = sample.get_text('sentence2') for num, each_word in enumerate(tokens2): if each_word not in self.blacklist_words: best_sense = lesk(tokens2, each_word) if best_sense is not None and ( best_sense.pos() == 's' or best_sense.pos() == 'n'): for lemma in best_sense.lemmas(): possible_antonyms = lemma.antonyms() for antonym in possible_antonyms: if "_" in antonym._name or \ antonym._name == "civilian": continue if each_word not in tokens1: continue new_s1 = original_text2.replace( each_word, antonym._name, 1) sample = sample.replace_fields([ 'sentence1', 'sentence2', 'y'], [new_s1, original_text2, '0']) return [sample]