Source code for textflint.generation_layer.validator.max_words_perturbed

r"""
Max Perturb Words Constraints
=====================================
"""

import numpy
from .validator import Validator
__all__ = ['MaxWordsPerturbed']


[docs]class MaxWordsPerturbed(Validator): r""" A constraint representing a maximum allowed perturbed words. We use the lcs div the long of the sentence as the score. :param ~textflint.input_layer.dataset origin_dataset: the dataset of origin sample :param ~textflint.input_layer.dataset trans_dataset: the dataset of translate sample :param str|list fields: the name of the origin field need compare. :param bool need_tokens: if we need tokenize the sentence """ def __init__( self, origin_dataset, trans_dataset, fields, need_tokens=True ): super().__init__( origin_dataset, trans_dataset, fields, need_tokens=need_tokens ) def __repr__(self): return "MaxWordsPerturbed"
[docs] def validate(self, transformed_text, reference_text): r""" Calculate the score :param str transformed_text: transformed sentence :param str reference_text: origin sentence :return float: the score of two sentence """ num_words_diff = self.get_lcs(reference_text, transformed_text) return num_words_diff / len(reference_text)
[docs] @staticmethod def get_lcs(token1, token2): """ Calculating the longest common subsequence :param list token1: the first token list :param list token2: the second token list :return int: the longest common subsequence """ l1 = len(token1) l2 = len(token2) lcs = numpy.zeros([l1 + 1, l2 + 1]) for i in range(0, l1): for j in range(0, l2): if token1[i] == token2[j]: lcs[i + 1][j + 1] = lcs[i][j] + 1 else: lcs[i + 1][j + 1] = max(lcs[i + 1][j], lcs[i][j + 1]) return lcs[l1][l2]