Source code for textflint.generation_layer.validator.max_words_perturbed

r"""
Max Perturb Words Constraints
=====================================
"""

import numpy
from .validator import Validator
__all__ = ['MaxWordsPerturbed']


[docs]class MaxWordsPerturbed(Validator):
    r"""
    A constraint representing a maximum allowed perturbed words.
        We use the lcs div the long of the sentence as the score.

    :param ~textflint.input_layer.dataset origin_dataset:
            the dataset of origin sample
    :param ~textflint.input_layer.dataset trans_dataset:
        the dataset of translate sample
    :param str|list fields: the name of the origin field need compare.
    :param bool need_tokens: if we need tokenize the sentence

    """
    def __init__(
            self,
            origin_dataset,
            trans_dataset,
            fields,
            need_tokens=True
    ):
        super().__init__(
            origin_dataset,
            trans_dataset,
            fields,
            need_tokens=need_tokens
        )

    def __repr__(self):
        return "MaxWordsPerturbed"

[docs]    def validate(self, transformed_text, reference_text):
        r"""
        Calculate the score

        :param str transformed_text: transformed sentence
        :param str reference_text: origin sentence
        :return float: the score of two sentence

        """
        num_words_diff = self.get_lcs(reference_text, transformed_text)

        return num_words_diff / len(reference_text)

[docs]    @staticmethod
    def get_lcs(token1, token2):
        """
        Calculating the longest common subsequence

        :param list token1: the first token list
        :param list token2: the second token list
        :return int: the longest common subsequence

        """
        l1 = len(token1)
        l2 = len(token2)
        lcs = numpy.zeros([l1 + 1, l2 + 1])
        for i in range(0, l1):
            for j in range(0, l2):
                if token1[i] == token2[j]:
                    lcs[i + 1][j + 1] = lcs[i][j] + 1
                else:
                    lcs[i + 1][j + 1] = max(lcs[i + 1][j], lcs[i][j + 1])
        return lcs[l1][l2]