Source code for textflint.generation_layer.subpopulation.UT.phrase

r"""
Extract samples matching phrases
============================================

"""
__all__ = ['PhraseSubPopulation']
from flashtext import KeywordProcessor
from ..subpopulation import SubPopulation
from ....common.settings import NEGATION, QUESTION


[docs]class PhraseSubPopulation(SubPopulation):
    r"""
    Filter samples based on a group of phrases

    for example, with phrase_name = 'question'::

        sample 1: "Who is Jack.", score: 1
        sample 2: "I am Jack.", score: 0
    """
    def __init__(
        self,
        phrase_name='negation'
    ):
        super().__init__()
        self.phrase_name = phrase_name
        if self.phrase_name == 'negation':
            self.phrases = NEGATION
        elif self.phrase_name == 'question':
            self.phrases = QUESTION

        self.phrase_processor = KeywordProcessor(case_sensitive=True)
        self.phrase_processor.add_keywords_from_dict(
            {self.phrase_name: self.phrases})

    def __repr__(self):
        return "PhraseSubPopulation" + "-" + self.phrase_name

    def phrase_match(self, text):
        match = False
        # Search for phrases
        result = self.phrase_processor.extract_keywords(text)
        if result:
            match = True
        return match

    def _score(self, sample, fields, **kwargs):
        """
        1 or 0 indicates whether sample fields match phrase groups

        :param sample: data sample
        :param list fields: list of field str
        :param kwargs:
        :return int: score for sample

        """

        text = ' '.join([sample.get_text(field) for field in fields])
        match = self.phrase_match(text)

        return match

[docs]    def get_slice(self, scores, dataset):
        r"""
        Save the samples that mach the phrase groups

        """
        sub_samples = []
        for i, sample in enumerate(dataset):
            if scores[i]:
                sub_samples.append(sample)
        return sub_samples