Source code for textflint.generation_layer.transformation.POS.prefix_swap

r"""
SwapPrefix transformation for POS tagging
============================================
"""
__all__ = ["SwapPrefix"]

from collections import Counter
from ....input_layer.component.sample import POSSample
from ...transformation import WordSubstitute
from ....common.utils.load import load_morfessor_model
from ....common.utils.install import download_if_needed
from ....common.settings import MORPHEME_ANALYZER


[docs]class SwapPrefix(WordSubstitute):
    r"""
    Swap prefix and keep the same POS tags.

    """
    def __init__(self,
                 trans_max=2,
                 trans_p=1,
                 **kwargs
                 ):
        super().__init__(trans_max, trans_p, **kwargs)
        self.morpheme_analyzer = load_morfessor_model(
            download_if_needed(MORPHEME_ANALYZER))
        self.remain_prefix_dict = self.get_remain_prefix_dict()
        self.get_pos = True

    def __repr__(self):
        return 'SwapPrefix'

[docs]    def get_remain_prefix_dict(self):
        r"""
        Get all possible candidates from WordNet.

        :return: a dict used as inverted index, {remain: prefix},
        """
        dicts = {
            'NN': [
                i for i in self.processor.get_all_lemmas(
                    pos='n') if "_" not in i], 'VB': [
                i for i in self.processor.get_all_lemmas(
                    pos='v') if "_" not in i], 'JJ': [
                        i for i in self.processor.get_all_lemmas(
                            pos='a') if "_" not in i], 'RB': [
                                i for i in self.processor.get_all_lemmas(
                                    pos='r') if "_" not in i]}
        remain_prefix_dict = {}
        prefix_counter = Counter()
        for k, v in dicts.items():
            for w in v:
                segs, _ = self.morpheme_analyzer.viterbi_segment(w)

                if len(segs) > 1:
                    prefix_counter.update({segs[0]: 1})
                    remain = ''.join(segs[1:])
                    remain_prefix_dict.setdefault(remain, set())
                    remain_prefix_dict[remain].add((k, segs[0]))
        return remain_prefix_dict

    def _get_candidates(self, word, pos, n=5):
        r"""
        Returns a list containing all possible words.

        :param word: str, the word to replace
        :param pos: str, the pos of the word to replace
        :param n: the number of returned words
        :return: a candidates list
        """
        assert pos is not None, "POS tag must be given!"
        candidates = []
        segs, _ = self.morpheme_analyzer.viterbi_segment(word)
        remain = ''.join(segs[1:])

        if remain in self.remain_prefix_dict:
            for type, prefix in self.remain_prefix_dict[remain]:
                if type == pos and prefix != segs[0]:
                    candidates.append(prefix + remain)
        return self.sample_num(candidates, n)

[docs]    def skip_aug(self, tokens, mask, pos=None):
        r"""
        Returns the index of the replaced tokens.

        :param tokens: list, tokenized words or word with pos tag pairs
        :param mask: list, the mask symbol of the tokens
        :param pos: list, the pos tags of the tokens
        :return: list, the words at these indices that can be replaced
        """
        assert pos is not None, "POS tag must be given!"
        results = []
        indices = self.pre_skip_aug(tokens, mask)

        for index in indices:
            if pos[index] in ['NN', 'JJ', 'RB', 'VB']:
                results.append(index)
        return results


if __name__ == "__main__":
    x = ['It', 'is', 'a', 'prefixed', 'string']
    y = ['DT', 'VBZ', 'DT', 'JJ', 'NN']

    data_sample = POSSample({'x': x, 'y': y})
    swap_ins = SwapPrefix()

    x = swap_ins.transform(sample=data_sample, field='x', n=3)

    for sample in x:
        print(sample.get_text('x'))

    """
    supposed output:
    It is a unfixed holding
    It is a affixed forwarding
    It is a transfixed holding
    """