Source code for textflint.generation_layer.transformation.CWS.swap_syn

r"""
Replace word with its SwapSyn.
==========================================================
"""
__all__ = ["SwapSyn"]
from ..transformation import Transformation
from ....common.settings import SYNONYM_PATH
from ....common.utils.load import plain_lines_loader
from ....common.utils.install import download_if_needed
from ....common.utils.list_op import descartes
import random


[docs]class SwapSyn(Transformation):
    r"""
    Replace word with its synonym.

    Example::

        先生过奖了 -> 先生过誉了

    """

[docs]    def __init__(self, **kwargs):
        r"""
        :param dict synonym_dict: the dictionary of synonym

        """
        super().__init__()
        self.synonym_dict = self.make_dict(download_if_needed(SYNONYM_PATH))

    def __repr__(self):
        return 'SwapSyn'

[docs]    @staticmethod
    def make_dict(path):
        r"""
        read data and make dictionary

        :param str path: the path of data
        :return dict: the dict of data

        """
        dic = {}
        lines = plain_lines_loader(path)
        for line in lines:
            line = line.strip().split(' ')
            if line[0] not in dic:
                dic[line[0]] = []
            for word in line[1:]:
                if word not in dic[line[0]]:
                    dic[line[0]].append(word)
        return dic

    def _transform(self, sample, n=5, **kwargs):
        r"""
        In this function, there are several deformation modes.

        :param ~textflint.CWSSample sample: the data which need be changed
        :param **kwargs:
        :return: In this function, there may be multiple outputs

        """
        # get sentence words
        origin_words = sample.get_words()

        # change function
        change_pos, change_word = self._get_transformations(origin_words, n)
        if len(change_pos) == 0:
            return []

        change_word = descartes(change_word, n)
        return [sample.replace_at_ranges(change_pos, words) for words in change_word]

    def _get_transformations(self, words, n):
        r"""
        Replace synonym function

        :param list words: chinese sentence words
        :param int n: the number of transformations
        :return list: two list include the pos which changed the word which
            changed and the label which changed

        """
        start = 0
        change_pos = []
        change_word = []

        for word in words:
            # find the word
            if word in self.synonym_dict:
                # save synonyms and change word segmentation labels
                change_pos.append([start, start + len(word)])
                change_word += [random.sample(
                    self.synonym_dict[word],
                    min(n, len(self.synonym_dict[word])))]
            start += len(word)

        return change_pos, change_word