Source code for textflint.generation_layer.transformation.CWS.swap_syn

r"""
Replace word with its SwapSyn.
==========================================================
"""
__all__ = ["SwapSyn"]
from ..transformation import Transformation
from ....common.settings import SYNONYM_PATH
from ....common.utils.load import plain_lines_loader
from ....common.utils.install import download_if_needed
from ....common.utils.list_op import descartes
import random


[docs]class SwapSyn(Transformation): r""" Replace word with its synonym. Example:: 先生过奖了 -> 先生过誉了 """
[docs] def __init__(self, **kwargs): r""" :param dict synonym_dict: the dictionary of synonym """ super().__init__() self.synonym_dict = self.make_dict(download_if_needed(SYNONYM_PATH))
def __repr__(self): return 'SwapSyn'
[docs] @staticmethod def make_dict(path): r""" read data and make dictionary :param str path: the path of data :return dict: the dict of data """ dic = {} lines = plain_lines_loader(path) for line in lines: line = line.strip().split(' ') if line[0] not in dic: dic[line[0]] = [] for word in line[1:]: if word not in dic[line[0]]: dic[line[0]].append(word) return dic
def _transform(self, sample, n=5, **kwargs): r""" In this function, there are several deformation modes. :param ~textflint.CWSSample sample: the data which need be changed :param **kwargs: :return: In this function, there may be multiple outputs """ # get sentence words origin_words = sample.get_words() # change function change_pos, change_word = self._get_transformations(origin_words, n) if len(change_pos) == 0: return [] change_word = descartes(change_word, n) return [sample.replace_at_ranges(change_pos, words) for words in change_word] def _get_transformations(self, words, n): r""" Replace synonym function :param list words: chinese sentence words :param int n: the number of transformations :return list: two list include the pos which changed the word which changed and the label which changed """ start = 0 change_pos = [] change_word = [] for word in words: # find the word if word in self.synonym_dict: # save synonyms and change word segmentation labels change_pos.append([start, start + len(word)]) change_word += [random.sample( self.synonym_dict[word], min(n, len(self.synonym_dict[word])))] start += len(word) return change_pos, change_word