Source code for textflint.generation_layer.transformation.CWS.swap_name

r"""
Make the first word of the surname and the preceding word form a word,
            and the last word of the name and the following word form a word
==========================================================
"""
__all__ = ["SwapName"]
import random

from ..transformation import Transformation
from ....common.settings import NAME_PATH, WORD_LIST_PATH
from ....common.utils.load import plain_lines_loader
from ....common.utils.install import download_if_needed
from ....common.utils.list_op import descartes


[docs]class SwapName(Transformation):
    r"""
    Make the first word of the surname and the preceding word form a word,
            and the last word of the name and the following word form a word

    Example::

        我朝小明走了过去 -> 我朝向明走了过去

    """

[docs]    def __init__(self, **kwargs):
        r"""
        :param list firstname_list: family name dictionary
        :param list word_list: dictionary of words
        :param dict word_end_dict: a dictionary
        :param dict name_dict: A dictionary ending with a surname

        """
        super().__init__()
        self.firstname_list = plain_lines_loader(download_if_needed(NAME_PATH))
        self.word_list = plain_lines_loader(download_if_needed(WORD_LIST_PATH))
        self.word_end_dict, self.name_dict = self.make_dict()

    def __repr__(self):
        return 'SwapName'

[docs]    def make_dict(self):
        r"""
        :return: Last name dictionary and first name dictionary

        """
        word_end_dict = {}
        name_dict = {}

        for word in self.word_list:
            if len(word) > 1:
                if word[1:] not in word_end_dict:
                    word_end_dict[word[1:]] = [word[0]]
                elif word[0] not in word_end_dict[word[1:]]:
                    word_end_dict[word[1:]] += [word[0]]

                if word[-1:] in self.firstname_list:
                    if word[:-1] not in name_dict:
                        name_dict[word[:-1]] = [word[-1:]]
                    elif word[-1:] not in name_dict[word[:-1]]:
                        name_dict[word[:-1]].append(word[-1])

        return word_end_dict, name_dict

    def _transform(self, sample, n=5, **kwargs):
        r"""
        We randomly generated five sets of data.

        :param ~textflint.CWSSample sample: sample the data which need be changed
        :param int n: number of generated data
        :param **kwargs:
        :return: trans_sample a list of sample
        """
        # get sentence and label and ner_label
        origin_sentence = sample.get_value('x')
        origin_label = sample.get_value('y')
        ner_label, _ = sample.ner

        # change function
        change_pos, change_list = self._get_transformations(
            origin_sentence, origin_label, ner_label, n)

        if len(change_pos) == 0:
            return []

        change_list = descartes(change_list, n)
        return [sample.replace_at_ranges(change_pos, item)
                for item in change_list]

    def _get_transformations(self, sentence, label, ner_label, n):
        r"""
        transformation function

        :param str sentence: chinese sentence
        :param list label: Chinese word segmentation tag
        :param list ner_label: sentence's ner tag
        :param int n: the number of transformations
        :return list: two list include the pos which changed and the
            label which changed
        """
        assert len(sentence) == len(label)

        change_pos = []
        change_list = []
        if len(ner_label):
            for ner in ner_label:
                tag, start, end = ner
                # Determine whether it is a name based on the ner tag
                # and the word segmentation tag
                if tag != 'Nh' or label[start] != 'B' \
                        or label[end] != 'E' \
                        or label[start + 1:end] != ['M'] * (end - start - 1):
                    continue
                # Combine the last name and the previous n words into a word,
                # and get a list of replacement words
                s = ''
                change = []

                for i in range(1, 6):
                    if start < i:
                        break
                    s = sentence[start - i] + s
                    if s in self.name_dict:
                        change += self.name_dict[s]
                if len(change) > 0:
                    change_pos += [start]
                    change_list += [random.sample(change, min(len(change), n))]

                # The name and the following n letters form a word,
                # and get a list of replacement words
                s = ''
                change = []

                for j in range(1, 5):
                    if end + j >= len(label):
                        break
                    s += sentence[end + j]
                    if s in self.word_end_dict:
                        change += self.word_end_dict[s]
                if len(change) > 0:
                    change_pos += [end]
                    change_list += [random.sample(change, min(len(change), n))]

        return change_pos, change_list