Source code for textflint.generation_layer.transformation.COREF.random_repeat

r"""
Coref - Rnd repeat: Randomly choose some sentences, and each of them
    will be repeated somewhere else in the sample.
==========================================================
"""

from math import ceil
import random

from ..transformation import Transformation
from ....input_layer.component.sample import CorefSample
from ....input_layer.component.field import ListField
__all__ = ['RndRepeat']


[docs]class RndRepeat(Transformation):
    r""" 
    Randomly choose trans_p * num_sentences sentences,
    and each of them will be repeated
        somewhere else in the sample.

    Attributes:
        trans_p: proportion of repeated sentences; default 0.2
        processor: textflint.common.preprocess.TextProcessor.

    Example::

        ori: {
            'sentences': [
                ['I', 'came'], ['I', 'saw'], ['I', 'conquered'], 
                ['Anna', 'bel', 'wanna', 'sleep'],
                ['Anna', 'bel', 'is', 'happy']],
            'clusters': [
                [[1, 1], [3, 3], [5, 5]], 
                [[7, 8], [11, 12]]]}
        trans: {
            'sentences': [
                ['I', 'came'], ['I', 'saw'], ['Anna', 'bel', 'wanna', 'sleep'], 
                ['I', 'conquered'], ['Anna', 'bel', 'wanna', 'sleep'], 
                ['Anna', 'bel', 'is', 'happy']],
            'clusters': [
                [[1, 1], [3, 3], [9, 9]], 
                [[5, 6], [11, 12], [17, 18]]]}

    """

    def __init__(self, trans_p=0.2, **kwargs):
        super().__init__()
        self.trans_p = trans_p

    def __repr__(self):
        return 'RndRepeat'

    def _transform(self, sample, n=5, **kwargs):
        r"""
        :param ~textflint.CorefSample sample: a CorefSample
        :param str|list fields: Not used
        :param int n: optional; number of generated samples
        :return list: samples_tfed, transformed sample list.

        """
        if sample.num_sentences() == 0: return [sample] * n
        num_sentences = sample.num_sentences()
        samples_tfed = []

        for i in range(n):
            sample_tfed = CorefSample(sample.dump())
            # repeat times: trans_p * num_sentences; at least 1
            for j in range(ceil(num_sentences * self.trans_p)):
                # randomly choose the sentence to repeat
                ori_sen_idx = int(random.random() * (num_sentences))
                # s_pt = sample.part_conll([ori_sen_idx])
                k_sen = sample.get_kth_sen(ori_sen_idx)
                clusters_pt = sample.part_conll([ori_sen_idx])\
                    .clusters.field_value

                # randomly choose tfed_sen_idx:
                # k_sen will be inserted after position tfed_sen_idx
                # tfed_sen_idx in [0, num_sentences + j - 1):
                # tfed_sen_idx cannot be the last one
                assert sample_tfed.num_sentences() == num_sentences + j, \
                    "Assert failed in RndRepeat: " \
                    "document length does not match."
                tfed_sen_idx = int(random.random() * (num_sentences + j - 1))
                sen_map = sample_tfed.sen_map
                insert_at_idx = sum(sen_map[:tfed_sen_idx+1])
                sample_tfed = sample_tfed.insert_field_after_indices(
                    'x', [insert_at_idx-1], [k_sen])

                assert sen_map[tfed_sen_idx] + len(k_sen) \
                    == sample_tfed.sen_map[tfed_sen_idx], \
                    "Assert failed in RndRepeat: sentence lengths " \
                    "seem to be unexpected after insert. " \
                    "Original sen_map: {0}, current sen_map: {1}"\
                        .format(sen_map, sample_tfed.sen_map)
                sample_tfed.sen_map = sample_tfed.sen_map[:tfed_sen_idx] \
                    + [sen_map[tfed_sen_idx], len(k_sen)] \
                    + sample_tfed.sen_map[tfed_sen_idx+1:]

                setattr(sample_tfed, 'clusters', 
                    ListField([c1 + c2 for c1, c2 in zip(
                        [
                            [[b+insert_at_idx, e+insert_at_idx] for [b, e] in c] 
                            for c in clusters_pt], 
                        sample_tfed.clusters.field_value)]))

            # get the tfed sample and append to list
            samples_tfed.append(sample_tfed)
        return samples_tfed