Source code for textflint.generation_layer.transformation.COREF.random_repeat

r"""
Coref - Rnd repeat: Randomly choose some sentences, and each of them
    will be repeated somewhere else in the sample.
==========================================================
"""

from math import ceil
import random

from ..transformation import Transformation
from ....input_layer.component.sample import CorefSample
from ....input_layer.component.field import ListField
__all__ = ['RndRepeat']


[docs]class RndRepeat(Transformation): r""" Randomly choose trans_p * num_sentences sentences, and each of them will be repeated somewhere else in the sample. Attributes: trans_p: proportion of repeated sentences; default 0.2 processor: textflint.common.preprocess.TextProcessor. Example:: ori: { 'sentences': [ ['I', 'came'], ['I', 'saw'], ['I', 'conquered'], ['Anna', 'bel', 'wanna', 'sleep'], ['Anna', 'bel', 'is', 'happy']], 'clusters': [ [[1, 1], [3, 3], [5, 5]], [[7, 8], [11, 12]]]} trans: { 'sentences': [ ['I', 'came'], ['I', 'saw'], ['Anna', 'bel', 'wanna', 'sleep'], ['I', 'conquered'], ['Anna', 'bel', 'wanna', 'sleep'], ['Anna', 'bel', 'is', 'happy']], 'clusters': [ [[1, 1], [3, 3], [9, 9]], [[5, 6], [11, 12], [17, 18]]]} """ def __init__(self, trans_p=0.2, **kwargs): super().__init__() self.trans_p = trans_p def __repr__(self): return 'RndRepeat' def _transform(self, sample, n=5, **kwargs): r""" :param ~textflint.CorefSample sample: a CorefSample :param str|list fields: Not used :param int n: optional; number of generated samples :return list: samples_tfed, transformed sample list. """ if sample.num_sentences() == 0: return [sample] * n num_sentences = sample.num_sentences() samples_tfed = [] for i in range(n): sample_tfed = CorefSample(sample.dump()) # repeat times: trans_p * num_sentences; at least 1 for j in range(ceil(num_sentences * self.trans_p)): # randomly choose the sentence to repeat ori_sen_idx = int(random.random() * (num_sentences)) # s_pt = sample.part_conll([ori_sen_idx]) k_sen = sample.get_kth_sen(ori_sen_idx) clusters_pt = sample.part_conll([ori_sen_idx])\ .clusters.field_value # randomly choose tfed_sen_idx: # k_sen will be inserted after position tfed_sen_idx # tfed_sen_idx in [0, num_sentences + j - 1): # tfed_sen_idx cannot be the last one assert sample_tfed.num_sentences() == num_sentences + j, \ "Assert failed in RndRepeat: " \ "document length does not match." tfed_sen_idx = int(random.random() * (num_sentences + j - 1)) sen_map = sample_tfed.sen_map insert_at_idx = sum(sen_map[:tfed_sen_idx+1]) sample_tfed = sample_tfed.insert_field_after_indices( 'x', [insert_at_idx-1], [k_sen]) assert sen_map[tfed_sen_idx] + len(k_sen) \ == sample_tfed.sen_map[tfed_sen_idx], \ "Assert failed in RndRepeat: sentence lengths " \ "seem to be unexpected after insert. " \ "Original sen_map: {0}, current sen_map: {1}"\ .format(sen_map, sample_tfed.sen_map) sample_tfed.sen_map = sample_tfed.sen_map[:tfed_sen_idx] \ + [sen_map[tfed_sen_idx], len(k_sen)] \ + sample_tfed.sen_map[tfed_sen_idx+1:] setattr(sample_tfed, 'clusters', ListField([c1 + c2 for c1, c2 in zip( [ [[b+insert_at_idx, e+insert_at_idx] for [b, e] in c] for c in clusters_pt], sample_tfed.clusters.field_value)])) # get the tfed sample and append to list samples_tfed.append(sample_tfed) return samples_tfed