Source code for textflint.generation_layer.transformation.UT.typos

r"""
Typos Transformation for add/remove punctuation.
==========================================================

"""

__all__ = ['Typos']

from ...transformation import WordSubstitute
from ....common.utils.word_op import *


[docs]class Typos(WordSubstitute): r""" Transformation that simulate typos error to transform sentence. https://arxiv.org/pdf/1711.02173.pdf """
[docs] def __init__( self, trans_min=1, trans_max=10, trans_p=0.3, stop_words=None, mode="random", skip_first_char=True, skip_last_char=True, **kwargs ): r""" :param int trans_min: Minimum number of character will be augmented. :param int trans_max: Maximum number of character will be augmented. If None is passed, number of augmentation is calculated via aup_char_p.If calculated result from aug_p is smaller than aug_max, will use calculated result from aup_char_p. Otherwise, using aug_max. :param float trans_p: Percentage of character (per token) will be augmented. :param list stop_words: List of words which will be skipped from augment operation. :param str mode: just support ['random', 'replace', 'swap', 'insert', 'delete']. :param bool skip_first_char: whether skip the first char of target word. :param bool skip_last_char: whether skip the last char of target word. """ super().__init__( trans_min=trans_min, trans_max=trans_max, trans_p=trans_p, stop_words=stop_words) self._mode = mode self.skip_first_char = skip_first_char self.skip_last_char = skip_last_char
def __repr__(self): return 'Typos' + '_' + self._mode @property def mode(self): return self._mode @mode.setter def mode(self, mode_value): assert mode_value in ['random', 'replace', 'swap', 'insert', 'delete'] self._mode = mode_value def skip_aug(self, tokens, mask, **kwargs): return self.pre_skip_aug(tokens, mask) def _get_candidates(self, word, n=5, **kwargs): r""" Returns a list of words with typo errors. :param str word: token word to transform. :param int n: number of transformed tokens to generate. :param kwargs: :return list replaced_tokens: replaced tokens list """ candidates = set() for i in range(n): typo_method = self._get_typo_method() # default operate at most one character in a word result = typo_method( word, 1, self.skip_first_char, self.skip_last_char) if result: candidates.add(result) if len(candidates) > 0: return list(candidates) else: return [] def _get_typo_method(self): if self._mode == 'replace': return replace elif self._mode == 'swap': return swap elif self._mode == 'insert': return insert elif self._mode == 'delete': return delete else: return random.choice([replace, swap, insert, delete])