Source code for textflint.generation_layer.transformation.UT.contraction

r"""
contract or extend sentence by common abbreviations
==========================================================
"""

__all__ = ['Contraction']

from ...transformation import Transformation
from ....common.settings import CONTRACTION_PHRASES


[docs]class Contraction(Transformation): r""" Transforms input by common abbreviations. Each sample generate one transformed sample at most. Example:: "we're playing ping pang ball, you are so lazy. She's so beautiful!" >> "we are playing ping pang ball, you're so lazy. She is so beautiful!" """ def __init__( self, **kwargs ): super().__init__() self.phrases = CONTRACTION_PHRASES self.contractions = {v: k for k, v in CONTRACTION_PHRASES.items()} def __repr__(self): return 'Contraction' def _transform(self, sample, field='x', n=1, **kwargs): r""" Transform text string according transform_field. :param ~Sample sample: input data, normally one data component. :param str field: indicate which field to transform. :param int n: number of generated samples :param kwargs: :return list trans_samples: transformed sample list. """ tokens = sample.get_words(field) contractions_indices, phrases = self._get_contractions(tokens) phrases_indices, contractions = self._get_expanded_phrases(tokens) indices = contractions_indices + phrases_indices rep_items = phrases + contractions if indices: return [ sample.unequal_replace_field_at_indices( field, indices, rep_items)] return [] def _get_contractions(self, tokens): r""" Get contractions string in given tokens. this function work while tokenizer won't split contract string like 'can't' -> 'can ' t' or 'can 't' :param list tokens: word list :return list indices_list: indices of contract strings :return list phrases_list: expanded phrases. """ indices_list = [] phrases_list = [] # just judge [:-2] range to avoid Exception for index, token in enumerate(tokens[:-2]): sub_string = token + tokens[index + 1] if sub_string in self.contractions: indices_list.append([index, index + 2]) phrases_list.append(self.contractions[sub_string]) return indices_list, phrases_list def _get_expanded_phrases(self, tokens): r""" Get expanded phrases which can convert to contractions. :param list tokens: word list :return list indices_list: indices of expanded phrase :return list contractions_list: expanded phrases, contractions """ indices_list = [] contractions_list = [] for i in range(len(tokens)): # one sublist less judge to avoid Exception if len(tokens) > i + 2: phrase_string = " ".join(tokens[i: i + 2]) if phrase_string in self.phrases: indices_list.append([i, i + 2]) contractions_list.append(self.phrases[phrase_string]) return indices_list, contractions_list