Source code for textflint.input_layer.component.field.text_field

"""
Text Field Class
=====================

A helper class that represents input string that to be modified.
"""

from .field import Field
from ....common.utils.list_op import *
from ....common.preprocess.en_processor import EnProcessor
from ....common.settings import ORIGIN, TASK_MASK, MODIFIED_MASK


[docs]class TextField(Field): """ A helper class that represents input string that to be modified. Text that Sample contains parsed in data set, ``TextField`` provides multiple methods for Sample to modify. Support sentence level and word level modification, default using word level API. """ text_processor = EnProcessor()
[docs] def __init__( self, field_value, mask=None, is_one_sent=False, split_by_space=False, **kwargs ): r""" :param str|list field_value: Sentence string or tokenized words. :param list mask: list of mask values :param bool is_one_sent: whether input is a sentence :param boo split_by_space: whether tokenize sentence by split space :param kwargs: """ if isinstance(field_value, str): # lazy load self._words = None self._text = field_value elif isinstance(field_value, list): self._words = field_value self._text = None else: raise ValueError( 'TextField supports string/token list, given {0}' .format(type(field_value)) ) super().__init__(field_value, field_type=(str, list), **kwargs) self._mask = None self.is_one_sent = is_one_sent self.split_by_space = split_by_space if mask: self.replace_mask(mask) # Process tags lazily self._sentences = None self._pos_tags = None self._ner_tags = None self._dp_tags = None
def __hash__(self): return hash(self.text) def __len__(self): return len(self.words) @property def mask(self): if not self._mask: self._mask = [ORIGIN] * len(self.words) return self._mask[:] def set_mask(self, index, value): if not self._mask: self._mask = [ORIGIN] * len(self.words) if index > len(self.mask) - 1: raise ValueError( "Index {0} out of range {1}".format( index, len( self.mask) - 1)) if value not in [ORIGIN, TASK_MASK, MODIFIED_MASK]: raise ValueError( 'Support mask value in {0}, while input mask value is {1}!' .format([ORIGIN, TASK_MASK, MODIFIED_MASK], value) ) self._mask[index] = value def replace_mask(self, values): if not isinstance(values, list): raise ValueError(f"Cant replace mask values with {values}") if len(values) != len(self.words): raise ValueError(f"Mask values length {len(values)} " f"unequal with words length {len(self.words)}") for index, value in enumerate(values): self.set_mask(index, value)
[docs] def pos_of_word_index(self, desired_word_idx): r""" Get pos tag of given index. :param int desired_word_idx: desire index to get pos tag :return: pos tag of word of desired_word_idx. """ if (desired_word_idx < 0) or ( desired_word_idx > len(self.field_value)): raise ValueError( f"Cannot get POS tagging at index {desired_word_idx}") return self.pos_tagging[desired_word_idx]
@staticmethod def _get_mirror_mask(mirror_list): r""" Get list with all values MODIFIED_MASK and the same shape of mirror_list :param list mirror_list: shape [[rep_0_0, ..., rep_0_i], ... , [rep_n_0, ..., rep_n_m]] :return: modified mask values with the same shape of mirror_list """ assert isinstance(mirror_list, list) mask_list = [] for _list in mirror_list: mask_len = len(_list) if isinstance(_list, list) else 1 mask_list.append([MODIFIED_MASK] * mask_len) return mask_list
[docs] def replace_at_indices(self, indices, new_items): r""" Replace words at indices and set their mask to MODIFIED_MASK. :param [int|list\slice] indices: each index can be int indicate replace single item or their list like [1, 2, 3]. each index can be list like (0,3) indicate replace items from 0 to 3(not included) or their list like [(0, 3), (5,6)] each index can be slice which would be convert to list. :param [str|list|tuple] new_items: items corresponding indices. :return: Replaced TextField object. """ replace_mask = self._get_mirror_mask(new_items) mask_value = replace_at_scopes(self.mask, indices, replace_mask) field_value = replace_at_scopes(self.words, indices, new_items) return self.new_field(field_value, mask=mask_value)
[docs] def replace_at_index(self, index, new_items): r""" Replace words at indices and set their mask to MODIFIED_MASK. :param int\list\slice index: can be int indicate replace single item or their list like [1, 2, 3] can be list like (0,3) indicate replace items from 0 to 3(not included) or their list like [(0, 3), (5,6)] can be slice which would be convert to list. :param str|list\tuple new_items: items corresponding index. :return: Replaced TextField object. """ return self.replace_at_indices([index], [new_items])
[docs] def delete_at_indices(self, indices): r""" Delete words at indices and remove their mask value. :param [int|list|slice] indices: each index can be int indicate replace single item or their list like [1, 2, 3]. each index can be list like (0,3) indicate replace items from 0 to 3(not included) or their list like [(0, 3), (5,6)] each index can be slice which would be convert to list. :return: Modified TextField object. """ mask_value = delete_at_scopes(self.mask, indices) field_value = delete_at_scopes(self.words, indices) return self.new_field(field_value, mask=mask_value)
[docs] def delete_at_index(self, index): r""" Delete words at index and remove their mask value. :param int|list|slice index: can be int indicate replace single item or their list like [1, 2, 3] can be list like (0,3) indicate replace items from 0 to 3(not included) or their list like [(0, 3), (5,6)] can be slice which would be convert to list. :return: Modified TextField object. """ return self.delete_at_indices([index])
[docs] def insert_before_indices(self, indices, new_items): r""" Insert words before indices. :param [int] indices: can be int indicate replace single item or their list like [1, 2, 3] can be list like (0,3) indicate replace items from 0 to 3(not included) or their list like [(0, 3), (5,6)] can be slice which would be convert to list. :param [str|list|tuple] new_items: items corresponding index. :return: new TextField object. """ insert_mask = self._get_mirror_mask(new_items) mask_value = insert_before_indices(self.mask, indices, insert_mask) field_value = insert_before_indices(self.words, indices, new_items) return self.new_field(field_value, mask=mask_value)
[docs] def insert_before_index(self, index, new_items): r""" Insert words before index and remove their mask value. :param int index: can be int indicate replace single item or their list like [1, 2, 3] can be list like (0,3) indicate replace items from 0 to 3(not included) or their list like [(0, 3), (5,6)] can be slice which would be convert to list. :param str|list|tuple new_items: items corresponding index. :return: new TextField object. """ return self.insert_before_indices([index], [new_items])
[docs] def insert_after_indices(self, indices, new_items): r""" Insert words after indices. :param [int] indices: can be int indicate replace single item or their list like [1, 2, 3] can be list like (0,3) indicate replace items from 0 to 3(not included) or their list like [(0, 3), (5,6)] can be slice which would be convert to list. :param [str|list|tuple] new_items: items corresponding index. :return: new TextField object. """ insert_mask = self._get_mirror_mask(new_items) mask_value = insert_after_indices(self.mask, indices, insert_mask) field_value = insert_after_indices(self.words, indices, new_items) return self.new_field(field_value, mask=mask_value)
[docs] def insert_after_index(self, index, new_items): r""" Insert words before index and remove their mask value. :param int index: can be int indicate replace single item or their list like [1, 2, 3] can be list like (0,3) indicate replace items from 0 to 3(not included) or their list like [(0, 3), (5,6)] can be slice which would be convert to list. :param str|list|tuple new_items: items corresponding index. :return: new TextField object. """ return self.insert_after_indices([index], [new_items])
[docs] def swap_at_index(self, first_index, second_index): r""" Swap items between first_index and second_index of origin_list :param int first_index: index of first item :param int second_index: index of second item :return: Modified TextField object. """ mask_value = replace_at_scopes( self.mask, [first_index, second_index], [MODIFIED_MASK] * 2) field_value = swap_at_index(self.words, first_index, second_index) return self.new_field(field_value, mask=mask_value)
@staticmethod def get_word_case(word): if len(word) == 0: return 'empty' if len(word) == 1 and word.isupper(): return 'capitalize' if word.isupper(): return 'upper' elif word.islower(): return 'lower' else: for i, c in enumerate(word): if i == 0: # do not check first character continue if c.isupper(): return 'mixed' if word[0].isupper(): return 'capitalize' return 'unknown' @property def words(self): if self._words or self._words == []: return self._words else: self._words = self.text_processor.tokenize( self.text, is_one_sent=self.is_one_sent, split_by_space=self.split_by_space ) return self._words @property def sentences(self): if not self._sentences: self._sentences = self.text_processor.sentence_tokenize(self.text) return self._sentences @property def text(self): if self._text or self._text == '': return self._text else: if self.split_by_space: self._text = " ".join(self.words) else: self._text = self.text_processor.inverse_tokenize(self.words) return self._text @property def pos_tagging(self): r""" Get POS tags. Example:: given sentence 'All things in their being are good for something.' >> [('All', 'DT'), ('things', 'NNS'), ('in', 'IN'), ('their', 'PRP$'), ('being', 'VBG'), ('are', 'VBP'), ('good', 'JJ'), ('for', 'IN'), ('something', 'NN'), ('.', '.')] :return: Tokenized tokens with their POS tags. """ if not self._pos_tags: pos_tags = [pos for w, pos in self.text_processor.get_pos(self.words)] if len(pos_tags) != len(self.words): raise ValueError( f"POS tagging not aligned with tokenized words") self._pos_tags = pos_tags return self._pos_tags @property def ner(self): """ Get NER tags. Example:: given sentence 'Lionel Messi is a football player from Argentina.' >>[('Lionel Messi', 0, 2, 'PERSON'), ('Argentina', 7, 8, 'LOCATION')] :return: A list of tuples, *(entity, start, end, label)* """ if not self._ner_tags: self._ner_tags = self.text_processor.get_ner( self.words, return_char_idx=False) return self._ner_tags @property def dependency_parsing(self): r""" Dependency parsing. Example:: given sentence: 'The quick brown fox jumps over the lazy dog.' >> The DT 4 det quick JJ 4 amod brown JJ 4 amod fox NN 5 nsubj jumps VBZ 0 root over IN 9 case the DT 9 det lazy JJ 9 amod dog NN 5 obl :return: A list of tuples, *(token, pos, target, type)* """ if not self._dp_tags: self._dp_tags = self.text_processor.get_dep_parser( self.field_value, split_by_space=self.split_by_space, is_one_sent=self.is_one_sent) return self._dp_tags