Source code for textflint.input_layer.component.sample.sample

r"""
Base Sample Abstract Class
============================================

"""
import copy
from abc import ABC, abstractmethod

from ..field import ListField, TextField
from ....common.settings import MODIFIED_MASK
from ....common.preprocess.en_processor import EnProcessor
__all__ = ['Sample']


[docs]class Sample(ABC): r""" Base Sample class to hold the necessary info and provide atomic operations """ text_processor = EnProcessor()
[docs] def __init__( self, data, origin=None, sample_id=None ): r""" :param dict data: The dict obj that contains data info. :param ~textflint.sample origin: original sample obj. :param int sample_id: sampleindex """ self.origin = origin if origin else self self.log = [] self.check_data(data) self.load(data) self.sample_id = sample_id
def __repr__(self): return 'Sample'
[docs] def get_value(self, field): r""" Get field value by field_str. :param str field: field name :return: field value """ return copy.deepcopy(getattr(self, field).field_value)
[docs] def get_words(self, field): r""" Get tokenized words of given textfield :param str field: field name :return: tokenized words """ field_obj = getattr(self, field) assert isinstance(field_obj, TextField), \ f"{field} is not a text field, get words failed!" return field_obj.words[:]
[docs] def get_text(self, field): r""" Get text string of given textfield :param str field: field name :return string: text """ field_obj = getattr(self, field) assert isinstance(field_obj, TextField), \ f"{field} is not a text field, get text failed!" return field_obj.text
[docs] def get_mask(self, field): r""" Get word masks of given textfield :param str field: field name :return: list of mask values """ field_obj = getattr(self, field) assert isinstance(field_obj, TextField), \ f"{field} is not a text field, get mask failed!" return field_obj.mask[:]
[docs] def get_sentences(self, field): r""" Get split sentences of given textfield :param str field: field name :return: list of sentences """ field_obj = getattr(self, field) assert isinstance(field_obj, TextField), \ f"{field} is not a text field, get sentences failed!" return field_obj.sentences[:]
[docs] def get_pos(self, field): r""" Get text field pos tags. :param str field: field name :return: pos tag list """ field_obj = getattr(self, field) assert isinstance(field_obj, TextField), \ f"{field} is not a text field, get pos tags failed!" return field_obj.pos_tagging[:]
[docs] def get_ner(self, field): r""" Get text field ner tags :param str field: field name :return: ner tag list """ field_obj = getattr(self, field) assert isinstance(field_obj, TextField), \ f"{field} is not a text field, get named entities failed!" return field_obj.ner[:]
[docs] def replace_fields(self, fields, field_values, field_masks=None): r""" Fully replace multi fields at the same time and return new sample. Notice: Not suggest use this API as it will set mask values of TextField to MODIFIED_MASK. :param list fields: field str list :param list field_values: field value list :param list field_masks: indicate mask values, useful for printable text :return: Modified Sample """ assert len(fields) == len(field_values), \ f"Fields length {len(fields)} unequal with " \ f"field values {len(field_values)}" if field_masks: assert len(fields) == len(field_masks), \ f"Fields length {len(fields)} unequal " \ f"with field masks {len(field_masks)}" sample = self.clone(self) for index, field in enumerate(fields): origin_field = getattr(sample, field) assert isinstance(field_values[index], origin_field.field_type), \ f"Cant replace {field} with type {field_values[index]}" new_field = origin_field.new_field(field_values[index]) if isinstance(origin_field, TextField): masks_values = field_masks[index] if field_masks \ else [MODIFIED_MASK] * len(new_field) new_field.replace_mask(masks_values) setattr(sample, field, new_field) return sample
[docs] def replace_field(self, field, field_value, field_mask=None): r""" Fully replace single field and return new sample. Notice: Not suggest use this API as it will set mask values of TextField to MODIFIED_MASK. :param str field: field str :param field_value: field_type :param list field_mask: indicate mask value of field :return: Modified Sample """ fields_mask = [field_mask] if field_mask else None return self.replace_fields([field], [field_value], fields_mask)
[docs] def replace_field_at_indices(self, field, indices, items): r""" Replace items of multi given scopes of field value at the same time. Stay away from the complex function !!! Be careful of your input list shape. :param str field: field name :param list of int|list|slice indices: each index can be int indicate replace single item or their list like [1, 2, 3], can be list like (0,3) indicate replace items from 0 to 3(not included), can be slice which would be convert to list. :param items: :return: Modified Sample """ assert isinstance(field, str) & isinstance(indices, list) \ & isinstance(items, list), \ f"Unequal( field length {0}, indices length {1}, items length {2}" assert len(indices) == len(items) sample = self.clone(self) field_obj = getattr(self, field) assert isinstance(field_obj, (ListField, TextField)) rep_field = field_obj.replace_at_indices(indices, items) setattr(sample, field, rep_field) return sample
[docs] def replace_field_at_index(self, field, index, items): r""" Replace items of given scope of field value. Be careful of your input list shape. :param str field: field name :param int|list|slice index: can be int indicate replace single item or list like [1, 2, 3], can be list like (0,3) indicate replace items from 0 to 3(not included), can be slice which would be convert to list. :param str|list items: shape: indices_num, correspond to field_sub_items :return: Modified Sample """ return self.replace_field_at_indices(field, [index], [items])
[docs] def unequal_replace_field_at_indices(self, field, indices, rep_items): r""" Replace scope items of field value with rep_items which may not equal with scope. :param field: field str :param indices: list of int/tupe/list :param rep_items: list :return: Modified Sample """ assert len(indices) == len(rep_items) > 0 sample = self.clone(self) sorted_items, sorted_indices = zip( *sorted(zip(rep_items, indices), key=lambda x: x[1], reverse=True)) for idx, sorted_token in enumerate(sorted_items): sample = sample.delete_field_at_index(field, sorted_indices[idx]) insert_index = sorted_indices[idx] \ if isinstance(sorted_indices[idx], int) \ else sorted_indices[idx][0] field_obj = getattr(sample, field) if insert_index > len(field_obj): raise ValueError('Cant replace items at range {0}' .format(sorted_indices[idx])) elif insert_index == len(field_obj): sample = sample.insert_field_after_index( field, insert_index - 1, sorted_token) else: sample = sample.insert_field_before_index( field, insert_index, sorted_token) return sample
[docs] def delete_field_at_indices(self, field, indices): r""" Delete items of given scopes of field value. :param str field: field name :param list of int|list|slice indices: shape:indices_num each index can be int indicate delete single item or their list like [1, 2, 3], can be list like (0,3) indicate replace items from 0 to 3(not included), can be slice which would be convert to list. :return: Modified Sample """ sample = self.clone(self) field_obj = getattr(sample, field) assert isinstance(field_obj, (ListField, TextField)) del_field = field_obj.delete_at_indices(indices) setattr(sample, field, del_field) return sample
[docs] def delete_field_at_index(self, field, index): r""" Delete items of given scopes of field value. :param str field: field value :param int|list|slice index: can be int indicate delete single item or their list like [1, 2, 3], can be list like (0,3) indicate replace items from 0 to 3(not included), can be slice which would be convert to list. :return: Modified Sample """ return self.delete_field_at_indices(field, [index])
[docs] def insert_field_before_indices(self, field, indices, items): r""" Insert items of multi given scopes before indices of field value at the same time. Stay away from the complex function !!! Be careful of your input list shape. :param str field: field name :param indices: list of int, shape:indices_num, list like [1, 2, 3] :param items: list of str/list, shape: indices_num, correspond to indices :return: Modified Sample """ sample = self.clone(self) field_obj = getattr(sample, field) assert isinstance(field_obj, (ListField, TextField)) rep_obj = field_obj.insert_before_indices(indices, items) setattr(sample, field, rep_obj) return sample
[docs] def insert_field_before_index(self, field, index, items): r""" Insert items of multi given scope before index of field value. :param str field: field name :param int index: indicate which index to insert items :param str|list items: items to insert :return: Modified Sample """ return self.insert_field_before_indices(field, [index], [items])
[docs] def insert_field_after_indices(self, field, indices, items): r""" Insert items of multi given scopes after indices of field value at the same time. Stay away from the complex function !!! Be careful of your input list shape. :param str field: field name :param indices: list of int, shape:indices_num, like [1, 2, 3] :param items: list of str/list shape: indices_num, correspond to indices :return: Modified Sample """ sample = self.clone(self) field_obj = getattr(sample, field) assert isinstance(field_obj, (ListField, TextField)) rep_obj = field_obj.insert_after_indices(indices, items) setattr(sample, field, rep_obj) return sample
[docs] def insert_field_after_index(self, field, index, items): r""" Insert items of multi given scope after index of field value :param str field: field name :param int index: indicate where to apply insert :param str|list items: shape: indices_num, correspond to field_sub_items :return: Modified Sample """ return self.insert_field_after_indices(field, [index], [items])
[docs] def swap_field_at_index(self, field, first_index, second_index): r""" Swap items between first_index and second_index of field value. :param str field: field name :param int first_index: :param int second_index: :return: Modified Sample """ sample = self.clone(self) field_obj = getattr(sample, field) assert isinstance(field_obj, (ListField, TextField)) rep_obj = field_obj.swap_at_index(first_index, second_index) setattr(sample, field, rep_obj) return sample
[docs] @abstractmethod def check_data(self, data): r""" Check rare data format :param data: rare data input :return: """ raise NotImplementedError
[docs] @abstractmethod def load(self, data): r""" Parse data into sample field value. :param data: rare data input """ raise NotImplementedError
[docs] @abstractmethod def dump(self): r""" Convert sample info to input data json format. :return: dict object. """ raise NotImplementedError
[docs] @classmethod def clone(cls, original_sample): r""" Deep copy self to a new sample :param original_sample: sample to be copied :return: Sample instance """ sample = copy.deepcopy(original_sample) sample.origin = original_sample.origin return sample
@property def is_origin(self): r""" Return whether the sample is original Sample. """ return self.origin is self