Source code for textflint.common.preprocess.cn_processor

r"""
CnProcessor Class
============================================
"""
import threading

__all__ = ['CnProcessor']


[docs]class CnProcessor: r""" Text Processor class implement NER. """ _instance_lock = threading.Lock() def __init__(self): self.__ner = None self.__pos = None # Single instance mode def __new__(cls, *args, **kwargs): if not hasattr(CnProcessor, "_instance"): with CnProcessor._instance_lock: if not hasattr(CnProcessor, "_instance"): CnProcessor._instance = object.__new__(cls) return CnProcessor._instance
[docs] @staticmethod def tokenize(sent): r""" tokenize fiction :param str sent: the sentence need to be tokenized :return: list.the tokens in it """ assert isinstance(sent, str) return [word for word in sent]
[docs] def get_ner(self, sentence): r""" NER function. :param str sent: the sentence need to be ner :return two forms of tags The first is the triple form (tags,start,end) The second is the list form, which marks the ner label of each word such as 周小明去玩 ['Nh', 'Nh', 'Nh', 'O', 'O'] """ assert isinstance(sentence, (list, str)) from ltp import LTP if isinstance(sentence, list): # Turn the list into sentence tmp = '' for word in sentence: tmp += word sentence = tmp if not sentence: return [], [] if self.__ner is None: self.__ner = LTP() seg, hidden = self.__ner.seg([sentence]) seg = seg[0] ner = self.__ner.ner(hidden) ner = ner[0] ner_label = len(sentence) * ['O'] for i in range(len(ner)): tag, start, end = ner[i] tmp = 0 for j in range(start): tmp += len(seg[j]) start = tmp tmp = 0 for j in range(end + 1): tmp += len(seg[j]) end = tmp ner[i] = (tag, start, end - 1) for j in range(start, end): ner_label[j] = tag return ner, ner_label
[docs] def get_pos_tag(self, sentence): r""" pos tag function. :param str sentence: the sentence need to be ner :return: the triple form (tags,start,end) """ assert isinstance(sentence, (list, str)) from ltp import LTP if isinstance(sentence, list): # Turn the list into sentence tmp = '' for word in sentence: tmp += word sentence = tmp if not sentence: return [] if self.__pos is None: # get pos tag self.__pos = LTP() seg, hidden = self.__pos.seg([sentence]) pos = self.__pos.pos(hidden) seg = seg[0] pos = pos[0] pos_tag = [] cnt = 0 for tag in range(len(pos)): pos_tag.append([pos[tag], cnt, cnt + len(seg[tag]) - 1]) cnt += len(seg[tag]) return pos_tag