Source code for textflint.generation_layer.transformation.ABSA.absa_transformation

import random
from abc import ABC
from copy import deepcopy

from ..transformation import Transformation
from ....common.settings import NEGATIVE_WORDS_LIST, DEGREE_WORD_LIST
__all__ = ['ABSATransformation']


[docs]class ABSATransformation(Transformation, ABC): r""" An class that supply methods for ABSA task data transformation. """ def __init__(self): super().__init__() self.negative_words_list = sorted(NEGATIVE_WORDS_LIST, key=lambda s: len(s), reverse=True) self.tokenize = self.processor.tokenize self.untokenize = self.processor.inverse_tokenize self.get_antonyms = self.processor.get_antonyms
[docs] def reverse(self, words_list, opinion_position): r""" Reverse the polarity of opinions. :param list words_list: tokenized words of original sentence :param list opinion_position: opinion position :return: transformed sentence and transformed opinion words """ trans_words = deepcopy(words_list) trans_opinion_words = [] for position in opinion_position: opinion_from = position[0] opinion_to = position[1] opinion_list = trans_words[opinion_from:opinion_to] trans_words, opinion_words, opinion_from, opinion_to, has_neg = \ self.check_negation(trans_words, opinion_from, opinion_to) if len(opinion_list) == 1: trans_words, trans_opinion_words = self.reverse_opinion( trans_words, trans_opinion_words, opinion_from, opinion_to, has_neg) elif len(opinion_list) > 1: if has_neg: trans_opinion_words.append( [opinion_from, opinion_to, self.untokenize(opinion_words)]) else: # negate the closest verb trans_opinion_words.append( [opinion_from, opinion_to, self.untokenize( ['not ' + opinion_words[0]] + opinion_words[1:])]) trans_words[opinion_from:opinion_from + 1] = ['not ' + opinion_words[0]] return trans_words, trans_opinion_words
[docs] def exaggerate(self, words_list, opinions): r""" Exaggerate the opinion words. :param list words_list: tokenized words of original sentence :param list opinions: opinion words and their positions :return: transformed sentence and opinion words """ new_words = deepcopy(words_list) new_opi_words = [] for i in range(len(opinions)): opi_position = opinions[i] opi_from = opi_position[0] opi_to = opi_position[1] new_words, new_opi = self.add_degree_words( new_words, opi_from, opi_to) new_opi_words.append([opi_from, opi_to, self.untokenize(new_opi)]) return new_words, new_opi_words
[docs] def get_postag(self, sentence, start, end): r""" Get the postag. :param list|str sentence: sentence :param int start: start index :param int end: end index :return list: postag """ tags = self.processor.get_pos(sentence) if end != -1: return tags[start:end] else: return tags[start:]
[docs] def refine_candidate(self, trans_words, opi_from, opi_to, candidate_list): r""" Refine the candidate opinion words. :param list trans_words: tokenized words of transformed sentence :param int opi_from: start position of opinion words :param int opi_to: end position of opinion words :param set candidate_list: candidate antonyms word list :return list: refined candidate word list """ if len(trans_words) == 0: return [] postag_list = self.get_postag(trans_words, 0, -1) postag_list = [t[1] for t in postag_list] refined_candi = self.get_candidate( candidate_list, trans_words, postag_list, opi_from, opi_to) return refined_candi
[docs] @staticmethod def get_word2id(text, lower=True): r""" Get the index of words in sentence. :param str text: input text :param bool lower: whether text is lowercase or not :return dict: index of words """ word2idx = {} idx = 1 if lower: text = text.lower() words = text.split() for word in words: if word not in word2idx: word2idx[word] = idx idx += 1 return word2idx
[docs] @staticmethod def add_degree_words(word_list, from_idx, to_idx): r""" Add the degree words to sentence. :param list word_list: tokenized words of original sentence :param int from_idx: index of start :param int to_idx: index of end :return: transformed sentence and opinion words """ candidate_list = DEGREE_WORD_LIST select = random.randint(0, len(candidate_list) - 1) opi1 = [' '.join([candidate_list[select]] + word_list[from_idx: from_idx + 1])] new_words = word_list[:from_idx] + opi1 + word_list[from_idx + 1:] opi = new_words[from_idx: to_idx] return new_words, opi
[docs] @staticmethod def get_conjunction_idx(trans_words, aspect_term, conjunction_list): r""" Get the index of conjunction words in conjunction_list. :param list trans_words: tokenized words of transformed sentence :param dict aspect_term: aspect term :param list conjunction_list: conjunction list :return list: index of transformed conjunction word """ conjunction_idx = [] trans_idx = None term = aspect_term['term'] term_from = aspect_term['from'] term_to = aspect_term['to'] distance_to_term = len(trans_words) for idx, word in enumerate(trans_words): if word.lower() in conjunction_list and word.lower() \ not in term.lower(): conjunction_idx.append(idx) for idx in conjunction_idx: if idx > term_to and idx - term_to < distance_to_term: distance_to_term = idx - term_to trans_idx = idx if idx < term_from and term_to - idx: distance_to_term = term_to - idx trans_idx = idx return trans_idx
[docs] def get_sentence(self, trans_words, sentence): r""" Untokenize and uppercase to get an output sentence. :param list trans_words: transformed sentence :param list sentence: original sentence :return list: transformed sentence """ trans_sentence = self.untokenize(trans_words) if sentence[0].isupper(): trans_sentence = trans_sentence[0].upper() + trans_sentence[1:] return trans_sentence
[docs] def get_term_span(self, trans_sentence, term): r""" Get the span of term in trans_sentence. :param list trans_sentence: transformed sentence :param list term: target term :return: start and end index of target term """ span_from = 0 char_from = 0 char_sentence = ''.join(self.tokenize(trans_sentence)) char_term = ''.join(self.tokenize(term)) for idx in range(len(char_sentence)): if char_sentence[idx:idx + len(char_term)] == char_term: char_from = len(char_sentence[:idx]) break trans_from = 0 for idx in range(len(trans_sentence)): if trans_sentence[idx] != ' ': trans_from += 1 if trans_from == char_from and char_from != 0 and \ trans_sentence[idx + 1] != ' ': span_from = idx + 1 break if trans_from == char_from and char_from != 0 and \ trans_sentence[idx + 1] == ' ': span_from = idx + 2 break span_to = span_from + len(term) return span_from, span_to
[docs] def get_candidate( self, candidate_list, words_list, postag_list, opi_from, opi_to): r""" Get the candidate opinion words from words_list. :param set candidate_list: candidate words :param list words_list: tokenized words of original sentence :param list postag_list: postag :param int opi_from: start index of opinion :param int opi_to: end index of opinion :return list: refined candidate words """ refined_candi = [] for candidate in candidate_list: opi = words_list[opi_from:opi_to][0] isupper = opi[0].isupper() allupper = opi.isupper() if allupper: candidate = candidate.upper() elif isupper: candidate = candidate[0].upper() + candidate[1:] if opi_from == 0: candidate = candidate[0].upper() + candidate[1:] new_words = words_list[:opi_from] + \ [candidate] + words_list[opi_to:] # check pos tag new_postag_list = self.get_postag(new_words, 0, -1) new_postag_list = [t[1] for t in new_postag_list] if len([i for i, j in zip(postag_list[opi_from:opi_to], new_postag_list[opi_from:opi_to]) if i != j]) != 0: continue refined_candi.append(candidate) return refined_candi
[docs] def check_negation(self, trans_words, opinion_from, opinion_to): r""" Check the negation words in trans_words and delete them. :param list trans_words: tokenized words of transformed sentence :param int opinion_from: start index of opinion :param int opinion_to: end index of opinion :return: transformed words, opinion words, position of opinion, and whether exist negation in transformed sentence """ opinion_words = trans_words[opinion_from: opinion_to] has_neg = False for w in self.negative_words_list: ws = self.tokenize(w) for j in range(opinion_from, opinion_to - len(ws) + 1): trans_words_ = ' '.join(trans_words[j:j + len(ws)]) ws_ = ' '.join(ws) if trans_words_.lower() == ws_.lower(): trans_words[j: j + len(ws)] = ['DELETE'] * len(ws) has_neg = True opinion_words = trans_words[opinion_from: opinion_to] break if has_neg: opinion_words.remove('DELETE') break return trans_words, opinion_words, opinion_from, opinion_to, has_neg
[docs] def reverse_opinion( self, trans_words, trans_opinion_words, opinion_from, opinion_to, has_neg): r""" Reverse the polarity of original opinion and return the new transformed opinion words. :param list trans_words: tokenized words of transformed sentence :param list trans_opinion_words: transformed opinion words :param int opinion_from: start index of opinion :param int opinion_to: end index of opinion :param bool has_neg: whether exist negation in transformed sentence """ opinion_list = trans_words[opinion_from:opinion_to] opinion_words = trans_words[opinion_from:opinion_to] opi = opinion_list[0] trans_opinion_word = None from_to = [] if has_neg and [opinion_from, opinion_to] not in from_to: trans_opinion_word = [ opinion_from, opinion_to, self.untokenize(opinion_words)] elif [opinion_from, opinion_to] not in from_to: opi_pos = self.get_postag(trans_words, opinion_from, opinion_to) antonyms = self.get_antonyms(opi_pos)[0] candidate = set() for antonym in antonyms: for ant_word in antonym.lemma_names(lang='eng'): if ( (ant_word != opi) and ("_" not in ant_word) ): candidate.add(ant_word) refined_candidate = self.refine_candidate( trans_words, opinion_from, opinion_to, candidate) if len(refined_candidate) == 0: trans_opinion_word = [opinion_from, opinion_to, self.untokenize(['not', opi])] else: select = random.randint(0, len(refined_candidate) - 1) trans_opinion_word = [opinion_from, opinion_to, self.untokenize( [refined_candidate[select]])] if trans_opinion_word is not None: trans_opinion_words.append(trans_opinion_word) from_to.append([opinion_from, opinion_to]) trans_words[opinion_from: opinion_to] = [trans_opinion_word[2]] return trans_words, trans_opinion_words
[docs] def update_sentence_terms( self, trans_words, trans_terms, trans_opinion_words, opinion_position): r""" Update the terms and sentence. :param list trans_words: tokenized words of transformed sentence :param dict trans_terms: transformed terms :param list trans_opinion_words: transformed opinion words :param list opinion_position: opinion position :return: transformed sentence and transformed terms """ terms = deepcopy(trans_terms) trans_opinion_position = deepcopy(opinion_position) for trans_id, trans_opi in enumerate(trans_opinion_words): offset = len(self.tokenize( trans_opi[2])) - (trans_opi[1] - trans_opi[0]) for opi_id, opi in enumerate(opinion_position): if opi[0] > trans_opi[0]: trans_opinion_position[opi_id][0] += offset trans_opinion_position[opi_id][1] += offset for term_id in terms: if terms[term_id]['from'] >= trans_opi[0]: trans_terms[term_id]['from'] += offset trans_terms[term_id]['to'] += offset positions = terms[term_id]['opinion_position'] for pos_id, position in enumerate(positions): if position == opinion_position[trans_id]: trans_terms[term_id]['opinion_words'][pos_id] = \ trans_opi[2] trans_terms[term_id]['opinion_position'][pos_id] = [ trans_opinion_position[trans_id][0], trans_opinion_position[trans_id][1] + offset] elif position[0] > trans_opi[0]: trans_terms[term_id]['opinion_position'][pos_id][0] += \ offset trans_terms[term_id]['opinion_position'][pos_id][1] += \ offset return trans_words, trans_terms