Source code for textflint.generation_layer.transformation.ABSA.absa_transformation
import random
from abc import ABC
from copy import deepcopy
from ..transformation import Transformation
from ....common.settings import NEGATIVE_WORDS_LIST, DEGREE_WORD_LIST
__all__ = ['ABSATransformation']
[docs]class ABSATransformation(Transformation, ABC):
r"""
An class that supply methods for ABSA task data transformation.
"""
def __init__(self):
super().__init__()
self.negative_words_list = sorted(NEGATIVE_WORDS_LIST,
key=lambda s: len(s), reverse=True)
self.tokenize = self.processor.tokenize
self.untokenize = self.processor.inverse_tokenize
self.get_antonyms = self.processor.get_antonyms
[docs] def reverse(self, words_list, opinion_position):
r"""
Reverse the polarity of opinions.
:param list words_list: tokenized words of original sentence
:param list opinion_position: opinion position
:return: transformed sentence and transformed opinion words
"""
trans_words = deepcopy(words_list)
trans_opinion_words = []
for position in opinion_position:
opinion_from = position[0]
opinion_to = position[1]
opinion_list = trans_words[opinion_from:opinion_to]
trans_words, opinion_words, opinion_from, opinion_to, has_neg = \
self.check_negation(trans_words, opinion_from, opinion_to)
if len(opinion_list) == 1:
trans_words, trans_opinion_words = self.reverse_opinion(
trans_words, trans_opinion_words, opinion_from, opinion_to,
has_neg)
elif len(opinion_list) > 1:
if has_neg:
trans_opinion_words.append(
[opinion_from, opinion_to,
self.untokenize(opinion_words)])
else:
# negate the closest verb
trans_opinion_words.append(
[opinion_from, opinion_to, self.untokenize(
['not ' + opinion_words[0]] + opinion_words[1:])])
trans_words[opinion_from:opinion_from +
1] = ['not ' + opinion_words[0]]
return trans_words, trans_opinion_words
[docs] def exaggerate(self, words_list, opinions):
r"""
Exaggerate the opinion words.
:param list words_list: tokenized words of original sentence
:param list opinions: opinion words and their positions
:return: transformed sentence and opinion words
"""
new_words = deepcopy(words_list)
new_opi_words = []
for i in range(len(opinions)):
opi_position = opinions[i]
opi_from = opi_position[0]
opi_to = opi_position[1]
new_words, new_opi = self.add_degree_words(
new_words, opi_from, opi_to)
new_opi_words.append([opi_from, opi_to, self.untokenize(new_opi)])
return new_words, new_opi_words
[docs] def get_postag(self, sentence, start, end):
r"""
Get the postag.
:param list|str sentence: sentence
:param int start: start index
:param int end: end index
:return list: postag
"""
tags = self.processor.get_pos(sentence)
if end != -1:
return tags[start:end]
else:
return tags[start:]
[docs] def refine_candidate(self, trans_words, opi_from, opi_to, candidate_list):
r"""
Refine the candidate opinion words.
:param list trans_words: tokenized words of transformed sentence
:param int opi_from: start position of opinion words
:param int opi_to: end position of opinion words
:param set candidate_list: candidate antonyms word list
:return list: refined candidate word list
"""
if len(trans_words) == 0:
return []
postag_list = self.get_postag(trans_words, 0, -1)
postag_list = [t[1] for t in postag_list]
refined_candi = self.get_candidate(
candidate_list, trans_words, postag_list, opi_from, opi_to)
return refined_candi
[docs] @staticmethod
def get_word2id(text, lower=True):
r"""
Get the index of words in sentence.
:param str text: input text
:param bool lower: whether text is lowercase or not
:return dict: index of words
"""
word2idx = {}
idx = 1
if lower:
text = text.lower()
words = text.split()
for word in words:
if word not in word2idx:
word2idx[word] = idx
idx += 1
return word2idx
[docs] @staticmethod
def add_degree_words(word_list, from_idx, to_idx):
r"""
Add the degree words to sentence.
:param list word_list: tokenized words of original sentence
:param int from_idx: index of start
:param int to_idx: index of end
:return: transformed sentence and opinion words
"""
candidate_list = DEGREE_WORD_LIST
select = random.randint(0, len(candidate_list) - 1)
opi1 = [' '.join([candidate_list[select]] +
word_list[from_idx: from_idx + 1])]
new_words = word_list[:from_idx] + opi1 + word_list[from_idx + 1:]
opi = new_words[from_idx: to_idx]
return new_words, opi
[docs] @staticmethod
def get_conjunction_idx(trans_words, aspect_term, conjunction_list):
r"""
Get the index of conjunction words in conjunction_list.
:param list trans_words: tokenized words of transformed sentence
:param dict aspect_term: aspect term
:param list conjunction_list: conjunction list
:return list: index of transformed conjunction word
"""
conjunction_idx = []
trans_idx = None
term = aspect_term['term']
term_from = aspect_term['from']
term_to = aspect_term['to']
distance_to_term = len(trans_words)
for idx, word in enumerate(trans_words):
if word.lower() in conjunction_list and word.lower() \
not in term.lower():
conjunction_idx.append(idx)
for idx in conjunction_idx:
if idx > term_to and idx - term_to < distance_to_term:
distance_to_term = idx - term_to
trans_idx = idx
if idx < term_from and term_to - idx:
distance_to_term = term_to - idx
trans_idx = idx
return trans_idx
[docs] def get_sentence(self, trans_words, sentence):
r"""
Untokenize and uppercase to get an output sentence.
:param list trans_words: transformed sentence
:param list sentence: original sentence
:return list: transformed sentence
"""
trans_sentence = self.untokenize(trans_words)
if sentence[0].isupper():
trans_sentence = trans_sentence[0].upper() + trans_sentence[1:]
return trans_sentence
[docs] def get_term_span(self, trans_sentence, term):
r"""
Get the span of term in trans_sentence.
:param list trans_sentence: transformed sentence
:param list term: target term
:return: start and end index of target term
"""
span_from = 0
char_from = 0
char_sentence = ''.join(self.tokenize(trans_sentence))
char_term = ''.join(self.tokenize(term))
for idx in range(len(char_sentence)):
if char_sentence[idx:idx + len(char_term)] == char_term:
char_from = len(char_sentence[:idx])
break
trans_from = 0
for idx in range(len(trans_sentence)):
if trans_sentence[idx] != ' ':
trans_from += 1
if trans_from == char_from and char_from != 0 and \
trans_sentence[idx + 1] != ' ':
span_from = idx + 1
break
if trans_from == char_from and char_from != 0 and \
trans_sentence[idx + 1] == ' ':
span_from = idx + 2
break
span_to = span_from + len(term)
return span_from, span_to
[docs] def get_candidate(
self,
candidate_list,
words_list,
postag_list,
opi_from,
opi_to):
r"""
Get the candidate opinion words from words_list.
:param set candidate_list: candidate words
:param list words_list: tokenized words of original sentence
:param list postag_list: postag
:param int opi_from: start index of opinion
:param int opi_to: end index of opinion
:return list: refined candidate words
"""
refined_candi = []
for candidate in candidate_list:
opi = words_list[opi_from:opi_to][0]
isupper = opi[0].isupper()
allupper = opi.isupper()
if allupper:
candidate = candidate.upper()
elif isupper:
candidate = candidate[0].upper() + candidate[1:]
if opi_from == 0:
candidate = candidate[0].upper() + candidate[1:]
new_words = words_list[:opi_from] + \
[candidate] + words_list[opi_to:]
# check pos tag
new_postag_list = self.get_postag(new_words, 0, -1)
new_postag_list = [t[1] for t in new_postag_list]
if len([i for i, j in zip(postag_list[opi_from:opi_to],
new_postag_list[opi_from:opi_to]) if
i != j]) != 0:
continue
refined_candi.append(candidate)
return refined_candi
[docs] def check_negation(self, trans_words, opinion_from, opinion_to):
r"""
Check the negation words in trans_words and delete them.
:param list trans_words: tokenized words of transformed sentence
:param int opinion_from: start index of opinion
:param int opinion_to: end index of opinion
:return: transformed words, opinion words, position of opinion, and
whether exist negation in transformed sentence
"""
opinion_words = trans_words[opinion_from: opinion_to]
has_neg = False
for w in self.negative_words_list:
ws = self.tokenize(w)
for j in range(opinion_from, opinion_to - len(ws) + 1):
trans_words_ = ' '.join(trans_words[j:j + len(ws)])
ws_ = ' '.join(ws)
if trans_words_.lower() == ws_.lower():
trans_words[j: j + len(ws)] = ['DELETE'] * len(ws)
has_neg = True
opinion_words = trans_words[opinion_from: opinion_to]
break
if has_neg:
opinion_words.remove('DELETE')
break
return trans_words, opinion_words, opinion_from, opinion_to, has_neg
[docs] def reverse_opinion(
self,
trans_words,
trans_opinion_words,
opinion_from,
opinion_to,
has_neg):
r"""
Reverse the polarity of original opinion and return the new
transformed opinion words.
:param list trans_words: tokenized words of transformed sentence
:param list trans_opinion_words: transformed opinion words
:param int opinion_from: start index of opinion
:param int opinion_to: end index of opinion
:param bool has_neg: whether exist negation in transformed sentence
"""
opinion_list = trans_words[opinion_from:opinion_to]
opinion_words = trans_words[opinion_from:opinion_to]
opi = opinion_list[0]
trans_opinion_word = None
from_to = []
if has_neg and [opinion_from, opinion_to] not in from_to:
trans_opinion_word = [
opinion_from,
opinion_to,
self.untokenize(opinion_words)]
elif [opinion_from, opinion_to] not in from_to:
opi_pos = self.get_postag(trans_words, opinion_from, opinion_to)
antonyms = self.get_antonyms(opi_pos)[0]
candidate = set()
for antonym in antonyms:
for ant_word in antonym.lemma_names(lang='eng'):
if (
(ant_word != opi)
and ("_" not in ant_word)
):
candidate.add(ant_word)
refined_candidate = self.refine_candidate(
trans_words, opinion_from, opinion_to, candidate)
if len(refined_candidate) == 0:
trans_opinion_word = [opinion_from, opinion_to,
self.untokenize(['not', opi])]
else:
select = random.randint(0, len(refined_candidate) - 1)
trans_opinion_word = [opinion_from,
opinion_to,
self.untokenize(
[refined_candidate[select]])]
if trans_opinion_word is not None:
trans_opinion_words.append(trans_opinion_word)
from_to.append([opinion_from, opinion_to])
trans_words[opinion_from: opinion_to] = [trans_opinion_word[2]]
return trans_words, trans_opinion_words
[docs] def update_sentence_terms(
self,
trans_words,
trans_terms,
trans_opinion_words,
opinion_position):
r"""
Update the terms and sentence.
:param list trans_words: tokenized words of transformed sentence
:param dict trans_terms: transformed terms
:param list trans_opinion_words: transformed opinion words
:param list opinion_position: opinion position
:return: transformed sentence and transformed terms
"""
terms = deepcopy(trans_terms)
trans_opinion_position = deepcopy(opinion_position)
for trans_id, trans_opi in enumerate(trans_opinion_words):
offset = len(self.tokenize(
trans_opi[2])) - (trans_opi[1] - trans_opi[0])
for opi_id, opi in enumerate(opinion_position):
if opi[0] > trans_opi[0]:
trans_opinion_position[opi_id][0] += offset
trans_opinion_position[opi_id][1] += offset
for term_id in terms:
if terms[term_id]['from'] >= trans_opi[0]:
trans_terms[term_id]['from'] += offset
trans_terms[term_id]['to'] += offset
positions = terms[term_id]['opinion_position']
for pos_id, position in enumerate(positions):
if position == opinion_position[trans_id]:
trans_terms[term_id]['opinion_words'][pos_id] = \
trans_opi[2]
trans_terms[term_id]['opinion_position'][pos_id] = [
trans_opinion_position[trans_id][0],
trans_opinion_position[trans_id][1] + offset]
elif position[0] > trans_opi[0]:
trans_terms[term_id]['opinion_position'][pos_id][0] += \
offset
trans_terms[term_id]['opinion_position'][pos_id][1] += \
offset
return trans_words, trans_terms