-
Notifications
You must be signed in to change notification settings - Fork 1
/
aspect_category_polarity_feature.py
127 lines (94 loc) · 5.46 KB
/
aspect_category_polarity_feature.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import svm
import parameters
import prepare_data
import nltk
import numpy
'compute features for the multiclass problem of detecting the sentiment of a category in a sentence'
class AspectCategoryPolarityFeatures:
'Constructor: compute some basic statistics (such as words in training prepare_data), to be able to calculate the features individually per sentence later on'
'category: The category for which this feature extractor should extract features'
def __init__(self, train, commonFeatures, category):
# we use word features and other features that are also used by other feature extractors, so we don't implement the same code here
self.comFeatures = commonFeatures
self.category = category
### w2v centroid feature: compute centroid of each sentiment
[self.w2vVocab, self.w2vVocabSize] = self.comFeatures.getw2vModel()
numCatSentw2v = [0]*4
self.centroids = [[numpy.zeros(parameters.w2vVecSize)] for i in range(4)]
# compute centroids
for sentence in train['sentences']:
filteredPost = sentence['tokens']
#filteredPosts = [w.lower() for w in sentence['tokens'] if w.lower() not in self.stopwords]
# extract sentiments appearing in sentence
sentiments = set()
for currCat in sentence['categories']:
if 'polarity' in sentence['categories'][currCat]:
sentiments.add(sentence['categories'][currCat]['polarity'])
for j in sentiments:
for w in filteredPost:
if w in self.w2vVocab:
self.centroids[j-1] += self.w2vVocab[w][0]
numCatSentw2v[j-1] += 1
for j in range(4):
self.centroids[j] /= numCatSentw2v[j]
# normalize centroids
self.centroids[j] /= numpy.linalg.norm(self.centroids[j])
def getCategory(self):
return self.category
'get features for a single sample'
# index: id of sentence in dataset, since they have to be consecutive for SVM HMM. (ignored in SVM Multiclass)
# returns a string which matches the format constraints of SVM Multiclass
def getFeatures(self, sentence, index):
if self.category not in sentence['categories'] or 'polarity' not in sentence['categories'][self.category]: # ignore sentences, that don't contain the considered sentiment
return ''
# feature array, which we will return
# is a array of tuples, each tuple represent an entry in a sparse vector
features = []
# current offset in the feature vector, thus the size of the feature vector before considering the current feature
offset = 0
# compute unigram features for sentence
[unigramFeatures, offset] = self.comFeatures.getUnigramFeatures(sentence, offset)
features += unigramFeatures
# compute bigram features for sentence
#[bigramFeatures, offset] = self.comFeatures.getBigramFeatures(sentence, offset)
#features += bigramFeatures
# compute w2vCategory features for sentence
[w2vCategoryFeatures, offset] = self.comFeatures.getW2VCategoryFeatures(sentence, offset)
features += w2vCategoryFeatures
[w2vCatSentFeatures, offset] = self.getW2VCatSentFeatures(sentence, offset)
features += w2vCatSentFeatures
[capitalizationFeature, offset] = self.comFeatures.getCapitalizationFeature(sentence, offset)
features += capitalizationFeature
[elongatedWordFeature, offset] = self.comFeatures.getElongatedWordFeature(sentence, offset)
features += elongatedWordFeature
[emoticonFeatures, offset] = self.comFeatures.getEmoticonFeatures(sentence, offset)
features += emoticonFeatures
[punctuationFeatures, offset] = self.comFeatures.getPunctuationFeature(sentence, offset)
features += punctuationFeatures
[sentimentFeature, offset] = self.comFeatures.getSentimentFeatures(sentence, offset)
features += sentimentFeature
[sentiWordFeatures, offset] = self.comFeatures.getSentiwordFeatures(sentence, offset)
features += sentiWordFeatures
if prepare_data.useGlove:
[SentenceVectorFeatures, offset] = self.comFeatures.getSentenceVectorFeatures(sentence, offset)
features += SentenceVectorFeatures
# for SVM hmm, the feature indices must be in increasing order
features.sort(key=lambda tup: tup[0]) # sort by first element of tuples
return str(sentence['categories'][self.category]['polarity']) + svm.sparseVectorToString(features, "categorySentFeatures") + '\n'
'get a set of features which measures the similarity from a sentence to the centroids of all category sentiments'
def getW2VCatSentFeatures(self, sentence, offset):
# feature array, which we will return
# is a array of tuples, each tuple represent an entry in a sparse vector
features = []
filteredSet = set(sentence['tokens'])
for w in filteredSet:
if w in self.w2vVocab:
val = self.w2vVocab[w]
wordVec = val[0]
index = val[1]
wordVec /= numpy.linalg.norm(wordVec)
for j in range(4):
# cosine similarity to positive centroid
features.append((offset+j*self.w2vVocabSize+index, numpy.dot(self.centroids[j], wordVec)[0]))
offset += 4 * self.w2vVocabSize
return [features, offset]