-
Notifications
You must be signed in to change notification settings - Fork 8
/
baselines.py
249 lines (191 loc) · 9.15 KB
/
baselines.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
import numpy as np
from collections import defaultdict
from nltk.tokenize import wordpunct_tokenize as tokenize
from fever_io import load_split_trainset
import random
random.seed(42)
def calculate_accuracy(id2label:dict, id2prediction:dict, match_assert=True):
"""
Takes two dictionaries: id->label; and id->prediction.
Based on the id, label and prediction are matched, and accuracy is computed
match_assert: boolean. Checks whether ids between the dicts overlap perfectly.
"""
if match_assert:
assert set(id2label.keys()) == set(id2prediction.keys())
# how often is the prediction equal to the label, on average?
accuracy = np.mean([id2prediction[id] == id2label[id] if id in id2prediction else False for id in id2label])
print("Accuracy:", accuracy)
return accuracy
def bigram_splitter(text:str):
"""
Returns list of bigrams in given text.
"""
tokens = tokenize(text)
if len(tokens) < 2:
bigrams = []
else:
bigrams = [tokens[i]+ " "+ tokens[i+1] for i in range(len(tokens)-1)]
return bigrams
class NaiveBayesBow():
def __init__(self, train_set, tokenisation_type="unigram", lowercase=True):
"""
Defines and trains a Naive Bayes model from the given train_set, using
bag-of-words features.
NOTE: This assumes a balanced test set distribution.
Inputs:
- "tokenisation_type" can be changed to try unigram/bigram/both.
- "lowercase" whether claims are lowercased before featurisation.
"""
self.lowercase = lowercase
# stores word counts for each label.
label_to_word_freq_dict = {
"NOT ENOUGH INFO": defaultdict(float),
"SUPPORTS": defaultdict(float),
"REFUTES": defaultdict(float)
}
# total proportions for each label
label_proportions = defaultdict(int)
# determine which feature computation to use (default unigram)
self.tokenisation_type = tokenisation_type
self.token_splitter = self.get_token_splitter(self.tokenisation_type)
# loop over train set
for example in train_set:
claim = example['claim']
if self.lowercase:
claim = claim.lower()
tokenized_claim = self.token_splitter(claim)
label = example['label']
label_proportions[label] +=1
# loop over words in claim
for token in tokenized_claim:
# add token count
label_to_word_freq_dict[label][token] += 1.0
self.label_proportions = label_proportions
self.labels = sorted(self.label_proportions.keys())
# normalise by total label frequency
for label in label_to_word_freq_dict.keys():
label_proportion = label_proportions[label]
for word in label_to_word_freq_dict[label].keys():
label_to_word_freq_dict[label][word] = label_to_word_freq_dict[label][word] / label_proportion
self.label_to_word_freq_dict = label_to_word_freq_dict
# optional
self.get_token_entropies()
print("Done fitting Naive Bayes.")
def get_token_entropies(self):
"""
Computes entropy and probabiltiy distribution per token
"""
# identify all tokens in the train set
all_tokens = set()
for label in self.labels:
all_tokens.update(self.label_to_word_freq_dict[label].keys())
# compute (normalised) probabilities for each label, given each token
token2entropy = {}
token2probabilities = {}
for token in all_tokens:
# probabilities: as many values as labels
probabilities = [self.label_to_word_freq_dict[label][token] for label in self.labels]
# normalise
normaliser = sum(probabilities)
probabilities = [x/normaliser for x in probabilities]
assert np.abs(sum(probabilities) - 1.0) < 0.01
# compute entropy
entropy = -np.sum([p * np.log(p) for p in probabilities if p > 0.0])
token2entropy[token] = entropy
token2probabilities[token] = probabilities
self.token2entropy = token2entropy
self.token2probabilities = token2probabilities
return token2entropy, token2probabilities
def print_indicative_words(self, mode="positive"):
"""
A function purely to play around with, for data exploration.
Purpose: print out words strongly associated with one clss
"""
print(self.labels)
if mode == "positive":
# prints the top 2000 tokens with highest probability for "SUPPORTS" (but only if seen at least 10 times)
indicative_positive_tokens = [x for x in sorted(self.token2probabilities.items(), key=lambda x: -x[1][2])][:2000]
for record in indicative_positive_tokens:
if self.label_to_word_freq_dict["SUPPORTS"][record[0]]>10/self.label_proportions["SUPPORTS"]:
print(record)
if mode == "negative":
# prints the top 2000 tokens with highest probability for "REFUTES" (but only if seen at least 10 times)
indicative_negative_tokens = [x for x in sorted(self.token2probabilities.items(), key=lambda x: -x[1][1])][:2000]
for record in indicative_negative_tokens:
if self.label_to_word_freq_dict["REFUTES"][record[0]]>10/self.label_proportions["REFUTES"]:
print(record)
if mode == "neutral":
# prints the top 2000 tokens with highest probability for "NOT ENOUGH INFO" (but only if seen at least 5 times)
indicative_neutral_tokens = [x for x in sorted(self.token2probabilities.items(), key=lambda x: -x[1][0])][:2000]
for record in indicative_neutral_tokens:
if self.label_to_word_freq_dict["NOT ENOUGH INFO"][record[0]]>5/self.label_proportions["NOT ENOUGH INFO"]:
print(record)
print(self.labels)
def get_token_splitter(self, type="unigram"):
"""
Returns a "tokenisation" function, but potentially also for bigrams,
or for both unigrams and bigrams.
"""
if type=="unigram":
return lambda s: tokenize(s)
elif type=="bigram":
return bigram_splitter
elif type=="both":
# concatenation of both unigrams and bigrams
return lambda s: tokenize(s) + bigram_splitter(s)
else:
return lambda s: []
def predict_single_example(self, example):
"""
Makes a Naive Bayes Prediction on a single given example.
THis assumes a balanced test set distribution.
"""
# will hold log probability for each label.
conditional_probability_per_label = defaultdict(float)
# featurise the given claim
claim = example['claim']
if self.lowercase:
claim = claim.lower()
tokenized_claim = self.token_splitter(claim)
# loop over words in the claim
for token in tokenized_claim:
# get log probability for each label
for label in self.labels:
# look up probability from previously computed counts
token_probability = self.label_to_word_freq_dict[label][token]
if token_probability: # can be zero for OOV words.
conditional_probability_per_label[label] += np.log(token_probability)
# rule out labels for which there is overall 0 probability
conditional_probability_per_label = {k:v for k,v in conditional_probability_per_label.items() if v != 0.0}
if conditional_probability_per_label:
# default case: dictionary not empty
prediction = max(conditional_probability_per_label.items(), key=lambda x: x[1])[0]
else:
# if no word has been observed for any of the labels --> predict random label
prediction = random.choice(self.labels)
return prediction
def batch_predict(self, dataset):
"""
Generates a dictionary: id -> prediction for each example in the given dataset
"""
id2prediction = {}
for example in dataset:
prediction = self.predict_single_example(example)
id2prediction[example['id']] = prediction
return id2prediction
if __name__ == "__main__":
# load some data (splitting off dev data from original train set)
train, dev = load_split_trainset(dev_size=1000)
# train a naive bayes model on Bag-of-Words features
nb = NaiveBayesBow(train, tokenisation_type="unigram")
# [for fun]: print out most frequent tokens for one of the classes
print(sorted(nb.label_to_word_freq_dict["SUPPORTS"].items(), key= lambda x: -x[1] )[:10])
# [for fun]: print out words most strongly associated with one class, vs. the others
#nb.print_indicative_words("positive")
nb.print_indicative_words("negative")
#nb.print_indicative_words("neutral")
# generate predictions for dev set
dev_predictions = nb.batch_predict(dev)
# evaluate the model on dev
dev_labels = {x['id']: x['label'] for x in dev}
calculate_accuracy(dev_labels, dev_predictions)