-
Notifications
You must be signed in to change notification settings - Fork 0
/
feature_helper.py
71 lines (58 loc) · 2.39 KB
/
feature_helper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import numpy as np
import pickle
from ProcessTweets import *
from nltk.tokenize import TweetTokenizer
def construct_features(vocab_filename, tweets, embeddings_filename, relevant_filename, nb_concat):
vocab = extract_index(vocab_filename)
#tweets = import_(tweets_filename)
embeddings = np.load(embeddings_filename)
relevant = extract_relevant(relevant_filename)
features = []
tknzr = TweetTokenizer(False)
nb_dim = 50
if nb_concat == -1:
nb_concat = len(tweets)
tweets_embeddings = []
tweets_embeddings_invalid = []
#get the embeddings of each token
loading_counter = 0
for i, tweet in enumerate(tweets):
token_embeddings = []
for token in extract_tokens(tknzr, tweet):
if token in vocab:
index = vocab.get(token)
token_embeddings.append([embeddings[index], float(relevant.get(token, 0))])
#sum the different embeddings
if len(token_embeddings)==0:
tweets_embeddings_invalid.append(i)
continue
sorted_token_embeddings = sorted(token_embeddings, key=lambda x: x[1])
sum_token_embeddings = sorted_token_embeddings[0][0]
sum_relevance = sorted_token_embeddings[0][1]
for token_embedding in sorted_token_embeddings[1:nb_concat]:
sum_token_embeddings = sum_token_embeddings + token_embedding[0]*token_embedding[1]
sum_relevance = sum_relevance + token_embedding[1]
if sum_relevance != 0:
tweets_embeddings.append(sum_token_embeddings / sum_relevance)
else:
tweets_embeddings.append(sum_token_embeddings)
if loading_counter%1000==1:
print("{:.1f}".format(loading_counter/len(tweets)*100), "%", end='\r')
loading_counter+=1
print("finished")
return tweets_embeddings, tweets_embeddings_invalid
def policy_unpredictable():
return np.random.choice((1,-1))
def assemble(valid, indices):
cur = 0
nb_inserted = 0
result = [0]*(len(valid) + len(indices))
for i in range((len(valid) + len(indices))):
if(cur in indices):
result[cur] = policy_unpredictable()
cur = cur + 1
else:
result[cur] = valid[nb_inserted]
cur = cur + 1
nb_inserted = nb_inserted + 1
return np.array(result)