-
Notifications
You must be signed in to change notification settings - Fork 6
/
helper.py
82 lines (65 loc) · 2.81 KB
/
helper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import string
import numpy as np
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
def sentences_to_indices(X, word_to_index, max_len):
"""
Converts an array of sentences (strings) into an array of indices corresponding to words in the sentences.
The output shape should be such that it can be given to `Embedding()` (described in Figure 4).
Arguments:
X -- array of sentences (strings), of shape (m, 1)
word_to_index -- a dictionary containing the each word mapped to its index
max_len -- maximum number of words in a sentence. You can assume every sentence in X is no longer than this.
Returns:
X_indices -- array of indices corresponding to words in the sentences from X, of shape (m, max_len)
"""
# Make translation table to replace puctuation
replace_char = {key: None for key in string.punctuation}
replace_char['"'] = None
table = str.maketrans(replace_char)
stop_words = set(stopwords.words('english'))
m = X.shape[0] # number of training examples
# Initialize X_indices as a numpy matrix of zeros and the correct shape (≈ 1 line)
X_indices = np.zeros(shape=(m, max_len))
for i in range(m): # loop over training examples
# Remove punctuation
X[i] = X[i].translate(table)
# Convert the ith training sentence in lower case and split is into words. You should get a list of words.
sentence_words = X[i].lower().split()
# Initialize j to 0
j = 0
# Store indices of unknown words in list and then replace it by
# the average of all words
unknown_words_index = []
# Loop over the words of sentence_words
for w in sentence_words:
# Skip Stopwords
if w in stop_words:
continue
# Set the (i,j)th entry of X_indices to the index of the correct word.
if w in word_to_index:
X_indices[i, j] = word_to_index[w]
else:
# Handle unknown key (keep it as zeros)
pass
# Increment j to j + 1
j += 1
return X_indices
def read_glove_vecs(glove_file):
with open(glove_file, 'r', encoding="utf8") as f:
words = set()
word_to_vec_map = {}
for line in f:
line = line.strip().split()
curr_word = line[0]
words.add(curr_word)
word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
i = 1
words_to_index = {}
index_to_words = {}
for w in sorted(words):
words_to_index[w] = i
index_to_words[i] = w
i = i + 1
return words_to_index, index_to_words, word_to_vec_map