-
Notifications
You must be signed in to change notification settings - Fork 1
/
text_processing.py
168 lines (137 loc) · 5.68 KB
/
text_processing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
import numpy as np
import os
import pickle
import re
class Vocabulary(object):
def __init__(self):
self.number_of_words = 1
self.word_to_id = dict()
self.id_to_word = dict()
self.word_counter = dict()
self.word_to_id['<un>'] = 0
self.id_to_word[0] = '<un>'
self.word_counter['<un>'] = 0
def add_word(self, word):
"""
Adds a word in the vocabulary
:param word: the word to add
"""
if word not in self.word_to_id:
self.word_to_id[word] = self.number_of_words
self.id_to_word[self.number_of_words] = word
self.word_counter[word] = 1
self.number_of_words += 1
else:
self.word_counter[word] += 1
def get_id_by_word(self, word):
"""
Returns id for an input word
:param word: The word corresponding to the requested id
:return: requested id
"""
return self.word_to_id[word]
def get_word_by_id(self, idx):
"""
Returns a word for an input id
:param idx: The id of the requested word
:return: requested word
"""
return self.id_to_word[idx]
def get_word_frequency(self, word):
"""
Return a frequncy for a given word
:param word: input word
:return: frequency
"""
return self.word_counter[word]
def get_most_frequent_words(self, word_num, pct=False):
"""
Return word_num most frequent words
:param word_num: the number of words to return
:return: dictionary of size word_num
"""
tmp_dict = self.word_counter
if '<sos>' and '<eos>' in tmp_dict:
tmp_dict.pop('<sos>')
tmp_dict.pop('<eos>')
freq_list = sorted(tmp_dict.items(), key=lambda x: x[1], reverse=True)
if pct:
sum_words = 0
freq_list_pct = []
for pair in freq_list:
sum_words += pair[1]
for i, pair in enumerate(freq_list):
freq_list_pct.append(str(pair[0]) + ' - ' + str(round(pair[1] / sum_words * 100, 2)) + '%')
return freq_list_pct[:word_num]
else:
return freq_list[:word_num]
def save_vocabulary(self, filename_word_to_id='word_to_id.pickle', filename_id_to_word='id_to_word.pickle', filename_counter='word_counter.pickle'):
"""
Saves vocabulary dictionaries to pickle files
:param filename_word_to_id: The filename for word_to_id dictionary
:param filename_id_to_word: The filename for id_to_word dictionary
"""
if not os.path.exists('./vocabulary'):
os.mkdir('./vocabulary')
path_word_to_id = os.path.join('./vocabulary/', filename_word_to_id)
path_id_to_word = os.path.join('./vocabulary/', filename_id_to_word)
path_word_counter = os.path.join('./vocabulary/', filename_counter)
with open(path_word_to_id, 'wb') as writer:
pickle.dump(self.word_to_id, writer)
with open(path_id_to_word, 'wb') as writer:
pickle.dump(self.id_to_word, writer)
with open(path_word_counter, 'wb') as writer:
pickle.dump(self.word_counter, writer)
def load_vocabulary(self, path_word_to_id, path_id_to_word, path_word_counter):
"""
Loads vocabulary dictionaries from pickle files
:param path_word_to_id: The path to file with word_to_id dictionary
:param path_id_to_word: The path to file with id_to_word dictionary
"""
with open(path_word_to_id, 'rb') as reader:
self.word_to_id = pickle.load(reader)
with open(path_id_to_word, 'rb') as reader:
self.id_to_word = pickle.load(reader)
with open(path_word_counter, 'rb') as reader:
self.word_counter = pickle.load(reader)
self.number_of_words = len(self.word_to_id)
def preprocess_captions(all_captions):
"""
Replaces all the signs by whitespaces and transforms words to lowercase inplace
:param all_captions: List of lists with all the captions
"""
for captions_list in all_captions:
for i, caption in enumerate(captions_list):
captions_list[i] = re.sub('\W+', ' ', caption.lower())
def add_start_and_end_to_captions(all_captions, start_str='<SOS>', end_str='<EOS>'):
"""
Adds start and end of caption markers with inplace
:param all_captions: List of lists with all the captions
:param start_str: Start of caption marker
:param end_str: End of caption marker
"""
for captions in all_captions:
for i in range(len(captions)):
captions[i] = '{} {} {}'.format(start_str, captions[i], end_str)
captions[i] = captions[i].replace(' ', ' ').lower()
def tokenise_captions(list_captions, vocabulary):
"""
Transforms the list of captions for all images in a dataset
:param list_captions: list of captions for images in the dataset
:param vocabulary: Vocabulary instance for the dataset
:return: list of transformed captions
"""
captions_tokens = []
for captions in list_captions:
tmp_captions_for_img = []
for caption in captions:
caption_words = caption.split()
tmp = []
for word in caption_words:
if word in vocabulary.word_to_id:
tmp.append(vocabulary.get_id_by_word(word))
else:
tmp.append(0)
tmp_captions_for_img.append(tmp)
captions_tokens.append(tmp_captions_for_img)
return captions_tokens