-
Notifications
You must be signed in to change notification settings - Fork 1
/
generator.py
105 lines (82 loc) · 3.24 KB
/
generator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import csv
import random
from converter import Converter
class Generator():
""" Generator class used to read input and write files.
Example usage:
l = ['english', 'spanish']
g = Generator(l)
g.convert_sentence(1000, 5)
g.write_file('csv/inputs_5.csv')
Args:
langnames (list(str)) : a list of language names
Attributes:
langnames (list(str)) : a list of language names
no_langs (int) : number of languages
no_features (int) : number of features
features (set()) : a set for features
word_list{n} (list()) : a list of words in lang{n}
"""
def __init__ (self, langnames):
self.langnames = langnames
self.no_langs = len(langnames)
self.no_features = 22
self.features = {}
for i in range(self.no_langs):
setattr(self, f'word_list{i}', list())
self._read_files()
def convert_sentence(self, list_size, size):
""" Converts sentences to feature vectors and stores them in
self.features{n}
Args:
list_size (int) : the length of the number of features
size (int) : the length of the sentence.
"""
for i in range(self.no_langs):
word_list = getattr(self, f'word_list{i}')
max_int = len(word_list)
for j in range(list_size):
sentence = []
while len(sentence) != size:
word = word_list[random.randrange(max_int)]
if word not in sentence:
sentence.append(word)
s = ' '.join(word for word in sentence)
converter = Converter(s)
result = converter.result
result.append(self.langnames[i])
self.features[f'{self.langnames[i]}{j}'] \
= tuple(result)
return
def write_file(self, filename):
"""Writes features to a csv file.
Args:
filename (str) : the name of the file to be written to
"""
with open(filename, 'w', newline = '') as csvfile:
langwriter = csv.writer(csvfile, delimiter=' ',
quotechar='|', quoting=csv.QUOTE_MINIMAL)
for key in self.features:
value = self.features[key]
l = []
for val in value:
l.append(str(val))
langwriter.writerow([l])
return
def _read_files(self):
"""Reads word list files to store in self.word_list{n}"""
for langname in self.langnames:
filename = f'data/word_lists/{langname}.txt'
with open(filename) as f:
index = self.langnames.index(langname)
lang_list = getattr(self, f'word_list{index}')
words = f.readlines()
for word in words:
fword = ''.join(char for char in word if char is not '\n')
lang_list.append(fword)
f.close()
return
# l = ['english', 'spanish', 'mandarin', 'japanese', 'arabic', 'turkish']
# g = Generator(l)
# g.convert_sentence(1000, 5)
# g.write_file('csv/inputs_5.csv')