-
Notifications
You must be signed in to change notification settings - Fork 3
/
tinto.py
424 lines (350 loc) · 17.6 KB
/
tinto.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
import numpy as np
import pandas as pd
import os
import gc
# Dimensional reduction classes
from sklearn.manifold import TSNE
#from tsnecuda import TSNE
from sklearn.decomposition import PCA
#Sklearn
from sklearn.preprocessing import MinMaxScaler
# Graphic library
import matplotlib
import matplotlib.image
# Additional libraries
import math
import pickle
# Arguments Library
import argparse
##################
#Params
parser = argparse.ArgumentParser(description="This program transform tidy data "+
"into image by dimensionality "+
"reduction algorithms (PCA o t-SNE)",
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument("-alg", "--algorithm", dest="algorithm", default="PCA", choices=['PCA','t-SNE'], help="dimensionality reduction algorithm (PCA o t-SNE)")
parser.add_argument("-px", "--pixels", dest="pixels", default=20, help="Image's Pixels (one side)", type=int)
parser.add_argument("-B", "--blurr", dest="blurr_active", action='store_true', help="Active option blurring")
parser.add_argument("-aB", "--amplification_blurr", dest="amplification", default=np.pi, help="Amplification in blurring", type=float)
parser.add_argument("-dB", "--distance_blurr", dest="distance", default=2, help="Distance in blurring (number of pixels)", type=int)
parser.add_argument("-sB", "--steps_blurr", dest="steps", default=4, help="Steps in blurring", type=int)
parser.add_argument("-oB", "--option_blurr", dest="option", default='mean', choices=['mean','maximum'], help="Option in blurring (mean and maximum)")
parser.add_argument("-sC", "--save", dest="save_configuration", help="Save configurations (to reuse)")
parser.add_argument("-lC", "--load", dest="load_configuration", help="Load configurations (.pkl)")
parser.add_argument("-sd", "--seed", dest="seed", default=20, help="seed", type=int)
parser.add_argument("-tt", "--times_tsne", dest="times_tsne", default=4, help="Times replication in t-SNE", type=int)
parser.add_argument("src_data", help="Source location (tidy data in csv without head)")
parser.add_argument("dest_folder", help="Destination location (folder)")
parser.add_argument("--show_positions", dest="show_positions", action='store_true', help="Show positions of the features in the image") # Agregado
parser.add_argument("-v", "--verbose", dest="verbose", action='store_true', help="Verbose: if it's true, show the compilation text")
args = parser.parse_args()
###########################################################
################ TINTO MAIN FUNCTIONS ####################
###########################################################
def square(coord):
"""
This functionhas the purpose of being able to create the square delimitation of the resulting image.
The steps it performs in the order of the code are as follows:
- Calculate the average of the points $(x,y)$.
- Centres the data at the point $(0,0)$.
- Calculate the distance from the centre to the points.
- The larger distance of \texttt{dista}, transforms it to integer.
- Calculate the vertices of the square.
- Move the points to quadrant $4$.
- Transfers the vertices as well.
- Returns the values, coordinates, and vertices.
"""
m = np.mean(coord,axis=0).reshape((1,2))
coord_new = coord - m
dista = (coord_new[:,0]**2+coord_new[:,1]**2)**0.5
maxi = math.ceil(max(dista))
vertices = np.array([[-maxi,maxi],[-maxi,-maxi],[maxi,-maxi],[maxi,maxi]])
coord_new = coord_new - vertices[0]
vertices = vertices - vertices[0]
return coord_new,vertices
def m_imagen(coord,vertices,filename,pixeles=24):
"""
This function obtain the coordinates of the matrix. This function has
the following specifications:
- Create a matrix of coordinates and vertices.
- Transform the coordinates into indices for the matrix.
- Fill in the positions of the features.
- Finally, a conditional is created if the features were grouped
in the same position.
"""
size = (pixeles,pixeles)
matrix = np.zeros(size)
coord_m = (coord/vertices[2,0])*(pixeles-1)
coord_m = np.round(abs(coord_m))
for i,j in zip(coord_m[:,1],coord_m[:,0]):
matrix[int(i),int(j)] = 1
if(np.count_nonzero(matrix!=0)!=coord.shape[0]):
return coord_m, matrix, True
else:
return coord_m, matrix, False
def createFilter(distance=2, steps=3, amplification=np.pi):
"""
In this function a filter is created since a matrix of size "2*distance*total_steps+1"
is being created to act as a "filter", which covers the whole circular space of the minutiae
determined by the distance and by the total number of steps.
This "filter", which is a matrix, would be multiplied with a scalar, which is the intensity value.
Finally, this resulting matrix is placed as a submatrix within the final matrix where the centre
of the submatrix would be the position of the characteristic pixel.
"""
size_filter = int(2 * distance * steps + 1)
center_x = distance * steps
center_y = distance * steps
print(distance,steps)
filter = np.zeros([size_filter,size_filter])
for step in reversed(range(steps)):
r_actual = int(distance*(step+1)) # current radius from largest to smallest
#Function of intensity
intensity=min(amplification*1/(np.pi*r_actual**2),1)
#Delimitation of the area
lim_inf_i = max(center_x - r_actual - 1, 0)
lim_sup_i = min(center_x + r_actual + 1, size_filter)
lim_inf_j = max(center_y - r_actual - 1, 0)
lim_sup_j = min(center_y + r_actual + 1, size_filter)
#Allocation of values
for i in range(lim_inf_i, lim_sup_i):
for j in range(lim_inf_j, lim_sup_j):
if((center_x-i)**2 + (center_y-j)**2 <= r_actual**2):
filter[i,j]=intensity
filter[center_x,center_y] = 1
return filter
def blurringFilter(matrix, filter, values, coordinates, option):
"""
This function is to be able to add more ordered contextual information to the image through the
classical painting technique called blurring. This function develops the following main steps:
- Take the coordinate matrix of the characteristic pixels.
- Create the blurring according to the number of steps taken in a loop with the
following specifications:
- Delimit the blurring area according to $(x,y)$ on an upper and lower boundary.
- Set the new intensity values in the matrix, taking into account that if there is
pixel overlap, the maximum or average will be taken as specified.
"""
iter_values = iter(values)
size_matrix = matrix.shape[0]
size_filter = filter.shape[0]
matrix_extended = np.zeros([size_filter+size_matrix,size_filter+size_matrix])
matrix_add = np.zeros([size_filter+size_matrix,size_filter+size_matrix])
center_filter = int((size_filter - 1)/2)
for i,j in coordinates:
i = int(i)
j = int(j)
value = next(iter_values)
submatrix = filter * value
#Delimitación del área
lim_inf_i = i
lim_sup_i = i+2*center_filter+1
lim_inf_j = j
lim_sup_j = j+2*center_filter+1
if(option=='mean'):
matrix_extended[lim_inf_i:lim_sup_i,lim_inf_j:lim_sup_j] += submatrix
matrix_add[lim_inf_i:lim_sup_i,lim_inf_j:lim_sup_j] += (submatrix > 0)*1
elif(option=='maximum'):
matrix_extended[lim_inf_i:lim_sup_i,lim_inf_j:lim_sup_j] = np.maximum(matrix_extended[lim_inf_i:lim_sup_i,lim_inf_j:lim_sup_j], submatrix)
if(option=='mean'):
matrix_add[matrix_add == 0] = 1
matrix_extended = matrix_extended / matrix_add
matrix_final = matrix_extended[center_filter:-center_filter-1,center_filter:-center_filter-1]
return matrix_final
def imageSampleFilter(X, Y, coord, matrix, folder, amplification, distance=2, steps=3, option='maximum', train_m=False):
"""
This function creates the samples, i.e., the images. This function has the following specifications:
- The first conditional performs the pre-processing of the images by creating the matrices.
- Then the for loop generates the images for each sample. Some assumptions have to be taken into
account in this step:
- The samples will be created according to the number of targets. Therefore, each folder that is
created will contain the images created for each target.
- In the code, the images are exported in PNG format; this can be changed to any other format.
"""
# Generate the filter
if distance * steps * amplification != 0: # The function is only called if there are no zeros (blurring).
filter = createFilter(distance,steps,amplification)
# In this part, images are generated for each sample.
for i in range(X.shape[0]):
matrix_a = np.zeros(matrix.shape)
if distance * steps * amplification != 0: # The function is only called if there are no zeros (blurring).
matrix_a = blurringFilter(matrix_a, filter, X[i], coord, option)
else: #(no blurring)
iter_values_X = iter(X[i])
for eje_x,eje_y in coord:
matrix_a[int(eje_x),int(eje_y)]=next(iter_values_X)
extension = 'png' #eps o pdf
subfolder = str(int(Y[i])).zfill(2) # subfolder for grouping the results of each class
name_image = str(i).zfill(6)
route = os.path.join(folder,subfolder)
route_complete = os.path.join(route,name_image+'.'+extension)
if not os.path.isdir(route):
try:
os.makedirs(route)
except:
print("Error: Could not create subfolder")
matplotlib.image.imsave(route_complete, matrix_a, cmap='binary', format=extension)
return matrix
def saveVariable(X, filename='objs.pkl',verbose=False):
"""
This function allows SAVING the transformation options to images in a Pickle object.
This point is basically to be able to reproduce the experiments or reuse the transformation
on unlabelled data.
"""
with open(filename, 'wb') as f:
pickle.dump(X, f)
if(verbose):
print("It has been successfully saved in "+filename)
def loadVariable(filename='objs.pkl',verbose=False):
"""
This function allows LOADING the transformation options to images in a Pickle object.
This point is basically to be able to reproduce the experiments or reuse the transformation
on unlabelled data.
"""
with open(filename, 'rb') as f:
variable = pickle.load(f)
if(verbose):
print("It has been successfully loaded in "+filename)
return variable
###########################################################
################ TINTO MAIN CLASS ####################
###########################################################
class DataImg:
"""
Python class has been developed that contains different specific functions
related to each step in the data transformation process
"""
def __init__(self, algorithm='PCA', pixeles=20, seed=20, times=4, amp=np.pi, distance=2, steps=4, option='maximum'):
"""
This function initialises packages and objects in Python, i.e., displays
the initialisation of each object.
"""
self.algorithm = algorithm # Dimensional reduction algorithm
self.p = pixeles
self.seed = seed
self.times = times # only for t-sne
self.amp = amp # amplitude (blurring)
self.distance = distance # distance (blurring)
self.steps = steps # steps (blurring)
self.option = option # overlapping option (blurring)
self.error_pos = False # Indicates the overlap of characteristic pixels.
def obtainCoord(self, X, verbose=False):
"""
This function uses the dimensionality reduction algorithm in order to represent the characteristic
pixels in the image. The specifications of this function are:
- Perform a normalisation of (0,1) to be able to represent the pixels inside the square.
- Transpose the matrix.
- Set the dimensionality reduction algorithm, PCA or t-SNE.
"""
self.min_max_scaler = MinMaxScaler()
X = self.min_max_scaler.fit_transform(X)
labels = np.arange(X.shape[1])
X_trans = X.T
if(verbose):
print("Selected algorithm: "+self.algorithm)
if(self.algorithm=='PCA'):
X_embedded = PCA(n_components=2,random_state=self.seed).fit(X_trans).transform(X_trans)
elif(self.algorithm=='t-SNE'):
for i in range(self.times):
X_trans = np.append(X_trans,X_trans,axis=0)
labels = np.append(labels,labels,axis=0)
X_embedded = TSNE(n_components=2,random_state=self.seed,perplexity=50).fit_transform(X_trans)
else:
print("Error: Incorrect algorithm")
X_embedded = np.random.rand(X.shape[1],2)
data_coord = {'x':X_embedded[:,0], 'y':X_embedded[:,1], 'Label':labels}
dc = pd.DataFrame(data=data_coord)
self.obtain_coord = dc.groupby('Label').mean().values
del X_trans
gc.collect()
def areaDelimitation(self):
"""
This function performs the delimitation of the area
"""
self.initial_coordinates, self.vertices = square(self.obtain_coord)
def matrixPositions(self, filename='original'):
"""
This function gets the positions in the matrix
"""
self.pos_pixel_caract, self.m, self.error_pos = m_imagen(self.initial_coordinates,self.vertices,filename,pixeles=self.p)
def CrearImg(self, X, Y, folder = 'prueba/', train_m=False, verbose=False):
"""
This function creates the images that will be processed by CNN.
"""
X_scaled = self.min_max_scaler.transform(X)
Y = np.array(Y)
try:
os.mkdir(folder)
if(verbose):
print("The folder was created "+folder+"...")
except:
if(verbose):
print("The folder "+folder+" is already created...")
self.m = imageSampleFilter(X_scaled, Y, self.pos_pixel_caract, self.m, folder, self.amp,
distance=self.distance, steps=self.steps, option=self.option, train_m=train_m)
def trainingAlg(self, X, Y, folder = 'img_train/', verbose=False):
"""
This function uses the above functions for the training.
"""
self.obtainCoord(X, verbose=verbose)
self.areaDelimitation()
self.matrixPositions()
self.CrearImg(X, Y, folder, train_m=True, verbose=verbose)
def testAlg(self, X, Y=None, folder = 'img_test/', verbose=False):
"""
This function uses the above functions for the validation.
"""
if(Y is None):
Y = np.zeros(X.shape[0])
self.CrearImg(X, Y, folder, train_m=False, verbose=verbose)
def getPositionsPixels(self,column_names=None): # Agregado
"""
This function returns the positions pixels in form of dictionary.
"""
dict_coord = {}
if (column_names is None):
base_columns_name = "column"
for i,coord in enumerate(self.pos_pixel_caract):
dict_coord[base_columns_name+str(i+1)] = coord
else:
for i,coord in enumerate(self.pos_pixel_caract):
dict_coord[column_names[i]] = coord
return dict_coord
###########################################################
################ TINTO EXECUTION ####################
###########################################################
# Blurring verification
if not args.blurr_active:
args.amplification = 0
args.distance = 2
args.steps = 0
# Read the CSV
dataset = pd.read_csv(args.src_data)
array = dataset.values
if args.load_configuration:
X = array
modeloIMG = loadVariable(filename=args.load_configuration,verbose=args.verbose)
modeloIMG.testAlg(X,folder=args.dest_folder,verbose=args.verbose)
else:
X = array[:,:-1]
Y = array[:,-1]
# Create the object
modeloIMG = DataImg(algorithm=args.algorithm,
pixeles=args.pixels,
amp=args.amplification,
distance=args.distance,
steps=args.steps,
option=args.option,
seed=args.seed,
times=args.times_tsne
)
# Training
modeloIMG.trainingAlg(X, Y, folder=args.dest_folder, verbose=args.verbose)
# Saves the configuration for later use
if args.save_configuration:
saveVariable(modeloIMG, filename=args.save_configuration,verbose=args.verbose)
# Show positions
if args.show_positions: #Agregado
print("\nPositions of the features:")
if dataset.columns is None:
print(modeloIMG.getPositionsPixels())
else:
print(modeloIMG.getPositionsPixels(dataset.columns))