-
Notifications
You must be signed in to change notification settings - Fork 46
/
utils.py
543 lines (453 loc) · 20.8 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
import xml.etree.ElementTree as ET
import matplotlib.pyplot as plt
import numpy as np
import os, cv2
import copy
import seaborn as sns
%matplotlib inline
def parse_annotation(ann_dir, img_dir, labels=[]):
'''
output:
- Each element of the train_image is a dictionary containing the annoation infomation of an image.
- seen_train_labels is the dictionary containing
(key, value) = (the object class, the number of objects found in the images)
'''
all_imgs = []
seen_labels = {}
for ann in sorted(os.listdir(ann_dir)):
if "xml" not in ann:
continue
img = {'object':[]}
tree = ET.parse(ann_dir + ann)
for elem in tree.iter():
if 'filename' in elem.tag:
path_to_image = img_dir + elem.text
img['filename'] = path_to_image
## make sure that the image exists:
if not os.path.exists(path_to_image):
assert False, "file does not exist!\n{}".format(path_to_image)
if 'width' in elem.tag:
img['width'] = int(elem.text)
if 'height' in elem.tag:
img['height'] = int(elem.text)
if 'object' in elem.tag or 'part' in elem.tag:
obj = {}
for attr in list(elem):
if 'name' in attr.tag:
obj['name'] = attr.text
if len(labels) > 0 and obj['name'] not in labels:
break
else:
img['object'] += [obj]
if obj['name'] in seen_labels:
seen_labels[obj['name']] += 1
else:
seen_labels[obj['name']] = 1
if 'bndbox' in attr.tag:
for dim in list(attr):
if 'xmin' in dim.tag:
obj['xmin'] = int(round(float(dim.text)))
if 'ymin' in dim.tag:
obj['ymin'] = int(round(float(dim.text)))
if 'xmax' in dim.tag:
obj['xmax'] = int(round(float(dim.text)))
if 'ymax' in dim.tag:
obj['ymax'] = int(round(float(dim.text)))
if len(img['object']) > 0:
all_imgs += [img]
return all_imgs, seen_labels
def iou(box, clusters):
'''
:param box: np.array of shape (2,) containing w and h
:param clusters: np.array of shape (N cluster, 2)
'''
x = np.minimum(clusters[:, 0], box[0])
y = np.minimum(clusters[:, 1], box[1])
intersection = x * y
box_area = box[0] * box[1]
cluster_area = clusters[:, 0] * clusters[:, 1]
iou_ = intersection / (box_area + cluster_area - intersection)
return iou_
def kmeans(boxes, k, dist=np.median,seed=1):
"""
Calculates k-means clustering with the Intersection over Union (IoU) metric.
:param boxes: numpy array of shape (r, 2), where r is the number of rows
:param k: number of clusters
:param dist: distance function
:return: numpy array of shape (k, 2)
"""
rows = boxes.shape[0]
distances = np.empty((rows, k)) ## N row x N cluster
last_clusters = np.zeros((rows,))
np.random.seed(seed)
# initialize the cluster centers to be k items
clusters = boxes[np.random.choice(rows, k, replace=False)]
while True:
# Step 1: allocate each item to the closest cluster centers
for icluster in range(k): # I made change to lars76's code here to make the code faster
distances[:,icluster] = 1 - iou(clusters[icluster], boxes)
nearest_clusters = np.argmin(distances, axis=1)
if (last_clusters == nearest_clusters).all():
break
# Step 2: calculate the cluster centers as mean (or median) of all the cases in the clusters.
for cluster in range(k):
clusters[cluster] = dist(boxes[nearest_clusters == cluster], axis=0)
last_clusters = nearest_clusters
return clusters,nearest_clusters,distances
class BestAnchorBoxFinder(object):
def __init__(self, ANCHORS):
'''
ANCHORS: a np.array of even number length e.g.
_ANCHORS = [4,2, ## width=4, height=2, flat large anchor box
2,4, ## width=2, height=4, tall large anchor box
1,1] ## width=1, height=1, small anchor box
'''
self.anchors = [BoundBox(0, 0, ANCHORS[2*i], ANCHORS[2*i+1])
for i in range(int(len(ANCHORS)//2))]
def _interval_overlap(self,interval_a, interval_b):
x1, x2 = interval_a
x3, x4 = interval_b
if x3 < x1:
if x4 < x1:
return 0
else:
return min(x2,x4) - x1
else:
if x2 < x3:
return 0
else:
return min(x2,x4) - x3
def bbox_iou(self,box1, box2):
intersect_w = self._interval_overlap([box1.xmin, box1.xmax], [box2.xmin, box2.xmax])
intersect_h = self._interval_overlap([box1.ymin, box1.ymax], [box2.ymin, box2.ymax])
intersect = intersect_w * intersect_h
w1, h1 = box1.xmax-box1.xmin, box1.ymax-box1.ymin
w2, h2 = box2.xmax-box2.xmin, box2.ymax-box2.ymin
union = w1*h1 + w2*h2 - intersect
return float(intersect) / union
def find(self,center_w, center_h):
# find the anchor that best predicts this box
best_anchor = -1
max_iou = -1
# each Anchor box is specialized to have a certain shape.
# e.g., flat large rectangle, or small square
shifted_box = BoundBox(0, 0,center_w, center_h)
## For given object, find the best anchor box!
for i in range(len(self.anchors)): ## run through each anchor box
anchor = self.anchors[i]
iou = self.bbox_iou(shifted_box, anchor)
if max_iou < iou:
best_anchor = i
max_iou = iou
return(best_anchor,max_iou)
class BoundBox:
def __init__(self, xmin, ymin, xmax, ymax, confidence=None,classes=None):
self.xmin, self.ymin = xmin, ymin
self.xmax, self.ymax = xmax, ymax
## the code below are used during inference
# probability
self.confidence = confidence
# class probaiblities [c1, c2, .. cNclass]
self.set_class(classes)
def set_class(self,classes):
self.classes = classes
self.label = np.argmax(self.classes)
def get_label(self):
return(self.label)
def get_score(self):
return(self.classes[self.label])
def rescale_centerxy(obj,config):
'''
obj: dictionary containing xmin, xmax, ymin, ymax
config : dictionary containing IMAGE_W, GRID_W, IMAGE_H and GRID_H
'''
center_x = .5*(obj['xmin'] + obj['xmax'])
center_x = center_x / (float(config['IMAGE_W']) / config['GRID_W'])
center_y = .5*(obj['ymin'] + obj['ymax'])
center_y = center_y / (float(config['IMAGE_H']) / config['GRID_H'])
return(center_x,center_y)
def rescale_cebterwh(obj,config):
'''
obj: dictionary containing xmin, xmax, ymin, ymax
config : dictionary containing IMAGE_W, GRID_W, IMAGE_H and GRID_H
'''
# unit: grid cell
center_w = (obj['xmax'] - obj['xmin']) / (float(config['IMAGE_W']) / config['GRID_W'])
# unit: grid cell
center_h = (obj['ymax'] - obj['ymin']) / (float(config['IMAGE_H']) / config['GRID_H'])
return(center_w,center_h)
class OutputRescaler(object):
def __init__(self,ANCHORS):
self.ANCHORS = ANCHORS
def _sigmoid(self, x):
return 1. / (1. + np.exp(-x))
def _softmax(self, x, axis=-1, t=-100.):
x = x - np.max(x)
if np.min(x) < t:
x = x/np.min(x)*t
e_x = np.exp(x)
return e_x / e_x.sum(axis, keepdims=True)
def get_shifting_matrix(self,netout):
GRID_H, GRID_W, BOX = netout.shape[:3]
no = netout[...,0]
ANCHORSw = self.ANCHORS[::2]
ANCHORSh = self.ANCHORS[1::2]
mat_GRID_W = np.zeros_like(no)
for igrid_w in range(GRID_W):
mat_GRID_W[:,igrid_w,:] = igrid_w
mat_GRID_H = np.zeros_like(no)
for igrid_h in range(GRID_H):
mat_GRID_H[igrid_h,:,:] = igrid_h
mat_ANCHOR_W = np.zeros_like(no)
for ianchor in range(BOX):
mat_ANCHOR_W[:,:,ianchor] = ANCHORSw[ianchor]
mat_ANCHOR_H = np.zeros_like(no)
for ianchor in range(BOX):
mat_ANCHOR_H[:,:,ianchor] = ANCHORSh[ianchor]
return(mat_GRID_W,mat_GRID_H,mat_ANCHOR_W,mat_ANCHOR_H)
def fit(self, netout):
'''
netout : np.array of shape (N grid h, N grid w, N anchor, 4 + 1 + N class)
a single image output of model.predict()
'''
GRID_H, GRID_W, BOX = netout.shape[:3]
(mat_GRID_W,
mat_GRID_H,
mat_ANCHOR_W,
mat_ANCHOR_H) = self.get_shifting_matrix(netout)
# bounding box parameters
netout[..., 0] = (self._sigmoid(netout[..., 0]) + mat_GRID_W)/GRID_W # x unit: range between 0 and 1
netout[..., 1] = (self._sigmoid(netout[..., 1]) + mat_GRID_H)/GRID_H # y unit: range between 0 and 1
netout[..., 2] = (np.exp(netout[..., 2]) * mat_ANCHOR_W)/GRID_W # width unit: range between 0 and 1
netout[..., 3] = (np.exp(netout[..., 3]) * mat_ANCHOR_H)/GRID_H # height unit: range between 0 and 1
# rescale the confidence to range 0 and 1
netout[..., 4] = self._sigmoid(netout[..., 4])
expand_conf = np.expand_dims(netout[...,4],-1) # (N grid h , N grid w, N anchor , 1)
# rescale the class probability to range between 0 and 1
# Pr(object class = k) = Pr(object exists) * Pr(object class = k |object exists)
# = Conf * P^c
netout[..., 5:] = expand_conf * self._softmax(netout[..., 5:])
# ignore the class probability if it is less than obj_threshold
return(netout)
class BoundBox:
def __init__(self, xmin, ymin, xmax, ymax, confidence=None,classes=None):
self.xmin, self.ymin = xmin, ymin
self.xmax, self.ymax = xmax, ymax
## the code below are used during inference
# probability
self.confidence = confidence
# class probaiblities [c1, c2, .. cNclass]
self.set_class(classes)
def set_class(self,classes):
self.classes = classes
self.label = np.argmax(self.classes)
def get_label(self):
return(self.label)
def get_score(self):
return(self.classes[self.label])
def find_high_class_probability_bbox(netout_scale, obj_threshold):
'''
== Input ==
netout : y_pred[i] np.array of shape (GRID_H, GRID_W, BOX, 4 + 1 + N class)
x, w must be a unit of image width
y, h must be a unit of image height
c must be in between 0 and 1
p^c must be in between 0 and 1
== Output ==
boxes : list containing bounding box with Pr(object is in class C) > 0 for at least in one class C
'''
GRID_H, GRID_W, BOX = netout_scale.shape[:3]
boxes = []
for row in range(GRID_H):
for col in range(GRID_W):
for b in range(BOX):
# from 4th element onwards are confidence and class classes
classes = netout_scale[row,col,b,5:]
if np.sum(classes) > 0:
# first 4 elements are x, y, w, and h
x, y, w, h = netout_scale[row,col,b,:4]
confidence = netout_scale[row,col,b,4]
box = BoundBox(x-w/2, y-h/2, x+w/2, y+h/2, confidence, classes)
if box.get_score() > obj_threshold:
boxes.append(box)
return(boxes)
def draw_boxes(image, boxes, labels, obj_baseline=0.05,verbose=False):
'''
image : np.array of shape (N height, N width, 3)
'''
def adjust_minmax(c,_max):
if c < 0:
c = 0
if c > _max:
c = _max
return c
image = copy.deepcopy(image)
image_h, image_w, _ = image.shape
score_rescaled = np.array([box.get_score() for box in boxes])
score_rescaled /= obj_baseline
colors = sns.color_palette("husl", 8)
for sr, box,color in zip(score_rescaled,boxes, colors):
xmin = adjust_minmax(int(box.xmin*image_w),image_w)
ymin = adjust_minmax(int(box.ymin*image_h),image_h)
xmax = adjust_minmax(int(box.xmax*image_w),image_w)
ymax = adjust_minmax(int(box.ymax*image_h),image_h)
text = "{:10} {:4.3f}".format(labels[box.label], box.get_score())
if verbose:
print("{} xmin={:4.0f},ymin={:4.0f},xmax={:4.0f},ymax={:4.0f}".format(text,xmin,ymin,xmax,ymax,text))
cv2.rectangle(image,
pt1=(xmin,ymin),
pt2=(xmax,ymax),
color=color,
thickness=sr)
cv2.putText(img = image,
text = text,
org = (xmin+ 13, ymin + 13),
fontFace = cv2.FONT_HERSHEY_SIMPLEX,
fontScale = 1e-3 * image_h,
color = (1, 0, 1),
thickness = 1)
return image
def nonmax_suppression(boxes,iou_threshold,obj_threshold):
'''
boxes : list containing "good" BoundBox of a frame
[BoundBox(),BoundBox(),...]
'''
bestAnchorBoxFinder = BestAnchorBoxFinder([])
CLASS = len(boxes[0].classes)
index_boxes = []
# suppress non-maximal boxes
for c in range(CLASS):
# extract class probabilities of the c^th class from multiple bbox
class_probability_from_bbxs = [box.classes[c] for box in boxes]
#sorted_indices[i] contains the i^th largest class probabilities
sorted_indices = list(reversed(np.argsort( class_probability_from_bbxs)))
for i in range(len(sorted_indices)):
index_i = sorted_indices[i]
# if class probability is zero then ignore
if boxes[index_i].classes[c] == 0:
continue
else:
index_boxes.append(index_i)
for j in range(i+1, len(sorted_indices)):
index_j = sorted_indices[j]
# check if the selected i^th bounding box has high IOU with any of the remaining bbox
# if so, the remaining bbox' class probabilities are set to 0.
bbox_iou = bestAnchorBoxFinder.bbox_iou(boxes[index_i], boxes[index_j])
if bbox_iou >= iou_threshold:
classes = boxes[index_j].classes
classes[c] = 0
boxes[index_j].set_class(classes)
newboxes = [ boxes[i] for i in index_boxes if boxes[i].get_score() > obj_threshold ]
return newboxes
def plot_image_with_grid_cell_partition(irow):
img = x_batch[irow]
plt.figure(figsize=(15,15))
plt.imshow(img)
for wh in ["W","H"]:
GRID_ = generator_config["GRID_" + wh] ## 13
IMAGE_ = generator_config["IMAGE_" + wh] ## 416
if wh == "W":
pltax = plt.axvline
plttick = plt.xticks
else:
pltax = plt.axhline
plttick = plt.yticks
for count in range(GRID_):
l = IMAGE_*count/GRID_
pltax(l,color="yellow",alpha=0.3)
plttick([(i + 0.5)*IMAGE_/GRID_ for i in range(GRID_)],
["iGRID{}={}".format(wh,i) for i in range(GRID_)])
def plot_grid(irow):
import seaborn as sns
color_palette = list(sns.xkcd_rgb.values())
iobj = 0
for igrid_h in range(generator_config["GRID_H"]):
for igrid_w in range(generator_config["GRID_W"]):
for ianchor in range(generator_config["BOX"]):
vec = y_batch[irow,igrid_h,igrid_w,ianchor,:]
C = vec[4] ## ground truth confidence
if C == 1:
class_nm = np.array(LABELS)[np.where(vec[5:])]
x, y, w, h = vec[:4]
multx = generator_config["IMAGE_W"]/generator_config["GRID_W"]
multy = generator_config["IMAGE_H"]/generator_config["GRID_H"]
c = color_palette[iobj]
iobj += 1
xmin = x - 0.5*w
ymin = y - 0.5*h
xmax = x + 0.5*w
ymax = y + 0.5*h
# center
plt.text(x*multx,y*multy,
"X",color=c,fontsize=23)
plt.plot(np.array([xmin,xmin])*multx,
np.array([ymin,ymax])*multy,color=c,linewidth=10)
plt.plot(np.array([xmin,xmax])*multx,
np.array([ymin,ymin])*multy,color=c,linewidth=10)
plt.plot(np.array([xmax,xmax])*multx,
np.array([ymax,ymin])*multy,color=c,linewidth=10)
plt.plot(np.array([xmin,xmax])*multx,
np.array([ymax,ymax])*multy,color=c,linewidth=10)
"""
It contains some code used for encoding images so that we can make them usable for training
"""
import copy
class ImageReader(object):
def __init__(self,IMAGE_H,IMAGE_W, norm=None):
'''
IMAGE_H : the height of the rescaled image, e.g., 416
IMAGE_W : the width of the rescaled image, e.g., 416
'''
self.IMAGE_H = IMAGE_H
self.IMAGE_W = IMAGE_W
self.norm = norm
def encode_core(self,image, reorder_rgb=True):
# resize the image to standard size
image = cv2.resize(image, (self.IMAGE_H, self.IMAGE_W))
if reorder_rgb:
image = image[:,:,::-1]
if self.norm is not None:
image = self.norm(image)
return(image)
def fit(self,train_instance):
'''
read in and resize the image, annotations are resized accordingly.
-- Input --
train_instance : dictionary containing filename, height, width and object
{'filename': 'ObjectDetectionRCNN/VOCdevkit/VOC2012/JPEGImages/2008_000054.jpg',
'height': 333,
'width': 500,
'object': [{'name': 'bird',
'xmax': 318,
'xmin': 284,
'ymax': 184,
'ymin': 100},
{'name': 'bird',
'xmax': 198,
'xmin': 112,
'ymax': 209,
'ymin': 146}]
}
'''
if not isinstance(train_instance,dict):
train_instance = {'filename':train_instance}
image_name = train_instance['filename']
image = cv2.imread(image_name)
h, w, c = image.shape
if image is None: print('Cannot find ', image_name)
image = self.encode_core(image, reorder_rgb=True)
if "object" in train_instance.keys():
all_objs = copy.deepcopy(train_instance['object'])
# fix object's position and size
for obj in all_objs:
for attr in ['xmin', 'xmax']:
obj[attr] = int(obj[attr] * float(self.IMAGE_W) / w)
obj[attr] = max(min(obj[attr], self.IMAGE_W), 0)
for attr in ['ymin', 'ymax']:
obj[attr] = int(obj[attr] * float(self.IMAGE_H) / h)
obj[attr] = max(min(obj[attr], self.IMAGE_H), 0)
else:
return image
return image, all_objs
def normalize(image):
return image / 255.