-
Notifications
You must be signed in to change notification settings - Fork 64
/
utils.py
797 lines (667 loc) · 30.7 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
import os
from time import localtime, strftime, time
from pathlib import Path
import hashlib
import requests
import numpy as np
import torch
from matplotlib import cm as colormap
from matplotlib import pyplot as plt
from matplotlib.colors import Normalize
from PIL import Image, ImageDraw, ImageFont
YOLOV3_WEIGHTS_PATH = 'https://pjreddie.com/media/files/yolov3.weights'
YOLOV3_WEIGHTS_MD5 = 'c84e5b99d0e52cd466ae710cadf6d84c'
def md5_hash(path):
with open(path, "rb") as f:
content = f.read()
return hashlib.md5(content).hexdigest()
def check_if_file_exists_else_download(path, chunk_size=1024):
path = Path(path)
if not path.exists() or (md5_hash(path) != YOLOV3_WEIGHTS_MD5):
print(path, 'does not exist or md5sum is incorrect downloading...')
path.parent.mkdir(exist_ok=True, parents=True)
with requests.get(YOLOV3_WEIGHTS_PATH, stream=True) as r:
total_size = int(r.headers.get('content-length', 0))
with open(path, 'wb') as f:
for data in r.iter_content(chunk_size=chunk_size):
if data:
f.write(data)
print('downloaded from', YOLOV3_WEIGHTS_PATH, 'md5 of the file:', md5_hash(path))
return path
def parse_cfg(file):
'''
Parses the original cfg file
Argument
--------
file: str
A path to cfg file.
Output
------
layers: list
A list of dicts with config for each layer.
Note: the 0th element of the list contain config for network itself
'''
layers = []
layer = {}
with open(file, 'r') as readf:
lines = readf.read().split('\n')
# skip commented lines
lines = [line for line in lines if not line.startswith('#')]
# skip empty lines
lines = [line for line in lines if not len(line) == 0]
# remove all whitespaces
lines = [line.replace(' ', '') for line in lines]
for line in lines:
# if the name of the layer (they are of form : [*])
if line.startswith('[') and line.endswith(']'):
# save the prev. layer as the next lines contains info for the next layer
if len(layer) > 0:
layers.append(layer)
layer = {}
# add the layer's name/type
layer['name'] = line.replace('[', '').replace(']', '')
# if not the name then parse agruments
else:
# all arguments follows the pattern: 'key=value'
key, value = line.split('=')
# add info to the layer
layer[key] = value
# append the last layer
layers.append(layer)
return layers
def get_center_coords(bboxes): #top_left_x, top_left_y, box_w, box_h
'''
Given the bboxes with top-left coordinates transforms the bboxes
with center coordinates.
Argument
--------
bboxes: torch.FloatTensor
A tensor of size (P, D) where D should contain info about the coords
in the following order (top_left_x, top_left_y, width, height).
Note: D can be higher than 4.
Output
------
bboxes: torch.FloatTensor
The similar to the tensor specified in the input but with center
coordinates in 0th and 1st columns.
'''
bboxes[:, 0] = bboxes[:, 0] + bboxes[:, 2] // 2
bboxes[:, 1] = bboxes[:, 1] + bboxes[:, 3] // 2
return bboxes
def get_corner_coords(bboxes):
'''
Transforms the bounding boxes coordinate from (center_x, center_y, w, h) into
(top_left_x, top_left_y, bottom_right_x, bottom_right_y),
i.e. into corner coordinates.
Argument
--------
bboxes: torch.FloatTensor
A tensor of size (P, D) where D should contain info about the coords
in the following order (center_x, center_y, width, height). Note:
D can be higher than 4.
Outputs
-------
top_left_x, top_left_y, bottom_right_x, bottom_right_y: torch.FloatTensors
Transformed coordinates for bboxes: top-left corner coordinates for x and y
and bottom-right coordinates for x and y respectively.
'''
top_left_x = bboxes[:, 0] - bboxes[:, 2]/2
top_left_y = bboxes[:, 1] - bboxes[:, 3]/2
bottom_right_x = bboxes[:, 0] + bboxes[:, 2]/2
bottom_right_y = bboxes[:, 1] + bboxes[:, 3]/2
return top_left_x, top_left_y, bottom_right_x, bottom_right_y
def iou_vectorized(bboxes1, bboxes2, without_center_coords=False):
'''
Calculates intersection over union between every bbox in bboxes1 with
every bbox in bboxes2, i.e. Cartesian product of both sets.
Arguments
---------
bboxes1: torch.FloatTensor
(M, 4 + *) shapped tensor with M bboxes with 4 bbox coordinates (cx, cy, w, h, *).
bboxes2: torch.FloatTensor
(N, 4 + *) shapped tensor with M bboxes with 4 bbox coordinates (cx, cy, w, h, *).
without_center_coords: bool
True: IoU is calculated only using width and height (no center coordinates).
It is useful on training when the best bbox is selected to replace the gt bbox.
Note: bboxes1 and bboxes2 are expected to have (M, 2 + *) and (N, 2 + *), respectively.
Output
------
: torch.FloatTensor
(M, N) shapped tensor with (i, j) corresponding to IoU between i-th bbox
from bboxes1 with j-th bbox from bboxes2.
'''
# pixel shift is 0 if we calculate without center coordinates and 1 otherwise.
# Why? Let's say I want to calculate the number of pixels the width of a box
# overlaps given two x coordinates for pixels: 0 and 5. So, the side is 6 pixels
# but 5 - 0 = 5. Therefore, we add 1.
# However, we don't need to do it when we don't have center coordinates
# i.e. without_center_coords = True
px_shift = 1
# add 'fake' center coordinates. You can use any value, we use zeros
if without_center_coords:
bboxes1 = torch.cat([torch.zeros_like(bboxes1[:, :2]), bboxes1], dim=1)
bboxes2 = torch.cat([torch.zeros_like(bboxes2[:, :2]), bboxes2], dim=1)
px_shift = 0
M, D = bboxes1.shape
N, D = bboxes2.shape
# Transform coords of the 1st bboxes (y=0 is at the top, and increases downwards)
top_left_x1, top_left_y1, bottom_right_x1, bottom_right_y1 = get_corner_coords(bboxes1)
# Transform coords of the 2nd bboxes
top_left_x2, top_left_y2, bottom_right_x2, bottom_right_y2 = get_corner_coords(bboxes2)
# broadcasting 1st bboxes
top_left_x1 = top_left_x1.view(M, 1)
top_left_y1 = top_left_y1.view(M, 1)
bottom_right_x1 = bottom_right_x1.view(M, 1)
bottom_right_y1 = bottom_right_y1.view(M, 1)
# broadcasting 2nd bboxes
top_left_x2 = top_left_x2.view(1, N)
top_left_y2 = top_left_y2.view(1, N)
bottom_right_x2 = bottom_right_x2.view(1, N)
bottom_right_y2 = bottom_right_y2.view(1, N)
# calculate coords for intersection
inner_top_left_x = torch.max(top_left_x1, top_left_x2)
inner_top_left_y = torch.max(top_left_y1, top_left_y2)
inner_bottom_right_x = torch.min(bottom_right_x1, bottom_right_x2)
inner_bottom_right_y = torch.min(bottom_right_y1, bottom_right_y2)
# area = side_a * side_b
# clamp(x, min=0) = max(x, 0)
# we make sure that the area is 0 if size of a side is negative
# which means that inner_top_left_x > inner_bottom_right_x which is not feasible
# Note: adding one because the coordinates starts at 0 and let's
a = torch.clamp(inner_bottom_right_x - inner_top_left_x + px_shift, min=0)
b = torch.clamp(inner_bottom_right_y - inner_top_left_y + px_shift, min=0)
inner_area = a * b
# finally we calculate union for each pair of bboxes
out_area1 = (bottom_right_x1 - top_left_x1 + px_shift) * (bottom_right_y1 - top_left_y1 + px_shift)
out_area2 = (bottom_right_x2 - top_left_x2 + px_shift) * (bottom_right_y2 - top_left_y2 + px_shift)
out_area = out_area1 + out_area2 - inner_area
return inner_area / out_area
def objectness_filter_and_nms(predictions, classes, obj_thresh=0.8, nms_thresh=0.4):
'''
Performs filtering according objectness score and non-maximum supression on predictions.
Arguments
---------
predictions: torch.FloatTensor
A tensor of size (B, P, 5+classes) with predictions.
B -- batch size; P -- number of predictions for an image,
i.e. 3 scales and 3 anchor boxes and
For example: P = (13*13 + 26*26 + 52*52) * 3 = 10647;
5 + classes -- (cx, cy, w, h, obj_score, {prob_class}).
classes: int
An integer with the number of classes to detect.
obj_thresh: float
A float that corresponds to the lowest objectness score the detector allows.
nms_thresh: float
Corresponds to the highest IoU the detector allows.
Output
------
predictions: torch.FloatTensor or None
Predictions after objectness filtering and non-max supression (same size
as predictions in arguments but with a different P). Returns None when
there no detections found.
'''
# iterate for images in a batch
for i, prediction in enumerate(predictions):
## objectness thresholding
# If prediction's (bbox') score is higher than obj_thress keep the prediction
# the fourth (fifth) element is objectness score; if there are no
# detections with obj score higher than obj_thresh, return None
objectness_mask = (prediction[:, 4] > obj_thresh)
if len(torch.nonzero(objectness_mask)) == 0:
return None
prediction = prediction[objectness_mask]
# if no object on an image found, continue with the next image
if prediction.size(0) == 0:
continue
## non-max supression
# The idea is as follows. If a prediction "survived" objectness filtering
# then it is considered meaningful. Since we may have multiple detections of
# one object on an image we need to filter out those predictions that have
# substantial (more than nms_thresh) overlap, or IoU, with the box with highest
# class score. Also note that one image might contact more than object of the same class.
# So, as they don't have high IoU with the box with highest class score, they will be kept
# in the list of predictions
# for each prediction we save the class with the maximum class score
pred_score, pred_classes = torch.max(prediction[:, 5:5+classes], dim=-1)
# we are going to iterate through classes, so, first, we select the set of unique classes
unique_classes = pred_classes.unique().float()
# initialize the list of filtered detections
detections_after_nms = []
for cls in unique_classes:
# select only the entries for a specific class.
# pred_classes is of torch.LongTensor type but we need torch.FloatTensor
prediction_4_cls = prediction[pred_classes.float() == cls]
# then we sort predictions for a specific class by objectness score (high -> low)
sort_pred_idxs = torch.sort(prediction_4_cls[:, 4], descending=True)[1]
prediction_4_cls = prediction_4_cls[sort_pred_idxs]
# next we want to fill out detections_after_nms with only with those objects
# that has a unique position, i.e. low IoU with other predictions.
# The idea here is to append (save) the first prediction in the prediction_4_cls
# and calculate IoUs with the rest predictions in that prediction_4_cls of the
# ordered list. Next, the predictions with the high IoU
# with the first prediction in prediction_4_cls will be discarded.
# For the next iteration, the first prediction will be the prediction with
# the highest obj score among the ones that are left.
# exit the loop when there is no prediction left after the nms
while len(prediction_4_cls) > 0:
# we append the first prediction for a specific class to the list of predictions.
# We can do this because we ordered the prediction_4_cls beforehand.
detections_after_nms.append(prediction_4_cls[0].unsqueeze(0))
# also stop when this is the last prediction in prediction_4_cls
if len(prediction_4_cls) == 1:
break
# calculate IoUs with the first pred in prediction_4_cls and the rest of them
ious = iou_vectorized(prediction_4_cls[0, :5].unsqueeze(0), prediction_4_cls[1:, :5])
# when iou_vectorized inputs two tensors, the ious.shape is (N, M) but now N = 1
# and [ious < nms_thresh] should be one dimesional
ious = ious.reshape(-1)
# filter out the first prediction (1:) and the ones with high IoU with the 0th pred
prediction_4_cls = prediction_4_cls[1:][ious < nms_thresh]
# as detections_after_nms is a list, we concatenate its elements to a tensor
predictions = torch.cat(detections_after_nms)
return predictions
def scale_numbers(num1, num2, largest_num_target):
'''
Scales two numbers (for example, dimensions) keeping aspect ratio.
Arguments
---------
num1: float or int
The 1st number (dim1).
num2: float or int
The 2nd number (dim2).
largest_num_target: int
The expected size of the largest number among 1st and 2nd numbers.
Outputs
-------
(int, int, float)
Two scaled numbers such that the largest is equal to largest_num_target
maintaining the same aspect ratio as num1 and num2 in input. Also,
returns a scalling coefficient.
Note: two ints are returned.
Examples
--------
scale_numbers(832, 832, 416) -> (416, 416, 0.5)
scale_numbers(223, 111, 416) -> (416, 207, 1.865...)
scale_numbers(100, 200, 416) -> (208, 416, 2.08)
scale_numbers(200, 832, 416) -> (100, 416, 0.5)
'''
# make sure the arguments are of correct types
assert isinstance(largest_num_target, int), 'largest_num_target should be "int"'
# to make the largest number to be equal largest_num_target keeping aspect ratio
# we need, first, to estimate by how much the largest number is smaller (larger)
# than largest_num_target and, second, to scale both numbers by this ratio.
# select the maximum among two numbers
max_num = max(num1, num2)
# calculate scalling coefficient
scale_coeff = largest_num_target / max_num
# scale both numbers
num1 = num1 * scale_coeff
num2 = num2 * scale_coeff
return round(num1), round(num2), scale_coeff
def letterbox_pad(img, color=127.5):
'''
Adds padding to an image according to the original implementation of darknet.
Specifically, it will pad the image up to (net_input_size x net_input_size) size.
Arguments
---------
img: numpy.ndarray
An image to pad.
color: (float or int) \in [0, 255]
The RGB intensity. The image will be padded with this color.
Output
------
img: numpy.ndarray
The padded image.
pad_sizes: (int, int, int, int)
The sizes of paddings. Used in show_prediction module where we need to shift
predictions by the size of the padding. order: top, bottom, left, right
'''
# make sure the arguments are of correct types
assert isinstance(img, np.ndarray), '"img" should have numpy.ndarray type'
# assert isinstance(net_input_size, int), '"net_input_size" should have int type'
assert isinstance(color, (int, float)), '"color" should be an int or float'
H, W, C = img.shape
max_side_len = max(H, W)
# if width is higher than height then, to make a squared-shaped image, we need
# to pad the height; else, we need to pad width.
if W > H:
# calculates how much should be padded "on top" which is a half of
# the difference between the target size and the current height
pad_top = (max_side_len - H) // 2
# another half is added to the bottom
pad_bottom = max_side_len - (H + pad_top)
pad_left = 0
pad_right = 0
else:
pad_top = 0
pad_bottom = 0
# calculates how much should be padded "on left" which is a half of
# the difference between the target size and the current width
pad_left = (max_side_len - W) // 2
pad_right = max_side_len - (W + pad_left)
# pad_widths should contain three pairs (because of 3d) of padding sizes:
# first pair adds rows [from top and bottom],
# second adds columns [from left to right],
# the third adds nothing because we pad only spatially, not channel-wise
pad_widths = [[pad_top, pad_bottom], [pad_left, pad_right], [0, 0]]
# for each padding we specify a color (constant parameter)
color = [[color, color], [color, color], [0, 0]]
# perform padding
img = np.pad(img, pad_widths, 'constant', constant_values=color)
# save padding sizes
pad_sizes = (pad_top, pad_bottom, pad_left, pad_right)
return img, pad_sizes
def fix_orientation_if_needed(pil_img, orientation):
'''
Motivation: sometimes when a user uploads a photo from their phone the
photo is rotated by 90 deg even though it looks fine on the phone. This
functionfixes this problem by correcting the orientation by employing info
from EXIF. For more info regarding this issue, please see:
https://magnushoff.com/jpeg-orientation.html
Argument
--------
pil_img: PIL.Image.Image
The target image.
orientation: str
Orientation which front-end tries to extract from EXIF of an image.
Can be 'undefined' or some integer which can be used to orient the image.
Output
------
pil_img: PIL.Image.Image
The original image with the fixed orientation or the same image if
no EXIF info is available
'''
# if expand is False the dimension of the image remains the same
if orientation == '3':
pil_img = pil_img.rotate(180, expand=True)
elif orientation == '6':
pil_img = pil_img.rotate(270, expand=True)
elif orientation == '8':
pil_img = pil_img.rotate(90, expand=True)
return pil_img
# TODO: test for different devices
def predict_and_save(source_img, model, device, labels_path, font_path, orientation, show=False, save=True):
'''
Performs inference on an image and saves the image with bounding boxes drawn on it.
Arguments
---------
source_img: PIL.Image.Image or str
The image to perform inference on.
model: Darknet
The model which will be used for inference.
device: torch.device or str
Device for calculations.
labels_path: str
The path to the object names.
font_path: str
The path to the font which is going to be used to tag bounding boxes.
orientation: str
Orientation which front-end tries to extract from EXIF of an image.
Can be 'undefined' or some integer which can be used to orient the image.
Used in fix_orientation_if_needed().
show: bool
Whether to show the output image with bounding boxes, for example, in jupyter notebook
save: bool
Whether to save the output image with bounding boxes.
Outputs
-------
prediction: torch.FloatTensor or NoneType
Predictions of a size (<number of detected objects>, 4+1+<number of classes>).
prediction is NoneType when no object has been detected on an image.
'''
assert isinstance(labels_path, (str, Path)), '"labels_path" should be str or Path'
assert isinstance(device, (torch.device, str)), 'device should be either torch.device or str'
assert isinstance(show, bool), 'show should be boolean'
# parameters of the vizualization: color palette, figsize to show,
# label parameters, jpeg quality
norm = Normalize(vmin=0, vmax=model.classes)
color_map = colormap.tab10
figsize = (15, 15)
line_thickness = 2
obj_thresh = 0.8 # 0.8
nms_thresh = 0.4 # 0.4
# make a dict: {class_number: class_name} if we have more than 1 class
if model.classes > 1:
# replacing with whitespace because we would like to remove space from
# the text format later in naming the bounding boxes:
names = [name.replace('\n', ' ') for name in open(labels_path, 'r').readlines()]
num2name = {num: name for num, name in enumerate(names)}
else:
# we don't need a class names if the the number of classes is 1
num2name = {0: ''}
source_img = fix_orientation_if_needed(source_img, orientation)
W, H = source_img.size
# add letterbox padding and save the pad sizes and scalling coefficient
# to use it latter when drawing bboxes on the original image
H_new, W_new, scale = scale_numbers(H, W, model.model_width)
img = source_img.resize((W_new, H_new))
img = np.array(img)
img, pad_sizes = letterbox_pad(img)
# HWC -> CHW, scale intensities to [0, 1], send to pytorch, add 'batch-'dimension
img = img.astype(np.float32)
img = img.transpose(2, 0, 1)
img = img / 255
img = torch.from_numpy(img)
img = img.unsqueeze(0)
img = img.to(device)
# make prediction
prediction, loss = model(img, device=device)
# and apply objectness filtering and nms. If returns None, draw a box that states it
prediction = objectness_filter_and_nms(prediction, model.classes, obj_thresh, nms_thresh)
# if show initialize a figure environment
if show:
plt.figure(figsize=figsize)
### if no objects have been detected draw one rectangle on the perimeter of the
# source_img with text that no objects are found. for comments for this
# if condition please see the for-loop below
if prediction is None:
text = "Couldn't find any objects that I was trained to detect :-("
font = ImageFont.truetype(str(font_path), 20)
text_size = font.getsize(text)
top_left_coords = ((W-text_size[0])//2, H//2)
black = (0, 0, 0)
# increase the font size a bit
tag = Image.new('RGB', text_size, black)
source_img.paste(tag, top_left_coords)
# create a rectangle object and draw it on the source image
tag_draw = ImageDraw.Draw(source_img)
# adds the text
tag_draw.text(top_left_coords, text, font=font)
if show:
plt.imshow(source_img)
if save:
source_img.save('output.jpg', 'JPEG')
return None, source_img
###
# since the predictions are made for a resized and padded images,
# the bounding boxes have to be scaled and shifted back
pad_top, pad_bottom, pad_left, pad_right = pad_sizes
prediction[:, 0] = (prediction[:, 0] - pad_left) / scale
prediction[:, 1] = (prediction[:, 1] - pad_top) / scale
prediction[:, 2] = prediction[:, 2] / scale
prediction[:, 3] = prediction[:, 3] / scale
# the, transform the coordinates (cx, cy, w, h) into corner coordinates:
# (top_left_x, top_left_y, bottom_right_x, bottom_right_y)
top_left_x, top_left_y, bottom_right_x, bottom_right_y = get_corner_coords(prediction)
# detach values from the computation graph, take the int part and transform to np.ndarray
top_left_x = top_left_x.cpu().detach().int().numpy()
top_left_y = top_left_y.cpu().detach().int().numpy()
bottom_right_x = bottom_right_x.cpu().detach().int().numpy()
bottom_right_y = bottom_right_y.cpu().detach().int().numpy()
# add each prediction on the image and captures it with a class number
machine_readable_preds = []
machine_readable_preds.append('class,confidence,bx,by,bw,bh')
for i in range(len(prediction)):
## ADD BBOXES
# first we need to extract coords for both top left and bottom right corners
# note: sometimes, the corner coordinates lie outside of the image itself
# hence we need to keep them on image -> min and max
top_left_coords = max(0, top_left_x[i]), max(0, top_left_y[i])
bottom_right_coords = min(W, bottom_right_x[i]), min(H, bottom_right_y[i])
# predicted class number
# todo dim (also see NMS with batch dim)
class_score, class_int = torch.max(prediction[i, 5:5+model.classes], dim=-1)
class_score, class_int = float(class_score), int(class_int)
# select the color for a class according to its label number and scale it to [0, 255]
bbox_color = color_map(class_int / model.classes)[:3]
bbox_color = tuple(map(lambda x: int(x * 255), bbox_color))
## ADD A LABLE FOR EACH BBOX INSIDE THE RECTANGLE WITH THE SAME COLOR
## AS THE BBOX ITSELF
# predicted class name to put on a bbox
class_name = num2name[class_int]
# text to name a box: class name and the probability in percents
text = f'{class_name}{(class_score * 100):.0f}%'
font = ImageFont.truetype(str(font_path), 14)
text_size = font.getsize(text)
# create a tag object and draw it on the source image
tag = Image.new('RGB', text_size, bbox_color)
top_left_coords_tag = top_left_coords[0], max(0, top_left_coords[1] - text_size[1])
source_img.paste(tag, top_left_coords_tag)
# create a rectangle object and draw it on the source image
bbox_draw = ImageDraw.Draw(source_img)
bbox_draw.rectangle((top_left_coords, bottom_right_coords),
width=line_thickness, outline=bbox_color)
# adds the class label with confidence
bbox_draw.text(top_left_coords_tag, text, font=font)
# add a prediction to the list of human readable predictions by making an ugly string
machine_readable_preds.append(
','.join([
f'{class_name.strip()}',
f'{class_score:.2f}',
f'{prediction[i, 0].item() / W:.2f}',
f'{prediction[i, 1].item() / H:.2f}',
f'{prediction[i, 2].item() / W:.2f}',
f'{prediction[i, 3].item() / H:.2f}',
])
)
# enclose the list of human readable predictions into a markdown code block
machine_readable_preds = '\n'.join(machine_readable_preds)
machine_readable_preds = f'```\n{machine_readable_preds}\n```'
# if show, then, show and close the environment
if show:
plt.imshow(source_img)
if save:
source_img.save('output.jpg', 'JPEG')
return machine_readable_preds, source_img
def show_image_w_bboxes_for_server(
img_path: str,
out_path: str,
archive_path: str,
labels_path: str,
font_path: str,
model: torch.nn.Module,
device: torch.device,
orientation: str) -> None:
'''
Reads an image from the disk and applies a detection algorithm specified in model.
Arguments
---------
img_path: str
A path to an image.
out_path: str
A path where to save the result image with detections. This image will
be used to send back to the user.
archive_path: str
Another path where the result image will be saved (archive).
Since `out_path` is always the same, we also use the archive path.
labels_path: str
A path to model labels (COCO)
font_path: str:
A path to a font-face to use to draw the prediction labels
model: Darknet
Model to apply to the image.
device: str:
PyTorch device. Use this argument to control 'cuda' vs 'cpu'.
orientation: str
Orientation which front-end tries to extract from EXIF of an image.
Can be 'undefined' or some integer which can be used to orient the image.
Used in predict_and_save().
'''
# I want to log the processing time for each image
start = time()
# predict_and_save returns both img with predictions drawn on it
# and the tensor with predictions
assert out_path is None or isinstance(out_path, str), 'output should be either NoneType or str'
# make sure the arguments are of correct types
assert isinstance(img_path, (str, Path)), '"img_path" should be str or Path'
# read an image
source_img = Image.open(img_path).convert('RGB')
with torch.no_grad():
predictions, img = predict_and_save(
source_img, model, device, labels_path, font_path, orientation, show=False
)
# selecting a name for a file for archiving
filename = f'{strftime("%y-%m-%dT%H-%M-%S", localtime())}.jpg'
archive_full_path = os.path.join(archive_path, filename)
img.save(archive_full_path, 'JPEG')
img.save(out_path, 'JPEG')
# calculating elapsed time and printing it to flask console
elapsed_time = round(time() - start, 2)
print(f'Processing time of {filename}: {elapsed_time} sec.')
print('=' * 70)
### SOME CODE FOR WIDER DATASET HANDLING
'''
The dataset folder is expected to have
the following structure:
./Submission_example/
0--Parade/
0_Parade_marchingband_1_20.txt
./wider_face_split/
wider_face_test.mat
wider_face_train.mat
wider_face_test_filelist.txt
wider_face_val.mat
wider_face_train_bbx_gt.txt
readme.txt
wider_face_val_bbx_gt.txt
./WIDER_train/
images/
0--Parade/
0_Parade_marchingband_1_100.jpg
...
1--Handshaking/
1_Handshaking_Handshaking_1_102.jpg
...
...
./WIDER_val/
(similar to ./WIDER_train/)
./WIDER_test/
(similar to ./WIDER_train/)
'''
def read_meta_from_file(data_root_path):
'''
Parses WIDER ground truth data.
Argument
--------
data_root_path: str
A path to the ground truth dataset. It is expected to have the '.txt'
extension.
Output
------
meta: dict
A map between a file path and ground truth bounding box coordinates and some
attributes (x1, y1, w, h, blur, expression, illumination, invalid, occlusion, pose)
stored as list of lists. For more information about the attributes see readme.txt.
'''
split_path = os.path.join(data_root_path, 'wider_face_split')
train_data_path = os.path.join(data_root_path, 'WIDER_train/images')
train_meta_path = os.path.join(split_path, 'wider_face_train_bbx_gt.txt')
meta = {}
with open(train_meta_path, 'r') as rfile:
while True:
short_file_path = rfile.readline()
bbox_count = rfile.readline()
if short_file_path == '' or bbox_count == '':
rfile.close()
break
short_file_path = short_file_path.replace('\n', '')
bbox_count = int(bbox_count.replace('\n', ''))
full_file_path = os.path.join(train_data_path, short_file_path)
gt_bboxes = []
for _ in range(bbox_count):
attributes = rfile.readline()
attributes = attributes.replace('\n', '').split(' ')
attributes = [int(att) for att in attributes if len(att) > 0]
gt_bboxes.append(attributes)
meta[full_file_path] = gt_bboxes
return meta