-
Notifications
You must be signed in to change notification settings - Fork 0
/
dataset.py
130 lines (103 loc) · 5.19 KB
/
dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
from torch.utils.data import Dataset
import json
import os
from PIL import Image
from transforms import *
def transform(image, boxes, labels, difficulties, split):
"""
Apply the transformations above.
:param image: image, a PIL Image
:param boxes: bounding boxes in boundary coordinates, a tensor of dimensions (n_objects, 4)
:param labels: labels of objects, a tensor of dimensions (n_objects)
:param difficulties: difficulties of detection of these objects, a tensor of dimensions (n_objects)
:param split: one of 'TRAIN' or 'TEST', since different sets of transformations are applied
:return: transformed image, transformed bounding box coordinates, transformed labels, transformed difficulties
"""
assert split in {'TRAIN', 'TEST'}
# Mean and standard deviation of ImageNet data that our base VGG from torchvision was trained on
mean = [0.485, 0.456, 0.406]
std = [0.229, 0.224, 0.225]
new_image = image
new_boxes = boxes
new_labels = labels
new_difficulties = difficulties
# Skip the following operations for evaluation/testing
if split == 'TRAIN':
# A series of photometric distortions in random order, each with 50% chance of occurrence, as in Caffe repo
new_image = photometric_distort(new_image)
# Convert PIL image to Torch tensor
new_image = FT.to_tensor(new_image)
# Expand image (zoom out) with a 50% chance - helpful for training detection of small objects
# Fill surrounding space with the mean of ImageNet data that our base VGG was trained on
if random.random() < 0.5:
new_image, new_boxes = expand(new_image, boxes, filler=mean)
# Randomly crop image (zoom in)
new_image, new_boxes, new_labels, new_difficulties = random_crop(new_image, new_boxes, new_labels,
new_difficulties)
# Convert Torch tensor to PIL image
new_image = FT.to_pil_image(new_image)
# Flip image with a 50% chance
if random.random() < 0.5:
new_image, new_boxes = flip(new_image, new_boxes)
# Resize image to (300, 300) - this also converts absolute boundary coordinates to their fractional form
new_image, new_boxes = resize(new_image, new_boxes, dims=(300, 300))
# Convert PIL image to Torch tensor
new_image = FT.to_tensor(new_image)
# Normalize by mean and standard deviation of ImageNet data that our base VGG was trained on
new_image = FT.normalize(new_image, mean=mean, std=std)
return new_image, new_boxes, new_labels, new_difficulties
class PascalVOCDataset(Dataset):
"""
Pascal VOC Dataset 2012/2017 for object detection
"""
def __init__(self, data_folder: str, split: str, keep_difficult: bool = False):
"""
:param data_folder: folder where data is kept
:param split: 'TRAIN' or 'TEST'
:param keep_difficult: keep difficult ground truth objects or discard them
"""
self.split = split.upper()
self.data_folder = data_folder
self.keep_difficult = keep_difficult
# Read data files
with open(os.path.join(data_folder, self.split + '_images.json'), 'r') as j:
self.images = json.load(j)
with open(os.path.join(data_folder, self.split + '_objects.json'), 'r') as j:
self.objects = json.load(j)
assert len(self.images) == len(self.objects)
def __getitem__(self, i):
image = Image.open(self.images[i], mode='r')
image = image.convert('RGB')
# Get objects in this image (bounding boxes, labels, difficulties)
objects = self.objects[i]
boxes = torch.FloatTensor(objects['boxes']) # (n_objects, 4)
labels = torch.LongTensor(objects['labels']) # (n_objects)
difficulties = torch.BoolTensor(objects['difficulties']) # (n_objects)
# Discard difficult objects, if desired
if not self.keep_difficult:
boxes = boxes[~difficulties]
labels = labels[~difficulties]
difficulties = difficulties[~difficulties]
# Apply transformations
image, boxes, labels, difficulties = transform(image, boxes, labels, difficulties, split=self.split)
return image, boxes, labels, difficulties
def __len__(self):
return len(self.images)
@staticmethod
def collate_fn(batch):
"""
Since each image may have a different number of objects, we need a collate function
(to be passed to the DataLoader).
This describes how to combine these tensors of different sizes. We use lists.
Note: this need not be defined in this Class, can be standalone.
:param batch: an iterable of N sets from __getitem__()
:return: a tensor of images, lists of varying-size tensors of bounding boxes, labels, and difficulties
"""
images, boxes, labels, difficulties = [], [], [], []
for b in batch:
images.append(b[0])
boxes.append(b[1])
labels.append(b[2])
difficulties.append(b[3])
images = torch.stack(images, dim=0) # (N, 3, 300, 300), 3 lists of N tensors each
return images, boxes, labels, difficulties