-
Notifications
You must be signed in to change notification settings - Fork 0
/
dataset.py
159 lines (132 loc) · 4.64 KB
/
dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
import os
import random
import numpy as np
import skimage.transform, skimage.io, skimage.color
import matplotlib.pyplot as plt
import pandas as pd
_image_extension = ("jpg", "jpeg", "png")
class image_db():
### load images from directory imgdir
""" kwargs:
- train_portion portion of dataset for training (max 1.0)
- seed random seed for reproducibility
"""
def __init__(self, imgdir, *args, **kwargs):
### get kwargs ###
train_portion = kwargs.get('train_portion', 0.5)
seed = kwargs.get('seed', None)
### set env ###
np.random.seed(seed)
### discover images ###
self.imgdir = imgdir
self.fnames = []
for path in os.listdir(self.imgdir):
if not os.path.isdir(path) and path.endswith(_image_extension):
self.fnames.append(os.path.join(self.imgdir, path))
self.images = skimage.io.imread_collection(self.fnames, conserve_memory=True)
self.size = len(self.images)
print self.size, "images loaded from", self.imgdir
### init pools for each mode ###
self.modeval = ['train', 'test', 'all', 'list']
self.is_shuffle = [True, True, True, False]
self.imgindex_cnt = [0] * len(self.modeval)
self.imgindex = [list(range(self.size)) for _ in range(len(self.modeval))]
for m, iindex in enumerate(self.imgindex):
if self.is_shuffle[m]: random.shuffle(iindex)
### split train/test ###
index_split = int(self.size*train_portion)
self.imgindex[1] = self.imgindex[0][index_split:]
self.imgindex[0] = self.imgindex[0][:index_split]
### label map ###
self.tlabel_map = lambda x: x
def transform_label(self, labels_map):
self.tlabel_map = labels_map
def get_index(self, idx, size=None, cmap='rgb'):
### get images ###
img = self.images[idx]
### convert to desired cmap ###
img = self._rgb2cmap(img, cmap)
### resize if necessary
if size is not None: img = self._img_resize(img, size)
return img, self.tlabel_map(os.path.splitext(os.path.basename(self.fnames[idx]))[0])
def get_batch(self, num, mode='all', size=None, cmap='rgb'):
if mode not in self.modeval:
raise ValueError(mode + " not in " + str(self.modeval))
m = self.modeval.index(mode)
batch_imgs, batch_labels = [], []
for i in range(num):
### get image and modify ###
img, label = self.get_index(self.imgindex[m][self.imgindex_cnt[m]], size, cmap)
batch_imgs.append(img)
batch_labels.append(label)
### update indexing ###
self.imgindex_cnt[m] += 1
if self.imgindex_cnt[m] == len(self.imgindex[m]):
if self.is_shuffle[m]: random.shuffle(self.imgindex[m])
self.imgindex_cnt[m] = 0
batch_imgs = np.array(batch_imgs)
batch_labels = np.array(batch_labels)
return batch_imgs, batch_labels
def get_size(self, mode='all'):
return len(self.imgindex[self.modeval.index(mode)])
def _rgb2cmap(self, img, cmap):
if cmap == 'grey':
img = skimage.color.rgb2grey(img)
img = img.reshape((img.shape[0], img.shape[1], 1))
elif cmap == 'hsv':
img = skimage.color.rgb2hsv(img)
elif cmap == 'rgb':
img = img/255.0
# if np.max(img) > 1.0:
# img /= np.max(img)
return img
def _img_resize(self, img, size):
resized_img = np.zeros(size + [img.shape[2]])
for k in range(img.shape[2]):
resized_img[..., k] = skimage.transform.resize(img[..., k], size, mode='reflect')
return resized_img
class Label():
def __init__(self, id2num, num2name, name2num):
self.id2num = id2num
self.num2name = num2name
self.name2num = name2num
def i2s(self, i):
return self.num2name[self.id2num[i]]
def i2n(self, i):
return self.id2num[i]
def n2s(self, i):
return self.num2name[i]
def s2n(self, i):
return self.name2num[i]
def cifar10_read_label(csv_fname):
df = pd.read_csv(csv_fname)
### explore labels ###
unique_label = df.label.unique()
num2name = {i:l for i, l in zip(range(len(unique_label)), unique_label)}
name2num = {l:i for i, l in zip(range(len(unique_label)), unique_label)}
print 'num2name', num2name
print 'name2num', name2num
### convert to num label ###
df = df.replace({'label':name2num})
id2num = pd.Series(df.label.values,index=df.id.astype(str)).to_dict()
return Label(id2num, num2name, name2num)
if __name__ == '__main__':
labels = cifar10_read_label("../trainLabels.csv")
idb = image_db("../train")
idb.transform_label(lambda x: labels.i2s(x))
size = [100, 100]
mode = 'test'
cmap = 'grey'
batch_img, batch_label = idb.get_batch(16, size=size, mode=mode, cmap=cmap)
print batch_label
bwimages = [batch_img[i][:, :, 0].astype(float)/255.0 for i in range(16)]
plt.figure(figsize=(10, 10))
for i in range(16):
plt.subplot(4, 4, i+1)
if cmap == 'grey':
plt.imshow(bwimages[i], cmap='gray')
else:
plt.imshow(batch_img[i])
plt.title(batch_label[i])
plt.show()
plt.close()