-
Notifications
You must be signed in to change notification settings - Fork 0
/
morderstats.py
324 lines (279 loc) · 13 KB
/
morderstats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
"""
morderstats.py
Multivariate Order Statistics
This script should compute the desired prediction regions of a certain type
(Mahalanobis, Halfspace Peeling, Direct Convex Hull Peeling)
and will write and plot all of them in the specified directory.
"""
from argparse import ArgumentParser
import os
import sys
import shutil
import itertools
import json
import numpy as np
import matplotlib.pyplot as plt
import scipy.spatial
import pandas as pd
try:
import pyhull
USE_PYHULL = True
except ImportError:
USE_PYHULL = False
import distributions
parser = ArgumentParser()
parser.add_argument("--sources-file",
help="The file which contains the data",
dest="sources_file",
type=str,
default=None)
parser.add_argument("--method",
help="The method for producing quantile regions."
"Can be one of the following methods:"
"mahal: uses mahalanobis distance to produce regions"
"halfspace: Uses halfspace peeling to produce regions"
"direct: Directly peels off convex hulls to produce regions",
dest="method",
type=str)
parser.add_argument("--write-file",
help="The name of the file to write the convex sets which define the regions",
dest="write_file",
type=str,
default='prediction_regions.csv')
parser.add_argument("--write-directory",
help="The name of the directory which will contain the written files and plots",
dest="write_directory",
type=str)
parser.add_argument("--alpha",
help="The alpha level for the prediction region "
"If it is unspecified, it will produce every hull "
"for alpha=.01-.99.",
dest="alpha",
type=float,
default=None)
parser.add_argument("--use-pyhull",
help="If this option is set, then the program will use the implementation "
"of the halfspace algorithm which is dependent on pyhull. If it is not "
"set, it will try to use the scipy implementation",
dest="use_pyhull",
action="store_true")
def main():
args = parser.parse_args()
if os.path.isdir(args.write_directory):
print("To run this program, this script will delete directory {}".format(args.write_directory))
response = input("Is this okay (y/n)? ")
while response not in 'yn':
response = input("Please answer with one of y or n: ")
if response == 'y':
shutil.rmtree(args.write_directory)
else:
sys.exit()
os.mkdir(args.write_directory)
global USE_PYHULL
if args.use_pyhull and not USE_PYHULL:
print("Cannot find the module pyhull, please make sure it is installed in the proper location.")
print("Will attempt to use the scipy implementation")
args.use_pyhull = False
if not args.use_pyhull:
USE_PYHULL = False
scipy_version = int(scipy.__version__.split('.')[1])
if scipy_version < 19:
print("Cannot use scipy implementation as your version of scipy is too old")
print("To use the scipy implementation, you must update to at least version 0.19.0")
sys.exit(1)
with open(args.sources_file) as f:
dataframe = pd.read_csv(f, header=None)
values = dataframe.values
print("Constructing Regions")
write_and_plot_regions(values, args.method, args.write_directory, args.write_file, args.alpha)
def write_and_plot_regions(points, method, write_directory, write_file, alpha=None):
"""
Delegates the work of generating regions to one of three methods
Args:
points (np.ndarray): A numpy array of points n x p where n is number of points, p is dimension of points
method (str): One of 'mahal', 'halfspace', 'direct'
write_directory (str): The directory which will contain the write files
write_file (str): The name of the file which contains the region information
alpha (float): The desired alpha level for the prediction region
"""
if method == 'mahal':
mahalanobis_regions(points, write_directory, write_file, alpha)
elif method == 'halfspace':
halfspace_regions(points, write_directory, write_file, alpha)
elif method == 'direct':
direct_regions(points, write_directory, write_file, alpha)
else:
print('The method for producing regions is either unfilled or unrecognized'
'--method must be set to one of mahal, halfspace, or direct'
'to produce regions')
sys.exit(1)
def mahalanobis_regions(points, write_directory, write_file, alpha=None):
"""
Computes the respective mahalanobis region for the specified alpha.
Then it writes the the representative points of the convex hull to the specified file
Args:
points (np.ndarray): A numpy array of points n x p where n is number of points, p is dimension of points
write_directory (str): The directory which will contain the write files
write_file (str): The name of the file which contains the region information
alpha (float): The desired alpha level for the prediction region, if None, then calculates for alpha=1,..,100
"""
region = distributions.MahalanobisRegion(points)
peels = []
if alpha is not None:
region.set_region(alpha)
hull = region.hull
realized_alpha = region.realized_alpha
peels = [hull.points[hull.vertices]]
alphas = [realized_alpha]
region.plot('mahalanobis_region', write_directory)
else:
alphas = []
for i in range(1, 100):
alpha = i / 100
try:
region.set_region(alpha)
except ValueError: # cannot make any more regions
break
hull = region.hull
realized_alpha = region.realized_alpha
hull_points = hull.points[hull.vertices]
peels.append(hull_points)
alphas.append(realized_alpha)
list_of_points = region.all_points
plot_regions(peels, list_of_points, write_directory, 'mahalanobis_regions')
write_regions(peels, alphas, write_directory, write_file)
def halfspace_regions(points, write_directory, write_file, alpha=None):
"""
Computes the respective halfspace peel for the specified alpha.
Then it writes the the representative points of the convex hull to the specified file
Args:
points (np.ndarray): A numpy array of points n x p where n is number of points, p is dimension of points
write_directory (str): The directory which will contain the write files
write_file (str): The name of the file which contains the region information
alpha (float): The desired alpha level for the prediction region, if None, then computes all convex hulls
"""
region = distributions.HalfspaceDepthRegion(points)
if USE_PYHULL:
distr = distributions.MultivariateEmpiricalDistribution(points, raw_data=True)
if alpha is not None:
list_of_points, hull, _, realized_alpha = distr.halfspacedepth_quantile_region(alpha)
peels = [hull.points[hull.vertices]]
alphas = [realized_alpha]
else:
distr.halfspacedepth_quantile_region(1)
peels = distr.allhulls
alphas = distr.alphas
list_of_points = np.array(distr.data_matrix)
plot_regions(peels, list_of_points, write_directory, 'halfspace_region')
else:
if alpha is not None:
region.set_region(alpha)
peels = [region.hull.points[region.hull.vertices]]
alphas = [region.realized_alpha]
region.plot('halfspace_region', write_directory)
else:
region.set_region(1)
peels = region.all_hulls
alphas = region.all_alphas
region.plot_sequence('halfspace_regions', write_directory)
write_regions(peels, alphas, write_directory, write_file)
def direct_regions(points, write_directory, write_file, alpha=None):
"""
Computes the respective direct convex hull peel for the specified alpha.
Then it writes the the representative points of the convex hull to the specified file
Args:
points (np.ndarray): The fitted multivariate empirical distribution to the data
write_directory: The directory which will contain the write files
write_file: The name of the file which contains the region information
alpha: The desired alpha level for the prediction region, if None, then computes all convex hulls
"""
region = distributions.DirectRegion(points)
if USE_PYHULL:
distr = distributions.MultivariateEmpiricalDistribution(points, raw_data=True)
if alpha is not None:
list_of_points, hull, _, realized_alpha = distr.direct_convex_hull_quantile_region(alpha)
peels = [hull.points[hull.vertices]]
alphas = [realized_alpha]
else:
distr.direct_convex_hull_quantile_region(1)
peels = distr.allhulls
alphas = distr.alphas
list_of_points = np.array(distr.data_matrix)
plot_regions(peels, list_of_points, write_directory, 'direct_region')
else:
if alpha is not None:
region.set_region(alpha)
peels = [region.hull.points[region.hull.vertices]]
alphas = [region.realized_alpha]
region.plot('direct_region', write_directory)
else:
region.set_region(1)
peels = region.all_hulls
alphas = region.all_alphas
region.plot_sequence('direct_regions', write_directory)
write_regions(peels, alphas, write_directory, write_file)
def plot_regions(peels, points_in_hull, plot_directory, plot_file):
"""
Plots each of the hulls in argument peels to the directory plot_directory
with name plot_file
Args:
peels: A list of peels (each is a list of points in defining polytope)
points_in_hull: list of the points
plot_directory: directory name to store the plots
plot_file: the name of the plot file
"""
print("Plotting to {}".format(plot_directory))
if not os.path.isdir(plot_directory):
os.mkdir(plot_directory)
for i, peel in enumerate(peels):
hull = scipy.spatial.ConvexHull(peel)
vertices = hull.points[hull.vertices]
for j, comb in enumerate(itertools.combinations(range(len(vertices[0])), 2)):
dimensions = list(zip(*vertices))
xs, ys = dimensions[comb[0]], dimensions[comb[1]]
projection_hull = scipy.spatial.ConvexHull(list(zip(xs, ys)))
projection_hull_vertices = projection_hull.points[projection_hull.vertices]
xs, ys = zip(*projection_hull_vertices)
plt.figure(j)
plt.plot(xs + (xs[0],), ys + (ys[0],), 'k-')
real_points = list(zip(*points_in_hull))
xs, ys = real_points[comb[0]], real_points[comb[1]]
plt.plot(xs, ys, 'b.')
for i, comb in enumerate(itertools.combinations(range(len(vertices[0])), 2)):
plt.figure(i)
dim1, dim2 = comb[0] + 1, comb[1] + 1 # change to 1 indexing
plt.title("{} Projection onto dimension {} versus dimension {}".format(plot_file, dim1, dim2))
plt.xlabel("Dimension " + str(dim1))
plt.ylabel("Dimension " + str(dim2))
plt.savefig(plot_directory + os.sep + plot_file + str(dim1) + 'vs' + str(dim2) + '.png')
def write_regions(peels, alphas, write_directory, write_file):
"""
This function writes out the prediction regions in two formats.
One is a json file and the other is a structured text file.
Each of the regions contain an alpha, the points, and the volume as fields.
Args:
peels: A list of peels (each is a list of points in defining polygon)
alphas: A list of alphas corresponding to the peels
write_directory: The directory which will contain the write files
write_file: The name of the json file
"""
print("Writing to {}".format(write_directory + os.sep + write_file))
list_of_regions = []
for i, peel in enumerate(peels):
hull = scipy.spatial.ConvexHull(peel)
vertices = hull.points[hull.vertices].tolist()
region = {}
region['points'] = vertices
region['alpha'] = alphas[i]
region['volume'] = hull.volume
list_of_regions.append(region)
with open(write_directory + os.sep + write_file, 'w') as f:
for region in list_of_regions:
f.write('Alpha: {}\n'.format(region['alpha']))
f.write('Volume: {}\n'.format(region['volume']))
f.write('Points:\n')
for point in region['points']:
f.write(','.join(map(str, point)) + '\n')
f.write('\n')
if __name__ == '__main__':
main()