forked from dwave-examples/feature-selection-cqm
-
Notifications
You must be signed in to change notification settings - Fork 0
/
data.py
194 lines (154 loc) · 6.56 KB
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
# Copyright 2023 D-Wave Systems Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pickle
import os
import numpy as np
import pandas as pd
import openml
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from dwave.plugins.sklearn import SelectFromQuadraticModel
class DataSetBase:
"""Base class for datasets.
Subclasses should define the following attributes:
X (array): Feature data with features as columns.
y (array): Target data.
n (int): Number of features.
baseline_cv_score (float):
Baseline cross-validation score with all features.
score_range (tuple):
Lower and upper values for displaying cross-validation scores.
default_redundancy_penalty (float)
default_k (int): Default setting for number of features to select.
"""
def get_relevance(self):
"""Return array of values for relevance of each feature to the target."""
return np.array([abs(np.corrcoef(x, self.y)[0,1]) for x in self.X.values.T])
def calc_redundancy(self):
"""Compute and return 2d array of feature redundancy values."""
return abs(np.corrcoef(self.X.values, rowvar=False))
def get_redundancy(self):
"""Return 2d array of feature redundancy values, possibly cached to disk."""
# The following logic can be used to store the redundancy matrix to disk
# so that it does not need to be computed each time the app is launched.
# This is probably not needed for dataset sizes that would be used with
# the app, as np.corrcoef is quite fast.
if self.n > 500:
data_path = f'redundancy-{self.name}.pkl'
if os.path.exists(data_path):
return pickle.load(open(data_path, 'rb'))
else:
print('Calculating redundancy data...')
data = self.calc_redundancy()
print('Storing redundancy data')
with open(data_path, 'wb') as f:
pickle.dump(data, f)
return data
else:
return self.calc_redundancy()
def get_selected_features(self, X_new):
""" Post-processes result from plug-in to return features
Args:
X_new (np.ndarray):
Reduced dataset with selected features
Returns:
Array of indices of selected features.
"""
_, n = self.X.shape
_, m = X_new.shape
# need to iterate through and enumerate which features were selected
feature_names = []
for i in range(n):
for j in range(m):
if np.all(X_new[:, j] == self.X.iloc[:, i]):
feature_names.append(i)
break
return feature_names
def solve_feature_selection(self, k, alpha):
"""Construct and solve feature selection CQM using plugin.
Args:
k (int):
Number of features to select.
alpha (float):
Parameter between 0 and 1 that defines the relative weight of
linear and quadratic coefficients.
Returns:
Array of indices of selected features.
"""
X_new = SelectFromQuadraticModel(num_features=k, alpha=alpha).fit_transform(self.X.values, self.y)
return self.get_selected_features(X_new)
def score_indices_cv(self, indices, cv=3):
"""Compute the accuracy score of a random forest classifier trained using the specified features.
Args:
indices (array): Array of feature indices
cv (int): Number of folds.
Returns:
float: Cross-validation accuracy score.
"""
clf = RandomForestClassifier()
# NB: the classifier is both trained and evaluated `cv` times in a loop inside
# NB: this call to `sklearn.model_selection.cross_val_score`:
return np.mean(cross_val_score(clf, self.X.iloc[:, indices], self.y, cv=cv))
def score_baseline_cv(self, reps=5):
"""Compute baseline accuracy score of a random forest classifier trained using all features.
Args:
reps (int): Number of times to repeat cross-validation.
Returns:
float: Average cross-validation accuracy score across `nreps` repetitions.
"""
indices = list(range(np.size(self.X, 1)))
return np.mean([self.score_indices_cv(indices) for i in range(reps)])
class Titanic(DataSetBase):
def __init__(self):
df = pd.read_csv('formatted_titanic.csv')
target_col = 'survived'
self.X = df.drop(target_col, axis=1).astype(float)
self.y = df[target_col].values
self.baseline_cv_score = 0.69
self.score_range = (0.6, 0.8)
self.default_redundancy_penalty = 0.68
self.default_k = 8
self.n = np.size(self.X, 1)
class Scene(DataSetBase):
def __init__(self):
data_id = 312
self.baseline_cv_score = 0.90
dataset = openml.datasets.get_dataset(data_id)
X, y, categorical_indicator, attribute_names = dataset.get_data(
target=dataset.default_target_attribute, dataset_format='dataframe')
self.y = y.values.astype(int)
X = X.astype(float)
self.X = X
self.score_range = (0.79, 0.95)
self.default_k = 30
self.default_redundancy_penalty = 0.4
self.n = np.size(self.X, 1)
def DataSet(name):
"""Return instance of specified DataSet class.
Args:
name (str)
Name of feature selection dataset: either 'titanic' or 'scene'.
The 'titanic' dataset contains 14 features, and the 'scene' dataset
contains 299 features.
"""
datasets = {'titanic': Titanic,
'scene': Scene}
return datasets[name]()
if __name__ == '__main__':
# Compute baseline scores, which are stored as part of the DataSet
# definitions.
dataset_names = ('titanic', 'scene')
for name in dataset_names:
score = DataSet(name).score_baseline_cv()
print(f'Baseline score for {name}: {score}')