forked from MMron/FeatureEval
-
Notifications
You must be signed in to change notification settings - Fork 0
/
sMCf.py
383 lines (294 loc) · 13 KB
/
sMCf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
# -*- coding: utf-8 -*-
"""
Created on Sat Oct 27 18:57:54 2018
@author: Meron
"""
import pandas as pd
import numpy as np
from sklearn.exceptions import NotFittedError
from sklearn.cross_decomposition import PLSRegression
import sklearn.datasets
import scipy.stats
from scipy.io import loadmat
class sMC :
"""
Significance Multivariate Correlation — sMC
Parameters
----------
type pls: object
object from PLS regression.
:param: int
optimal number of components of PLS model.
: alpha_mc: float
the chosen significance level for f-test.
Attributes
----------
importances: array [number_of_features,1 ]
The quantified value of how important a variable is in the same
sequence as the variables used in the input model.
Notes
-----
The algorithm assumes that the
data is provided centred and scaled (optional).
Examples
--------
from sklearn.cross_decomposition import PLSRegression
import sklearn.datasets
data = sklearn.datasets.load_boston()
X = data['data']
y = data['target']
pls = PLSRegression()
pls.fit(X,y)
smc = sMC()
important_dataset = smc.fit_transform(pls,X,alpha_mc=0.0001)
References
----------
T.N. Tran*, N.L. Afanador, L.M.C. Buydens, L. Blanchet,
Interpretation of variable importance in Partial Least Squares with Significance Multivariate Correlation (sMC),
Chemometrics and Intelligent Laboratory Systems, Volume 138, 15 November 2014, Pages 153-160
DOI: http://dx.doi.org/10.1016/j.chemolab.2014.08.005
https://rdrr.io/github/khliland/plsVarSel/src/R/filters.R
"""
def __init__(self):
"""
Initialize self. See help(type(self)) for accurate signature.
"""
self.model, self.importances, self.significant_variables = None, None, None
self.opt, self.params, self.alpha_mc = None, None, None
def fit(self, model,X, opt=None, alpha_mc=None): #pls.object, opt.comp, X, alpha_mc = 0.05)
"""
Computes the importance to the features given a dataset and a fitted
classification/regression model with coefficients for each parameter
Get a quantified importance value for each parameter in the matrix X
a set of column vectors equal in length to the number of variables
included in the model. It contains one column of mSC scores for each
predicted y-block column. The important variables are those who passes
the F-value test, and has a F-value over the critical f-value
associated with the chosen significance level
Parameters
----------
model: object
object from a classifier or regression model with atribute coef_.
X : Pandas Dataframe or numpy ndarray
data matrix values
opt : int
optimal number of components of PLS model.
alpha_mc : float
the chosen significance level for f-test. Range <0,1>
default value: 0.05
Attributes
-------
importances : numpy array
SMC F-values for the list of variables
smcFcrit: Float
F-critical cut-off threshold value for significant important
variables (smcF>smcFcrit)
significant_variables: List
list with false and true values according to smcF>smcFcrit
representing the important columns in the given dataset
Development note
----------------
should remove model object as input and replace it with the
coefficients for the given model, this way the code will be easier to
read. Also the code will generalise better since it could be used on
any model which has coefficients and not be restricted to the sklearn
and hoggorm packages.
Returns
-------
self
"""
self.alpha_mc = 0.05 if alpha_mc is None else alpha_mc
#The algorithm assumes scaled features the next two lines dummy proofs
if (np.round(np.sum(X),9)!=0): # not scaled
X -=np.mean(X,axis=0)
if hasattr(model,'coef_'): #sklearn
b = model.coef_
elif hasattr(model,'cvType'): #hoggorm
opt = np.shape(model.X_loadings())[1] if opt is None else opt
b = model.regressionCoefficients(numComp=opt)
else:
raise NotImplementedError('This model object type is not supported '\
'The supported objects are sklearn and '
'hoggorm pls.')
n = np.shape(X)[0]
yhat = np.dot(X,b)
Xhat = np.dot(yhat,b.T)/(np.linalg.norm(b)**2)
Xresidual = X - Xhat
SSCregression = np.sum(Xhat**2,axis=0)
SSResidual = np.sum(Xresidual**2,axis=0)
MSCregression = SSCregression
MSResidual = SSResidual/(n-2)
smcF = np.divide(MSCregression,MSResidual)
self.smcFcrit = scipy.stats.f.ppf(1-self.alpha_mc,1,n-2)
self.importances = smcF
self.significant_variables = smcF > self.smcFcrit
return self
def transform(self,X):
"""
Perform feature reduction by selecting features within a f-value
threshold associated with the significance certainty threshold described
by aplha_mc.
Parameters
----------
X : ndarray or pandas dataframe, shape [n_samples, n_features]
The data used to scale along the features axis.
Returns
-------
:returns value : numpy ndarray or Dataframe, same as the input
a nxz matrix, where n are the number of samples in x
and z are the number of features, z is based upon the
F-critical cut-off threshold value.
"""
if self.significant_variables is not None:
if isinstance(X,pd.DataFrame): # dataframe
return X[X.columns[self.significant_variables]]
elif isinstance(X,np.ndarray): # numpy array
return X[:,self.significant_variables]
else:
raise TypeError('X must be a pandas dataframe or numpy ndarray')
else:
raise NotFittedError('This sMC instance is not fitted yet. Call the fit method '\
'with appropriate arguments before using this method')
def fit_transform(self,model, X,opt=None, alpha_mc=None):
"""
Fit to data based upon the data and the desired model.
Later the data is transformed to only include the important variables.
Fits transformer to X, and returns a transformed version of X.
Parameters
----------
model: object
object from a classifier or regression model with atribute coef_.
or a PLS object from hoggorm
X: pandas dataframe or numpy ndarray
data matrix used as predictors in model.
opt: int
optimal number of components of PLS model.
alpha_mc: float
the chosen significance level for f-test.
Returns
-------
:returns value: numpy ndarray or Dataframe, same as the input
a nxz matrix, where n is the number of samples in x
and z are the number of features, z is based upon the
threshold value.
"""
if isinstance(X,pd.DataFrame):
self.fit(model,X.get_values(),opt,alpha_mc)
return self.transform(X)
elif isinstance(X,np.ndarray):
self.fit(model,X,opt,alpha_mc)
return self.transform(X)
else:
raise TypeError('X must be a pandas dataframe or numpy ndarray')
def get_params(self, deep = True):
"""
Get parameters for this class.
Parameters
----------
deep : boolean, optional
If True, will return the parameters for this module and
contained subobjects that are class.
Returns
-------
params : mapping of string to any
Parameter names mapped to their values.
"""
self._params = {'importances':self.importances,
'significant_variables': self.significant_variables,
'smcFcrit':self.smcFcrit,'alpha_mc':self.alpha_mc}
return self._params
def set_params(self,**parameters):
"""
Set the parameters of this class, will keep old parameters if input
is None.
The method works on simple instances as well as on nested objects
(such as pipelines). The latter have parameters of the form
``<component>__<parameter>`` so that it's possible to update each
component of a nested object.
Aquired from https://scikit-learn.org/stable/developers/contributing.html
Returns
-------
self
"""
for parameter, value in parameters.items():
setattr(self, parameter, value)
return self
if __name__ == "__main__":
def test_same_as_matlab():
"""
test that the sMC score is equal to those provided from matlab
"""
data = sklearn.datasets.load_boston()
X = data['data']
y = data['target']
pls = PLSRegression()
pls.fit(X,y)
smc_mat = loadmat('./validering/values_smc_1_centered.mat')['values']
coef = loadmat('./validering/beta_1_centered')['BETA']
pls.coef_ = coef[1:] # leave the interception out
smc = sMC()
smc.fit(pls,X)
corrects = np.sum(np.round(smc.importances,10) == np.round(smc_mat,10))
assert (corrects==np.shape(X)[1])
def test_equal_f_value():
"""
For a dataset with equal columns, check that the F-value is equal
"""
data = sklearn.datasets.load_boston()
X = np.column_stack((data['data'][:,0],data['data'][:,0],data['data'][:,0]))
y = data['target']
pls = PLSRegression()
pls.fit(X,y)
smc = sMC()
smc.fit(pls,X)
assert len(set(smc.importances))==1
def test_correct_dim_out():
"""
Checks that the output dimensjons are reduced as it should.
"""
data = sklearn.datasets.load_boston()
X = data['data']
y = data['target']
pls = PLSRegression()
pls.fit(X,y)
smc = sMC()
smc.fit(pls,X)
smc.significant_variables= np.array([False,False,False,True,False,False,True,False,False,False,True,True,True])
assert (np.shape(smc.transform(X))==(506,5))
def test_random_columns_low():
"""
Check that the imporance of randomly generated columns are low over some number of iterations
"""
from sklearn.model_selection import GridSearchCV
np.random.seed(99)
pls = PLSRegression()
no_iter=10
smc = sMC()
no_params = 18
sampels = 506
variable_imp = {key: [] for key in range(no_params)}
for i in range(no_iter):
data = np.random.normal(size=(sampels,no_params))
noise_y = np.random.normal(0,1,sampels)
y = 2*data[:,0] - 3*data[:,1] + noise_y
pls = PLSRegression()
params = {'n_components':list(range(1,no_params))}
gs=GridSearchCV(estimator = pls,
param_grid=params,
scoring='neg_mean_absolute_error',
cv=5)
gs.fit(data,y)
smc.fit(gs.best_estimator_,data)
for key in variable_imp.keys():
variable_imp[key].append(smc.importances[key])
variable_imp_means = [np.mean(variable_imp[i]) for i in range(no_params)]
variable_imp_min = np.min(variable_imp_means[:2]) #min importance of orginals features
r_variable_imp_max = np.max(variable_imp_means[2:]) # max importance of random features
assert variable_imp_min > r_variable_imp_max
data = sklearn.datasets.load_boston()
X = data['data']
y = data['target']
pls = PLSRegression()
pls.fit(X,y)
smc = sMC()
smc.fit_transform(pls,pd.DataFrame(X),alpha_mc=0.01)