-
Notifications
You must be signed in to change notification settings - Fork 0
/
doregression.py
188 lines (169 loc) · 7.29 KB
/
doregression.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
from __future__ import with_statement
import copy
import csv
import gzip
import math
import matplotlib.pyplot as plt
import pdb
import plotly.plotly as py
import numpy as np
np.set_printoptions(threshold='nan')
import scipy
import time
import urllib
from collections import namedtuple, Counter
from copy import deepcopy
from plotly.graph_objs import *
py.sign_in('Eftychia', '2puhmq6aj8')
from pylab import *
from scipy.stats.stats import pearsonr
from sklearn import datasets, linear_model, cross_validation, metrics, clone, gaussian_process, svm, preprocessing
from sklearn.cross_validation import KFold, cross_val_score, StratifiedKFold
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, ExtraTreesClassifier, AdaBoostClassifier
from sklearn.externals.six.moves import xrange
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
##########################################################################################################################
arrayNames = np.load("arrayNames.npy")
arrayFeatures = np.load("arrayFeatures.npy")
codingsLengths = np.load("codingsLengths.npy")
arrayCodSequences = np.load("arrayCodSequences.npy")
print "arrayNames : ", arrayNames.shape
print "arrayFeatures : ", arrayFeatures.shape
print "codingsLengths : ", codingsLengths.shape
print "arrayCodSequences : ",arrayCodSequences.shape
X = codingsLengths
y = arrayFeatures[:,-1]
skf = cross_validation.KFold(len(y),n_folds=5)
# # Do SVR
# allScores = []
# print "X and Y", X.shape, y.shape
# fold = 1
# for train_index, test_index in skf:
# #print("TRAIN:", train_index, "TEST:", test_index)
# X_train, X_test = X[train_index], X[test_index]
# y_train, y_test = y[train_index], y[test_index]
# print "X_train.shape",X_train.shape
# print "y_train.shape", y_train.shape
# svr_linear = svm.SVR(kernel='linear', degree=4, C=1, cache_size=300)
# # svr_linear = svm.SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.2, gamma=0.0,
# # kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False) #SupportVectorMachine - Support Vector Regression
# print "start training"
# X_train = X_train.reshape( (len(X_train), 1) )
# svr_linear.fit(X_train, y_train)
# print "done training"
# score = svr_linear.score(X_test, y_test)
# allScores.append(score)
# y_linear_predictTest = svr_linear.predict(X_test)
# y_linear_predictTrain = svr_linear.predict(X_train)
# ys = np.vstack([y_train , y_linear_predictTrain]).T
# ys = ys[ys[:, 0].argsort()]
# #print ys
# y_train = ys[:,0]
# y_linear_predictTrain = ys[:,1]
# ys = np.vstack([y_test , y_linear_predictTest]).T
# ys = ys[ys[:, 0].argsort()]
# #print ys
# y_test = ys[:,0]
# y_linear_predictTest = ys[:,1]
# #Images
# plt.close('all')
# plt.figure()
# plt.subplots_adjust(hspace=.5, wspace=.5)
# plt.subplot(1,2,1)
# plt.title('SVR train')
# allSampleIndexes=np.arange(len(X_train[:])) #we put the len so to give to each sample a number
# plt.scatter(allSampleIndexes, y_train, c='g', edgecolor='none', s=3, label='train')
# plt.hold('on')
# plt.scatter(allSampleIndexes, y_linear_predictTrain, c='r', edgecolor='none', s=3, label='prediction')
# plt.xlabel('#of rows')
# plt.ylabel('initiation rates (log)')
# art=[]
# lgd = plt.legend(loc=9, bbox_to_anchor=(0.5, -0.1), ncol=2)
# art.append(lgd)
# plt.subplot(1,2,2)
# allSampleIndexes=np.arange(len(X_test[:])) #we put the len so to give to each sample a number
# plt.scatter(allSampleIndexes, y_test, c='g', edgecolor='none', s=3, label='test')
# plt.hold('on')
# plt.scatter(allSampleIndexes, y_linear_predictTest, c='r', edgecolor='none', s=3, label='prediction')
# plt.xlabel('#of rows')
# plt.ylabel('initiation rates (log)')
# plt.title('SVR test')
# lgd = plt.legend(loc=9, bbox_to_anchor=(0.5, -0.1), ncol=2)
# art.append(lgd)
# plt.suptitle('SVR_length'+ `fold`+'Fold')
# plt.savefig('CodingRegression/svr_length'+ `csvFile` +'Fold '+`fold`+'.png', additional_artists=art, bbox_inches='tight')
# # plt.show()
# plt.clf()
# fold += 1
# # Print SVR scores
# allScores = np.array(allScores)
# print "SVR Scores:", allScores
# print("Accuracy: %0.2f (+/- %0.2f)" % (allScores.mean(), allScores.std()))
# print('MeanAbsoluteError Train: {}'.format(metrics.mean_absolute_error(y_linear_predictTrain, y_train)))
# print('MeanAbsoluteError Test: {}'.format(metrics.mean_absolute_error(y_linear_predictTest, y_test)))
#Do Random Forest
allScores = []
fold = 1
for train_index, test_index in skf:
#print("TRAIN:", train_index, "TEST:", test_index)
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
X_train = X_train.reshape( (len(X_train), 1) )
X_test = X_test.reshape( (len(X_test), 1) )
rndmForest = RandomForestRegressor(n_estimators=100)
print "Start training Random Forest"
rndmForest.fit(X_train, y_train)
print "Done training"
score = rndmForest.score(X_test, y_test)
allScores.append(score)
y_predictTest = rndmForest.predict(X_test)
y_predictTrain = rndmForest.predict(X_train)
ys = np.vstack([y_train , y_predictTrain]).T
ys = ys[ys[:, 0].argsort()]
#print ys
y_train = ys[:,0]
y_predictTrain = ys[:,1]
ys = np.vstack([y_test , y_predictTest]).T
ys = ys[ys[:, 0].argsort()]
#print ys
y_test = ys[:,0]
y_predictTest = ys[:,1]
#Images
plt.close('all')
plt.figure()
plt.subplots_adjust(hspace=.5, wspace=.5)
plt.subplot(1,2,1)
plt.title('rndmForest train scaled')
allSampleIndexes=np.arange(len(X_train[:])) #we put the len so to give to each sample a number
plt.scatter(allSampleIndexes, y_train, c='g', edgecolor='none', s=3, label='train')
plt.hold('on')
plt.scatter(allSampleIndexes, y_predictTrain, c='r', edgecolor='none', s=3, label='prediction')
plt.xlabel('#of rows')
plt.ylabel('initiation rates (log)')
art = []
lgd = plt.legend(loc=9, bbox_to_anchor=(0.5, -0.1), ncol=2)
art.append(lgd)
plt.subplot(1,2,2)
allSampleIndexes=np.arange(len(X_test[:])) #we put the len so to give to each sample a number
plt.scatter(allSampleIndexes, y_test, c='g', edgecolor='none', s=3, label='test')
plt.hold('on')
plt.scatter(allSampleIndexes, y_predictTest, c='r', edgecolor='none', s=3, label='prediction')
plt.xlabel('#of rows')
plt.ylabel('initiation rates (log)')
plt.title('rndmForest test')
lgd = plt.legend(loc=9, bbox_to_anchor=(0.5, -0.1), ncol=2)
art.append(lgd)
plt.suptitle('rndmForest_length'+`fold`+'Fold')
plt.savefig('CodingRegression/rndmForest_length'+ `csvFile` +'Fold '+`fold`+'.png', additional_artists=art, bbox_inches='tight')
# plt.show()
plt.clf()
fold += 1
# print "Prediction of X is ", Y_predicted
allScores = np.array(allScores)
print "Random Forest Scores:", allScores
print("Accuracy: %0.2f (+/- %0.2f)" % (allScores.mean(), allScores.std()))
print('MeanAbsoluteError Train: {}'.format(metrics.mean_absolute_error(y_predictTrain, y_train)))
print('MeanAbsoluteError Test: {}'.format(metrics.mean_absolute_error(y_predictTest, y_test)))