-
Notifications
You must be signed in to change notification settings - Fork 0
/
exploratory_data_analysis.py
187 lines (163 loc) · 7.62 KB
/
exploratory_data_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from xgboost.sklearn import XGBRegressor
# To begin with, we will focus on a single smaller data set to get a sense of the data.
snapshot = pd.read_csv('data/latest-snapshot-quarterly.csv')
print(snapshot.info(), '\n')
snapshot.hist(bins=50, figsize=(20, 15))
plt.show()
# "Current assets can be converted into cash is less than one year."
# All companies have slightly more total assets (average ~2K) than current assets
# Values appear to be in $USD
# SEC ID seems to be the variable by which we can unite this info with the info in companies-names-industries.csv
# We would like to predict: dilluted earnings per share
print(snapshot["EarningsPerShareDiluted"].describe(), '\n')
# Worst case scenario: each share is a huge loss (-60$)!
# Mean is higher than the median, suggesting we have some high-performers to compensate for the "failures"
# We would like to find those high-performers.
# Let's see how our target variable correlates to the other economical values (exclude date and ID)
corr_matrix = snapshot.drop(["SEC ID", "Report date"], axis=1).corr()
print(corr_matrix["EarningsPerShareDiluted"].sort_values(ascending=False), '\n')
# Non-neglible but weak positive correlation with ComprehensiveIncomeNetOfTax
# "Includes deferred gains or losses on qualifying hedges, unrealized holding gains or losses on available-for-sale
# securities, minimum pension liability, and cumulative translation adjustment."
# So, possibly, the more "potential" value a company has, the higher its stock value.
y = snapshot["EarningsPerShareDiluted"]
X = snapshot.drop(["EarningsPerShareDiluted", "Report date", "SEC ID"], axis=1)
# Dealing with missing values
print("Are there missing values in X?", any(pd.isna(X).any()))
print("Are there missing values in y?", pd.isna(y).any())
features = X.columns[X.isna().any()].tolist()
print("List of features:\n", ','.join(features))
print("Do all feature columns have NANs?", len(features) == X.shape[1])
# All feature columns have NANs
# Also need to remove NANs from target -- or do we?
median_imputer = SimpleImputer(strategy="median")
X = median_imputer.fit_transform(X)
# Fill strategy for NANs appears to have little effect on ML
median_target_val = y.median()
y = y.fillna(median_target_val)
y = y.to_numpy()
# Feature scaling
scaler = StandardScaler()
X = scaler.fit_transform(X)
# Feature Scaling also appears to not explain the weak correlation found in ML (little effect)
# Feature importance
forest = ExtraTreesRegressor(n_estimators=250, random_state=0)
forest.fit(X, y)
importances = forest.feature_importances_
print("Feature importances:\n", importances)
std = np.std([tree.feature_importances_ for tree in forest.estimators_],
axis=0)
indices = np.argsort(importances)[::-1]
# Print the feature ranking
print("Feature rankings:")
for f in range(X.shape[1]):
print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))
# Plot the feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(X.shape[1]), importances[indices],
color="r", yerr=std[indices], align="center")
plt.xticks(range(X.shape[1]), indices)
plt.xlim([-1, X.shape[1]])
plt.show()
# None of the features appears to be particularly informative with such wide ranges of standard deviation.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print("\nLinear Regression")
lr = LinearRegression()
lr.fit(X_train, y_train)
y_train_pred = lr.predict(X_train)
y_test_pred = lr.predict(X_test)
print("Train MSE:", mean_squared_error(y_train, y_train_pred))
print("Test MSE:", mean_squared_error(y_test, y_test_pred))
print("Train R^2:", r2_score(y_train, y_train_pred))
print("Test R^2:", r2_score(y_test, y_test_pred))
# Poor correlation, error overfits a lot
print("\nRandom Forest")
rf = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=4)
rf.fit(X_train, y_train)
y_train_pred = rf.predict(X_train)
y_test_pred = rf.predict(X_test)
print("Train MSE:", mean_squared_error(y_train, y_train_pred))
print("Test MSE:", mean_squared_error(y_test, y_test_pred))
print("Train R^2:", r2_score(y_train, y_train_pred))
print("Test R^2:", r2_score(y_test, y_test_pred))
# Overfits correlation a great deal, and still overfits error
print("\nLasso Regression")
lasso = Lasso(alpha=0.1, random_state=42)
lasso.fit(X_train, y_train)
y_train_pred = lasso.predict(X_train)
y_test_pred = lasso.predict(X_test)
print("Train MSE:", mean_squared_error(y_train, y_train_pred))
print("Test MSE:", mean_squared_error(y_test, y_test_pred))
print("Train R^2:", r2_score(y_train, y_train_pred))
print("Test R^2:", r2_score(y_test, y_test_pred))
# No overfitting in correlation (none found in fact) but severely overfits error
print("\nRidge Regression")
ridge = Ridge(alpha=0.1)
ridge.fit(X_train, y_train)
y_train_pred = ridge.predict(X_train)
y_test_pred = ridge.predict(X_test)
print("Train MSE:", mean_squared_error(y_train, y_train_pred))
print("Test MSE:", mean_squared_error(y_test, y_test_pred))
print("Train R^2:", r2_score(y_train, y_train_pred))
print("Test R^2:", r2_score(y_test, y_test_pred))
# Similar to Lasso
print("\nElastic Net")
en = ElasticNet(alpha=0.1)
en.fit(X_train, y_train)
y_train_pred = en.predict(X_train)
y_test_pred = en.predict(X_test)
print("Train MSE:", mean_squared_error(y_train, y_train_pred))
print("Test MSE:", mean_squared_error(y_test, y_test_pred))
print("Train R^2:", r2_score(y_train, y_train_pred))
print("Test R^2:", r2_score(y_test, y_test_pred))
# Similar to Lasso and Ridge
print("\nAdaBoost")
ada = AdaBoostRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
ada.fit(X_train, y_train)
y_train_pred = ada.predict(X_train)
y_test_pred = ada.predict(X_test)
print("Train MSE:", mean_squared_error(y_train, y_train_pred))
print("Test MSE:", mean_squared_error(y_test, y_test_pred))
print("Train R^2:", r2_score(y_train, y_train_pred))
print("Test R^2:", r2_score(y_test, y_test_pred))
# Similar to Random Forests with slightly less overfitting
print("\nGradient Boosting")
gb = GradientBoostingRegressor()
gb.fit(X_train, y_train)
y_train_pred = gb.predict(X_train)
y_test_pred = gb.predict(X_test)
print("Train MSE:", mean_squared_error(y_train, y_train_pred))
print("Test MSE:", mean_squared_error(y_test, y_test_pred))
print("Train R^2:", r2_score(y_train, y_train_pred))
print("Test R^2:", r2_score(y_test, y_test_pred))
# Similar to AdaBoost
print("\nXGBoost")
xgb = XGBRegressor()
xgb.fit(X_train, y_train)
y_train_pred = xgb.predict(X_train)
y_test_pred = xgb.predict(X_test)
print("Train MSE:", mean_squared_error(y_train, y_train_pred))
print("Test MSE:", mean_squared_error(y_test, y_test_pred))
print("Train R^2:", r2_score(y_train, y_train_pred))
print("Test R^2:", r2_score(y_test, y_test_pred))
# Similar to the other boosting methods
# We probably need more data to improve predictions.
# Inspect structure for quaterly data per indicator:
assets = pd.read_csv('data/metrics/Assets-quarterly.csv')
assets = assets.drop("SEC ID", axis=1)
print('\n', assets.head())
print(assets.to_numpy())
# Each row is a company; each column is a quarter, from older to newest
# Load all quaterly reports for features, and merge, and load target also
print('\nQuarters', list(assets.columns))