-
Notifications
You must be signed in to change notification settings - Fork 1
/
poi_id.py
137 lines (118 loc) · 4.41 KB
/
poi_id.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
#!/usr/bin/python
from __future__ import print_function
import pickle
import os
from tester import dump_classifier_and_data
from learnEnron import (
feature_format,
feature_engineering,
feature_selection,
feature_scaling,
tune
)
ro = True # Outlier selection
fs = True # Feature selection
fe = True # Feature engineering
sc = True # Feature scaling
tu = True # Cross validation and parameter optimization
gb = False # Use gradient boosting
lr = True # Use logistic regression
pipe = True # Use pipe > anova features > pca > classifer
# Features_list is a list of strings, each of which is a feature name.
# The first feature must be "poi".
features_list = ['poi',
'bonus',
'deferral_payments',
'deferred_income',
'director_fees',
'exercised_stock_options',
'expenses',
'loan_advances',
'long_term_incentive',
'other',
'restricted_stock',
'restricted_stock_deferred',
'salary',
'shared_receipt_with_poi',
'total_payments',
'total_stock_value',
'ratio_to_poi',
'ratio_from_poi',
'from_messages',
'to_messages',
'from_poi_to_this_person',
'from_this_person_to_poi'
]
# Use os.path.abspath to access the file
file_dir = os.path.dirname(os.path.realpath(__file__))
f = os.path.join(file_dir, "resources", "data", "final_project_dataset.pkl")
# Changed to rb for python to read binary
with open(f, "rb") as data_file:
data_dict = pickle.load(data_file)
# Remove outliers
if ro:
# Total of all people
data_dict.pop("TOTAL", None)
# Not a person
data_dict.pop("THE TRAVEL AGENCY IN THE PARK", None)
# Only contains missing values
data_dict.pop("LOCKHART E", None)
# Feature engineering
if fe:
data_dict = feature_engineering.email_ratios(data_dict)
# Feature selection
#
# When pipe is True an ANOVA feature selction using
# KBest feature selection is used during GridSearchCV.
# This is more effective than this feature selection approach.
if fs:
# Chooses an AdaBoost classifier for feature selection.
#
# Overwrite this to try different SkLearn Classifiers.
clf_fs = feature_selection.get_fs_clf()
# Overwrite feature list after feature selection
features_list = feature_selection.selection(
data_dict,
features_list,
clf_fs,
cut_off=0.01
)
print("Features to be used after feature selection:")
print(features_list)
# Feature scaling
if sc:
data_dict = feature_scaling.scale(data_dict, features_list)
print("All features scaled")
# Store to my_dataset for easy export below.
my_dataset = data_dict
# Extract features and labels from dataset for local testing
data = feature_format.featureFormat(
my_dataset,
features_list,
sort_keys=True
)
labels, features = feature_format.targetFeatureSplit(data)
# No train test split is used.
#
# Final validation uses tester.py to compare to a test set.
#
# GridSearch CV involves an inner-loop stratified K-Fold
# cross validation.
features_train = features
labels_train = labels
# Tune the classifier to achieve better than .3 precision and recall
#
# View the tune module for details on full implementation.
if tu:
if gb:
clf = tune.param_optimize_gb(features_train, labels_train,
grid_search=False)
if lr:
if pipe:
clf = tune.param_optimize_lr_pipe(features_train, labels_train,
grid_search=True, folds=3)
else:
clf = tune.param_optimize_lr(features_train, labels_train,
grid_search=True)
# Dump the classifier, dataset, and features_list
dump_classifier_and_data(clf, my_dataset, features_list)