diff --git a/.gitignore b/.gitignore
index 17c225de1fc..8a3a93664a8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,9 +2,13 @@
enron_mail_20110402.tgz
enron_mail_20110402/
enron_mail_20150507.tgz
+enron_mail_20150507.tar.gz
+enron_mail_20150507.tar
maildir/
text_learning/your_word_data.pkl
text_learning/your_email_authors.pkl
my_classifier.pkl
my_dataset.pkl
my_feature_list.pkl
+.idea
+
diff --git a/Project report.docx b/Project report.docx
new file mode 100644
index 00000000000..8997ad63776
Binary files /dev/null and b/Project report.docx differ
diff --git a/README.md b/README.md
index 9c2c9967cb5..26b1f343e6e 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,34 @@
-ud120-projects
+
ud120-projects
==============
-Starter project code for students taking Udacity ud120
+My repo for Udacity ud120 course
+
+ Content
+* Session excercises / mini projects
+* Enron project
+
+
+ IDE
+PyCharm community Edition By Jet Brain
+
+ Commands used
+**install sklearn**
+
+pip install scikit-learn
+
+**install natural language toolkit**
+
+pip install nltk
+
+**install matplotlib**
+
+pip install matplotlib
+
+ Environment from requirements.txt
+
+nltk==3.2.1
+numpy==1.13.3
+scikit-learn==0.18
+scipy==0.19.1
+
+
diff --git a/choose_your_own/class_vis.py b/choose_your_own/class_vis.py
index 38957c9574e..4bf993d4571 100644
--- a/choose_your_own/class_vis.py
+++ b/choose_your_own/class_vis.py
@@ -46,5 +46,5 @@ def output_image(name, format, bytes):
data['name'] = name
data['format'] = format
data['bytes'] = base64.encodestring(bytes)
- print image_start+json.dumps(data)+image_end
+ print( image_start+json.dumps(data)+image_end)
diff --git a/choose_your_own/test.PNG b/choose_your_own/test.PNG
new file mode 100644
index 00000000000..ac55c5d3eb6
Binary files /dev/null and b/choose_your_own/test.PNG differ
diff --git a/choose_your_own/your_algorithm.py b/choose_your_own/your_algorithm.py
index 62a7573cfdf..67a59637651 100644
--- a/choose_your_own/your_algorithm.py
+++ b/choose_your_own/your_algorithm.py
@@ -30,7 +30,16 @@
### your code here! name your classifier object clf if you want the
### visualization code (prettyPicture) to show you the decision boundary
-
+# KNN
+clf_knn = KNeighborsClassifier(n_neighbors=4)
+clf_knn.fit(features_train, labels_train)
+pred_knn = clf_knn.predict(features_test)
+print( "Accuracy for KNeighborsClassifier:", accuracy_score(labels_test, pred_knn))
+
+clf_rf = RandomForestClassifier(n_estimators=15, min_samples_split=6)
+clf_rf.fit(features_train, labels_train)
+clf_rf = clf_rf.predict(features_test)
+print( "Accuracy RandomForestClassifier:", accuracy_score(labels_test, clf_rf))
diff --git a/datasets_questions/explore_enron_data.py b/datasets_questions/explore_enron_data.py
index ca9bacb9c83..ed00f32fb71 100644
--- a/datasets_questions/explore_enron_data.py
+++ b/datasets_questions/explore_enron_data.py
@@ -16,7 +16,48 @@
"""
import pickle
+import numpy as np
enron_data = pickle.load(open("../final_project/final_project_dataset.pkl", "rb"))
-
+print(len(enron_data))
+print(len(list(enron_data.values())[0]))
+
+count = 0
+for person_name in enron_data.keys():
+ if(enron_data[person_name]["poi"]==1):
+ count = count+1
+print(count)
+
+total_poi = 0
+with open('../final_project/poi_names.txt', 'r') as file:
+ for line in file:
+ if('\(y\)' or '\(n\)' in line):
+ total_poi= total_poi+1
+print(total_poi)
+file.close()
+print("Net Stock value of James Prentice: ", enron_data['PRENTICE JAMES']['total_stock_value'])
+print("Wesley Colwell to POI emails: ", enron_data['COLWELL WESLEY']['from_this_person_to_poi'])
+print("Stock options of Jeffrey Skilling: ", enron_data['SKILLING JEFFREY K']['exercised_stock_options'])
+
+most_value_taken = max([(enron_data[person_name]['total_payments']) for person_name in ("LAY KENNETH L", "SKILLING JEFFREY K", "FASTOW ANDREW S")])
+print(most_value_taken)
+
+salaries_not_nan = 0
+known_emails = 0
+total_payments_not_nan = 0
+total_payments_not_nan_poi = 0
+for person_name in enron_data:
+ if not np.isnan(float(enron_data[person_name]['salary'])):
+ salaries_not_nan += 1
+ if(enron_data[person_name]['email_address'] != 'NaN'):
+ known_emails+=1
+ if np.isnan(float(enron_data[person_name]['total_payments'])):
+ total_payments_not_nan +=1
+ if np.isnan(enron_data[person_name]["poi"]==1 ):
+ total_payments_not_nan_poi += 1
+
+print('Salaries available:: ', salaries_not_nan)
+print('Available emails: ', known_emails)
+print('Number Percentage people NaN -> their total payments: ',total_payments_not_nan, total_payments_not_nan*100/len(enron_data))
+print('Number and Percentage Pois NaN -> their total payments: ',total_payments_not_nan_poi, total_payments_not_nan_poi*100/count)
diff --git a/decision_tree/dt_author_id.py b/decision_tree/dt_author_id.py
index 006afb8357b..14cdfe84125 100644
--- a/decision_tree/dt_author_id.py
+++ b/decision_tree/dt_author_id.py
@@ -24,7 +24,23 @@
#########################################################
### your code goes here ###
+#imports
+from sklearn import tree
+from sklearn.metrics import accuracy_score
+#
+# create classifer
+clf = tree.DecisionTreeClassifier(min_samples_split=40)
+# fit the classifier on training features and labels
+clf.fit(features_train, labels_train)
+
+#predict
+pred = clf.predict(features_test)
+
+# print
+print( "Accuracy:", accuracy_score(labels_test, pred))
+
+print( "No of features in date:", len(features_train[0]))
#########################################################
diff --git a/evaluation/evaluate_poi_identifier.py b/evaluation/evaluate_poi_identifier.py
index 0ca99d52d5f..0632a18e71e 100644
--- a/evaluation/evaluate_poi_identifier.py
+++ b/evaluation/evaluate_poi_identifier.py
@@ -13,8 +13,12 @@
import pickle
import sys
+import numpy as np
sys.path.append("../tools/")
from feature_format import featureFormat, targetFeatureSplit
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.metrics import accuracy_score, precision_score, recall_score
+from sklearn.cross_validation import train_test_split
data_dict = pickle.load(open("../final_project/final_project_dataset.pkl", "r") )
@@ -27,5 +31,61 @@
### your code goes here
+features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size=0.30,
+ random_state=42)
+# create DT Classifier
+clf = DecisionTreeClassifier()
+# fit/train it
+clf.fit(features_train, labels_train)
+
+# predict
+pred = clf.predict(features_test)
+
+#print
+print( "accuracy:", accuracy_score(labels_test, pred))
+
+### evaluation
+values, counts = np.unique(pred, return_counts=True)
+test_size = len(features_test)
+
+# print
+print("Predicted POIs:", zip(values, counts))
+print( "Total number in test set:", test_size)
+print( "Accuracy - all poi=0:", counts[0] / test_size)
+
+true_positives = 0
+for actual, predicted in zip(labels_test, pred):
+ if actual == 1 and predicted == 1:
+ true_positives += 1
+
+# print
+print( "TP - true positives:", true_positives)
+print( "Precision score:", precision_score(labels_test, pred))
+print( "Recall score:", recall_score(labels_test, pred))
+
+prediction_labels = [0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1]
+true_labels = [0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0]
+
+
+def calc_precision_and_recall(actual, predicted):
+ print( "Doing precision and recall...")
+ true_positives = 0
+ false_positives = 0
+ false_negatives = 0
+ true_negatives = 0
+ for a, p in zip(actual, predicted):
+ if a == 1 and p == 1:
+ true_positives += 1
+ elif a == 1 and p == 0:
+ false_negatives += 1
+ elif a == 0 and p == 1:
+ false_positives += 1
+ else:
+ true_negatives += 1
+ print( "Precision:", true_positives / (true_positives + false_positives))
+ print( "Recall:", true_positives / (true_positives + false_negatives))
+
+
+calc_precision_and_recall(true_labels, prediction_labels)
diff --git a/feature_selection/find_signature.py b/feature_selection/find_signature.py
index c01a1f2111a..93c0be9a822 100644
--- a/feature_selection/find_signature.py
+++ b/feature_selection/find_signature.py
@@ -28,6 +28,8 @@
features_train = vectorizer.fit_transform(features_train)
features_test = vectorizer.transform(features_test).toarray()
+# get words
+words = vectorizer.get_feature_names()
### a classic way to overfit is to use a small number
### of data points and a large number of features;
@@ -38,6 +40,19 @@
### your code goes here
-
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.metrics import accuracy_score
+
+clf = DecisionTreeClassifier(min_samples_split=40)
+clf.fit(features_train, labels_train)
+pred = clf.predict(features_test)
+print( "Accuracy:", accuracy_score(labels_test, pred))
+
+print( "Important features:")
+for index, feature in enumerate(clf.feature_importances_):
+ if feature>0.2:
+ print( "Feature number", index)
+ print( "Importance", feature)
+ print( "Word", words[index])
diff --git a/final_project/poi_id.py b/final_project/poi_id.py
index 47912a7c51d..c1e39e53b7a 100644
--- a/final_project/poi_id.py
+++ b/final_project/poi_id.py
@@ -2,6 +2,7 @@
import sys
import pickle
+import matplotlib
sys.path.append("../tools/")
from feature_format import featureFormat, targetFeatureSplit
@@ -10,14 +11,86 @@
### Task 1: Select what features you'll use.
### features_list is a list of strings, each of which is a feature name.
### The first feature must be "poi".
-features_list = ['poi','salary'] # You will need to use more features
+features_list = ['poi', 'salary'] # You will need to use more features
### Load the dictionary containing the dataset
-with open("final_project_dataset.pkl", "r") as data_file:
- data_dict = pickle.load(data_file)
+# with open("final_project_dataset.pkl", "r") as data_file:
+# data_dict = pickle.load(data_file)
+
+# data_dict = pickle.load( open( "final_project_dataset.pkl", "rb" ) )
+
+with open('final_project_dataset.pkl', 'rb') as handle:
+ data_dict = pickle.load(handle)
+
+
### Task 2: Remove outliers
+identified_outliers = ["TOTAL", "LAVORATO JOHN J", "MARTIN AMANDA K", "URQUHART JOHN A", "MCCLELLAN GEORGE", "SHANKMAN JEFFREY A", "WHITE JR THOMAS E", "PAI LOU L", "HIRKO JOSEPH"]
+for outlier in identified_outliers:
+ data_dict.pop(outlier)
+
### Task 3: Create new feature(s)
+financial_features = ['salary', 'deferral_payments', 'total_payments', 'loan_advances', 'bonus', 'restricted_stock_deferred', 'deferred_income', 'total_stock_value', 'expenses', 'exercised_stock_options', 'other', 'long_term_incentive', 'restricted_stock', 'director_fees']
+email_features = ['to_messages', 'email_address', 'from_poi_to_this_person', 'from_messages', 'from_this_person_to_poi', 'poi', 'shared_receipt_with_poi']
+
+# count data_points
+data_points = len(data_dict)
+
+# initialise counts
+poi_count = 0
+non_poi_count = 0
+
+
+# print
+non_poi_count = 0
+print( "Data points:\t", data_points)
+print( "Number of non POIs:\t", non_poi_count)
+print( "Number of POIs:\t\t", poi_count)
+
+print( "POI ratio:\t\t", poi_count/data_points)
+print( "Total features:\t", len(data_dict[data_dict.keys()[0]]))
+print( "Financial features:\t", len(financial_features))
+print( "Email features:\t", len(email_features))
+print( "")
+
+
+
+
+
+def outlier_visualization(data):
+ for point in data:
+ f1 = point[0]
+ f2 = point[1]
+ matplotlib.pyplot.scatter(f1, f2 )
+
+ matplotlib.pyplot.xlabel("Feature 1")
+ matplotlib.pyplot.ylabel("Feature 2")
+ matplotlib.pyplot.show()
+
+
+
+def visualize_outliers():
+ start = 1
+ for i in range(2, len(financial_features)):
+ outlier_visualization(financial_outliers, 1, i, 'salary', financial_features[i], start)
+ start += 1
+ start = 10
+
+ for i in range(2, len(email_features)):
+ outlier_visualization(email_outliers, 1, i, 'to_messages', email_features[i], start)
+ start += 1
+
+
+# outlier name
+def get_outlier(feature, value):
+ for person, features in data_dict.iteritems():
+ if features[feature] == value:
+ print("Outlier is:", person, features['poi'])
+
+
+
+
+
### Store to my_dataset for easy export below.
my_dataset = data_dict
@@ -25,6 +98,16 @@
data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)
+financial_outliers = featureFormat(data_dict, financial_features)
+email_outliers = featureFormat(data_dict, email_features)
+
+
+#from sklearn.cross_validation import train_test_split
+from sklearn.model_selection import train_test_split
+
+features_train, features_test, labels_train, labels_test = \
+ train_test_split(features, labels, test_size=0.3, random_state=42)
+
### Task 4: Try a varity of classifiers
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
@@ -32,9 +115,18 @@
### http://scikit-learn.org/stable/modules/pipeline.html
# Provided to give you a starting point. Try a variety of classifiers.
+# import
from sklearn.naive_bayes import GaussianNB
+
+# create classifier
clf = GaussianNB()
+#fit/train
+clf.fit(features_train, labels_train)
+
+# predict
+pred = clf.predict(features_test)
+
### Task 5: Tune your classifier to achieve better than .3 precision and recall
### using our testing script. Check the tester.py script in the final project
### folder for details on the evaluation method, especially the test_classifier
@@ -43,7 +135,8 @@
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html
# Example starting point. Try investigating other evaluation techniques!
-from sklearn.cross_validation import train_test_split
+#from sklearn.cross_validation import train_test_split
+from sklearn.model_selection import train_test_split
features_train, features_test, labels_train, labels_test = \
train_test_split(features, labels, test_size=0.3, random_state=42)
@@ -52,4 +145,9 @@
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.
-dump_classifier_and_data(clf, my_dataset, features_list)
\ No newline at end of file
+visualize_outliers()
+
+dump_classifier_and_data(clf, my_dataset, features_list)
+
+
+
diff --git a/final_project/tester.py b/final_project/tester.py
index c0899dbe58e..3749091aaba 100644
--- a/final_project/tester.py
+++ b/final_project/tester.py
@@ -12,7 +12,9 @@
import pickle
import sys
-from sklearn.cross_validation import StratifiedShuffleSplit
+#from sklearn.cross_validation import StratifiedShuffleSplit
+from sklearn.model_selection import StratifiedShuffleSplit
+
sys.path.append("../tools/")
from feature_format import featureFormat, targetFeatureSplit
@@ -55,9 +57,9 @@ def test_classifier(clf, dataset, feature_list, folds = 1000):
elif prediction == 1 and truth == 1:
true_positives += 1
else:
- print "Warning: Found a predicted label not == 0 or 1."
- print "All predictions should take value 0 or 1."
- print "Evaluating performance for processed predictions:"
+ print( "Warning: Found a predicted label not == 0 or 1.")
+ print( "All predictions should take value 0 or 1.")
+ print( "Evaluating performance for processed predictions:")
break
try:
total_predictions = true_negatives + false_negatives + false_positives + true_positives
@@ -66,13 +68,13 @@ def test_classifier(clf, dataset, feature_list, folds = 1000):
recall = 1.0*true_positives/(true_positives+false_negatives)
f1 = 2.0 * true_positives/(2*true_positives + false_positives+false_negatives)
f2 = (1+2.0*2.0) * precision*recall/(4*precision + recall)
- print clf
- print PERF_FORMAT_STRING.format(accuracy, precision, recall, f1, f2, display_precision = 5)
- print RESULTS_FORMAT_STRING.format(total_predictions, true_positives, false_positives, false_negatives, true_negatives)
- print ""
+ print( clf)
+ print( PERF_FORMAT_STRING.format(accuracy, precision, recall, f1, f2, display_precision = 5))
+ print( RESULTS_FORMAT_STRING.format(total_predictions, true_positives, false_positives, false_negatives, true_negatives))
+ print( "")
except:
- print "Got a divide by zero when trying out:", clf
- print "Precision or recall may be undefined due to a lack of true positive predicitons."
+ print( "Got a divide by zero when trying out:", clf)
+ print( "Precision or recall may be undefined due to a lack of true positive predicitons.")
CLF_PICKLE_FILENAME = "my_classifier.pkl"
DATASET_PICKLE_FILENAME = "my_dataset.pkl"
diff --git a/k_means/k_means_cluster.py b/k_means/k_means_cluster.py
index 6a2ba687017..eb42d6d8f51 100644
--- a/k_means/k_means_cluster.py
+++ b/k_means/k_means_cluster.py
@@ -13,7 +13,8 @@
import sys
sys.path.append("../tools/")
from feature_format import featureFormat, targetFeatureSplit
-
+from sklearn.cluster import KMeans
+from sklearn.preprocessing import MinMaxScaler
@@ -39,7 +40,11 @@ def Draw(pred, features, poi, mark_poi=False, name="image.png", f1_name="feature
### load in the dict of dicts containing all the data on each person in the dataset
-data_dict = pickle.load( open("../final_project/final_project_dataset.pkl", "r") )
+with open("../final_project/final_project_dataset.pkl", "rb") as f:
+ rawdataset = f.read()
+
+#data_dict = pickle.load( open("../final_project/final_project_dataset.pkl", "r") )
+data_dict = pickle.load( rawdataset)
### there's an outlier--remove it!
data_dict.pop("TOTAL", 0)
@@ -63,8 +68,13 @@ def Draw(pred, features, poi, mark_poi=False, name="image.png", f1_name="feature
plt.show()
### cluster here; create predictions of the cluster labels
-### for the data and store them to a list called pred
+est = KMeans(n_clusters=2)
+# fit/train it
+est.fit(finance_features)
+
+### for the data and store them to a list called pred
+pred = est.predict(finance_features)
@@ -73,4 +83,26 @@ def Draw(pred, features, poi, mark_poi=False, name="image.png", f1_name="feature
try:
Draw(pred, finance_features, poi, mark_poi=False, name="clusters.pdf", f1_name=feature_1, f2_name=feature_2)
except NameError:
- print "no predictions object named pred found, no clusters to plot"
+ print( "no predictions object named pred found, no clusters to plot")
+
+
+salary = []
+exercised_stock_options = []
+for name in data_dict:
+ stock = data_dict[name]['exercised_stock_options']
+ sal = data_dict[name]['salary']
+ if not numpy.isnan(float(stock)):
+ exercised_stock_options.append(float(stock))
+ if not numpy.isnan(float(sal)):
+ salary.append(float(sal))
+
+#Feature rescaling
+scaler = MinMaxScaler()
+print( "After rescaling, salary $200,000:", scaler.fit_transform([[float(min(salary))], [200000], [float(max(salary))]]))
+print( "After rescaling, salary $100,000:", scaler.fit_transform([[float(min(exercised_stock_options))], [1000000], [float(max(exercised_stock_options))]]))
+
+print( "Minimum stock :", min(exercised_stock_options))
+print( "Maximum stock :", max(exercised_stock_options))
+
+print( "Minimum salary :", min(salary))
+print( "Maximum salary :", max(salary))
\ No newline at end of file
diff --git a/naive_bayes/nb_author_id.py b/naive_bayes/nb_author_id.py
index f69d57d8408..30f511f5599 100644
--- a/naive_bayes/nb_author_id.py
+++ b/naive_bayes/nb_author_id.py
@@ -26,7 +26,31 @@
#########################################################
### your code goes here ###
+# create classifer
+clf = GaussianNB()
+# note time
+t0 = time()
+
+# fit the classifier on training features and labels
+clf.fit(features_train, labels_train)
+print("Training time", time()-t0, "s")
+
+# note time
+t1=time()
+
+# predict labels for the test features
+pred = clf.predict(features_test)
+print("Predicting time", time()-t1, "s")
+
+# calculate accuracy
+accuracy = accuracy_score(pred, labels_test)
+
+# return the accuracy
+# return(accuracy)
+
+print(accuracy)
+# return the accuracy
#########################################################
diff --git a/outliers/enron_outliers.py b/outliers/enron_outliers.py
index ac26d7fe9a8..d881848a2be 100644
--- a/outliers/enron_outliers.py
+++ b/outliers/enron_outliers.py
@@ -8,12 +8,19 @@
### read in data dictionary, convert to numpy array
-data_dict = pickle.load( open("../final_project/final_project_dataset.pkl", "r") )
+data_dict = pickle.load( open("../final_project/final_project_dataset.pkl", "rb"), fix_imports = True )
features = ["salary", "bonus"]
data = featureFormat(data_dict, features)
### your code below
-
+for point in data:
+ salary = point[0]
+ bonus = point[1]
+ matplotlib.pyplot.scatter( salary, bonus )
+
+matplotlib.pyplot.xlabel("salary")
+matplotlib.pyplot.ylabel("bonus")
+matplotlib.pyplot.show()
diff --git a/outliers/outlier_cleaner.py b/outliers/outlier_cleaner.py
index c0ddb2acc79..db7276a8745 100644
--- a/outliers/outlier_cleaner.py
+++ b/outliers/outlier_cleaner.py
@@ -14,7 +14,12 @@ def outlierCleaner(predictions, ages, net_worths):
cleaned_data = []
### your code goes here
+ import operator
+ errors = [a - b for a, b in zip(predictions, net_worths)]
+ data = zip(ages, net_worths, errors)
+ data.sort(key=operator.itemgetter(2))
+ cleaned_data = data[:int(len(predictions) * 0.9)]
return cleaned_data
diff --git a/outliers/outlier_removal_regression.py b/outliers/outlier_removal_regression.py
index d509cd9f22f..27240241b3d 100644
--- a/outliers/outlier_removal_regression.py
+++ b/outliers/outlier_removal_regression.py
@@ -25,7 +25,12 @@
### fill in a regression here! Name the regression object reg so that
### the plotting code below works, and you can see what your regression looks like
-
+from sklearn.linear_model import LinearRegression
+reg = LinearRegression().fit(ages_train, net_worths_train)
+print("Slope: ", reg.coef_)
+# print('Regession intercept: ', reg.intercept_)
+# print('Regression score: ', reg.score(ages_train, net_worths_train))
+print("Score: ", reg.score(ages_test, net_worths_test))
@@ -50,8 +55,8 @@
predictions = reg.predict(ages_train)
cleaned_data = outlierCleaner( predictions, ages_train, net_worths_train )
except NameError:
- print "your regression object doesn't exist, or isn't name reg"
- print "can't make predictions to use in identifying outliers"
+ print("your regression object doesn't exist, or isn't name reg")
+ print("can't make predictions to use in identifying outliers")
@@ -70,9 +75,9 @@
reg.fit(ages, net_worths)
plt.plot(ages, reg.predict(ages), color="blue")
except NameError:
- print "you don't seem to have regression imported/created,"
- print " or else your regression object isn't named reg"
- print " either way, only draw the scatter plot of the cleaned data"
+ print("you don't seem to have regression imported/created,")
+ print(" or else your regression object isn't named reg")
+ print(" either way, only draw the scatter plot of the cleaned data")
plt.scatter(ages, net_worths)
plt.xlabel("ages")
plt.ylabel("net worths")
@@ -80,5 +85,5 @@
else:
- print "outlierCleaner() is returning an empty list, no refitting to be done"
+ print("outlierCleaner() is returning an empty list, no refitting to be done")
diff --git a/pca/eigenfaces.py b/pca/eigenfaces.py
index 074b860a253..b3cbc1e33ee 100644
--- a/pca/eigenfaces.py
+++ b/pca/eigenfaces.py
@@ -16,7 +16,7 @@
-print __doc__
+print(__doc__)
from time import time
import logging
@@ -53,10 +53,10 @@
target_names = lfw_people.target_names
n_classes = target_names.shape[0]
-print "Total dataset size:"
-print "n_samples: %d" % n_samples
-print "n_features: %d" % n_features
-print "n_classes: %d" % n_classes
+print( "Total dataset size:")
+print("n_samples: {0}".format(n_samples))
+print("n_features: {0}".format(n_features))
+print("n_classes: {0}".format(n_classes))
###############################################################################
@@ -68,24 +68,24 @@
# dataset): unsupervised feature extraction / dimensionality reduction
n_components = 150
-print "Extracting the top %d eigenfaces from %d faces" % (n_components, X_train.shape[0])
+print("Extracting the top {0} eigenfaces from {1} faces".format(n_components, X_train.shape[0]))
t0 = time()
pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train)
-print "done in %0.3fs" % (time() - t0)
+print( "done in %0.3fs" % (time() - t0))
eigenfaces = pca.components_.reshape((n_components, h, w))
-print "Projecting the input data on the eigenfaces orthonormal basis"
+print( "Projecting the input data on the eigenfaces orthonormal basis")
t0 = time()
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)
-print "done in %0.3fs" % (time() - t0)
+print("done in {0:.3f}s".format(time() - t0))
###############################################################################
# Train a SVM classification model
-print "Fitting the classifier to the training set"
+print( "Fitting the classifier to the training set")
t0 = time()
param_grid = {
'C': [1e3, 5e3, 1e4, 5e4, 1e5],
@@ -94,21 +94,21 @@
# for sklearn version 0.16 or prior, the class_weight parameter value is 'auto'
clf = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid)
clf = clf.fit(X_train_pca, y_train)
-print "done in %0.3fs" % (time() - t0)
-print "Best estimator found by grid search:"
-print clf.best_estimator_
+print("done in {0:.3f}s".format(time() - t0))
+print( "Best estimator found by grid search:")
+print( clf.best_estimator_)
###############################################################################
# Quantitative evaluation of the model quality on the test set
-print "Predicting the people names on the testing set"
+print( "Predicting the people names on the testing set")
t0 = time()
y_pred = clf.predict(X_test_pca)
-print "done in %0.3fs" % (time() - t0)
+print("done in {0:.3f}s".format(time() - t0))
-print classification_report(y_test, y_pred, target_names=target_names)
-print confusion_matrix(y_test, y_pred, labels=range(n_classes))
+print( classification_report(y_test, y_pred, target_names=target_names))
+print( confusion_matrix(y_test, y_pred, labels=range(n_classes)))
###############################################################################
diff --git a/regression/finance_regression.py b/regression/finance_regression.py
index efa10637a1f..3842ba3739f 100644
--- a/regression/finance_regression.py
+++ b/regression/finance_regression.py
@@ -29,16 +29,19 @@
from sklearn.cross_validation import train_test_split
feature_train, feature_test, target_train, target_test = train_test_split(features, target, test_size=0.5, random_state=42)
train_color = "b"
-test_color = "b"
-
-
+test_color = "r"
### Your regression goes here!
### Please name it reg, so that the plotting code below picks it up and
### plots it correctly. Don't forget to change the test_color above from "b" to
### "r" to differentiate training points from test points.
-
-
+from sklearn.linear_model import LinearRegression
+reg = LinearRegression().fit(feature_train, target_train)
+print("Regression output: ")
+print("Slope: ", reg.coef_)
+print("Intercept: ", reg.intercept_)
+print("Score for training: ", reg.score(feature_train, target_train))
+print("Score for testing: ", reg.score(feature_test, target_test))
diff --git a/requirements.txt b/requirements.txt
index 1d4ac04c20e..e278210faad 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
nltk==3.2.1
-numpy==1.11.2
+numpy==1.13.3
scikit-learn==0.18
-scipy==0.18.1
+scipy==0.19.1
diff --git a/svm/svm_author_id.py b/svm/svm_author_id.py
index fda3f7fdb28..208b4adf01d 100644
--- a/svm/svm_author_id.py
+++ b/svm/svm_author_id.py
@@ -24,7 +24,23 @@
#########################################################
### your code goes here ###
+#clf = SVC(kernel='linear')
+clf = SVC(kernel='rbf', C=10000)
+t0 = time()
+clf.fit(features_train, labels_train)
+print( "training time:", round(time()-t0, 3), "s")
+
+t1 = time()
+pred = clf.predict(features_test)
+print( "Prediction time:", round(time()-t1, 3), "s")
+
+print( "Accuracy score:", accuracy_score(labels_test, pred))
+
+print( "Predictions for 10:", pred[10], print( "Predictions for 26:", pred[26], print( "Predictions for 50:", pred[50] ) ))
+
+c = Counter(pred)
+print( "Number of predictions for Chris(1):", c[1])
#########################################################
diff --git a/text_learning/vectorize_text.py b/text_learning/vectorize_text.py
index 629c6b0f317..38dd5fc0c9c 100644
--- a/text_learning/vectorize_text.py
+++ b/text_learning/vectorize_text.py
@@ -7,6 +7,8 @@
sys.path.append( "../tools/" )
from parse_out_email_text import parseOutText
+from sklearn.feature_extraction.text import TfidfVectorizer
+from nltk.corpus import stopwords
"""
Starter code to process the emails from Sara and Chris to extract
@@ -44,22 +46,28 @@
temp_counter += 1
if temp_counter < 200:
path = os.path.join('..', path[:-1])
- print path
+ print( path)
email = open(path, "r")
### use parseOutText to extract the text from the opened email
-
+ text = parseOutText(email)
### use str.replace() to remove any instances of the words
### ["sara", "shackleton", "chris", "germani"]
+ stopwords = ["sara", "shackleton", "chris", "germani"]
+ for word in stopwords:
+ text = text.replace(word, "")
### append the text to word_data
-
+ word_data.append(text)
### append a 0 to from_data if email is from Sara, and 1 if email is from Chris
-
+ if name == "sara":
+ from_data.append(0)
+ else:
+ from_data.append(1)
email.close()
-print "emails processed"
+print( "emails processed")
from_sara.close()
from_chris.close()
@@ -71,5 +79,10 @@
### in Part 4, do TfIdf vectorization here
-
+vectorizer = TfidfVectorizer(stop_words="english")
+vectorizer.fit(word_data)
+vectorizer.transform(word_data)
+feature_words = vectorizer.get_feature_names()
+print( "Total words:", len(feature_words))
+print( "The word at word[34597]:", feature_words[34597])
diff --git a/tools/email_preprocess.py b/tools/email_preprocess.py
index 2528b995904..edc1f74e60e 100644
--- a/tools/email_preprocess.py
+++ b/tools/email_preprocess.py
@@ -59,7 +59,7 @@ def preprocess(words_file = "../tools/word_data.pkl", authors_file="../tools/ema
features_test_transformed = selector.transform(features_test_transformed).toarray()
### info on the data
- print "no. of Chris training emails:", sum(labels_train)
- print "no. of Sara training emails:", len(labels_train)-sum(labels_train)
+ print( "no. of Chris training emails:", sum(labels_train))
+ print( "no. of Sara training emails:", len(labels_train)-sum(labels_train))
return features_train_transformed, features_test_transformed, labels_train, labels_test
diff --git a/tools/feature_format.py b/tools/feature_format.py
index 7ca78ac291a..cf948d35506 100644
--- a/tools/feature_format.py
+++ b/tools/feature_format.py
@@ -67,7 +67,7 @@ def featureFormat( dictionary, features, remove_NaN=True, remove_all_zeroes=True
try:
dictionary[key][feature]
except KeyError:
- print "error: key ", feature, " not present"
+ print( "error: key ", feature, " not present")
return
value = dictionary[key][feature]
if value=="NaN" and remove_NaN:
diff --git a/tools/parse_out_email_text.py b/tools/parse_out_email_text.py
index 43725b22d10..cea649a9f92 100644
--- a/tools/parse_out_email_text.py
+++ b/tools/parse_out_email_text.py
@@ -45,7 +45,7 @@ def parseOutText(f):
def main():
ff = open("../text_learning/test_email.txt", "r")
text = parseOutText(ff)
- print text
+ print( text)
diff --git a/tools/startup.py b/tools/startup.py
index 4638e0d115e..bc77280ab4d 100644
--- a/tools/startup.py
+++ b/tools/startup.py
@@ -1,47 +1,48 @@
#!/usr/bin/python
-print
-print "checking for nltk"
+print()
+print( "checking for nltk")
try:
import nltk
except ImportError:
- print "you should install nltk before continuing"
+ print( "you should install nltk before continuing")
-print "checking for numpy"
+print( "checking for numpy")
try:
import numpy
except ImportError:
- print "you should install numpy before continuing"
+ print( "you should install numpy before continuing")
-print "checking for scipy"
+print( "checking for scipy")
try:
import scipy
except:
- print "you should install scipy before continuing"
+ print( "you should install scipy before continuing")
-print "checking for sklearn"
+print( "checking for sklearn")
try:
import sklearn
except:
- print "you should install sklearn before continuing"
+ print( "you should install sklearn before continuing")
-print
-print "downloading the Enron dataset (this may take a while)"
-print "to check on progress, you can cd up one level, then execute "
-print "Enron dataset should be last item on the list, along with its current size"
-print "download will complete at about 423 MB"
+print()
+print( "downloading the Enron dataset (this may take a while)")
+print( "to check on progress, you can cd up one level, then execute ")
+print( "Enron dataset should be last item on the list, along with its current size")
+print( "download will complete at about 423 MB")
import urllib
url = "https://www.cs.cmu.edu/~./enron/enron_mail_20150507.tar.gz"
-urllib.urlretrieve(url, filename="../enron_mail_20150507.tar.gz")
-print "download complete!"
+#old -> urllib.urlretrieve(url, filename="../enron_mail_20150507.tar.gz")
+urllib.request.urlretrieve(url, filename="../enron_mail_20150507.tgz")
+print( "download complete!")
-print
-print "unzipping Enron dataset (this may take a while)"
+print()
+print( "unzipping Enron dataset (this may take a while)")
import tarfile
import os
os.chdir("..")
tfile = tarfile.open("enron_mail_20150507.tar.gz", "r:gz")
tfile.extractall(".")
-print "you're ready to go!"
+print( "you're ready to go!")
diff --git a/validation/validate_poi.py b/validation/validate_poi.py
index 03537a5cc07..c0fafa62bcc 100644
--- a/validation/validate_poi.py
+++ b/validation/validate_poi.py
@@ -14,6 +14,9 @@
import sys
sys.path.append("../tools/")
from feature_format import featureFormat, targetFeatureSplit
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.metrics import accuracy_score
+from sklearn.cross_validation import train_test_split
data_dict = pickle.load(open("../final_project/final_project_dataset.pkl", "r") )
@@ -25,8 +28,19 @@
data = featureFormat(data_dict, features_list)
labels, features = targetFeatureSplit(data)
+features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size=0.30, random_state=42)
### it's all yours from here forward!
+# create DecisionTree Classifier
+clf = DecisionTreeClassifier()
+# Fit/train it
+clf.fit(features_train, labels_train)
+
+# predict
+pred = clf.predict(features_test)
+
+# print
+print( "Accuracy:", accuracy_score(labels_test, pred))