diff --git a/.gitignore b/.gitignore index 17c225de1fc..8a3a93664a8 100644 --- a/.gitignore +++ b/.gitignore @@ -2,9 +2,13 @@ enron_mail_20110402.tgz enron_mail_20110402/ enron_mail_20150507.tgz +enron_mail_20150507.tar.gz +enron_mail_20150507.tar maildir/ text_learning/your_word_data.pkl text_learning/your_email_authors.pkl my_classifier.pkl my_dataset.pkl my_feature_list.pkl +.idea + diff --git a/Project report.docx b/Project report.docx new file mode 100644 index 00000000000..8997ad63776 Binary files /dev/null and b/Project report.docx differ diff --git a/README.md b/README.md index 9c2c9967cb5..26b1f343e6e 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,34 @@ -ud120-projects +

ud120-projects

============== -Starter project code for students taking Udacity ud120 +My repo for Udacity ud120 course + +

Content

+* Session excercises / mini projects +* Enron project + + +

IDE

+PyCharm community Edition By Jet Brain + +

Commands used

+**install sklearn** + +pip install scikit-learn + +**install natural language toolkit** + +pip install nltk + +**install matplotlib** + +pip install matplotlib + +

Environment from requirements.txt

+ +nltk==3.2.1
+numpy==1.13.3
+scikit-learn==0.18
+scipy==0.19.1
+ + diff --git a/choose_your_own/class_vis.py b/choose_your_own/class_vis.py index 38957c9574e..4bf993d4571 100644 --- a/choose_your_own/class_vis.py +++ b/choose_your_own/class_vis.py @@ -46,5 +46,5 @@ def output_image(name, format, bytes): data['name'] = name data['format'] = format data['bytes'] = base64.encodestring(bytes) - print image_start+json.dumps(data)+image_end + print( image_start+json.dumps(data)+image_end) diff --git a/choose_your_own/test.PNG b/choose_your_own/test.PNG new file mode 100644 index 00000000000..ac55c5d3eb6 Binary files /dev/null and b/choose_your_own/test.PNG differ diff --git a/choose_your_own/your_algorithm.py b/choose_your_own/your_algorithm.py index 62a7573cfdf..67a59637651 100644 --- a/choose_your_own/your_algorithm.py +++ b/choose_your_own/your_algorithm.py @@ -30,7 +30,16 @@ ### your code here! name your classifier object clf if you want the ### visualization code (prettyPicture) to show you the decision boundary - +# KNN +clf_knn = KNeighborsClassifier(n_neighbors=4) +clf_knn.fit(features_train, labels_train) +pred_knn = clf_knn.predict(features_test) +print( "Accuracy for KNeighborsClassifier:", accuracy_score(labels_test, pred_knn)) + +clf_rf = RandomForestClassifier(n_estimators=15, min_samples_split=6) +clf_rf.fit(features_train, labels_train) +clf_rf = clf_rf.predict(features_test) +print( "Accuracy RandomForestClassifier:", accuracy_score(labels_test, clf_rf)) diff --git a/datasets_questions/explore_enron_data.py b/datasets_questions/explore_enron_data.py index ca9bacb9c83..ed00f32fb71 100644 --- a/datasets_questions/explore_enron_data.py +++ b/datasets_questions/explore_enron_data.py @@ -16,7 +16,48 @@ """ import pickle +import numpy as np enron_data = pickle.load(open("../final_project/final_project_dataset.pkl", "rb")) - +print(len(enron_data)) +print(len(list(enron_data.values())[0])) + +count = 0 +for person_name in enron_data.keys(): + if(enron_data[person_name]["poi"]==1): + count = count+1 +print(count) + +total_poi = 0 +with open('../final_project/poi_names.txt', 'r') as file: + for line in file: + if('\(y\)' or '\(n\)' in line): + total_poi= total_poi+1 +print(total_poi) +file.close() +print("Net Stock value of James Prentice: ", enron_data['PRENTICE JAMES']['total_stock_value']) +print("Wesley Colwell to POI emails: ", enron_data['COLWELL WESLEY']['from_this_person_to_poi']) +print("Stock options of Jeffrey Skilling: ", enron_data['SKILLING JEFFREY K']['exercised_stock_options']) + +most_value_taken = max([(enron_data[person_name]['total_payments']) for person_name in ("LAY KENNETH L", "SKILLING JEFFREY K", "FASTOW ANDREW S")]) +print(most_value_taken) + +salaries_not_nan = 0 +known_emails = 0 +total_payments_not_nan = 0 +total_payments_not_nan_poi = 0 +for person_name in enron_data: + if not np.isnan(float(enron_data[person_name]['salary'])): + salaries_not_nan += 1 + if(enron_data[person_name]['email_address'] != 'NaN'): + known_emails+=1 + if np.isnan(float(enron_data[person_name]['total_payments'])): + total_payments_not_nan +=1 + if np.isnan(enron_data[person_name]["poi"]==1 ): + total_payments_not_nan_poi += 1 + +print('Salaries available:: ', salaries_not_nan) +print('Available emails: ', known_emails) +print('Number Percentage people NaN -> their total payments: ',total_payments_not_nan, total_payments_not_nan*100/len(enron_data)) +print('Number and Percentage Pois NaN -> their total payments: ',total_payments_not_nan_poi, total_payments_not_nan_poi*100/count) diff --git a/decision_tree/dt_author_id.py b/decision_tree/dt_author_id.py index 006afb8357b..14cdfe84125 100644 --- a/decision_tree/dt_author_id.py +++ b/decision_tree/dt_author_id.py @@ -24,7 +24,23 @@ ######################################################### ### your code goes here ### +#imports +from sklearn import tree +from sklearn.metrics import accuracy_score +# +# create classifer +clf = tree.DecisionTreeClassifier(min_samples_split=40) +# fit the classifier on training features and labels +clf.fit(features_train, labels_train) + +#predict +pred = clf.predict(features_test) + +# print +print( "Accuracy:", accuracy_score(labels_test, pred)) + +print( "No of features in date:", len(features_train[0])) ######################################################### diff --git a/evaluation/evaluate_poi_identifier.py b/evaluation/evaluate_poi_identifier.py index 0ca99d52d5f..0632a18e71e 100644 --- a/evaluation/evaluate_poi_identifier.py +++ b/evaluation/evaluate_poi_identifier.py @@ -13,8 +13,12 @@ import pickle import sys +import numpy as np sys.path.append("../tools/") from feature_format import featureFormat, targetFeatureSplit +from sklearn.tree import DecisionTreeClassifier +from sklearn.metrics import accuracy_score, precision_score, recall_score +from sklearn.cross_validation import train_test_split data_dict = pickle.load(open("../final_project/final_project_dataset.pkl", "r") ) @@ -27,5 +31,61 @@ ### your code goes here +features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size=0.30, + random_state=42) +# create DT Classifier +clf = DecisionTreeClassifier() +# fit/train it +clf.fit(features_train, labels_train) + +# predict +pred = clf.predict(features_test) + +#print +print( "accuracy:", accuracy_score(labels_test, pred)) + +### evaluation +values, counts = np.unique(pred, return_counts=True) +test_size = len(features_test) + +# print +print("Predicted POIs:", zip(values, counts)) +print( "Total number in test set:", test_size) +print( "Accuracy - all poi=0:", counts[0] / test_size) + +true_positives = 0 +for actual, predicted in zip(labels_test, pred): + if actual == 1 and predicted == 1: + true_positives += 1 + +# print +print( "TP - true positives:", true_positives) +print( "Precision score:", precision_score(labels_test, pred)) +print( "Recall score:", recall_score(labels_test, pred)) + +prediction_labels = [0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1] +true_labels = [0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0] + + +def calc_precision_and_recall(actual, predicted): + print( "Doing precision and recall...") + true_positives = 0 + false_positives = 0 + false_negatives = 0 + true_negatives = 0 + for a, p in zip(actual, predicted): + if a == 1 and p == 1: + true_positives += 1 + elif a == 1 and p == 0: + false_negatives += 1 + elif a == 0 and p == 1: + false_positives += 1 + else: + true_negatives += 1 + print( "Precision:", true_positives / (true_positives + false_positives)) + print( "Recall:", true_positives / (true_positives + false_negatives)) + + +calc_precision_and_recall(true_labels, prediction_labels) diff --git a/feature_selection/find_signature.py b/feature_selection/find_signature.py index c01a1f2111a..93c0be9a822 100644 --- a/feature_selection/find_signature.py +++ b/feature_selection/find_signature.py @@ -28,6 +28,8 @@ features_train = vectorizer.fit_transform(features_train) features_test = vectorizer.transform(features_test).toarray() +# get words +words = vectorizer.get_feature_names() ### a classic way to overfit is to use a small number ### of data points and a large number of features; @@ -38,6 +40,19 @@ ### your code goes here - +from sklearn.tree import DecisionTreeClassifier +from sklearn.metrics import accuracy_score + +clf = DecisionTreeClassifier(min_samples_split=40) +clf.fit(features_train, labels_train) +pred = clf.predict(features_test) +print( "Accuracy:", accuracy_score(labels_test, pred)) + +print( "Important features:") +for index, feature in enumerate(clf.feature_importances_): + if feature>0.2: + print( "Feature number", index) + print( "Importance", feature) + print( "Word", words[index]) diff --git a/final_project/poi_id.py b/final_project/poi_id.py index 47912a7c51d..c1e39e53b7a 100644 --- a/final_project/poi_id.py +++ b/final_project/poi_id.py @@ -2,6 +2,7 @@ import sys import pickle +import matplotlib sys.path.append("../tools/") from feature_format import featureFormat, targetFeatureSplit @@ -10,14 +11,86 @@ ### Task 1: Select what features you'll use. ### features_list is a list of strings, each of which is a feature name. ### The first feature must be "poi". -features_list = ['poi','salary'] # You will need to use more features +features_list = ['poi', 'salary'] # You will need to use more features ### Load the dictionary containing the dataset -with open("final_project_dataset.pkl", "r") as data_file: - data_dict = pickle.load(data_file) +# with open("final_project_dataset.pkl", "r") as data_file: +# data_dict = pickle.load(data_file) + +# data_dict = pickle.load( open( "final_project_dataset.pkl", "rb" ) ) + +with open('final_project_dataset.pkl', 'rb') as handle: + data_dict = pickle.load(handle) + + ### Task 2: Remove outliers +identified_outliers = ["TOTAL", "LAVORATO JOHN J", "MARTIN AMANDA K", "URQUHART JOHN A", "MCCLELLAN GEORGE", "SHANKMAN JEFFREY A", "WHITE JR THOMAS E", "PAI LOU L", "HIRKO JOSEPH"] +for outlier in identified_outliers: + data_dict.pop(outlier) + ### Task 3: Create new feature(s) +financial_features = ['salary', 'deferral_payments', 'total_payments', 'loan_advances', 'bonus', 'restricted_stock_deferred', 'deferred_income', 'total_stock_value', 'expenses', 'exercised_stock_options', 'other', 'long_term_incentive', 'restricted_stock', 'director_fees'] +email_features = ['to_messages', 'email_address', 'from_poi_to_this_person', 'from_messages', 'from_this_person_to_poi', 'poi', 'shared_receipt_with_poi'] + +# count data_points +data_points = len(data_dict) + +# initialise counts +poi_count = 0 +non_poi_count = 0 + + +# print +non_poi_count = 0 +print( "Data points:\t", data_points) +print( "Number of non POIs:\t", non_poi_count) +print( "Number of POIs:\t\t", poi_count) + +print( "POI ratio:\t\t", poi_count/data_points) +print( "Total features:\t", len(data_dict[data_dict.keys()[0]])) +print( "Financial features:\t", len(financial_features)) +print( "Email features:\t", len(email_features)) +print( "") + + + + + +def outlier_visualization(data): + for point in data: + f1 = point[0] + f2 = point[1] + matplotlib.pyplot.scatter(f1, f2 ) + + matplotlib.pyplot.xlabel("Feature 1") + matplotlib.pyplot.ylabel("Feature 2") + matplotlib.pyplot.show() + + + +def visualize_outliers(): + start = 1 + for i in range(2, len(financial_features)): + outlier_visualization(financial_outliers, 1, i, 'salary', financial_features[i], start) + start += 1 + start = 10 + + for i in range(2, len(email_features)): + outlier_visualization(email_outliers, 1, i, 'to_messages', email_features[i], start) + start += 1 + + +# outlier name +def get_outlier(feature, value): + for person, features in data_dict.iteritems(): + if features[feature] == value: + print("Outlier is:", person, features['poi']) + + + + + ### Store to my_dataset for easy export below. my_dataset = data_dict @@ -25,6 +98,16 @@ data = featureFormat(my_dataset, features_list, sort_keys = True) labels, features = targetFeatureSplit(data) +financial_outliers = featureFormat(data_dict, financial_features) +email_outliers = featureFormat(data_dict, email_features) + + +#from sklearn.cross_validation import train_test_split +from sklearn.model_selection import train_test_split + +features_train, features_test, labels_train, labels_test = \ + train_test_split(features, labels, test_size=0.3, random_state=42) + ### Task 4: Try a varity of classifiers ### Please name your classifier clf for easy export below. ### Note that if you want to do PCA or other multi-stage operations, @@ -32,9 +115,18 @@ ### http://scikit-learn.org/stable/modules/pipeline.html # Provided to give you a starting point. Try a variety of classifiers. +# import from sklearn.naive_bayes import GaussianNB + +# create classifier clf = GaussianNB() +#fit/train +clf.fit(features_train, labels_train) + +# predict +pred = clf.predict(features_test) + ### Task 5: Tune your classifier to achieve better than .3 precision and recall ### using our testing script. Check the tester.py script in the final project ### folder for details on the evaluation method, especially the test_classifier @@ -43,7 +135,8 @@ ### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html # Example starting point. Try investigating other evaluation techniques! -from sklearn.cross_validation import train_test_split +#from sklearn.cross_validation import train_test_split +from sklearn.model_selection import train_test_split features_train, features_test, labels_train, labels_test = \ train_test_split(features, labels, test_size=0.3, random_state=42) @@ -52,4 +145,9 @@ ### that the version of poi_id.py that you submit can be run on its own and ### generates the necessary .pkl files for validating your results. -dump_classifier_and_data(clf, my_dataset, features_list) \ No newline at end of file +visualize_outliers() + +dump_classifier_and_data(clf, my_dataset, features_list) + + + diff --git a/final_project/tester.py b/final_project/tester.py index c0899dbe58e..3749091aaba 100644 --- a/final_project/tester.py +++ b/final_project/tester.py @@ -12,7 +12,9 @@ import pickle import sys -from sklearn.cross_validation import StratifiedShuffleSplit +#from sklearn.cross_validation import StratifiedShuffleSplit +from sklearn.model_selection import StratifiedShuffleSplit + sys.path.append("../tools/") from feature_format import featureFormat, targetFeatureSplit @@ -55,9 +57,9 @@ def test_classifier(clf, dataset, feature_list, folds = 1000): elif prediction == 1 and truth == 1: true_positives += 1 else: - print "Warning: Found a predicted label not == 0 or 1." - print "All predictions should take value 0 or 1." - print "Evaluating performance for processed predictions:" + print( "Warning: Found a predicted label not == 0 or 1.") + print( "All predictions should take value 0 or 1.") + print( "Evaluating performance for processed predictions:") break try: total_predictions = true_negatives + false_negatives + false_positives + true_positives @@ -66,13 +68,13 @@ def test_classifier(clf, dataset, feature_list, folds = 1000): recall = 1.0*true_positives/(true_positives+false_negatives) f1 = 2.0 * true_positives/(2*true_positives + false_positives+false_negatives) f2 = (1+2.0*2.0) * precision*recall/(4*precision + recall) - print clf - print PERF_FORMAT_STRING.format(accuracy, precision, recall, f1, f2, display_precision = 5) - print RESULTS_FORMAT_STRING.format(total_predictions, true_positives, false_positives, false_negatives, true_negatives) - print "" + print( clf) + print( PERF_FORMAT_STRING.format(accuracy, precision, recall, f1, f2, display_precision = 5)) + print( RESULTS_FORMAT_STRING.format(total_predictions, true_positives, false_positives, false_negatives, true_negatives)) + print( "") except: - print "Got a divide by zero when trying out:", clf - print "Precision or recall may be undefined due to a lack of true positive predicitons." + print( "Got a divide by zero when trying out:", clf) + print( "Precision or recall may be undefined due to a lack of true positive predicitons.") CLF_PICKLE_FILENAME = "my_classifier.pkl" DATASET_PICKLE_FILENAME = "my_dataset.pkl" diff --git a/k_means/k_means_cluster.py b/k_means/k_means_cluster.py index 6a2ba687017..eb42d6d8f51 100644 --- a/k_means/k_means_cluster.py +++ b/k_means/k_means_cluster.py @@ -13,7 +13,8 @@ import sys sys.path.append("../tools/") from feature_format import featureFormat, targetFeatureSplit - +from sklearn.cluster import KMeans +from sklearn.preprocessing import MinMaxScaler @@ -39,7 +40,11 @@ def Draw(pred, features, poi, mark_poi=False, name="image.png", f1_name="feature ### load in the dict of dicts containing all the data on each person in the dataset -data_dict = pickle.load( open("../final_project/final_project_dataset.pkl", "r") ) +with open("../final_project/final_project_dataset.pkl", "rb") as f: + rawdataset = f.read() + +#data_dict = pickle.load( open("../final_project/final_project_dataset.pkl", "r") ) +data_dict = pickle.load( rawdataset) ### there's an outlier--remove it! data_dict.pop("TOTAL", 0) @@ -63,8 +68,13 @@ def Draw(pred, features, poi, mark_poi=False, name="image.png", f1_name="feature plt.show() ### cluster here; create predictions of the cluster labels -### for the data and store them to a list called pred +est = KMeans(n_clusters=2) +# fit/train it +est.fit(finance_features) + +### for the data and store them to a list called pred +pred = est.predict(finance_features) @@ -73,4 +83,26 @@ def Draw(pred, features, poi, mark_poi=False, name="image.png", f1_name="feature try: Draw(pred, finance_features, poi, mark_poi=False, name="clusters.pdf", f1_name=feature_1, f2_name=feature_2) except NameError: - print "no predictions object named pred found, no clusters to plot" + print( "no predictions object named pred found, no clusters to plot") + + +salary = [] +exercised_stock_options = [] +for name in data_dict: + stock = data_dict[name]['exercised_stock_options'] + sal = data_dict[name]['salary'] + if not numpy.isnan(float(stock)): + exercised_stock_options.append(float(stock)) + if not numpy.isnan(float(sal)): + salary.append(float(sal)) + +#Feature rescaling +scaler = MinMaxScaler() +print( "After rescaling, salary $200,000:", scaler.fit_transform([[float(min(salary))], [200000], [float(max(salary))]])) +print( "After rescaling, salary $100,000:", scaler.fit_transform([[float(min(exercised_stock_options))], [1000000], [float(max(exercised_stock_options))]])) + +print( "Minimum stock :", min(exercised_stock_options)) +print( "Maximum stock :", max(exercised_stock_options)) + +print( "Minimum salary :", min(salary)) +print( "Maximum salary :", max(salary)) \ No newline at end of file diff --git a/naive_bayes/nb_author_id.py b/naive_bayes/nb_author_id.py index f69d57d8408..30f511f5599 100644 --- a/naive_bayes/nb_author_id.py +++ b/naive_bayes/nb_author_id.py @@ -26,7 +26,31 @@ ######################################################### ### your code goes here ### +# create classifer +clf = GaussianNB() +# note time +t0 = time() + +# fit the classifier on training features and labels +clf.fit(features_train, labels_train) +print("Training time", time()-t0, "s") + +# note time +t1=time() + +# predict labels for the test features +pred = clf.predict(features_test) +print("Predicting time", time()-t1, "s") + +# calculate accuracy +accuracy = accuracy_score(pred, labels_test) + +# return the accuracy +# return(accuracy) + +print(accuracy) +# return the accuracy ######################################################### diff --git a/outliers/enron_outliers.py b/outliers/enron_outliers.py index ac26d7fe9a8..d881848a2be 100644 --- a/outliers/enron_outliers.py +++ b/outliers/enron_outliers.py @@ -8,12 +8,19 @@ ### read in data dictionary, convert to numpy array -data_dict = pickle.load( open("../final_project/final_project_dataset.pkl", "r") ) +data_dict = pickle.load( open("../final_project/final_project_dataset.pkl", "rb"), fix_imports = True ) features = ["salary", "bonus"] data = featureFormat(data_dict, features) ### your code below - +for point in data: + salary = point[0] + bonus = point[1] + matplotlib.pyplot.scatter( salary, bonus ) + +matplotlib.pyplot.xlabel("salary") +matplotlib.pyplot.ylabel("bonus") +matplotlib.pyplot.show() diff --git a/outliers/outlier_cleaner.py b/outliers/outlier_cleaner.py index c0ddb2acc79..db7276a8745 100644 --- a/outliers/outlier_cleaner.py +++ b/outliers/outlier_cleaner.py @@ -14,7 +14,12 @@ def outlierCleaner(predictions, ages, net_worths): cleaned_data = [] ### your code goes here + import operator + errors = [a - b for a, b in zip(predictions, net_worths)] + data = zip(ages, net_worths, errors) + data.sort(key=operator.itemgetter(2)) + cleaned_data = data[:int(len(predictions) * 0.9)] return cleaned_data diff --git a/outliers/outlier_removal_regression.py b/outliers/outlier_removal_regression.py index d509cd9f22f..27240241b3d 100644 --- a/outliers/outlier_removal_regression.py +++ b/outliers/outlier_removal_regression.py @@ -25,7 +25,12 @@ ### fill in a regression here! Name the regression object reg so that ### the plotting code below works, and you can see what your regression looks like - +from sklearn.linear_model import LinearRegression +reg = LinearRegression().fit(ages_train, net_worths_train) +print("Slope: ", reg.coef_) +# print('Regession intercept: ', reg.intercept_) +# print('Regression score: ', reg.score(ages_train, net_worths_train)) +print("Score: ", reg.score(ages_test, net_worths_test)) @@ -50,8 +55,8 @@ predictions = reg.predict(ages_train) cleaned_data = outlierCleaner( predictions, ages_train, net_worths_train ) except NameError: - print "your regression object doesn't exist, or isn't name reg" - print "can't make predictions to use in identifying outliers" + print("your regression object doesn't exist, or isn't name reg") + print("can't make predictions to use in identifying outliers") @@ -70,9 +75,9 @@ reg.fit(ages, net_worths) plt.plot(ages, reg.predict(ages), color="blue") except NameError: - print "you don't seem to have regression imported/created," - print " or else your regression object isn't named reg" - print " either way, only draw the scatter plot of the cleaned data" + print("you don't seem to have regression imported/created,") + print(" or else your regression object isn't named reg") + print(" either way, only draw the scatter plot of the cleaned data") plt.scatter(ages, net_worths) plt.xlabel("ages") plt.ylabel("net worths") @@ -80,5 +85,5 @@ else: - print "outlierCleaner() is returning an empty list, no refitting to be done" + print("outlierCleaner() is returning an empty list, no refitting to be done") diff --git a/pca/eigenfaces.py b/pca/eigenfaces.py index 074b860a253..b3cbc1e33ee 100644 --- a/pca/eigenfaces.py +++ b/pca/eigenfaces.py @@ -16,7 +16,7 @@ -print __doc__ +print(__doc__) from time import time import logging @@ -53,10 +53,10 @@ target_names = lfw_people.target_names n_classes = target_names.shape[0] -print "Total dataset size:" -print "n_samples: %d" % n_samples -print "n_features: %d" % n_features -print "n_classes: %d" % n_classes +print( "Total dataset size:") +print("n_samples: {0}".format(n_samples)) +print("n_features: {0}".format(n_features)) +print("n_classes: {0}".format(n_classes)) ############################################################################### @@ -68,24 +68,24 @@ # dataset): unsupervised feature extraction / dimensionality reduction n_components = 150 -print "Extracting the top %d eigenfaces from %d faces" % (n_components, X_train.shape[0]) +print("Extracting the top {0} eigenfaces from {1} faces".format(n_components, X_train.shape[0])) t0 = time() pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train) -print "done in %0.3fs" % (time() - t0) +print( "done in %0.3fs" % (time() - t0)) eigenfaces = pca.components_.reshape((n_components, h, w)) -print "Projecting the input data on the eigenfaces orthonormal basis" +print( "Projecting the input data on the eigenfaces orthonormal basis") t0 = time() X_train_pca = pca.transform(X_train) X_test_pca = pca.transform(X_test) -print "done in %0.3fs" % (time() - t0) +print("done in {0:.3f}s".format(time() - t0)) ############################################################################### # Train a SVM classification model -print "Fitting the classifier to the training set" +print( "Fitting the classifier to the training set") t0 = time() param_grid = { 'C': [1e3, 5e3, 1e4, 5e4, 1e5], @@ -94,21 +94,21 @@ # for sklearn version 0.16 or prior, the class_weight parameter value is 'auto' clf = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid) clf = clf.fit(X_train_pca, y_train) -print "done in %0.3fs" % (time() - t0) -print "Best estimator found by grid search:" -print clf.best_estimator_ +print("done in {0:.3f}s".format(time() - t0)) +print( "Best estimator found by grid search:") +print( clf.best_estimator_) ############################################################################### # Quantitative evaluation of the model quality on the test set -print "Predicting the people names on the testing set" +print( "Predicting the people names on the testing set") t0 = time() y_pred = clf.predict(X_test_pca) -print "done in %0.3fs" % (time() - t0) +print("done in {0:.3f}s".format(time() - t0)) -print classification_report(y_test, y_pred, target_names=target_names) -print confusion_matrix(y_test, y_pred, labels=range(n_classes)) +print( classification_report(y_test, y_pred, target_names=target_names)) +print( confusion_matrix(y_test, y_pred, labels=range(n_classes))) ############################################################################### diff --git a/regression/finance_regression.py b/regression/finance_regression.py index efa10637a1f..3842ba3739f 100644 --- a/regression/finance_regression.py +++ b/regression/finance_regression.py @@ -29,16 +29,19 @@ from sklearn.cross_validation import train_test_split feature_train, feature_test, target_train, target_test = train_test_split(features, target, test_size=0.5, random_state=42) train_color = "b" -test_color = "b" - - +test_color = "r" ### Your regression goes here! ### Please name it reg, so that the plotting code below picks it up and ### plots it correctly. Don't forget to change the test_color above from "b" to ### "r" to differentiate training points from test points. - - +from sklearn.linear_model import LinearRegression +reg = LinearRegression().fit(feature_train, target_train) +print("Regression output: ") +print("Slope: ", reg.coef_) +print("Intercept: ", reg.intercept_) +print("Score for training: ", reg.score(feature_train, target_train)) +print("Score for testing: ", reg.score(feature_test, target_test)) diff --git a/requirements.txt b/requirements.txt index 1d4ac04c20e..e278210faad 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ nltk==3.2.1 -numpy==1.11.2 +numpy==1.13.3 scikit-learn==0.18 -scipy==0.18.1 +scipy==0.19.1 diff --git a/svm/svm_author_id.py b/svm/svm_author_id.py index fda3f7fdb28..208b4adf01d 100644 --- a/svm/svm_author_id.py +++ b/svm/svm_author_id.py @@ -24,7 +24,23 @@ ######################################################### ### your code goes here ### +#clf = SVC(kernel='linear') +clf = SVC(kernel='rbf', C=10000) +t0 = time() +clf.fit(features_train, labels_train) +print( "training time:", round(time()-t0, 3), "s") + +t1 = time() +pred = clf.predict(features_test) +print( "Prediction time:", round(time()-t1, 3), "s") + +print( "Accuracy score:", accuracy_score(labels_test, pred)) + +print( "Predictions for 10:", pred[10], print( "Predictions for 26:", pred[26], print( "Predictions for 50:", pred[50] ) )) + +c = Counter(pred) +print( "Number of predictions for Chris(1):", c[1]) ######################################################### diff --git a/text_learning/vectorize_text.py b/text_learning/vectorize_text.py index 629c6b0f317..38dd5fc0c9c 100644 --- a/text_learning/vectorize_text.py +++ b/text_learning/vectorize_text.py @@ -7,6 +7,8 @@ sys.path.append( "../tools/" ) from parse_out_email_text import parseOutText +from sklearn.feature_extraction.text import TfidfVectorizer +from nltk.corpus import stopwords """ Starter code to process the emails from Sara and Chris to extract @@ -44,22 +46,28 @@ temp_counter += 1 if temp_counter < 200: path = os.path.join('..', path[:-1]) - print path + print( path) email = open(path, "r") ### use parseOutText to extract the text from the opened email - + text = parseOutText(email) ### use str.replace() to remove any instances of the words ### ["sara", "shackleton", "chris", "germani"] + stopwords = ["sara", "shackleton", "chris", "germani"] + for word in stopwords: + text = text.replace(word, "") ### append the text to word_data - + word_data.append(text) ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris - + if name == "sara": + from_data.append(0) + else: + from_data.append(1) email.close() -print "emails processed" +print( "emails processed") from_sara.close() from_chris.close() @@ -71,5 +79,10 @@ ### in Part 4, do TfIdf vectorization here - +vectorizer = TfidfVectorizer(stop_words="english") +vectorizer.fit(word_data) +vectorizer.transform(word_data) +feature_words = vectorizer.get_feature_names() +print( "Total words:", len(feature_words)) +print( "The word at word[34597]:", feature_words[34597]) diff --git a/tools/email_preprocess.py b/tools/email_preprocess.py index 2528b995904..edc1f74e60e 100644 --- a/tools/email_preprocess.py +++ b/tools/email_preprocess.py @@ -59,7 +59,7 @@ def preprocess(words_file = "../tools/word_data.pkl", authors_file="../tools/ema features_test_transformed = selector.transform(features_test_transformed).toarray() ### info on the data - print "no. of Chris training emails:", sum(labels_train) - print "no. of Sara training emails:", len(labels_train)-sum(labels_train) + print( "no. of Chris training emails:", sum(labels_train)) + print( "no. of Sara training emails:", len(labels_train)-sum(labels_train)) return features_train_transformed, features_test_transformed, labels_train, labels_test diff --git a/tools/feature_format.py b/tools/feature_format.py index 7ca78ac291a..cf948d35506 100644 --- a/tools/feature_format.py +++ b/tools/feature_format.py @@ -67,7 +67,7 @@ def featureFormat( dictionary, features, remove_NaN=True, remove_all_zeroes=True try: dictionary[key][feature] except KeyError: - print "error: key ", feature, " not present" + print( "error: key ", feature, " not present") return value = dictionary[key][feature] if value=="NaN" and remove_NaN: diff --git a/tools/parse_out_email_text.py b/tools/parse_out_email_text.py index 43725b22d10..cea649a9f92 100644 --- a/tools/parse_out_email_text.py +++ b/tools/parse_out_email_text.py @@ -45,7 +45,7 @@ def parseOutText(f): def main(): ff = open("../text_learning/test_email.txt", "r") text = parseOutText(ff) - print text + print( text) diff --git a/tools/startup.py b/tools/startup.py index 4638e0d115e..bc77280ab4d 100644 --- a/tools/startup.py +++ b/tools/startup.py @@ -1,47 +1,48 @@ #!/usr/bin/python -print -print "checking for nltk" +print() +print( "checking for nltk") try: import nltk except ImportError: - print "you should install nltk before continuing" + print( "you should install nltk before continuing") -print "checking for numpy" +print( "checking for numpy") try: import numpy except ImportError: - print "you should install numpy before continuing" + print( "you should install numpy before continuing") -print "checking for scipy" +print( "checking for scipy") try: import scipy except: - print "you should install scipy before continuing" + print( "you should install scipy before continuing") -print "checking for sklearn" +print( "checking for sklearn") try: import sklearn except: - print "you should install sklearn before continuing" + print( "you should install sklearn before continuing") -print -print "downloading the Enron dataset (this may take a while)" -print "to check on progress, you can cd up one level, then execute " -print "Enron dataset should be last item on the list, along with its current size" -print "download will complete at about 423 MB" +print() +print( "downloading the Enron dataset (this may take a while)") +print( "to check on progress, you can cd up one level, then execute ") +print( "Enron dataset should be last item on the list, along with its current size") +print( "download will complete at about 423 MB") import urllib url = "https://www.cs.cmu.edu/~./enron/enron_mail_20150507.tar.gz" -urllib.urlretrieve(url, filename="../enron_mail_20150507.tar.gz") -print "download complete!" +#old -> urllib.urlretrieve(url, filename="../enron_mail_20150507.tar.gz") +urllib.request.urlretrieve(url, filename="../enron_mail_20150507.tgz") +print( "download complete!") -print -print "unzipping Enron dataset (this may take a while)" +print() +print( "unzipping Enron dataset (this may take a while)") import tarfile import os os.chdir("..") tfile = tarfile.open("enron_mail_20150507.tar.gz", "r:gz") tfile.extractall(".") -print "you're ready to go!" +print( "you're ready to go!") diff --git a/validation/validate_poi.py b/validation/validate_poi.py index 03537a5cc07..c0fafa62bcc 100644 --- a/validation/validate_poi.py +++ b/validation/validate_poi.py @@ -14,6 +14,9 @@ import sys sys.path.append("../tools/") from feature_format import featureFormat, targetFeatureSplit +from sklearn.tree import DecisionTreeClassifier +from sklearn.metrics import accuracy_score +from sklearn.cross_validation import train_test_split data_dict = pickle.load(open("../final_project/final_project_dataset.pkl", "r") ) @@ -25,8 +28,19 @@ data = featureFormat(data_dict, features_list) labels, features = targetFeatureSplit(data) +features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size=0.30, random_state=42) ### it's all yours from here forward! +# create DecisionTree Classifier +clf = DecisionTreeClassifier() +# Fit/train it +clf.fit(features_train, labels_train) + +# predict +pred = clf.predict(features_test) + +# print +print( "Accuracy:", accuracy_score(labels_test, pred))