diff --git a/.gitignore b/.gitignore
index 17c225de1fc..8a3a93664a8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,9 +2,13 @@
 enron_mail_20110402.tgz
 enron_mail_20110402/
 enron_mail_20150507.tgz
+enron_mail_20150507.tar.gz
+enron_mail_20150507.tar
 maildir/
 text_learning/your_word_data.pkl
 text_learning/your_email_authors.pkl
 my_classifier.pkl
 my_dataset.pkl
 my_feature_list.pkl
+.idea
+
diff --git a/Project report.docx b/Project report.docx
new file mode 100644
index 00000000000..8997ad63776
Binary files /dev/null and b/Project report.docx differ
diff --git a/README.md b/README.md
index 9c2c9967cb5..26b1f343e6e 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,34 @@
-ud120-projects
+<h1> ud120-projects </h1> 
 ==============
 
-Starter project code for students taking Udacity ud120
+My repo for Udacity ud120 course
+
+<h2> Content </h2> 
+* Session excercises / mini projects
+* Enron project
+
+
+<h2> IDE </h2> 
+PyCharm community Edition By Jet Brain
+
+<h2> Commands used </h2> 
+**install sklearn**
+
+pip install scikit-learn
+
+**install natural language toolkit** 
+
+pip install nltk
+
+**install matplotlib**
+
+pip install matplotlib
+
+<h2> Environment from requirements.txt</h2> 
+
+nltk==3.2.1<br>
+numpy==1.13.3<br>
+scikit-learn==0.18<br>
+scipy==0.19.1<br>
+
+
diff --git a/choose_your_own/class_vis.py b/choose_your_own/class_vis.py
index 38957c9574e..4bf993d4571 100644
--- a/choose_your_own/class_vis.py
+++ b/choose_your_own/class_vis.py
@@ -46,5 +46,5 @@ def output_image(name, format, bytes):
     data['name'] = name
     data['format'] = format
     data['bytes'] = base64.encodestring(bytes)
-    print image_start+json.dumps(data)+image_end
+    print( image_start+json.dumps(data)+image_end)
                                     
diff --git a/choose_your_own/test.PNG b/choose_your_own/test.PNG
new file mode 100644
index 00000000000..ac55c5d3eb6
Binary files /dev/null and b/choose_your_own/test.PNG differ
diff --git a/choose_your_own/your_algorithm.py b/choose_your_own/your_algorithm.py
index 62a7573cfdf..67a59637651 100644
--- a/choose_your_own/your_algorithm.py
+++ b/choose_your_own/your_algorithm.py
@@ -30,7 +30,16 @@
 
 ### your code here!  name your classifier object clf if you want the 
 ### visualization code (prettyPicture) to show you the decision boundary
-
+# KNN
+clf_knn = KNeighborsClassifier(n_neighbors=4)
+clf_knn.fit(features_train, labels_train)
+pred_knn = clf_knn.predict(features_test)
+print( "Accuracy for KNeighborsClassifier:", accuracy_score(labels_test, pred_knn))
+
+clf_rf = RandomForestClassifier(n_estimators=15, min_samples_split=6)
+clf_rf.fit(features_train, labels_train)
+clf_rf = clf_rf.predict(features_test)
+print( "Accuracy RandomForestClassifier:", accuracy_score(labels_test, clf_rf))
 
 
 
diff --git a/datasets_questions/explore_enron_data.py b/datasets_questions/explore_enron_data.py
index ca9bacb9c83..ed00f32fb71 100644
--- a/datasets_questions/explore_enron_data.py
+++ b/datasets_questions/explore_enron_data.py
@@ -16,7 +16,48 @@
 """
 
 import pickle
+import numpy as np
 
 enron_data = pickle.load(open("../final_project/final_project_dataset.pkl", "rb"))
 
-
+print(len(enron_data))
+print(len(list(enron_data.values())[0]))
+
+count = 0
+for person_name in enron_data.keys():
+	if(enron_data[person_name]["poi"]==1):
+		count = count+1
+print(count)
+
+total_poi = 0
+with open('../final_project/poi_names.txt', 'r') as file:
+	for line in file:
+		if('\(y\)' or '\(n\)' in line):
+			total_poi= total_poi+1
+print(total_poi)
+file.close()
+print("Net Stock value of James Prentice: ", enron_data['PRENTICE JAMES']['total_stock_value'])
+print("Wesley Colwell to POI emails: ", enron_data['COLWELL WESLEY']['from_this_person_to_poi'])
+print("Stock options of Jeffrey Skilling: ", enron_data['SKILLING JEFFREY K']['exercised_stock_options'])
+
+most_value_taken = max([(enron_data[person_name]['total_payments']) for person_name in ("LAY KENNETH L", "SKILLING JEFFREY K", "FASTOW ANDREW S")])
+print(most_value_taken)
+
+salaries_not_nan = 0
+known_emails = 0
+total_payments_not_nan = 0
+total_payments_not_nan_poi = 0
+for person_name in enron_data:
+	if not np.isnan(float(enron_data[person_name]['salary'])):
+		salaries_not_nan += 1
+	if(enron_data[person_name]['email_address'] != 'NaN'):
+		known_emails+=1
+	if np.isnan(float(enron_data[person_name]['total_payments'])):
+		total_payments_not_nan +=1
+		if np.isnan(enron_data[person_name]["poi"]==1 ):
+			total_payments_not_nan_poi += 1
+
+print('Salaries available:: ', salaries_not_nan)
+print('Available emails: ', known_emails)
+print('Number Percentage people NaN -> their total payments: ',total_payments_not_nan, total_payments_not_nan*100/len(enron_data))
+print('Number and Percentage Pois NaN ->  their total payments: ',total_payments_not_nan_poi, total_payments_not_nan_poi*100/count)
diff --git a/decision_tree/dt_author_id.py b/decision_tree/dt_author_id.py
index 006afb8357b..14cdfe84125 100644
--- a/decision_tree/dt_author_id.py
+++ b/decision_tree/dt_author_id.py
@@ -24,7 +24,23 @@
 
 #########################################################
 ### your code goes here ###
+#imports
+from sklearn import tree
+from sklearn.metrics import accuracy_score
+#
+# create classifer
+clf = tree.DecisionTreeClassifier(min_samples_split=40)
 
+# fit the classifier on  training features and labels
+clf.fit(features_train, labels_train)
+
+#predict
+pred = clf.predict(features_test)
+
+# print
+print( "Accuracy:", accuracy_score(labels_test, pred))
+
+print( "No of features in date:", len(features_train[0]))
 
 #########################################################
 
diff --git a/evaluation/evaluate_poi_identifier.py b/evaluation/evaluate_poi_identifier.py
index 0ca99d52d5f..0632a18e71e 100644
--- a/evaluation/evaluate_poi_identifier.py
+++ b/evaluation/evaluate_poi_identifier.py
@@ -13,8 +13,12 @@
 
 import pickle
 import sys
+import numpy as np
 sys.path.append("../tools/")
 from feature_format import featureFormat, targetFeatureSplit
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.metrics import accuracy_score, precision_score, recall_score
+from sklearn.cross_validation import train_test_split
 
 data_dict = pickle.load(open("../final_project/final_project_dataset.pkl", "r") )
 
@@ -27,5 +31,61 @@
 
 
 ### your code goes here 
+features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size=0.30,
+                                                                            random_state=42)
+# create DT Classifier
+clf = DecisionTreeClassifier()
 
+# fit/train it
+clf.fit(features_train, labels_train)
+
+# predict
+pred = clf.predict(features_test)
+
+#print
+print( "accuracy:", accuracy_score(labels_test, pred))
+
+### evaluation
+values, counts = np.unique(pred, return_counts=True)
+test_size = len(features_test)
+
+# print
+print("Predicted POIs:", zip(values, counts))
+print( "Total number in test set:", test_size)
+print( "Accuracy - all poi=0:", counts[0] / test_size)
+
+true_positives = 0
+for actual, predicted in zip(labels_test, pred):
+    if actual == 1 and predicted == 1:
+        true_positives += 1
+
+# print
+print( "TP - true positives:", true_positives)
+print( "Precision score:", precision_score(labels_test, pred))
+print( "Recall score:", recall_score(labels_test, pred))
+
+prediction_labels = [0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1]
+true_labels = [0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0]
+
+
+def calc_precision_and_recall(actual, predicted):
+    print( "Doing precision and recall...")
+    true_positives = 0
+    false_positives = 0
+    false_negatives = 0
+    true_negatives = 0
+    for a, p in zip(actual, predicted):
+        if a == 1 and p == 1:
+            true_positives += 1
+        elif a == 1 and p == 0:
+            false_negatives += 1
+        elif a == 0 and p == 1:
+            false_positives += 1
+        else:
+            true_negatives += 1
+    print( "Precision:", true_positives / (true_positives + false_positives))
+    print( "Recall:", true_positives / (true_positives + false_negatives))
+
+
+calc_precision_and_recall(true_labels, prediction_labels)
 
diff --git a/feature_selection/find_signature.py b/feature_selection/find_signature.py
index c01a1f2111a..93c0be9a822 100644
--- a/feature_selection/find_signature.py
+++ b/feature_selection/find_signature.py
@@ -28,6 +28,8 @@
 features_train = vectorizer.fit_transform(features_train)
 features_test  = vectorizer.transform(features_test).toarray()
 
+# get words
+words = vectorizer.get_feature_names()
 
 ### a classic way to overfit is to use a small number
 ### of data points and a large number of features;
@@ -38,6 +40,19 @@
 
 
 ### your code goes here
-
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.metrics import accuracy_score
+
+clf = DecisionTreeClassifier(min_samples_split=40)
+clf.fit(features_train, labels_train)
+pred = clf.predict(features_test)
+print( "Accuracy:", accuracy_score(labels_test, pred))
+
+print( "Important features:")
+for index, feature in enumerate(clf.feature_importances_):
+    if feature>0.2:
+        print( "Feature number", index)
+        print( "Importance", feature)
+        print( "Word", words[index])
 
 
diff --git a/final_project/poi_id.py b/final_project/poi_id.py
index 47912a7c51d..c1e39e53b7a 100644
--- a/final_project/poi_id.py
+++ b/final_project/poi_id.py
@@ -2,6 +2,7 @@
 
 import sys
 import pickle
+import matplotlib
 sys.path.append("../tools/")
 
 from feature_format import featureFormat, targetFeatureSplit
@@ -10,14 +11,86 @@
 ### Task 1: Select what features you'll use.
 ### features_list is a list of strings, each of which is a feature name.
 ### The first feature must be "poi".
-features_list = ['poi','salary'] # You will need to use more features
+features_list = ['poi', 'salary'] # You will need to use more features
 
 ### Load the dictionary containing the dataset
-with open("final_project_dataset.pkl", "r") as data_file:
-    data_dict = pickle.load(data_file)
+# with open("final_project_dataset.pkl", "r") as data_file:
+#     data_dict = pickle.load(data_file)
+
+# data_dict = pickle.load( open( "final_project_dataset.pkl", "rb" ) )
+
+with open('final_project_dataset.pkl', 'rb') as handle:
+    data_dict = pickle.load(handle)
+
+
 
 ### Task 2: Remove outliers
+identified_outliers = ["TOTAL", "LAVORATO JOHN J", "MARTIN AMANDA K", "URQUHART JOHN A", "MCCLELLAN GEORGE", "SHANKMAN JEFFREY A", "WHITE JR THOMAS E", "PAI LOU L", "HIRKO JOSEPH"]
+for outlier in identified_outliers:
+    data_dict.pop(outlier)
+
 ### Task 3: Create new feature(s)
+financial_features = ['salary', 'deferral_payments', 'total_payments', 'loan_advances', 'bonus', 'restricted_stock_deferred', 'deferred_income', 'total_stock_value', 'expenses', 'exercised_stock_options', 'other', 'long_term_incentive', 'restricted_stock', 'director_fees']
+email_features = ['to_messages', 'email_address', 'from_poi_to_this_person', 'from_messages', 'from_this_person_to_poi', 'poi', 'shared_receipt_with_poi']
+
+# count data_points
+data_points = len(data_dict)
+
+# initialise counts
+poi_count = 0
+non_poi_count = 0
+
+
+# print
+non_poi_count = 0
+print( "Data points:\t", data_points)
+print( "Number of non POIs:\t", non_poi_count)
+print( "Number of POIs:\t\t", poi_count)
+
+print( "POI ratio:\t\t", poi_count/data_points)
+print( "Total features:\t", len(data_dict[data_dict.keys()[0]]))
+print( "Financial features:\t", len(financial_features))
+print( "Email features:\t", len(email_features))
+print( "")
+
+
+
+
+
+def outlier_visualization(data):
+    for point in data:
+        f1 = point[0]
+        f2 = point[1]
+        matplotlib.pyplot.scatter(f1, f2 )
+    
+    matplotlib.pyplot.xlabel("Feature 1")
+    matplotlib.pyplot.ylabel("Feature 2")
+    matplotlib.pyplot.show()
+
+
+
+def visualize_outliers():
+    start = 1
+    for i in range(2, len(financial_features)):
+        outlier_visualization(financial_outliers, 1, i, 'salary', financial_features[i], start)
+        start += 1
+    start = 10
+
+    for i in range(2, len(email_features)):
+        outlier_visualization(email_outliers, 1, i, 'to_messages', email_features[i], start)
+        start += 1
+
+
+# outlier name
+def get_outlier(feature, value):
+    for person, features in data_dict.iteritems():
+        if features[feature] == value:
+            print("Outlier is:", person, features['poi'])
+
+
+
+
+
 ### Store to my_dataset for easy export below.
 my_dataset = data_dict
 
@@ -25,6 +98,16 @@
 data = featureFormat(my_dataset, features_list, sort_keys = True)
 labels, features = targetFeatureSplit(data)
 
+financial_outliers = featureFormat(data_dict, financial_features)
+email_outliers = featureFormat(data_dict, email_features)
+
+
+#from sklearn.cross_validation import train_test_split
+from sklearn.model_selection import train_test_split
+
+features_train, features_test, labels_train, labels_test = \
+    train_test_split(features, labels, test_size=0.3, random_state=42)
+
 ### Task 4: Try a varity of classifiers
 ### Please name your classifier clf for easy export below.
 ### Note that if you want to do PCA or other multi-stage operations,
@@ -32,9 +115,18 @@
 ### http://scikit-learn.org/stable/modules/pipeline.html
 
 # Provided to give you a starting point. Try a variety of classifiers.
+# import
 from sklearn.naive_bayes import GaussianNB
+
+# create classifier
 clf = GaussianNB()
 
+#fit/train
+clf.fit(features_train, labels_train)
+
+# predict
+pred = clf.predict(features_test)
+
 ### Task 5: Tune your classifier to achieve better than .3 precision and recall 
 ### using our testing script. Check the tester.py script in the final project
 ### folder for details on the evaluation method, especially the test_classifier
@@ -43,7 +135,8 @@
 ### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html
 
 # Example starting point. Try investigating other evaluation techniques!
-from sklearn.cross_validation import train_test_split
+#from sklearn.cross_validation import train_test_split
+from sklearn.model_selection  import train_test_split
 features_train, features_test, labels_train, labels_test = \
     train_test_split(features, labels, test_size=0.3, random_state=42)
 
@@ -52,4 +145,9 @@
 ### that the version of poi_id.py that you submit can be run on its own and
 ### generates the necessary .pkl files for validating your results.
 
-dump_classifier_and_data(clf, my_dataset, features_list)
\ No newline at end of file
+visualize_outliers()
+
+dump_classifier_and_data(clf, my_dataset, features_list)
+
+
+
diff --git a/final_project/tester.py b/final_project/tester.py
index c0899dbe58e..3749091aaba 100644
--- a/final_project/tester.py
+++ b/final_project/tester.py
@@ -12,7 +12,9 @@
 
 import pickle
 import sys
-from sklearn.cross_validation import StratifiedShuffleSplit
+#from sklearn.cross_validation import StratifiedShuffleSplit
+from sklearn.model_selection import StratifiedShuffleSplit
+
 sys.path.append("../tools/")
 from feature_format import featureFormat, targetFeatureSplit
 
@@ -55,9 +57,9 @@ def test_classifier(clf, dataset, feature_list, folds = 1000):
             elif prediction == 1 and truth == 1:
                 true_positives += 1
             else:
-                print "Warning: Found a predicted label not == 0 or 1."
-                print "All predictions should take value 0 or 1."
-                print "Evaluating performance for processed predictions:"
+                print( "Warning: Found a predicted label not == 0 or 1.")
+                print( "All predictions should take value 0 or 1.")
+                print( "Evaluating performance for processed predictions:")
                 break
     try:
         total_predictions = true_negatives + false_negatives + false_positives + true_positives
@@ -66,13 +68,13 @@ def test_classifier(clf, dataset, feature_list, folds = 1000):
         recall = 1.0*true_positives/(true_positives+false_negatives)
         f1 = 2.0 * true_positives/(2*true_positives + false_positives+false_negatives)
         f2 = (1+2.0*2.0) * precision*recall/(4*precision + recall)
-        print clf
-        print PERF_FORMAT_STRING.format(accuracy, precision, recall, f1, f2, display_precision = 5)
-        print RESULTS_FORMAT_STRING.format(total_predictions, true_positives, false_positives, false_negatives, true_negatives)
-        print ""
+        print( clf)
+        print( PERF_FORMAT_STRING.format(accuracy, precision, recall, f1, f2, display_precision = 5))
+        print( RESULTS_FORMAT_STRING.format(total_predictions, true_positives, false_positives, false_negatives, true_negatives))
+        print( "")
     except:
-        print "Got a divide by zero when trying out:", clf
-        print "Precision or recall may be undefined due to a lack of true positive predicitons."
+        print( "Got a divide by zero when trying out:", clf)
+        print( "Precision or recall may be undefined due to a lack of true positive predicitons.")
 
 CLF_PICKLE_FILENAME = "my_classifier.pkl"
 DATASET_PICKLE_FILENAME = "my_dataset.pkl"
diff --git a/k_means/k_means_cluster.py b/k_means/k_means_cluster.py
index 6a2ba687017..eb42d6d8f51 100644
--- a/k_means/k_means_cluster.py
+++ b/k_means/k_means_cluster.py
@@ -13,7 +13,8 @@
 import sys
 sys.path.append("../tools/")
 from feature_format import featureFormat, targetFeatureSplit
-
+from sklearn.cluster import KMeans
+from sklearn.preprocessing import MinMaxScaler
 
 
 
@@ -39,7 +40,11 @@ def Draw(pred, features, poi, mark_poi=False, name="image.png", f1_name="feature
 
 
 ### load in the dict of dicts containing all the data on each person in the dataset
-data_dict = pickle.load( open("../final_project/final_project_dataset.pkl", "r") )
+with open("../final_project/final_project_dataset.pkl", "rb") as f:
+    rawdataset = f.read()
+
+#data_dict = pickle.load( open("../final_project/final_project_dataset.pkl", "r") )
+data_dict = pickle.load( rawdataset)
 ### there's an outlier--remove it! 
 data_dict.pop("TOTAL", 0)
 
@@ -63,8 +68,13 @@ def Draw(pred, features, poi, mark_poi=False, name="image.png", f1_name="feature
 plt.show()
 
 ### cluster here; create predictions of the cluster labels
-### for the data and store them to a list called pred
+est = KMeans(n_clusters=2)
 
+# fit/train it
+est.fit(finance_features)
+
+### for the data and store them to a list called pred
+pred = est.predict(finance_features)
 
 
 
@@ -73,4 +83,26 @@ def Draw(pred, features, poi, mark_poi=False, name="image.png", f1_name="feature
 try:
     Draw(pred, finance_features, poi, mark_poi=False, name="clusters.pdf", f1_name=feature_1, f2_name=feature_2)
 except NameError:
-    print "no predictions object named pred found, no clusters to plot"
+    print( "no predictions object named pred found, no clusters to plot")
+
+
+salary = []
+exercised_stock_options = []
+for name in data_dict:
+    stock = data_dict[name]['exercised_stock_options']
+    sal = data_dict[name]['salary']
+    if not numpy.isnan(float(stock)):
+        exercised_stock_options.append(float(stock))
+    if not numpy.isnan(float(sal)):
+        salary.append(float(sal))
+
+#Feature rescaling
+scaler = MinMaxScaler()
+print( "After rescaling, salary $200,000:", scaler.fit_transform([[float(min(salary))], [200000], [float(max(salary))]]))
+print( "After rescaling,  salary $100,000:", scaler.fit_transform([[float(min(exercised_stock_options))], [1000000], [float(max(exercised_stock_options))]]))
+
+print( "Minimum stock :", min(exercised_stock_options))
+print( "Maximum stock :", max(exercised_stock_options))
+
+print( "Minimum salary :", min(salary))
+print( "Maximum salary :", max(salary))
\ No newline at end of file
diff --git a/naive_bayes/nb_author_id.py b/naive_bayes/nb_author_id.py
index f69d57d8408..30f511f5599 100644
--- a/naive_bayes/nb_author_id.py
+++ b/naive_bayes/nb_author_id.py
@@ -26,7 +26,31 @@
 
 #########################################################
 ### your code goes here ###
+# create classifer
+clf = GaussianNB()
 
+# note time
+t0 = time()
+
+# fit the classifier on  training features and labels
+clf.fit(features_train, labels_train)
+print("Training time", time()-t0, "s")
+
+# note time
+t1=time()
+
+# predict labels for the test features
+pred = clf.predict(features_test)
+print("Predicting time", time()-t1, "s")
+
+# calculate accuracy
+accuracy = accuracy_score(pred, labels_test)
+
+# return the accuracy
+# return(accuracy)
+
+print(accuracy)
+# return the accuracy
 
 #########################################################
 
diff --git a/outliers/enron_outliers.py b/outliers/enron_outliers.py
index ac26d7fe9a8..d881848a2be 100644
--- a/outliers/enron_outliers.py
+++ b/outliers/enron_outliers.py
@@ -8,12 +8,19 @@
 
 
 ### read in data dictionary, convert to numpy array
-data_dict = pickle.load( open("../final_project/final_project_dataset.pkl", "r") )
+data_dict = pickle.load( open("../final_project/final_project_dataset.pkl", "rb"), fix_imports = True )
 features = ["salary", "bonus"]
 data = featureFormat(data_dict, features)
 
 
 ### your code below
-
+for point in data:
+    salary = point[0]
+    bonus = point[1]
+    matplotlib.pyplot.scatter( salary, bonus )
+
+matplotlib.pyplot.xlabel("salary")
+matplotlib.pyplot.ylabel("bonus")
+matplotlib.pyplot.show()
 
 
diff --git a/outliers/outlier_cleaner.py b/outliers/outlier_cleaner.py
index c0ddb2acc79..db7276a8745 100644
--- a/outliers/outlier_cleaner.py
+++ b/outliers/outlier_cleaner.py
@@ -14,7 +14,12 @@ def outlierCleaner(predictions, ages, net_worths):
     cleaned_data = []
 
     ### your code goes here
+    import operator
 
+    errors = [a - b for a, b in zip(predictions, net_worths)]
+    data = zip(ages, net_worths, errors)
+    data.sort(key=operator.itemgetter(2))
+    cleaned_data = data[:int(len(predictions) * 0.9)]
     
     return cleaned_data
 
diff --git a/outliers/outlier_removal_regression.py b/outliers/outlier_removal_regression.py
index d509cd9f22f..27240241b3d 100644
--- a/outliers/outlier_removal_regression.py
+++ b/outliers/outlier_removal_regression.py
@@ -25,7 +25,12 @@
 
 ### fill in a regression here!  Name the regression object reg so that
 ### the plotting code below works, and you can see what your regression looks like
-
+from sklearn.linear_model import LinearRegression
+reg = LinearRegression().fit(ages_train, net_worths_train)
+print("Slope: ", reg.coef_)
+# print('Regession intercept: ', reg.intercept_)
+# print('Regression score: ', reg.score(ages_train, net_worths_train))
+print("Score: ", reg.score(ages_test, net_worths_test))
 
 
 
@@ -50,8 +55,8 @@
     predictions = reg.predict(ages_train)
     cleaned_data = outlierCleaner( predictions, ages_train, net_worths_train )
 except NameError:
-    print "your regression object doesn't exist, or isn't name reg"
-    print "can't make predictions to use in identifying outliers"
+    print("your regression object doesn't exist, or isn't name reg")
+    print("can't make predictions to use in identifying outliers")
 
 
 
@@ -70,9 +75,9 @@
         reg.fit(ages, net_worths)
         plt.plot(ages, reg.predict(ages), color="blue")
     except NameError:
-        print "you don't seem to have regression imported/created,"
-        print "   or else your regression object isn't named reg"
-        print "   either way, only draw the scatter plot of the cleaned data"
+        print("you don't seem to have regression imported/created,")
+        print("   or else your regression object isn't named reg")
+        print("   either way, only draw the scatter plot of the cleaned data")
     plt.scatter(ages, net_worths)
     plt.xlabel("ages")
     plt.ylabel("net worths")
@@ -80,5 +85,5 @@
 
 
 else:
-    print "outlierCleaner() is returning an empty list, no refitting to be done"
+    print("outlierCleaner() is returning an empty list, no refitting to be done")
 
diff --git a/pca/eigenfaces.py b/pca/eigenfaces.py
index 074b860a253..b3cbc1e33ee 100644
--- a/pca/eigenfaces.py
+++ b/pca/eigenfaces.py
@@ -16,7 +16,7 @@
 
 
 
-print __doc__
+print(__doc__)
 
 from time import time
 import logging
@@ -53,10 +53,10 @@
 target_names = lfw_people.target_names
 n_classes = target_names.shape[0]
 
-print "Total dataset size:"
-print "n_samples: %d" % n_samples
-print "n_features: %d" % n_features
-print "n_classes: %d" % n_classes
+print( "Total dataset size:")
+print("n_samples: {0}".format(n_samples))
+print("n_features: {0}".format(n_features))
+print("n_classes: {0}".format(n_classes))
 
 
 ###############################################################################
@@ -68,24 +68,24 @@
 # dataset): unsupervised feature extraction / dimensionality reduction
 n_components = 150
 
-print "Extracting the top %d eigenfaces from %d faces" % (n_components, X_train.shape[0])
+print("Extracting the top {0} eigenfaces from {1} faces".format(n_components, X_train.shape[0]))
 t0 = time()
 pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train)
-print "done in %0.3fs" % (time() - t0)
+print( "done in %0.3fs" % (time() - t0))
 
 eigenfaces = pca.components_.reshape((n_components, h, w))
 
-print "Projecting the input data on the eigenfaces orthonormal basis"
+print( "Projecting the input data on the eigenfaces orthonormal basis")
 t0 = time()
 X_train_pca = pca.transform(X_train)
 X_test_pca = pca.transform(X_test)
-print "done in %0.3fs" % (time() - t0)
+print("done in {0:.3f}s".format(time() - t0))
 
 
 ###############################################################################
 # Train a SVM classification model
 
-print "Fitting the classifier to the training set"
+print( "Fitting the classifier to the training set")
 t0 = time()
 param_grid = {
          'C': [1e3, 5e3, 1e4, 5e4, 1e5],
@@ -94,21 +94,21 @@
 # for sklearn version 0.16 or prior, the class_weight parameter value is 'auto'
 clf = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid)
 clf = clf.fit(X_train_pca, y_train)
-print "done in %0.3fs" % (time() - t0)
-print "Best estimator found by grid search:"
-print clf.best_estimator_
+print("done in {0:.3f}s".format(time() - t0))
+print( "Best estimator found by grid search:")
+print( clf.best_estimator_)
 
 
 ###############################################################################
 # Quantitative evaluation of the model quality on the test set
 
-print "Predicting the people names on the testing set"
+print( "Predicting the people names on the testing set")
 t0 = time()
 y_pred = clf.predict(X_test_pca)
-print "done in %0.3fs" % (time() - t0)
+print("done in {0:.3f}s".format(time() - t0))
 
-print classification_report(y_test, y_pred, target_names=target_names)
-print confusion_matrix(y_test, y_pred, labels=range(n_classes))
+print( classification_report(y_test, y_pred, target_names=target_names))
+print( confusion_matrix(y_test, y_pred, labels=range(n_classes)))
 
 
 ###############################################################################
diff --git a/regression/finance_regression.py b/regression/finance_regression.py
index efa10637a1f..3842ba3739f 100644
--- a/regression/finance_regression.py
+++ b/regression/finance_regression.py
@@ -29,16 +29,19 @@
 from sklearn.cross_validation import train_test_split
 feature_train, feature_test, target_train, target_test = train_test_split(features, target, test_size=0.5, random_state=42)
 train_color = "b"
-test_color = "b"
-
-
+test_color = "r"
 
 ### Your regression goes here!
 ### Please name it reg, so that the plotting code below picks it up and 
 ### plots it correctly. Don't forget to change the test_color above from "b" to
 ### "r" to differentiate training points from test points.
-
-
+from sklearn.linear_model import LinearRegression
+reg = LinearRegression().fit(feature_train, target_train)
+print("Regression output: ")
+print("Slope: ", reg.coef_)
+print("Intercept: ", reg.intercept_)
+print("Score for training: ", reg.score(feature_train, target_train))
+print("Score for testing: ", reg.score(feature_test, target_test))
 
 
 
diff --git a/requirements.txt b/requirements.txt
index 1d4ac04c20e..e278210faad 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
 nltk==3.2.1
-numpy==1.11.2
+numpy==1.13.3
 scikit-learn==0.18
-scipy==0.18.1
+scipy==0.19.1
diff --git a/svm/svm_author_id.py b/svm/svm_author_id.py
index fda3f7fdb28..208b4adf01d 100644
--- a/svm/svm_author_id.py
+++ b/svm/svm_author_id.py
@@ -24,7 +24,23 @@
 
 #########################################################
 ### your code goes here ###
+#clf = SVC(kernel='linear')
+clf = SVC(kernel='rbf', C=10000)
 
+t0 = time()
+clf.fit(features_train, labels_train)
+print( "training time:", round(time()-t0, 3), "s")
+
+t1 = time()
+pred = clf.predict(features_test)
+print( "Prediction time:", round(time()-t1, 3), "s")
+
+print( "Accuracy score:", accuracy_score(labels_test, pred))
+
+print( "Predictions for 10:", pred[10], print( "Predictions for 26:", pred[26], print( "Predictions for 50:", pred[50] ) ))
+
+c = Counter(pred)
+print( "Number of predictions for Chris(1):", c[1])
 #########################################################
 
 
diff --git a/text_learning/vectorize_text.py b/text_learning/vectorize_text.py
index 629c6b0f317..38dd5fc0c9c 100644
--- a/text_learning/vectorize_text.py
+++ b/text_learning/vectorize_text.py
@@ -7,6 +7,8 @@
 
 sys.path.append( "../tools/" )
 from parse_out_email_text import parseOutText
+from sklearn.feature_extraction.text import TfidfVectorizer
+from nltk.corpus import stopwords
 
 """
     Starter code to process the emails from Sara and Chris to extract
@@ -44,22 +46,28 @@
         temp_counter += 1
         if temp_counter < 200:
             path = os.path.join('..', path[:-1])
-            print path
+            print( path)
             email = open(path, "r")
 
             ### use parseOutText to extract the text from the opened email
-
+            text = parseOutText(email)
             ### use str.replace() to remove any instances of the words
             ### ["sara", "shackleton", "chris", "germani"]
+            stopwords = ["sara", "shackleton", "chris", "germani"]
+            for word in stopwords:
+                text = text.replace(word, "")
 
             ### append the text to word_data
-
+            word_data.append(text)
             ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris
-
+            if name == "sara":
+                from_data.append(0)
+            else:
+                from_data.append(1)
 
             email.close()
 
-print "emails processed"
+print( "emails processed")
 from_sara.close()
 from_chris.close()
 
@@ -71,5 +79,10 @@
 
 
 ### in Part 4, do TfIdf vectorization here
-
+vectorizer = TfidfVectorizer(stop_words="english")
+vectorizer.fit(word_data)
+vectorizer.transform(word_data)
+feature_words = vectorizer.get_feature_names()
+print( "Total words:", len(feature_words))
+print( "The word at word[34597]:", feature_words[34597])
 
diff --git a/tools/email_preprocess.py b/tools/email_preprocess.py
index 2528b995904..edc1f74e60e 100644
--- a/tools/email_preprocess.py
+++ b/tools/email_preprocess.py
@@ -59,7 +59,7 @@ def preprocess(words_file = "../tools/word_data.pkl", authors_file="../tools/ema
     features_test_transformed  = selector.transform(features_test_transformed).toarray()
 
     ### info on the data
-    print "no. of Chris training emails:", sum(labels_train)
-    print "no. of Sara training emails:", len(labels_train)-sum(labels_train)
+    print( "no. of Chris training emails:", sum(labels_train))
+    print( "no. of Sara training emails:", len(labels_train)-sum(labels_train))
     
     return features_train_transformed, features_test_transformed, labels_train, labels_test
diff --git a/tools/feature_format.py b/tools/feature_format.py
index 7ca78ac291a..cf948d35506 100644
--- a/tools/feature_format.py
+++ b/tools/feature_format.py
@@ -67,7 +67,7 @@ def featureFormat( dictionary, features, remove_NaN=True, remove_all_zeroes=True
             try:
                 dictionary[key][feature]
             except KeyError:
-                print "error: key ", feature, " not present"
+                print( "error: key ", feature, " not present")
                 return
             value = dictionary[key][feature]
             if value=="NaN" and remove_NaN:
diff --git a/tools/parse_out_email_text.py b/tools/parse_out_email_text.py
index 43725b22d10..cea649a9f92 100644
--- a/tools/parse_out_email_text.py
+++ b/tools/parse_out_email_text.py
@@ -45,7 +45,7 @@ def parseOutText(f):
 def main():
     ff = open("../text_learning/test_email.txt", "r")
     text = parseOutText(ff)
-    print text
+    print( text)
 
 
 
diff --git a/tools/startup.py b/tools/startup.py
index 4638e0d115e..bc77280ab4d 100644
--- a/tools/startup.py
+++ b/tools/startup.py
@@ -1,47 +1,48 @@
 #!/usr/bin/python
 
-print
-print "checking for nltk"
+print()
+print( "checking for nltk")
 try:
     import nltk
 except ImportError:
-    print "you should install nltk before continuing"
+    print( "you should install nltk before continuing")
 
-print "checking for numpy"
+print( "checking for numpy")
 try:
     import numpy
 except ImportError:
-    print "you should install numpy before continuing"
+    print( "you should install numpy before continuing")
 
-print "checking for scipy"
+print( "checking for scipy")
 try:
     import scipy
 except:
-    print "you should install scipy before continuing"
+    print( "you should install scipy before continuing")
 
-print "checking for sklearn"
+print( "checking for sklearn")
 try:
     import sklearn
 except:
-    print "you should install sklearn before continuing"
+    print( "you should install sklearn before continuing")
 
-print
-print "downloading the Enron dataset (this may take a while)"
-print "to check on progress, you can cd up one level, then execute <ls -lthr>"
-print "Enron dataset should be last item on the list, along with its current size"
-print "download will complete at about 423 MB"
+print()
+print( "downloading the Enron dataset (this may take a while)")
+print( "to check on progress, you can cd up one level, then execute <ls -lthr>")
+print( "Enron dataset should be last item on the list, along with its current size")
+print( "download will complete at about 423 MB")
 import urllib
 url = "https://www.cs.cmu.edu/~./enron/enron_mail_20150507.tar.gz"
-urllib.urlretrieve(url, filename="../enron_mail_20150507.tar.gz") 
-print "download complete!"
+#old -> urllib.urlretrieve(url, filename="../enron_mail_20150507.tar.gz")
+urllib.request.urlretrieve(url, filename="../enron_mail_20150507.tgz") 
+print( "download complete!")
 
 
-print
-print "unzipping Enron dataset (this may take a while)"
+print()
+print( "unzipping Enron dataset (this may take a while)")
 import tarfile
 import os
 os.chdir("..")
 tfile = tarfile.open("enron_mail_20150507.tar.gz", "r:gz")
 tfile.extractall(".")
 
-print "you're ready to go!"
+print( "you're ready to go!")
diff --git a/validation/validate_poi.py b/validation/validate_poi.py
index 03537a5cc07..c0fafa62bcc 100644
--- a/validation/validate_poi.py
+++ b/validation/validate_poi.py
@@ -14,6 +14,9 @@
 import sys
 sys.path.append("../tools/")
 from feature_format import featureFormat, targetFeatureSplit
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.metrics import accuracy_score
+from sklearn.cross_validation import train_test_split
 
 data_dict = pickle.load(open("../final_project/final_project_dataset.pkl", "r") )
 
@@ -25,8 +28,19 @@
 data = featureFormat(data_dict, features_list)
 labels, features = targetFeatureSplit(data)
 
+features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size=0.30, random_state=42)
 
 
 ### it's all yours from here forward!  
+# create DecisionTree Classifier
+clf = DecisionTreeClassifier()
 
+# Fit/train it
+clf.fit(features_train, labels_train)
+
+# predict
+pred = clf.predict(features_test)
+
+# print
+print( "Accuracy:", accuracy_score(labels_test, pred))