Muzammil-Elahi · Adventurer-E · May 13, 2024
diff --git a/.DS_Store b/.DS_Store
diff --git a/dividend_policy_predictor.py b/dividend_policy_predictor.py
@@ -166,9 +166,6 @@ def rank_columns_by_correlation(df, threshold=0.9):
 plt.xticks(rotation=45, ha='right', rotation_mode="anchor")
 plt.show()
 
-# ============================================================================
-
-
 # Now let's remove the features one by one from the least important one
 X_train_temp = X_train_oversampled.copy()
 X_validate_temp = X_validate_transformed.copy()
@@ -221,6 +218,7 @@ def rank_columns_by_correlation(df, threshold=0.9):
 with open('result_df.pkl', 'rb') as file:
     result_df = pickle.load(file)
 
+# ============================================================================
 # Model Selection
 
 X_train_oversampled.info()
@@ -242,7 +240,7 @@ def rank_columns_by_correlation(df, threshold=0.9):
 
 # Create and save model
 best_model_lr = LogisticRegression(**best_params_lr, solver='liblinear', n_jobs=-1)
-with open('best_models_lr.pkl', 'wb') as file:
+with open('storage_files/best_models_lr.pkl', 'wb') as file:
     pickle.dump(best_model_lr, file)
 
 
@@ -278,7 +276,7 @@ def objective_function(trial):
 
 # Create and save model
 best_model_lr = LogisticRegression(**best_params_lr, solver='liblinear', n_jobs=-1)
-with open('best_models_lr.pkl', 'wb') as file:
+with open('storage_files/best_models_lr.pkl', 'wb') as file:
     pickle.dump(best_model_lr, file)
 
 # Decision Tree
@@ -312,7 +310,7 @@ def objective_function(trial):
 
 # Create and save model
 best_model_dt = DecisionTreeClassifier(**best_params_dt)
-with open('best_models_dt.pkl', 'wb') as file:
+with open('storage_files/best_models_dt.pkl', 'wb') as file:
     pickle.dump(best_model_dt, file)
 
 # KNN
@@ -347,7 +345,7 @@ def objective_function(trial):
 
 # Create and save model
 best_model_knn = KNeighborsClassifier(**best_params_knn)
-with open('best_models_knn.pkl', 'wb') as file:
+with open('storage_files/best_models_knn.pkl', 'wb') as file:
     pickle.dump(best_model_knn, file)
 
 # Random Forest
@@ -383,7 +381,7 @@ def objective_function(trial):
 
 # Create and save model
 best_model_rf = RandomForestClassifier(**best_params_rf, n_jobs=-1)
-with open('best_models_rf.pkl', 'wb') as file:
+with open('storage_files/best_models_rf.pkl', 'wb') as file:
     pickle.dump(best_model_rf, file)
 
 # XgBoost
@@ -443,7 +441,7 @@ def objective_function(trial):
 print("Best ROC-AUC Score: ", study_xgb.best_value)
 
 best_model_xgb = XGBClassifier(**best_params_xgb, use_label_encoder=False, n_jobs=-1)
-with open('best_models_xgb.pkl', 'wb') as file:
+with open('storage_files/best_models_xgb.pkl', 'wb') as file:
     pickle.dump(best_model_xgb, file)
 
 # Label encode categorical features with many categories
@@ -457,15 +455,15 @@ def objective_function(trial):
 # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
 
 # Model selection - Compare Performance
-with open('best_models_lr.pkl', 'rb') as file:
+with open('storage_files/best_models_lr.pkl', 'rb') as file:
     best_model_lr = pickle.load(file)
-with open('best_models_dt.pkl', 'rb') as file:
+with open('storage_files/best_models_dt.pkl', 'rb') as file:
     best_model_dt = pickle.load(file)
-with open('best_models_knn.pkl', 'rb') as file:
+with open('storage_files/best_models_knn.pkl', 'rb') as file:
     best_model_knn = pickle.load(file)
-with open('best_models_rf.pkl', 'rb') as file:
+with open('storage_files/best_models_rf.pkl', 'rb') as file:
     best_model_rf = pickle.load(file)
-with open('best_models_xgb.pkl', 'rb') as file:
+with open('storage_files/best_models_xgb.pkl', 'rb') as file:
     best_model_xgb = pickle.load(file)
 
 print("Testing Performances...Please wait")

diff --git a/feature_engineering.py b/feature_engineering.py
@@ -8,17 +8,6 @@
 from sklearn.ensemble import RandomForestClassifier
 # from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, make_scorer
 import sklearn.metrics as skm
-from sklearn.neighbors import KNeighborsClassifier
-from sklearn.linear_model import LogisticRegression
-from sklearn.tree import DecisionTreeClassifier
-from sklearn.svm import SVC
-from sklearn.pipeline import Pipeline
-from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
-from sklearn.compose import ColumnTransformer
-from sklearn.model_selection import cross_val_score, GridSearchCV
-import pickle
-import optuna
-from xgboost import XGBClassifier
 
 warnings.filterwarnings('ignore')
 
@@ -36,24 +25,33 @@
 X_validate_temp = X_validate_transformed.copy()
 
 # Initialize the result dataframe
-result_df = pd.DataFrame(columns=['Features_Removed', 'ROC_Score'])
+result_df = pd.DataFrame(columns=['Features_Removed', 'ROC_Score', 'f1',
+                                  'accuracy', 'precision', 'recall'])
 
 # First, evaluate performance using all features
 randomForestModel = RandomForestClassifier(max_features=None)
 print('Random Forest fitting started.')
 randomForestModel.fit(X_train_temp, y_train_oversampled)
 # Predict probabilities on test data
 y_pred_probs = randomForestModel.predict_proba(X_validate_temp)[:, 1]
+y_pred = randomForestModel.predict(X_validate_temp)
 # Different metrics of classification
 roc_score = skm.roc_auc_score(y_test, y_pred_probs)
-# f1_score = skm.f1_score(y_test, y_pred_probs, average=None)
-
-
+f1_score = skm.f1_score(y_test, y_pred, average=None)[0]
+accuracy = skm.accuracy_score(y_test, y_pred)
+precision = skm.precision_score(y_test, y_pred, average=None)[0]
+recall = skm.recall_score(y_test, y_pred, average=None)[0]
 # Append the result to the result dataframe
-roc_dict = {'Features_Removed': 'None', 'no_features_used': len(X_train_temp.columns), 'ROC_Score': roc_score}
+roc_dict = {
+    'Features_Removed': 'None',
+    'no_features_used': len(X_train_temp.columns),
+    'ROC_Score': roc_score, 'f1': f1_score, 'accuracy': accuracy,
+    'precision': precision, 'recall': recall}
 result_df = pd.DataFrame([roc_dict])
 total_num = len(X_train_temp.columns)
-print(f"Feature_Removed: None, Number of features used: {len(X_train_temp.columns)}, ROC_AUC_Score: {roc_score}")
+print(f"Feature_Removed: None, Number of features used: {len(X_train_temp.columns)}, \
+ROC_AUC_Score: {roc_score}, F1: {f1_score}, Accuracy: {accuracy}, Precision: {precision} \
+      Recall: {recall}")
 result_df.to_csv("storage_files/result_df.csv",index=False)
 
 # Sort importance_table by Importance in ascending order to start with the least important
@@ -71,24 +69,36 @@
     # Compute ROC score
     roc_score = skm.roc_auc_score(y_test, y_pred_probs)
     # Append the result to the result dataframe
-    roc_dict = {'Features_Removed': row['Feature'], 'no_features_used': len(X_train_temp.columns), 'ROC_Score': roc_score}
+    roc_dict = {
+        'Features_Removed': row['Feature'],
+        'no_features_used': len(X_train_temp.columns),
+        'ROC_Score': roc_score, 'f1': f1_score, 'accuracy': accuracy,
+        'precision': precision, 'recall': recall}
     pd.DataFrame([roc_dict]).to_csv("storage_files/result_df.csv",mode='a',index=False,header=False)
     result_df = pd.concat([result_df, pd.DataFrame([roc_dict])])
-    print(
-        f"Feature_Removed: {row['Feature']}, Number of features used: {len(X_train_temp.columns)}, ROC_AUC_Score: {roc_score}")
+    print(f"Feature_Removed: {row['Feature']}, Number of features used: {len(X_train_temp.columns)}, \
+    ROC_AUC_Score: {roc_score}, F1: {f1_score}, Accuracy: {accuracy}, Precision: {precision} \
+    Recall: {recall}")
     # If only one feature left, break the loop
     if X_train_temp.shape[1] == 1:
         break
 
-if __name__ == '__main__':
-    # Plot a bar chart to visualize ROC scores
+def plot(metric: str):
+    """
+    metric: 'ROC_Score', 'f1', 'accuracy', 'precision' or 'recall'.
+    """
     plt.figure(figsize=(20, 10))
-    sns.barplot(data=result_df, x="no_features_used", y="ROC_Score")
-    plt.title("ROC scores")
+    sns.barplot(data=result_df, x="no_features_used", y=metric)
+    plt.title(metric)
     plt.subplots_adjust(bottom=0.2, top=0.95)
     plt.xticks(rotation=45, ha='right', rotation_mode="anchor")
     plt.show()
 
+if __name__ == '__main__':
+    for metric in ['ROC_Score', 'f1', 'accuracy', 'precision', 'recall']:
+        plot(metric)
+
+
 # Find out the one with max roc_score.
 max_roc_score = result_df.iloc[0]['ROC_Score']
 max_inds = [0]
@@ -100,15 +110,8 @@
         max_inds.append(index)
 max_no_features = [total_num-i for i in max_inds]
 print(f'Conclusion: The best model is to use {max_no_features} features.')
-
-# Save the results
-# with open('storage_files/result_df.pkl', 'wb') as file:
-#     pickle.dump(result_df, file)
-# with open('storage_files/importance_table_sorted.pkl', 'wb') as file:
-#     pickle.dump(importance_table_sorted, file)
-
-# Load the results
-# with open('storage_files/result_df.pkl', 'rb') as file:
-#     result_df = pickle.load(file)
-
-
+f = open('storage_files/max_no_features.txt','w')
+for i in max_no_features:
+    f.write(str(i))
+    f.write('\n')
+f.close()
diff --git a/importance_table_sorted.pkl b/importance_table_sorted.pkl
diff --git a/model_training_top_36_features.ipynb b/model_training_top_36_features.ipynb