Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

run the model on 36 features #3

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file modified .DS_Store
Binary file not shown.
26 changes: 12 additions & 14 deletions dividend_policy_predictor.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,9 +166,6 @@ def rank_columns_by_correlation(df, threshold=0.9):
plt.xticks(rotation=45, ha='right', rotation_mode="anchor")
plt.show()

# ============================================================================


# Now let's remove the features one by one from the least important one
X_train_temp = X_train_oversampled.copy()
X_validate_temp = X_validate_transformed.copy()
Expand Down Expand Up @@ -221,6 +218,7 @@ def rank_columns_by_correlation(df, threshold=0.9):
with open('result_df.pkl', 'rb') as file:
result_df = pickle.load(file)

# ============================================================================
# Model Selection

X_train_oversampled.info()
Expand All @@ -242,7 +240,7 @@ def rank_columns_by_correlation(df, threshold=0.9):

# Create and save model
best_model_lr = LogisticRegression(**best_params_lr, solver='liblinear', n_jobs=-1)
with open('best_models_lr.pkl', 'wb') as file:
with open('storage_files/best_models_lr.pkl', 'wb') as file:
pickle.dump(best_model_lr, file)


Expand Down Expand Up @@ -278,7 +276,7 @@ def objective_function(trial):

# Create and save model
best_model_lr = LogisticRegression(**best_params_lr, solver='liblinear', n_jobs=-1)
with open('best_models_lr.pkl', 'wb') as file:
with open('storage_files/best_models_lr.pkl', 'wb') as file:
pickle.dump(best_model_lr, file)

# Decision Tree
Expand Down Expand Up @@ -312,7 +310,7 @@ def objective_function(trial):

# Create and save model
best_model_dt = DecisionTreeClassifier(**best_params_dt)
with open('best_models_dt.pkl', 'wb') as file:
with open('storage_files/best_models_dt.pkl', 'wb') as file:
pickle.dump(best_model_dt, file)

# KNN
Expand Down Expand Up @@ -347,7 +345,7 @@ def objective_function(trial):

# Create and save model
best_model_knn = KNeighborsClassifier(**best_params_knn)
with open('best_models_knn.pkl', 'wb') as file:
with open('storage_files/best_models_knn.pkl', 'wb') as file:
pickle.dump(best_model_knn, file)

# Random Forest
Expand Down Expand Up @@ -383,7 +381,7 @@ def objective_function(trial):

# Create and save model
best_model_rf = RandomForestClassifier(**best_params_rf, n_jobs=-1)
with open('best_models_rf.pkl', 'wb') as file:
with open('storage_files/best_models_rf.pkl', 'wb') as file:
pickle.dump(best_model_rf, file)

# XgBoost
Expand Down Expand Up @@ -443,7 +441,7 @@ def objective_function(trial):
print("Best ROC-AUC Score: ", study_xgb.best_value)

best_model_xgb = XGBClassifier(**best_params_xgb, use_label_encoder=False, n_jobs=-1)
with open('best_models_xgb.pkl', 'wb') as file:
with open('storage_files/best_models_xgb.pkl', 'wb') as file:
pickle.dump(best_model_xgb, file)

# Label encode categorical features with many categories
Expand All @@ -457,15 +455,15 @@ def objective_function(trial):
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Model selection - Compare Performance
with open('best_models_lr.pkl', 'rb') as file:
with open('storage_files/best_models_lr.pkl', 'rb') as file:
best_model_lr = pickle.load(file)
with open('best_models_dt.pkl', 'rb') as file:
with open('storage_files/best_models_dt.pkl', 'rb') as file:
best_model_dt = pickle.load(file)
with open('best_models_knn.pkl', 'rb') as file:
with open('storage_files/best_models_knn.pkl', 'rb') as file:
best_model_knn = pickle.load(file)
with open('best_models_rf.pkl', 'rb') as file:
with open('storage_files/best_models_rf.pkl', 'rb') as file:
best_model_rf = pickle.load(file)
with open('best_models_xgb.pkl', 'rb') as file:
with open('storage_files/best_models_xgb.pkl', 'rb') as file:
best_model_xgb = pickle.load(file)

print("Testing Performances...Please wait")
Expand Down
75 changes: 39 additions & 36 deletions feature_engineering.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,6 @@
from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, make_scorer
import sklearn.metrics as skm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score, GridSearchCV
import pickle
import optuna
from xgboost import XGBClassifier

warnings.filterwarnings('ignore')

Expand All @@ -36,24 +25,33 @@
X_validate_temp = X_validate_transformed.copy()

# Initialize the result dataframe
result_df = pd.DataFrame(columns=['Features_Removed', 'ROC_Score'])
result_df = pd.DataFrame(columns=['Features_Removed', 'ROC_Score', 'f1',
'accuracy', 'precision', 'recall'])

# First, evaluate performance using all features
randomForestModel = RandomForestClassifier(max_features=None)
print('Random Forest fitting started.')
randomForestModel.fit(X_train_temp, y_train_oversampled)
# Predict probabilities on test data
y_pred_probs = randomForestModel.predict_proba(X_validate_temp)[:, 1]
y_pred = randomForestModel.predict(X_validate_temp)
# Different metrics of classification
roc_score = skm.roc_auc_score(y_test, y_pred_probs)
# f1_score = skm.f1_score(y_test, y_pred_probs, average=None)


f1_score = skm.f1_score(y_test, y_pred, average=None)[0]
accuracy = skm.accuracy_score(y_test, y_pred)
precision = skm.precision_score(y_test, y_pred, average=None)[0]
recall = skm.recall_score(y_test, y_pred, average=None)[0]
# Append the result to the result dataframe
roc_dict = {'Features_Removed': 'None', 'no_features_used': len(X_train_temp.columns), 'ROC_Score': roc_score}
roc_dict = {
'Features_Removed': 'None',
'no_features_used': len(X_train_temp.columns),
'ROC_Score': roc_score, 'f1': f1_score, 'accuracy': accuracy,
'precision': precision, 'recall': recall}
result_df = pd.DataFrame([roc_dict])
total_num = len(X_train_temp.columns)
print(f"Feature_Removed: None, Number of features used: {len(X_train_temp.columns)}, ROC_AUC_Score: {roc_score}")
print(f"Feature_Removed: None, Number of features used: {len(X_train_temp.columns)}, \
ROC_AUC_Score: {roc_score}, F1: {f1_score}, Accuracy: {accuracy}, Precision: {precision} \
Recall: {recall}")
result_df.to_csv("storage_files/result_df.csv",index=False)

# Sort importance_table by Importance in ascending order to start with the least important
Expand All @@ -71,24 +69,36 @@
# Compute ROC score
roc_score = skm.roc_auc_score(y_test, y_pred_probs)
# Append the result to the result dataframe
roc_dict = {'Features_Removed': row['Feature'], 'no_features_used': len(X_train_temp.columns), 'ROC_Score': roc_score}
roc_dict = {
'Features_Removed': row['Feature'],
'no_features_used': len(X_train_temp.columns),
'ROC_Score': roc_score, 'f1': f1_score, 'accuracy': accuracy,
'precision': precision, 'recall': recall}
pd.DataFrame([roc_dict]).to_csv("storage_files/result_df.csv",mode='a',index=False,header=False)
result_df = pd.concat([result_df, pd.DataFrame([roc_dict])])
print(
f"Feature_Removed: {row['Feature']}, Number of features used: {len(X_train_temp.columns)}, ROC_AUC_Score: {roc_score}")
print(f"Feature_Removed: {row['Feature']}, Number of features used: {len(X_train_temp.columns)}, \
ROC_AUC_Score: {roc_score}, F1: {f1_score}, Accuracy: {accuracy}, Precision: {precision} \
Recall: {recall}")
# If only one feature left, break the loop
if X_train_temp.shape[1] == 1:
break

if __name__ == '__main__':
# Plot a bar chart to visualize ROC scores
def plot(metric: str):
"""
metric: 'ROC_Score', 'f1', 'accuracy', 'precision' or 'recall'.
"""
plt.figure(figsize=(20, 10))
sns.barplot(data=result_df, x="no_features_used", y="ROC_Score")
plt.title("ROC scores")
sns.barplot(data=result_df, x="no_features_used", y=metric)
plt.title(metric)
plt.subplots_adjust(bottom=0.2, top=0.95)
plt.xticks(rotation=45, ha='right', rotation_mode="anchor")
plt.show()

if __name__ == '__main__':
for metric in ['ROC_Score', 'f1', 'accuracy', 'precision', 'recall']:
plot(metric)


# Find out the one with max roc_score.
max_roc_score = result_df.iloc[0]['ROC_Score']
max_inds = [0]
Expand All @@ -100,15 +110,8 @@
max_inds.append(index)
max_no_features = [total_num-i for i in max_inds]
print(f'Conclusion: The best model is to use {max_no_features} features.')

# Save the results
# with open('storage_files/result_df.pkl', 'wb') as file:
# pickle.dump(result_df, file)
# with open('storage_files/importance_table_sorted.pkl', 'wb') as file:
# pickle.dump(importance_table_sorted, file)

# Load the results
# with open('storage_files/result_df.pkl', 'rb') as file:
# result_df = pickle.load(file)


f = open('storage_files/max_no_features.txt','w')
for i in max_no_features:
f.write(str(i))
f.write('\n')
f.close()
Binary file removed importance_table_sorted.pkl
Binary file not shown.
1 change: 1 addition & 0 deletions model_training_top_36_features.ipynb

Large diffs are not rendered by default.

Loading