-
Notifications
You must be signed in to change notification settings - Fork 0
/
Exploratory Data Analysis
244 lines (198 loc) · 9.89 KB
/
Exploratory Data Analysis
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
# -*- coding: utf-8 -*-
"""
Created on Tue Dec 29 12:22:05 2020
@author: Albert
This file was created to show exploratory data analysis for BCG's virtuaal experience. Thank You.
"""
import pandas as pd
import os
import seaborn as sns #visualisation
import matplotlib.pyplot as plt #visualisation
from scipy import stats
sns.set(color_codes=True)
os.getcwd()
# EDA and Data cleaning Table
# 1. Loading data through pandas
# A. Loading three dataframes
# 2. Data cleaning
# A. Drop Duplicates
# C. Removing Columns
# B. Missing data / Empty values
# D. Combining three dataframes
# 3. General statistics of a dataframe
# A. Data types
# B. Data statistics
# 4. Format/ Convert data
# A. Dates
# 5. Data visualization
# A. Churn percentage
# B. Detecting outliers using boxplots
# C. Using a correlation matrix
# D. Final Thoughts
# 6. Saving the data as an excel file
# Feature Engineering
# 7. Detecting outliers
# A. Standard Deviation
# B. Percentiles
# 8.Binning
# A. Numerical
# B. Categorical
# 9. Scaling
# A. Normalization
# 10.Date Extraction
# 11.Feature Selection
#
""" ---------------------------- SUB TASK ONE CLEAN THE DATA ---------------------------- """
# 1. import data into dataframes
df_train = pd.read_csv ('C:/Users/Albert/BCG_data/ml_case_training_data.csv')
df_hist = pd.read_csv ('C:/Users/Albert/BCG_data/ml_case_training_hist_data.csv')
df_churn = pd.read_csv ('C:/Users/Albert/BCG_data/ml_case_training_output.csv')
df_train.isnull().sum()
df_hist.isnull().sum()
# 2. Data Cleaning
""" A. DROP DUPLICATES - remove duplicate id's """
df_train = df_train.drop_duplicates(subset=['id'])
df_hist = df_hist.drop_duplicates(subset=['id'])
""" point out columns with large numbers of missing values in the dataframe df_missing """
df_missing = df_train.columns[df_train.isnull().any()]
""" B. remove 'activity_new', 'channel_sales', 'campaign_disc_ele','date_first_activ', 'forecast_base_bill_ele',
'forecast_base_bill_year', 'forecast_bill_12m', 'forecast_cons', 'forecast_discount_energy',
'forecast_price_energy_p1', 'forecast_price_energy_p2','forecast_price_pow_p1' columns from df """
df_train = df_train.drop(columns = ['activity_new', 'channel_sales', 'campaign_disc_ele','date_first_activ', 'forecast_base_bill_ele',
'forecast_base_bill_year', 'forecast_bill_12m', 'forecast_cons', 'forecast_discount_energy',
'forecast_price_energy_p1', 'forecast_price_energy_p2','forecast_price_pow_p1'])
""" C. Remove columns with missing values are listed in the dataframe df_missing """
df_train = df_train.dropna(subset=['date_end', 'pow_max', 'margin_gross_pow_ele', 'margin_net_pow_ele', 'net_margin'])
df_hist = df_hist.dropna(subset=['price_p1_var', 'price_p2_var', 'price_p3_var', 'price_p1_fix', 'price_p2_fix', 'price_p3_fix'])
""" D. Combining the three dataframes train hist and churn into one df """
"""merge output dataframe with case_training by id to include churn values in case_training df"""
df = pd.merge(df_train, df_hist, on='id')
df = pd.merge(df, df_churn, on='id')
df.shape
""" ---------------------------- SUB TASK TWO EXPLORATORY DATA ANALYSIS ---------------------------- """
"""CHECK IF CURRENT MISSING VALUES MAKE SENSE"""
df_missing = df.columns[df.isnull().any()]
df[df_missing].isna().sum()
# 3. General Statistics
""" A. Data types in the dataframe"""
df.dtypes
""" B. BASIC STATISTICS of the dataframe"""
df.describe()
# 4. Data Format
""" A. DATA TYPE CONVERSION """
# Date columns are in object format and can be converted to integer format but since this analysis is
# based on price and churn, we will not be doing any data type conversion.
# 5. Data Visualization
""" A. CHURN DATA VISUALIZATION"""
# Calculate churn percentage (it's about 10%)
churn = df[["id","churn"]]
churn_total = churn.groupby(churn["churn"]).count()
churn_percentage = churn_total/churn_total.sum()*100
def plot_stacked_bars(dataframe, title_, size_=(18, 10), rot_=0, legend_="upper right"):
"""
Plot stacked bars with annotations
"""
ax = dataframe.plot(kind="bar",
stacked=True,
figsize=size_,
rot=rot_,
title=title_)
# Annotate bars
annotate_stacked_bars(ax, textsize=14)
# Rename legend
plt.legend(["Retention", "Churn"], loc=legend_)
# Labels
plt.ylabel("Company base (%)")
plt.show()
def annotate_stacked_bars(ax, pad=0.99, colour="white", textsize=13):
"""
Add value annotations to the bars
"""
# Iterate over the plotted rectanges/bars
for p in ax.patches:
# Calculate annotation
value = str(round(p.get_height(),1))
# If value is 0 do not annotate
if value == '0.0':
continue
ax.annotate(value,
((p.get_x()+ p.get_width()/2)*pad-0.05, (p.get_y()+p.get_height()/2)*pad),
color=colour,
size=textsize,
)
plot_stacked_bars(churn_percentage.transpose(),"Churning status", (5,5), legend_="lower right")
""" B. DETECTING OUTLIERS USING BOX PLOTS"""
#sns.boxplot(x=df['cons_gas_12m'])
#sns.boxplot(x=df['forecast_cons_year'])
#sns.boxplot(x=df['nb_prod_act'])
#sns.boxplot(x=df['num_years_antig'])
sns.boxplot(x=df['cons_12m']) #has significant outlier in boxplot
sns.boxplot(x=df['net_margin']) #has significant outlier in boxplot
sns.boxplot(x=df['margin_gross_pow_ele'])
sns.boxplot(x=df['cons_last_month']) #has significant outlier in boxplot
sns.boxplot(x=df['margin_gross_pow_ele'])
sns.boxplot(x=df['margin_net_pow_ele'])
sns.boxplot(x=df['price_p1_var']) #has significant outlier in boxplot
sns.boxplot(x=df['price_p2_var'])
sns.boxplot(x=df['price_p3_var'])
sns.boxplot(x=df['price_p1_fix']) #has significant outlier in boxplot
sns.boxplot(x=df['price_p2_fix'])
sns.boxplot(x=df['price_p3_fix'])
sns.boxplot(x=df['churn'])
""" C. USING A CORRELATION MATRIX"""
# Creating a correlation heatmap using cons_12m, consumption_last_month, and churn as rows and columns
df[['cons_12m','cons_last_month','churn']].corr()
sns.heatmap(df[['cons_12m','cons_last_month','churn']].corr(), annot=True, cmap = 'Reds')
plt.show()
# Creating a correlation heatmap using net_margin, margin_gross_pow_ele, margin_net_pow_ele, and churn as rows and columns
df[['net_margin','margin_gross_pow_ele', 'margin_net_pow_ele', 'churn']].corr()
sns.heatmap(df[['net_margin','margin_gross_pow_ele','margin_net_pow_ele', 'churn']].corr(), annot=True, cmap = 'Reds')
plt.show()
# Creating a correlation heatmap using price_p1_var, price_p2_var, price_p3_var, and churn as rows and columns
df[['price_p1_var','price_p2_var', 'price_p3_var', 'churn']].corr()
sns.heatmap(df[['price_p1_var','price_p2_var','price_p3_var', 'churn']].corr(), annot=True, cmap = 'Reds')
plt.show()
# Creating a correlation heatmap using price_p1_fix, price_p1_fix, price_p1_fix, and churn as rows and columns
df[['price_p1_fix','price_p2_fix', 'price_p3_fix', 'churn']].corr()
sns.heatmap(df[['price_p1_fix','price_p2_fix','price_p3_fix', 'churn']].corr(), annot=True, cmap = 'Reds')
plt.show()
# we get low correlation for margin and consumption variables with respect to churn. Try using a point-biserial correlation to double check
# and get a look at p values.
stats.pointbiserialr(df['cons_12m'], df['churn'])
stats.pointbiserialr(df['cons_last_month'], df['churn'])
stats.pointbiserialr(df['net_margin'], df['churn'])
stats.pointbiserialr(df['margin_gross_pow_ele'], df['churn'])
stats.pointbiserialr(df['margin_net_pow_ele'], df['churn'])
stats.pointbiserialr(df['imp_cons'], df['churn'])
stats.pointbiserialr(df['price_p1_var'], df['churn'])
stats.pointbiserialr(df['price_p2_var'], df['churn'])
stats.pointbiserialr(df['price_p3_var'], df['churn'])
stats.pointbiserialr(df['price_p1_fix'], df['churn'])
stats.pointbiserialr(df['price_p2_fix'], df['churn'])
stats.pointbiserialr(df['price_p3_fix'], df['churn'])
""" D. Final Thoughts """
# based on p-values from point-biserial correlations above, cons_12m, cons_last_month, net_margin, margin_gross_pow_ele, margin_net_pow_ele,
# price_p2_var, price_p3_var, price_p2_fix, and price_p3_fix are statistically significant! Also, correlation matrix is showing that there
# is a higher correlation between price and churn in periods 2 and 3 than any other variable. I would like to further explore the effects
# of pricing in periods one and two on customer churn.
# 6. Saving the data as an excel file
""" ---------------------------- Save final dataframe df to file directory for submission ---------------------------- """
#df.to_excel('cleaned_data.xlsx')
# 7. Detecting Outliers
""" A. Using Standard Deviation """
#Dropping the outlier rows using standard deviation (note that price_p1_var and price_p1_fix were ommited)
factor = 3
upper_lim = df['cons_12m'].mean () + df['cons_12m'].std () * factor
lower_lim = df['cons_12m'].mean () - df['cons_12m'].std () * factor
df = df[(df['cons_12m'] < upper_lim) & (df['cons_12m'] > lower_lim)]
#Dropping the outlier rows using standard deviation
factor = 3
upper_lim = df['net_margin'].mean () + df['net_margin'].std () * factor
lower_lim = df['net_margin'].mean () - df['net_margin'].std () * factor
df = df[(df['net_margin'] < upper_lim) & (df['net_margin'] > lower_lim)]
#Dropping the outlier rows using standard deviation
factor = 3
upper_lim = df['cons_last_month'].mean () + df['cons_last_month'].std () * factor
lower_lim = df['cons_last_month'].mean () - df['cons_last_month'].std () * factor
df = df[(df['net_margin'] < upper_lim) & (df['net_margin'] > lower_lim)]
# Using this method we can see a reduction in total rows from 16042 to 15462 (96% of data remains)