diff --git a/Python/bike-sharing/Chapter5-interpretable_models.ipynb b/Python/bike-sharing/Chapter5-interpretable_models.ipynb new file mode 100644 index 00000000..1a0b43ad --- /dev/null +++ b/Python/bike-sharing/Chapter5-interpretable_models.ipynb @@ -0,0 +1,804 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Chapter 5: Interpretable Models" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "metadata": {}, + "outputs": [], + "source": [ + "# Imports\n", + "\n", + "from preprocess_bike_data import data_pipeline\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Reading in the Data" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "metadata": {}, + "outputs": [], + "source": [ + "df = data_pipeline()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data Preprocessing" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "metadata": {}, + "outputs": [], + "source": [ + "# Dropping columns not used\n", + "\n", + "sub_features = ['season', 'holiday','workingday', 'weathersit']\n", + "df.drop(sub_features + ['yr', 'mnth', 'weekday'], inplace=True, axis=1)\n", + "\n", + "df_copy = df.copy()" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "metadata": {}, + "outputs": [], + "source": [ + "# Scaling the data\n", + "\n", + "from sklearn.preprocessing import MinMaxScaler\n", + "\n", + "scaling = {}\n", + "\n", + "for col in ['temp', 'hum', 'windspeed']:\n", + " sc = MinMaxScaler()\n", + " df[col] = sc.fit_transform(df[col].values.reshape(-1,1))\n", + " scaling[col] = sc" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 83, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "df['temp'].hist()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Modeling" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Establishing Features and Target" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "metadata": {}, + "outputs": [], + "source": [ + "features = [col for col in df.columns if col != 'cnt']\n", + "target = 'cnt'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Linear Regression" + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "const 1497.148579\n", + "temp 4175.879086\n", + "hum -1689.932626\n", + "windspeed -1381.678357\n", + "days_since_2011 4.926432\n", + "season_SPRING 473.715303\n", + "season_SUMMER -287.387420\n", + "season_WINTER -425.602853\n", + "holiday_NO HOLIDAY 686.115442\n", + "workingday_WORKING DAY 124.920938\n", + "weathersit_MISTY -379.398530\n", + "weathersit_RAIN/SNOW/STORM -1901.539915\n", + "dtype: float64" + ] + }, + "execution_count": 85, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import statsmodels.api as sm\n", + "\n", + "X = sm.add_constant(df[features])\n", + "model = sm.OLS(df['cnt'], X)\n", + "results = model.fit()\n", + "results.params" + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "
OLS Regression Results
Dep. Variable: cnt R-squared: 0.794
Model: OLS Adj. R-squared: 0.790
Method: Least Squares F-statistic: 251.2
Date: Sat, 10 Dec 2022 Prob (F-statistic): 1.05e-237
Time: 22:38:09 Log-Likelihood: -5993.0
No. Observations: 731 AIC: 1.201e+04
Df Residuals: 719 BIC: 1.207e+04
Df Model: 11
Covariance Type: nonrobust
\n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "
coef std err t P>|t| [0.025 0.975]
const 1497.1486 324.587 4.612 0.000 859.897 2134.400
temp 4175.8791 265.667 15.718 0.000 3654.304 4697.454
hum -1689.9326 308.226 -5.483 0.000 -2295.063 -1084.803
windspeed -1381.6784 223.979 -6.169 0.000 -1821.409 -941.948
days_since_2011 4.9264 0.173 28.507 0.000 4.587 5.266
season_SPRING 473.7153 109.947 4.309 0.000 257.860 689.570
season_SUMMER -287.3874 134.225 -2.141 0.033 -550.906 -23.869
season_WINTER -425.6029 110.820 -3.840 0.000 -643.172 -208.034
holiday_NO HOLIDAY 686.1154 203.301 3.375 0.001 286.980 1085.251
workingday_WORKING DAY 124.9209 73.267 1.705 0.089 -18.921 268.763
weathersit_MISTY -379.3985 87.553 -4.333 0.000 -551.289 -207.508
weathersit_RAIN/SNOW/STORM -1901.5399 223.640 -8.503 0.000 -2340.605 -1462.475
\n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "
Omnibus: 91.525 Durbin-Watson: 0.911
Prob(Omnibus): 0.000 Jarque-Bera (JB): 194.706
Skew: -0.719 Prob(JB): 5.25e-43
Kurtosis: 5.079 Cond. No. 5.56e+03


Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 5.56e+03. This might indicate that there are
strong multicollinearity or other numerical problems." + ], + "text/plain": [ + "\n", + "\"\"\"\n", + " OLS Regression Results \n", + "==============================================================================\n", + "Dep. Variable: cnt R-squared: 0.794\n", + "Model: OLS Adj. R-squared: 0.790\n", + "Method: Least Squares F-statistic: 251.2\n", + "Date: Sat, 10 Dec 2022 Prob (F-statistic): 1.05e-237\n", + "Time: 22:38:09 Log-Likelihood: -5993.0\n", + "No. Observations: 731 AIC: 1.201e+04\n", + "Df Residuals: 719 BIC: 1.207e+04\n", + "Df Model: 11 \n", + "Covariance Type: nonrobust \n", + "==============================================================================================\n", + " coef std err t P>|t| [0.025 0.975]\n", + "----------------------------------------------------------------------------------------------\n", + "const 1497.1486 324.587 4.612 0.000 859.897 2134.400\n", + "temp 4175.8791 265.667 15.718 0.000 3654.304 4697.454\n", + "hum -1689.9326 308.226 -5.483 0.000 -2295.063 -1084.803\n", + "windspeed -1381.6784 223.979 -6.169 0.000 -1821.409 -941.948\n", + "days_since_2011 4.9264 0.173 28.507 0.000 4.587 5.266\n", + "season_SPRING 473.7153 109.947 4.309 0.000 257.860 689.570\n", + "season_SUMMER -287.3874 134.225 -2.141 0.033 -550.906 -23.869\n", + "season_WINTER -425.6029 110.820 -3.840 0.000 -643.172 -208.034\n", + "holiday_NO HOLIDAY 686.1154 203.301 3.375 0.001 286.980 1085.251\n", + "workingday_WORKING DAY 124.9209 73.267 1.705 0.089 -18.921 268.763\n", + "weathersit_MISTY -379.3985 87.553 -4.333 0.000 -551.289 -207.508\n", + "weathersit_RAIN/SNOW/STORM -1901.5399 223.640 -8.503 0.000 -2340.605 -1462.475\n", + "==============================================================================\n", + "Omnibus: 91.525 Durbin-Watson: 0.911\n", + "Prob(Omnibus): 0.000 Jarque-Bera (JB): 194.706\n", + "Skew: -0.719 Prob(JB): 5.25e-43\n", + "Kurtosis: 5.079 Cond. No. 5.56e+03\n", + "==============================================================================\n", + "\n", + "Notes:\n", + "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n", + "[2] The condition number is large, 5.56e+03. This might indicate that there are\n", + "strong multicollinearity or other numerical problems.\n", + "\"\"\"" + ] + }, + "execution_count": 86, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "results.summary()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Weight Plot" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "metadata": {}, + "outputs": [], + "source": [ + "def get_model_stats(results):\n", + " \n", + " coef = results.params.to_dict()\n", + " std_error = results.bse.to_dict()\n", + " conf_intervals = results.conf_int(alpha=0.5).to_dict()\n", + "\n", + " for key, val in coef.items():\n", + " coef[key] = [val]\n", + "\n", + " t = pd.DataFrame(coef).T\n", + " t['std_error'] = t.index.map(std_error)\n", + " t.columns = ['coefficients', 'std_error']\n", + " t['lower'] = t.index.map(conf_intervals[0])\n", + " t['upper'] = t.index.map(conf_intervals[1])\n", + " \n", + " return t, coef" + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "metadata": {}, + "outputs": [], + "source": [ + "t, coef = get_model_stats(results)" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# Plotting the Weight Plot\n", + "\n", + "plt.rcParams['figure.dpi'] = 200\n", + "fig, ax = plt.subplots(figsize=(11,6))\n", + "ax.scatter(t['coefficients'], t.index)\n", + "ax.errorbar(t['coefficients'], t.index, xerr=2*t['std_error'], fmt = '.')\n", + "ax.axvline(0, linestyle='-.', color='red')\n", + "ax.set_title(\"Weight Plot\")\n", + "ax.set_xlabel(\"Weights\")\n", + "ax.set_ylabel(\"Features\")\n", + "plt.grid('True')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The Weight Plot shows that the feature, \"weathersit_RAIN/SNOW/STORM\", has a strong negative effect on the bike rentals. The features, \"season_WINTER\", \"season_SUMMER\", and \"workingday_WORKING DAY\" have coefficients where 0 falls within its 95% confidence interval. That says that the effect is not statistically significant." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Effect Plot\n", + "The weight plot shows how much *weight* a feature has on the prediction. BUT an effect plot helps understand how much a feature contributes to the predictions. This is done by multiplying the weight by its actual feature values. " + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "metadata": {}, + "outputs": [], + "source": [ + "for col in [col for col in df.columns if col != 'cnt']:\n", + " df['effect_'+col] = df[col] * coef[col][0]" + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['effect_temp',\n", + " 'effect_hum',\n", + " 'effect_windspeed',\n", + " 'effect_days_since_2011',\n", + " 'effect_season_SPRING',\n", + " 'effect_season_SUMMER',\n", + " 'effect_season_WINTER',\n", + " 'effect_holiday_NO HOLIDAY',\n", + " 'effect_workingday_WORKING DAY',\n", + " 'effect_weathersit_MISTY',\n", + " 'effect_weathersit_RAIN/SNOW/STORM']" + ] + }, + "execution_count": 91, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "effect_columns = [col for col in df.columns if \"effect_\" in col]\n", + "effect_columns" + ] + }, + { + "cell_type": "code", + "execution_count": 92, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "fig, ax = plt.subplots(1,1)\n", + "\n", + "ax.boxplot(df[effect_columns], vert=False)\n", + "ax.set_yticklabels(effect_columns)\n", + "ax.axvline(0, linestyle='-.', color='red')\n", + "ax.set_ylabel(\"Features\")\n", + "ax.set_xlabel(\"Effect\")\n", + "ax.set_title(\"Feature Effect on Predictions\")\n", + "plt.grid();" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Plot Individual Predictions" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "metadata": {}, + "outputs": [], + "source": [ + "# new_observation = [1.604356, 51.8261, 6.000868, 5, 0, 0, 1, 1, 1, 0, 0]\n", + "\n", + "# fig, ax = plt.subplots(1,1)\n", + "# ax.boxplot(df[effect_columns], vert=False)\n", + "# for idx, col in enumerate(effect_columns):\n", + "# ax.annotate('x', (idx, new_observation[idx]))\n", + "# ax.set_yticklabels(effect_columns)\n", + "# ax.axvline(0, linestyle='-.', color='red')\n", + "# ax.set_ylabel(\"Features\")\n", + "# ax.set_xlabel(\"Effect\")\n", + "# ax.set_title(\"Feature Effect on Predictions\")\n", + "# plt.grid();" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Interaction Effect" + ] + }, + { + "cell_type": "code", + "execution_count": 94, + "metadata": {}, + "outputs": [], + "source": [ + "# Creating a interaction effect of workingday and temperature\n", + "\n", + "df_copy['workingday_temp'] = df_copy['workingday_WORKING DAY'] * df_copy['temp']" + ] + }, + { + "cell_type": "code", + "execution_count": 95, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['temp',\n", + " 'hum',\n", + " 'windspeed',\n", + " 'days_since_2011',\n", + " 'season_SPRING',\n", + " 'season_SUMMER',\n", + " 'season_WINTER',\n", + " 'holiday_NO HOLIDAY',\n", + " 'workingday_WORKING DAY',\n", + " 'weathersit_MISTY',\n", + " 'weathersit_RAIN/SNOW/STORM',\n", + " 'workingday_temp']" + ] + }, + "execution_count": 95, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "interaction_columns = [col for col in df_copy.columns if col not in ['weathersit', 'workingday', 'weekday', 'cnt', 'season', 'yr', 'mnth', 'holiday']]\n", + "interaction_columns = [col for col in interaction_columns if 'effect' not in col]\n", + "interaction_columns" + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "metadata": {}, + "outputs": [], + "source": [ + "X = sm.add_constant(df_copy[interaction_columns])\n", + "interaction_model = sm.OLS(df_copy['cnt'], X)\n", + "interaction_results = interaction_model.fit()" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "metadata": {}, + "outputs": [], + "source": [ + "interaction_df, interaction_coef = get_model_stats(interaction_results)" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "
OLS Regression Results
Dep. Variable: cnt R-squared: 0.796
Model: OLS Adj. R-squared: 0.792
Method: Least Squares F-statistic: 232.9
Date: Sat, 10 Dec 2022 Prob (F-statistic): 4.70e-238
Time: 22:38:11 Log-Likelihood: -5989.4
No. Observations: 731 AIC: 1.200e+04
Df Residuals: 718 BIC: 1.206e+04
Df Model: 12
Covariance Type: nonrobust
\n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "
coef std err t P>|t| [0.025 0.975]
const 1937.9905 329.281 5.886 0.000 1291.521 2584.460
temp 125.4363 8.895 14.102 0.000 107.973 142.899
hum -17.5037 3.156 -5.546 0.000 -23.700 -11.307
windspeed -42.0682 6.864 -6.129 0.000 -55.544 -28.592
days_since_2011 4.9307 0.172 28.654 0.000 4.593 5.268
season_SPRING 467.2725 109.499 4.267 0.000 252.297 682.248
season_SUMMER -289.4902 133.648 -2.166 0.031 -551.877 -27.103
season_WINTER -426.5500 110.342 -3.866 0.000 -643.182 -209.918
holiday_NO HOLIDAY 674.4010 202.471 3.331 0.001 276.895 1071.907
workingday_WORKING DAY 451.8710 141.699 3.189 0.001 173.678 730.064
weathersit_MISTY -382.1347 87.181 -4.383 0.000 -553.296 -210.974
weathersit_RAIN/SNOW/STORM -1898.2107 222.679 -8.524 0.000 -2335.390 -1461.032
workingday_temp -21.7967 8.099 -2.691 0.007 -37.696 -5.897
\n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "
Omnibus: 92.795 Durbin-Watson: 0.904
Prob(Omnibus): 0.000 Jarque-Bera (JB): 196.738
Skew: -0.729 Prob(JB): 1.90e-43
Kurtosis: 5.081 Cond. No. 4.72e+03


Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 4.72e+03. This might indicate that there are
strong multicollinearity or other numerical problems." + ], + "text/plain": [ + "\n", + "\"\"\"\n", + " OLS Regression Results \n", + "==============================================================================\n", + "Dep. Variable: cnt R-squared: 0.796\n", + "Model: OLS Adj. R-squared: 0.792\n", + "Method: Least Squares F-statistic: 232.9\n", + "Date: Sat, 10 Dec 2022 Prob (F-statistic): 4.70e-238\n", + "Time: 22:38:11 Log-Likelihood: -5989.4\n", + "No. Observations: 731 AIC: 1.200e+04\n", + "Df Residuals: 718 BIC: 1.206e+04\n", + "Df Model: 12 \n", + "Covariance Type: nonrobust \n", + "==============================================================================================\n", + " coef std err t P>|t| [0.025 0.975]\n", + "----------------------------------------------------------------------------------------------\n", + "const 1937.9905 329.281 5.886 0.000 1291.521 2584.460\n", + "temp 125.4363 8.895 14.102 0.000 107.973 142.899\n", + "hum -17.5037 3.156 -5.546 0.000 -23.700 -11.307\n", + "windspeed -42.0682 6.864 -6.129 0.000 -55.544 -28.592\n", + "days_since_2011 4.9307 0.172 28.654 0.000 4.593 5.268\n", + "season_SPRING 467.2725 109.499 4.267 0.000 252.297 682.248\n", + "season_SUMMER -289.4902 133.648 -2.166 0.031 -551.877 -27.103\n", + "season_WINTER -426.5500 110.342 -3.866 0.000 -643.182 -209.918\n", + "holiday_NO HOLIDAY 674.4010 202.471 3.331 0.001 276.895 1071.907\n", + "workingday_WORKING DAY 451.8710 141.699 3.189 0.001 173.678 730.064\n", + "weathersit_MISTY -382.1347 87.181 -4.383 0.000 -553.296 -210.974\n", + "weathersit_RAIN/SNOW/STORM -1898.2107 222.679 -8.524 0.000 -2335.390 -1461.032\n", + "workingday_temp -21.7967 8.099 -2.691 0.007 -37.696 -5.897\n", + "==============================================================================\n", + "Omnibus: 92.795 Durbin-Watson: 0.904\n", + "Prob(Omnibus): 0.000 Jarque-Bera (JB): 196.738\n", + "Skew: -0.729 Prob(JB): 1.90e-43\n", + "Kurtosis: 5.081 Cond. No. 4.72e+03\n", + "==============================================================================\n", + "\n", + "Notes:\n", + "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n", + "[2] The condition number is large, 4.72e+03. This might indicate that there are\n", + "strong multicollinearity or other numerical problems.\n", + "\"\"\"" + ] + }, + "execution_count": 98, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "interaction_results.summary()" + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "metadata": {}, + "outputs": [], + "source": [ + "temp = interaction_results.params['temp']\n", + "workingday = interaction_results.params['workingday_WORKING DAY']\n", + "working_day_temp = interaction_results.params['workingday_temp']\n", + "intercept = interaction_results.params['const']" + ] + }, + { + "cell_type": "code", + "execution_count": 102, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.plot(df_copy['temp'], temp*df_copy['temp'] + intercept, linestyle=\"--\", label='NO WORKING DAY')\n", + "plt.plot(df_copy['temp'], (temp+working_day_temp)*df_copy['temp'] + (intercept + working_day_temp), linestyle=\"--\", label='WORKING DAY')\n", + "plt.legend()\n", + "plt.grid()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.10.6 64-bit", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.6" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/Python/bike-sharing/__pycache__/preprocess_bike_data.cpython-310.pyc b/Python/bike-sharing/__pycache__/preprocess_bike_data.cpython-310.pyc new file mode 100644 index 00000000..b65e3125 Binary files /dev/null and b/Python/bike-sharing/__pycache__/preprocess_bike_data.cpython-310.pyc differ diff --git a/Python/bike-sharing/bike_sharing_preprocessing.ipynb b/Python/bike-sharing/bike_sharing_preprocessing.ipynb new file mode 100644 index 00000000..8552b1a2 --- /dev/null +++ b/Python/bike-sharing/bike_sharing_preprocessing.ipynb @@ -0,0 +1,2720 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Imports" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# Imports\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Reading in the Data" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
instantdtedayseasonyrmnthholidayweekdayworkingdayweathersittempatemphumwindspeedcasualregisteredcnt
012011-01-0110106020.3441670.3636250.8058330.160446331654985
122011-01-0210100020.3634780.3537390.6960870.248539131670801
232011-01-0310101110.1963640.1894050.4372730.24830912012291349
342011-01-0410102110.2000000.2121220.5904350.16029610814541562
452011-01-0510103110.2269570.2292700.4369570.1869008215181600
\n", + "
" + ], + "text/plain": [ + " instant dteday season yr mnth holiday weekday workingday \\\n", + "0 1 2011-01-01 1 0 1 0 6 0 \n", + "1 2 2011-01-02 1 0 1 0 0 0 \n", + "2 3 2011-01-03 1 0 1 0 1 1 \n", + "3 4 2011-01-04 1 0 1 0 2 1 \n", + "4 5 2011-01-05 1 0 1 0 3 1 \n", + "\n", + " weathersit temp atemp hum windspeed casual registered \\\n", + "0 2 0.344167 0.363625 0.805833 0.160446 331 654 \n", + "1 2 0.363478 0.353739 0.696087 0.248539 131 670 \n", + "2 1 0.196364 0.189405 0.437273 0.248309 120 1229 \n", + "3 1 0.200000 0.212122 0.590435 0.160296 108 1454 \n", + "4 1 0.226957 0.229270 0.436957 0.186900 82 1518 \n", + "\n", + " cnt \n", + "0 985 \n", + "1 801 \n", + "2 1349 \n", + "3 1562 \n", + "4 1600 " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Reading in the data\n", + "\n", + "DIR = \"../../data/\"\n", + "DATA = \"bike-sharing-daily.csv\"\n", + "day_bike_rentals = pd.read_csv(DIR+DATA)\n", + "day_bike_rentals.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Data Cleaning" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Categorical Features" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "# Label Lists\n", + "\n", + "weekdays = ['SUN', 'MON', 'TUE', 'WED', 'THU', 'FRI', 'SAT']\n", + "holidays = ['NO HOLIDAY', 'HOLIDAY']\n", + "working_day = ['NO WORKING DAY', 'WORKING DAY']\n", + "season = ['WINTER', 'SPRING', 'SUMMER', 'FALL']\n", + "weathersit = ['GOOD', 'MISTY', 'RAIN/SNOW/STORM']\n", + "months = ['JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', 'AUG', 'SEP', 'OCT', 'NOV', 'DEC']" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "# Function to create a dictionary mapping index of list to the respective label\n", + "\n", + "def create_dictionary(label_list, start_at_zero=True):\n", + " d = {}\n", + " if start_at_zero:\n", + " for idx, val in enumerate(label_list):\n", + " d[idx] = val\n", + " else:\n", + " for idx, val in enumerate(label_list):\n", + " d[idx+1] = val\n", + " return d" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "# Mapping the indicies to actual labels\n", + "\n", + "day_bike_rentals['weekday'] = day_bike_rentals['weekday'].map(create_dictionary(weekdays))\n", + "day_bike_rentals['holiday'] = day_bike_rentals['holiday'].map(create_dictionary(holidays))\n", + "day_bike_rentals['workingday'] = day_bike_rentals['workingday'].map(create_dictionary(working_day))\n", + "day_bike_rentals['season'] = day_bike_rentals['season'].map(create_dictionary(season, start_at_zero=False))\n", + "day_bike_rentals['weathersit'] = day_bike_rentals['weathersit'].map(create_dictionary(weathersit, start_at_zero=False))\n", + "day_bike_rentals['mnth'] = day_bike_rentals['mnth'].map(create_dictionary(months, start_at_zero=False))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Numerical Features" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "# Denormalizing the temperture\n", + "\n", + "# Function that denormalizes temperture\n", + "def inverse_min_max(row, tmin, tmax):\n", + " return row * (tmax - tmin) + tmin\n", + "\n", + "# t_min=-8, t_max=+39\n", + "day_bike_rentals['temp'] = day_bike_rentals['temp'].apply(inverse_min_max, args=(-8, 39))\n", + "\n", + "# t_min=-16, t_max=+50\n", + "day_bike_rentals['atemp'] = day_bike_rentals['atemp'].apply(inverse_min_max, args=(-16, 50))" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "# DeNormalized wind speed. The values are divided by 67 (max)\n", + "\n", + "day_bike_rentals['windspeed'] = day_bike_rentals['windspeed'].apply(lambda row: row * 67)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "# Dormalized humidity. The values are divided by 100 (max)\n", + "\n", + "day_bike_rentals['hum'] = day_bike_rentals['hum'].apply(lambda row: row * 100)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Temporal Features" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "# Processing time features\n", + "\n", + "day_bike_rentals['yr'] = day_bike_rentals['yr'].apply(lambda row: 2011 if row == 0 else 2012)\n", + "day_bike_rentals['dteday'] = pd.to_datetime(day_bike_rentals['dteday'])\n", + "day_bike_rentals['days_since_2011'] = (day_bike_rentals['dteday'] - day_bike_rentals['dteday'].min()).dt.days" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Data Processing" + ] + }, + { + "cell_type": "code", + "execution_count": 108, + "metadata": {}, + "outputs": [], + "source": [ + "# Extracting only the needed features\n", + "\n", + "day_bike_rentals = day_bike_rentals[[col for col in day_bike_rentals.columns if col not in ['instant', 'dteday', 'registered', 'casual', 'atemp']]]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Dummifying Categorical Features" + ] + }, + { + "cell_type": "code", + "execution_count": 109, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['season', 'mnth', 'holiday', 'weekday', 'workingday', 'weathersit']" + ] + }, + "execution_count": 109, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Collecting only the categorical features\n", + "\n", + "categorical_features = [col for col in day_bike_rentals.columns if day_bike_rentals[col].dtype == 'object']\n", + "categorical_features" + ] + }, + { + "cell_type": "code", + "execution_count": 110, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['season', 'holiday', 'workingday', 'weathersit']" + ] + }, + "execution_count": 110, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Collecting all the features that needs to be dummified \n", + "sub_features = ['season', 'holiday','workingday', 'weathersit']\n", + "sub_features" + ] + }, + { + "cell_type": "code", + "execution_count": 111, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
temphumwindspeedcntdays_since_2011season_SPRINGseason_SUMMERseason_WINTERholiday_NO HOLIDAYworkingday_WORKING DAYweathersit_MISTYweathersit_RAIN/SNOW/STORM
08.17584980.583310.74988298500011010
19.08346669.608716.65211380110011010
21.22910843.727316.636703134920011100
31.40000059.043510.739832156230011100
42.66697943.695712.522300160040011100
\n", + "
" + ], + "text/plain": [ + " temp hum windspeed cnt days_since_2011 season_SPRING \\\n", + "0 8.175849 80.5833 10.749882 985 0 0 \n", + "1 9.083466 69.6087 16.652113 801 1 0 \n", + "2 1.229108 43.7273 16.636703 1349 2 0 \n", + "3 1.400000 59.0435 10.739832 1562 3 0 \n", + "4 2.666979 43.6957 12.522300 1600 4 0 \n", + "\n", + " season_SUMMER season_WINTER holiday_NO HOLIDAY workingday_WORKING DAY \\\n", + "0 0 1 1 0 \n", + "1 0 1 1 0 \n", + "2 0 1 1 1 \n", + "3 0 1 1 1 \n", + "4 0 1 1 1 \n", + "\n", + " weathersit_MISTY weathersit_RAIN/SNOW/STORM \n", + "0 1 0 \n", + "1 1 0 \n", + "2 0 0 \n", + "3 0 0 \n", + "4 0 0 " + ] + }, + "execution_count": 111, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Dummifying categorical features\n", + "cleaned_bike_data = pd.concat([day_bike_rentals, pd.get_dummies(day_bike_rentals[sub_features], drop_first=True)], axis=1)\n", + "\n", + "# Dropping unnecessary columns\n", + "bike = cleaned_bike_data.drop(sub_features + ['yr', 'mnth', 'weekday'], axis=1)\n", + "\n", + "# Viewing the result\n", + "bike.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
countmeanstdmin25%50%75%max
temp731.015.2830858.603397-5.2208717.84292515.42165122.80457532.498349
hum731.062.78940614.2429100.00000052.00000062.66670073.02085097.250000
windspeed731.012.7625765.1923571.5002449.04165012.12532515.62537134.000021
cnt731.04504.3488371937.21145222.0000003152.0000004548.0000005956.0000008714.000000
days_since_2011731.0365.000000211.1658120.000000182.500000365.000000547.500000730.000000
season_SPRING731.00.2517100.4342930.0000000.0000000.0000001.0000001.000000
season_SUMMER731.00.2571820.4373800.0000000.0000000.0000001.0000001.000000
season_WINTER731.00.2476060.4319170.0000000.0000000.0000000.0000001.000000
holiday_NO HOLIDAY731.00.9712720.1671550.0000001.0000001.0000001.0000001.000000
workingday_WORKING DAY731.00.6839950.4652330.0000000.0000001.0000001.0000001.000000
weathersit_MISTY731.00.3378930.4733160.0000000.0000000.0000001.0000001.000000
weathersit_RAIN/SNOW/STORM731.00.0287280.1671550.0000000.0000000.0000000.0000001.000000
\n", + "
" + ], + "text/plain": [ + " count mean std min \\\n", + "temp 731.0 15.283085 8.603397 -5.220871 \n", + "hum 731.0 62.789406 14.242910 0.000000 \n", + "windspeed 731.0 12.762576 5.192357 1.500244 \n", + "cnt 731.0 4504.348837 1937.211452 22.000000 \n", + "days_since_2011 731.0 365.000000 211.165812 0.000000 \n", + "season_SPRING 731.0 0.251710 0.434293 0.000000 \n", + "season_SUMMER 731.0 0.257182 0.437380 0.000000 \n", + "season_WINTER 731.0 0.247606 0.431917 0.000000 \n", + "holiday_NO HOLIDAY 731.0 0.971272 0.167155 0.000000 \n", + "workingday_WORKING DAY 731.0 0.683995 0.465233 0.000000 \n", + "weathersit_MISTY 731.0 0.337893 0.473316 0.000000 \n", + "weathersit_RAIN/SNOW/STORM 731.0 0.028728 0.167155 0.000000 \n", + "\n", + " 25% 50% 75% max \n", + "temp 7.842925 15.421651 22.804575 32.498349 \n", + "hum 52.000000 62.666700 73.020850 97.250000 \n", + "windspeed 9.041650 12.125325 15.625371 34.000021 \n", + "cnt 3152.000000 4548.000000 5956.000000 8714.000000 \n", + "days_since_2011 182.500000 365.000000 547.500000 730.000000 \n", + "season_SPRING 0.000000 0.000000 1.000000 1.000000 \n", + "season_SUMMER 0.000000 0.000000 1.000000 1.000000 \n", + "season_WINTER 0.000000 0.000000 0.000000 1.000000 \n", + "holiday_NO HOLIDAY 1.000000 1.000000 1.000000 1.000000 \n", + "workingday_WORKING DAY 0.000000 1.000000 1.000000 1.000000 \n", + "weathersit_MISTY 0.000000 0.000000 1.000000 1.000000 \n", + "weathersit_RAIN/SNOW/STORM 0.000000 0.000000 0.000000 1.000000 " + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bike.describe().T" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def data_pipeline():\n", + " \n", + " # Reading in the data\n", + " day_bike_rentals = pd.read_csv(DIR+DATA)\n", + " \n", + " # Mapping numerical codes with actual values\n", + " day_bike_rentals['weekday'] = day_bike_rentals['weekday'].map(create_dictionary(weekdays))\n", + " day_bike_rentals['holiday'] = day_bike_rentals['holiday'].map(create_dictionary(holidays))\n", + " day_bike_rentals['workingday'] = day_bike_rentals['workingday'].map(create_dictionary(working_day))\n", + " day_bike_rentals['season'] = day_bike_rentals['season'].map(create_dictionary(season, start_at_zero=False))\n", + " day_bike_rentals['weathersit'] = day_bike_rentals['weathersit'].map(create_dictionary(weathersit, start_at_zero=False))\n", + " day_bike_rentals['mnth'] = day_bike_rentals['mnth'].map(create_dictionary(months, start_at_zero=False))\n", + " \n", + " # Reversing min_max\n", + " day_bike_rentals['temp'] = day_bike_rentals['temp'].apply(inverse_min_max, args=(-8, 39))\n", + " day_bike_rentals['atemp'] = day_bike_rentals['atemp'].apply(inverse_min_max, args=(-16, 50))\n", + " \n", + " # Unnormalizing the data\n", + " day_bike_rentals['windspeed'] = day_bike_rentals['windspeed'].apply(lambda row: row * 67)\n", + " day_bike_rentals['hum'] = day_bike_rentals['hum'].apply(lambda row: row * 100)\n", + " \n", + " # Converting labels into actual year\n", + " day_bike_rentals['yr'] = day_bike_rentals['yr'].apply(lambda row: 2011 if row == 0 else 2012)\n", + " \n", + " # Converting 'dteday' to datetime object\n", + " day_bike_rentals['dteday'] = pd.to_datetime(day_bike_rentals['dteday'])\n", + "\n", + " # Calculating days since 2011\n", + " day_bike_rentals['days_since_2011'] = (day_bike_rentals['dteday'] - day_bike_rentals['dteday'].min()).dt.days\n", + " \n", + " # Feature selecting\n", + " day_bike_rentals = day_bike_rentals[[col for col in day_bike_rentals.columns if col not in ['instant', 'dteday', 'registered', 'casual', 'atemp']]]\n", + " \n", + " # Dummifying categorical features\n", + " day_bike_rentals = pd.concat([day_bike_rentals, pd.get_dummies(day_bike_rentals[sub_features], drop_first=True)], axis=1)\n", + " \n", + " # Returning the cleaned data\n", + " return day_bike_rentals " + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
seasonyrmnthholidayweekdayworkingdayweathersittemphumwindspeedcntdays_since_2011
0WINTER2011JANNO HOLIDAYSATNO WORKING DAYMISTY8.17584980.583310.7498829850
1WINTER2011JANNO HOLIDAYSUNNO WORKING DAYMISTY9.08346669.608716.6521138011
2WINTER2011JANNO HOLIDAYMONWORKING DAYGOOD1.22910843.727316.63670313492
3WINTER2011JANNO HOLIDAYTUEWORKING DAYGOOD1.40000059.043510.73983215623
4WINTER2011JANNO HOLIDAYWEDWORKING DAYGOOD2.66697943.695712.52230016004
\n", + "
" + ], + "text/plain": [ + " season yr mnth holiday weekday workingday weathersit temp \\\n", + "0 WINTER 2011 JAN NO HOLIDAY SAT NO WORKING DAY MISTY 8.175849 \n", + "1 WINTER 2011 JAN NO HOLIDAY SUN NO WORKING DAY MISTY 9.083466 \n", + "2 WINTER 2011 JAN NO HOLIDAY MON WORKING DAY GOOD 1.229108 \n", + "3 WINTER 2011 JAN NO HOLIDAY TUE WORKING DAY GOOD 1.400000 \n", + "4 WINTER 2011 JAN NO HOLIDAY WED WORKING DAY GOOD 2.666979 \n", + "\n", + " hum windspeed cnt days_since_2011 \n", + "0 80.5833 10.749882 985 0 \n", + "1 69.6087 16.652113 801 1 \n", + "2 43.7273 16.636703 1349 2 \n", + "3 59.0435 10.739832 1562 3 \n", + "4 43.6957 12.522300 1600 4 " + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ef_bike = day_bike_rentals[[col for col in day_bike_rentals.columns if col not in ['instant', 'dteday', 'registered', 'casual', 'atemp']]]\n", + "ef_bike.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "season 4\n", + "yr 2\n", + "mnth 12\n", + "holiday 2\n", + "weekday 7\n", + "workingday 2\n", + "weathersit 3\n", + "temp 499\n", + "hum 595\n", + "windspeed 650\n", + "cnt 696\n", + "days_since_2011 731\n", + "dtype: int64" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ef_bike.nunique()" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "GOOD 463\n", + "MISTY 247\n", + "RAIN/SNOW/STORM 21\n", + "Name: weathersit, dtype: int64" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ef_bike['weathersit'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "df_bike = pd.get_dummies(ef_bike['weathersit'], drop_first=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Splitting the Data" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(731, 2)" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_bike.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "# Training data\n", + "\n", + "X = bike[[col for col in bike.columns if col != 'cnt']]\n", + "y = bike['cnt']" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
LinearRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "LinearRegression()" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.linear_model import LinearRegression\n", + "\n", + "lr = LinearRegression()\n", + "lr.fit(X, y)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dummy coeficients: [ 110.70958159 -17.37719923 -42.51347178 4.92643186\n", + " 473.71530337 -287.38742028 -425.60285274 686.11544159\n", + " 124.92093811 -379.39852979 -1901.53991489], Dummy intercept: 2138.9296219985613\n" + ] + } + ], + "source": [ + "print(f\"Dummy coeficients: {lr.coef_}, Dummy intercept: {lr.intercept_}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Effect Coding" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 731 entries, 0 to 730\n", + "Data columns (total 2 columns):\n", + " # Column Non-Null Count Dtype\n", + "--- ------ -------------- -----\n", + " 0 MISTY 731 non-null int64\n", + " 1 RAIN/SNOW/STORM 731 non-null int64\n", + "dtypes: int64(2)\n", + "memory usage: 11.5 KB\n" + ] + } + ], + "source": [ + "# Converting numerical values to type int64\n", + "''' \n", + " Currently it is int8 and int8 cannot interpret negative values \n", + "'''\n", + "\n", + "df_bike = df_bike.astype(int)\n", + "df_bike.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [], + "source": [ + "# Setting the reference category to -1\n", + "\n", + "df_bike.loc[(df_bike['MISTY']==0) & (df_bike['RAIN/SNOW/STORM']==0),'MISTY'] = -1\n", + "df_bike.loc[(df_bike['MISTY']==-1) & (df_bike['RAIN/SNOW/STORM']==0),'RAIN/SNOW/STORM'] = -1" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Effect Coding coeficients: [ 110.70958159 -17.37719923 -42.51347178 4.92643186\n", + " 473.71530337 -287.38742028 -425.60285274 686.11544159\n", + " 124.92093811 -379.39852979 -1901.53991489]\n", + "Effect Coding intercept: 2138.9296219985613\n" + ] + } + ], + "source": [ + "# Printing the coeficients and intercept of LR\n", + "\n", + "print(f\"Effect Coding coeficients: {lr.coef_}\\nEffect Coding intercept: {lr.intercept_}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Effect of 'GOOD': 2045.5972396403506\n", + "Effect of 'MISTY': 2249.639203591673\n", + "Effect of 'RAIN': 2121.5524227636606\n" + ] + } + ], + "source": [ + "# Calculating effect coding\n", + "\n", + "effect_of_good = lr.intercept_ - (lr.coef_[0]+lr.coef_[1])\n", + "effect_of_misty = lr.intercept_ + lr.coef_[0]\n", + "effect_of_rain = lr.intercept_ + lr.coef_[1]\n", + "print(f\"Effect of 'GOOD': {effect_of_good}\\nEffect of 'MISTY': {effect_of_misty}\\nEffect of 'RAIN': {effect_of_rain}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Splitting Data into Train and Test Sets" + ] + }, + { + "cell_type": "code", + "execution_count": 94, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "\n", + "X = bike[[col for col in bike.columns if col != 'cnt']]\n", + "y = bike['cnt']\n", + "\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)" + ] + }, + { + "cell_type": "code", + "execution_count": 95, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.preprocessing import MinMaxScaler\n", + "\n", + "mms = MinMaxScaler()\n", + "\n", + "scale_columns = ['temp', 'hum', 'windspeed']\n", + "\n", + "for col in scale_columns:\n", + " mms.fit(X_train[col].values.reshape(-1,1))\n", + " X_train[col] = mms.transform(X_train[col].values.reshape(-1,1))\n", + " X_test[col] = mms.transform(X_test[col].values.reshape(-1,1))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Linear Regression" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Linear Regession\n" + ] + }, + { + "data": { + "text/plain": [ + "array([ 4119.68595694, -1270.84325983, -1128.71029573, 4.8310698 ,\n", + " 468.29339945, -200.30842603, -360.94542215, 485.53541273,\n", + " 96.92766521, -300.13333285, -1896.75540058])" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Training the Data\n", + "\n", + "from sklearn.linear_model import LinearRegression\n", + "\n", + "lr = LinearRegression()\n", + "lr.fit(X_train,y_train)\n", + "\n", + "print(\"Linear Regession\")\n", + "lr.coef_" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MSE: 884897.294264837\n" + ] + } + ], + "source": [ + "# Evaluate the data\n", + "\n", + "from sklearn.metrics import mean_squared_error\n", + "\n", + "predictions = lr.predict(X_test)\n", + "print(f\"MSE: {mean_squared_error(predictions, y_test)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Lasso Regression" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Lasso Regression: \n" + ] + }, + { + "data": { + "text/plain": [ + "array([ 4086.18142137, -1251.03700225, -1114.61277435, 4.83844788,\n", + " 475.98848393, -181.43877689, -358.58597195, 465.58406442,\n", + " 96.32689137, -301.20879631, -1885.99734402])" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Imports\n", + "from sklearn.linear_model import Lasso\n", + "\n", + "# Fitting the model\n", + "lasso = Lasso(alpha=0.5)\n", + "lasso.fit(X_train, y_train)\n", + "print(\"Lasso Regression: \")\n", + "lasso.coef_" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MSE: 751649.6486909935\n", + "MSE: 886927.2630415942\n" + ] + } + ], + "source": [ + "# Evaluate the data\n", + "\n", + "from sklearn.metrics import mean_squared_error\n", + "\n", + "train_mse = mean_squared_error(lasso.predict(X_train), y_train)\n", + "predictions = lasso.predict(X_test)\n", + "\n", + "print(f\"MSE: {train_mse}\")\n", + "print(f\"MSE: {mean_squared_error(predictions, y_test)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Hyperparameter Tuning" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['mean_fit_time',\n", + " 'mean_score_time',\n", + " 'mean_test_score',\n", + " 'param_alpha',\n", + " 'params',\n", + " 'rank_test_score',\n", + " 'split0_test_score',\n", + " 'split1_test_score',\n", + " 'split2_test_score',\n", + " 'split3_test_score',\n", + " 'split4_test_score',\n", + " 'std_fit_time',\n", + " 'std_score_time',\n", + " 'std_test_score']" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Using GridSearch and CV to find best alpha\n", + "\n", + "lasso = Lasso()\n", + "\n", + "from sklearn.model_selection import GridSearchCV\n", + "\n", + "parameters = {\"alpha\":np.arange(.1,1.1, 0.1)}\n", + "grid_search = GridSearchCV(\n", + " estimator=Lasso(),\n", + " param_grid=parameters,\n", + ")\n", + "\n", + "grid_search.fit(X_train, y_train)\n", + "\n", + "sorted(grid_search.cv_results_.keys())" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.1" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "grid_search.best_params_['alpha']" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Train performance: 751596.8213950731, Test performance: 885291.6766173533\n" + ] + } + ], + "source": [ + "lasso = Lasso(alpha=0.1)\n", + "\n", + "# Fitting the model\n", + "lasso.fit(X_train, y_train)\n", + "\n", + "# Calculating the train performance\n", + "train_predictions = lasso.predict(X_train)\n", + "train_mse = mean_squared_error(train_predictions, y_train)\n", + "\n", + "# Calculating the test performance\n", + "test_predictions = lasso.predict(X_test)\n", + "test_mse = mean_squared_error(y_test, test_predictions)\n", + "\n", + "print(f\"Train performance: {train_mse}, Test performance: {test_mse}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([ 4113.03486783, -1266.8947437 , -1125.89284108, 4.83253953,\n", + " 469.8202845 , -196.55402595, -360.46931257, 481.54641279,\n", + " 96.80660755, -300.34475081, -1894.59623269])" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "lasso.coef_" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [], + "source": [ + "# Feature is line\n", + "# y: weights are weights\n", + "# x: lambda" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [], + "source": [ + "# for num in np.arange(.1,1.1, 0.01):\n", + "# print(np.log(num))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Decision Trees" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
seasonyrmnthholidayweekdayworkingdayweathersittemphumwindspeedcntdays_since_2011
0WINTER2011JANNO HOLIDAYSATNO WORKING DAYMISTY8.17584980.583310.7498829850
1WINTER2011JANNO HOLIDAYSUNNO WORKING DAYMISTY9.08346669.608716.6521138011
2WINTER2011JANNO HOLIDAYMONWORKING DAYGOOD1.22910843.727316.63670313492
3WINTER2011JANNO HOLIDAYTUEWORKING DAYGOOD1.40000059.043510.73983215623
4WINTER2011JANNO HOLIDAYWEDWORKING DAYGOOD2.66697943.695712.52230016004
\n", + "
" + ], + "text/plain": [ + " season yr mnth holiday weekday workingday weathersit temp \\\n", + "0 WINTER 2011 JAN NO HOLIDAY SAT NO WORKING DAY MISTY 8.175849 \n", + "1 WINTER 2011 JAN NO HOLIDAY SUN NO WORKING DAY MISTY 9.083466 \n", + "2 WINTER 2011 JAN NO HOLIDAY MON WORKING DAY GOOD 1.229108 \n", + "3 WINTER 2011 JAN NO HOLIDAY TUE WORKING DAY GOOD 1.400000 \n", + "4 WINTER 2011 JAN NO HOLIDAY WED WORKING DAY GOOD 2.666979 \n", + "\n", + " hum windspeed cnt days_since_2011 \n", + "0 80.5833 10.749882 985 0 \n", + "1 69.6087 16.652113 801 1 \n", + "2 43.7273 16.636703 1349 2 \n", + "3 59.0435 10.739832 1562 3 \n", + "4 43.6957 12.522300 1600 4 " + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ef_bike.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [], + "source": [ + "# Getting data\n", + "\n", + "dt_columns = ['temp', 'season', 'days_since_2011','windspeed', 'hum', 'cnt']\n", + "\n", + "dt_df = ef_bike[dt_columns]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Label Encoding 'season'" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/3h/n7g9d8x521gg4nfvf31y3_840000gn/T/ipykernel_15068/2549798644.py:5: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " dt_df['labeled_season'] = le.fit_transform(dt_df['season'])\n" + ] + } + ], + "source": [ + "\n", + "from sklearn.preprocessing import LabelEncoder\n", + "\n", + "le = LabelEncoder()\n", + "\n", + "dt_df['labeled_season'] = le.fit_transform(dt_df['season'])" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [], + "source": [ + "# Splitting the data into train test split\n", + "\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "X = dt_df[[col for col in dt_df.columns if col not in ['cnt','season']]]\n", + "y = dt_df['cnt']\n", + "\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
tempdays_since_2011windspeedhumlabeled_season
08.175849010.74988280.58333
19.083466116.65211369.60873
21.229108216.63670343.72733
31.400000310.73983259.04353
42.666979412.52230043.69573
\n", + "
" + ], + "text/plain": [ + " temp days_since_2011 windspeed hum labeled_season\n", + "0 8.175849 0 10.749882 80.5833 3\n", + "1 9.083466 1 16.652113 69.6087 3\n", + "2 1.229108 2 16.636703 43.7273 3\n", + "3 1.400000 3 10.739832 59.0435 3\n", + "4 2.666979 4 12.522300 43.6957 3" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.tree import DecisionTreeRegressor\n", + "\n", + "dt = DecisionTreeRegressor()" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
DecisionTreeRegressor()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "DecisionTreeRegressor()" + ] + }, + "execution_count": 50, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dt.fit(X_train,y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([0.22038283, 0.64386564, 0.04414508, 0.08394337, 0.00766308])" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dt.feature_importances_" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['temp', 'days_since_2011', 'windspeed', 'hum', 'labeled_season'],\n", + " dtype=object)" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn import tree\n", + "\n", + "dt.feature_names_in_" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 temp\n", + "1 days_since_2011\n", + "2 windspeed\n", + "3 hum\n", + "4 labeled_season\n", + "dtype: object" + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.Series(dt.feature_names_in_)" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
featuresimportance
4labeled_season0.007663
2windspeed0.044145
3hum0.083943
0temp0.220383
1days_since_20110.643866
\n", + "
" + ], + "text/plain": [ + " features importance\n", + "4 labeled_season 0.007663\n", + "2 windspeed 0.044145\n", + "3 hum 0.083943\n", + "0 temp 0.220383\n", + "1 days_since_2011 0.643866" + ] + }, + "execution_count": 54, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fi_df = pd.DataFrame({'features':dt.feature_names_in_, 'importance': dt.feature_importances_}).sort_values('importance')\n", + "fi_df" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAbkAAAD4CAYAAABxJ5hVAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8qNh9FAAAACXBIWXMAAAsTAAALEwEAmpwYAAAZdUlEQVR4nO3df5RddX3u8fdjCGVkICMEUxmEQYRQSSxjRgpKaYK9BvQWY4iWGtTgj2jlR7ssEdLQe+nVmti4/AFV2tBiqsIdNCsOAdRBSaZABCVhQoaAUwOJlcFewXaCg0Mcwuf+sffIyWQmcybzY5/zzfNa66zs893fs/dz9krmmb3PyTmKCMzMzFL0sqIDmJmZjReXnJmZJcslZ2ZmyXLJmZlZslxyZmaWrEOKDmAvmTp1ajQ0NJQ9/7nnnuPwww8fv0DjzPmLU83ZwfmLVmn5N2/e/ExEHDPYOpdcBWloaGDTpk1lz29ra2P27NnjF2icOX9xqjk7OH/RKi2/pJ8Otc6XK83MLFkuOTMzS5ZLzszMkuWSMzOzZLnkzMwsWX53pZmZFaalvYuVrZ081d3LsXU1LJk7nXmN9WO2/RGfyUm6VtKVY5agvH0eK2nNBOznf0jaLKkj//PcknWz8vHtkq6TpHz8XZK2SXpRUlPJ/KMlbZDUI+kfxju7mVm1aWnvYunaDrq6ewmgq7uXpWs7aGnvGrN9VMXlyoh4KiIWTMCungH+JCJmAu8Hvlay7gbgw8DJ+e28fPwRYD5wz4BtPQ/8DTChvxCYmVWLla2d9Pbt2Wust28PK1s7x2wfKuf75CQtI/uh/wvgZ8BmYBewGDgU2A68F5gEbAVOiYg+SUcCDwOnAH8OfBR4AXg0Ii4aYl9/BHwxvxvAOcDRwB0RMUPSIuAC4OXAScC3IuIT+WPPAz6d53gmIt4i6XDgemAGMBm4NiJuK+M5C/gl8CrgKGBDRJyar/szYHZEfKRkfhtwZURsGrCdRUBTRFw2xH4Wkx1Hpk2bNqu5uXm4aL/V09NDbW1t2fMrjfMXp5qzg/MXbazyd3TtGnLdzPopZW9nzpw5myOiabB1w74mJ2kWcBFwej7/IbKSWxsRN+ZzPgV8MCKuz3/Yvx1oyR+3Ni+8q4ETI2K3pLr97PJK4NKI2CipluyMaKDTgUZgN9Ap6fp83o3AORGxQ9JR+dxlwPqI+EC+3x9J+n5EPDfMU78QeCjPWw88WbLuSWBMLhpHxCpgFUBTU1OM5FMEKu1TB0bK+YtTzdnB+Ys2VvmXrVhPV3fvPuP1dTVcvnD024fyLlf+IdnZ0q8j4llgXT4+Q9K9kjqAhcBp+fg/A5fky5cAX8mXtwI3S7qY7GxuKBuBz0m6AqiLiMHm3h0RuyLieeBR4ATgTOCeiNgBEBH/lc99K3C1pC1AG3AYcPz+nrCk04DPAB/Z3zwzMztwS+ZOp2bypL3GaiZPYsnc6WO2j9G8JrcauCx//epvycqDiNgINEiaDUyKiEfy+W8HvgS8AXhQ0qBnkRGxAvgQUANslHTqINN2lyzvYf9npAIujIjT89vxEfHYkJOl44BvAe+LiMfz4S7guJJpx+VjZmZ2gOY11rN8/kzq62oQ2Rnc8vkzx/TdleX8F4J7gNWSlufz/wT4J+AI4OeSJpOdyZX+0P8qcAvwSQBJLwNeHREbJN1HdhmzFugeuDNJJ0VEB9Ah6Y3AqcCWMnI+AHxZ0on9lyvzs7lW4HJJl0dESGqMiPbBNpBfzrwTuDovawAi4ueSnpV0JvBD4H1kr/OZmdkozGusH9NSG2jYM7mIeAi4lewNJN8BHsxX/Q3ZD/yNwI8HPOxm4BXA/83vTwK+nl/abAeui4juIXb5l5IekbQV6Mv3OayIeJrsDRxrJT2cZ4asaCcDWyVty+8P5TLgtcD/krQlv70yX/cxskux24HH+3NJeqekJ4GzgDsltfZvTNJO4HPAIklPSnpdOc/FzMzGRln/GTwi/g74u0FW3TDEQ84G1vQXWUT05WPl7OvyQYZ3kr07kohYTXaptH/+/yxZ/g4DSjEieinztbWI+BTwqSHWberPMGD8W2SXNwd7TEM5+zUzs/Ex5p94kr/T8XzgbWO9bTMzs5EY85Ib4kxsH5IuAf5iwPDGiLh0rDMNsu+5ZO+eLLUjIt453vs2M7OJU9hnV0bEV3jpvxdM9L5byd6QYmZmCauKj/UyMzM7EC45MzNLlkvOzMyS5ZIzM7NkueTMzCxZLjkzM0uWS87MzJLlkjMzs2S55MzMLFkuOTMzS5ZLzszMkuWSMzOzZLnkzMwsWS45MzNLlkvOzMyS5ZIzM7NkueTMzCxZLjkzM0uWS87MzJLlkjMzs2S55MzMLFkuOTMzS5ZLzszMkuWSMzOzZLnkzMwsWS45MzNLlkvOzMyS5ZIzM7NkueTMzCxZLjkzM0uWS87MzJLlkjMzs2S55MzMLFkuOTMzS5ZLzszMkuWSMzOzZLnkzMwsWYcUHaCSSKoD3hMRXy46i1W3lvYuVrZ28lR3L8fW1bBk7nTmNdYXHcvsoOMzub3VAR8rOoRVt5b2Lpau7aCru5cAurp7Wbq2g5b2rqKjmR10XHJ7WwGcJGmLpJWSlkh6UNJWSX8LIKlB0o8lrZb075JulvTHkjZK+omkM/J510r6mqT78/EPF/rMbMKsbO2kt2/PXmO9fXtY2dpZUCKzg5ciougMFUNSA3BHRMyQ9FZgAfARQMA64O+B/wC2A43ANuBB4GHgg8AFwCURMU/StcA7gTOBw4F24A8i4qkB+1wMLAaYNm3arObm5rLz9vT0UFtbe6BPt3Cp5u/o2jXkY2bWTxnPSGVL9dhXC+cfW3PmzNkcEU2DrfNrckN7a35rz+/XAieTldyOiOgAkLQNuDsiQlIH0FCyjdsiohfolbQBOANoKd1JRKwCVgE0NTXF7Nmzyw7Y1tbGSOZXmlTzL1uxnq7u3n3G6+tquHzhvvOLkOqxrxbOP3F8uXJoApZHxOn57bUR8S/5ut0l814suf8ie//iMPA02afNB4Elc6dTM3nSXmM1kyexZO70ghKZHbxccnv7FXBEvtwKfEBSLYCkekmvHOH23iHpMElHA7PJLm1a4uY11rN8/kzq62oQ2Rnc8vkz/e5KswL4cmWJiPhl/gaSR4DvALcA90sC6AEuBvbsZxMDbQU2AFOBTw58Pc7SNa+x3qVmVgFccgNExHsGDH1xkGkzSuYvKlneWboO2BoR7xvLfGZmVj5frjQzs2T5TG6cRMS1RWcwMzvY+UzOzMyS5ZIzM7NkueTMzCxZLjkzM0uWS87MzJLlkjMzs2S55MzMLFkuOTMzS5ZLzszMkuWSMzOzZLnkzMwsWS45MzNLlkvOzMyS5ZIzM7NkueTMzCxZLjkzM0uWS87MzJLlkjMzs2S55MzMLFkuOTMzS5ZLzszMkuWSMzOzZLnkzMwsWS45MzNLlkvOzMyS5ZIzM7NkueTMzCxZLjkzM0uWS87MzJLlkjMzs2S55MzMLFkuOTMzS5ZLzszMkuWSMzOzZLnkzMwsWS45MzNLlkvOzMyS5ZIzM7NkHVJ0gEomqQG4IyJmFJ2lmrS0d7GytZOnuns5tq6GJXOnM6+xvuhYZnYQcsnZmGpp72Lp2g56+/YA0NXdy9K1HQAuOjObcL5cObxJkm6UtE3SXZJqJLVJagKQNFXSznx5kaQWSd+TtFPSZZI+Lqld0gOSjir0mUyAla2dvy24fr19e1jZ2llQIjM7mCkiis5QsfLLlduBpojYIukbwDrgQ8CVEbFJ0lRgU0Q0SFoEXAM0Aoflj70qIv5R0ueBn0bEFwbsYzGwGGDatGmzmpuby87X09NDbW3tKJ/l2Oro2jXkupn1U/a6X4n5R6Ka81dzdnD+olVa/jlz5myOiKbB1vly5fB2RMSWfHkz0DDM/A0R8SvgV5J2Abfn4x3A6wdOjohVwCqApqammD17dtnB2traGMn8ibBsxXq6unv3Ga+vq+HyhbP3GqvE/CNRzfmrOTs4f9GqKb8vVw5vd8nyHrJfDF7gpWN32H7mv1hy/0UOgl8qlsydTs3kSXuN1UyexJK50wtKZGYHM5fcgdkJzMqXFxSYo+LMa6xn+fyZ1NfVILIzuOXzZ/pNJ2ZWiOTPLMbJZ4Fv5K+n3Vl0mEozr7HepWZmFcEltx8RsROYUXL/syWrS19fuyZfvxpYXTK/oWR5r3VmZjb+fLnSzMyS5ZIzM7NkueTMzCxZLjkzM0uWS87MzJLlkjMzs2S55MzMLFkuOTMzS5ZLzszMkuWSMzOzZLnkzMwsWS45MzNLlkvOzMyS5ZIzM7NkueTMzCxZLjkzM0uWS87MzJLlkjMzs2S55MzMLFkuOTMzS5ZLzszMkuWSMzOzZLnkzMwsWS45MzNLlkvOzMyS5ZIzM7NkueTMzCxZLjkzM0uWS87MzJLlkjMzs2S55MzMLFkuOTMzS5ZLzszMkuWSMzOzZLnkzMwsWS45MzNLlkvOzMyS5ZIzM7NkueTMzCxZLjkzM0tWVZecpG9LqhvB/AZJj4xjpP3tu6eI/Zarpb2LN69Yz4lX38mbV6ynpb2r6EhmZqN2SNEBRiMi3lZ0hhS0tHexdG0HvX17AOjq7mXp2g4A5jXWFxnNzGxUKvpMTtISSVfky5+XtD5fPlfSzZJ2Spqan6E9JulGSdsk3SWpJp87S9LDkh4GLi3Z9mmSfiRpi6Stkk7Ot/PjfNuPSVoj6eUl2/k3SZsltUp6VT5+kqTv5uP3Sjo1Hz9R0v2SOiR9aoIP3YisbO38bcH16+3bw8rWzoISmZmNDUVE0RmGJOlM4K8i4l2S7gV+B3gz8NfAfwJLgSagFtgONEXEFknfANZFxNclbQUui4h7JK0Ezo+IGZKuBx6IiJslHQpMAqYBO4CzI2KjpJuAR4EvAv8GvCMinpb0p8DciPiApLuBj0bETyT9AbA8Is6VtA5YExFflXQp8JmIqB3kOS4GFgNMmzZtVnNzc9nHp6enh9rafTY5Yh1du4ZcN7N+yqi3P5Sxyl+Uas5fzdnB+YtWafnnzJmzOSKaBltX6ZcrNwOzJB0J7AYeIiu1PwSuICu5fjsiYkvJ4xry1+vqIuKefPxrwPn58v3AMknHAWvzkgL4WURszOd8Pd/Pd4EZwPfyOZOAn0uqBd4EfDMfh6yIISvjC0v2+5nBnmBErAJWATQ1NcXs2bPLOS4AtLW1MZL5Q1m2Yj1d3b37jNfX1XD5wtFvfyhjlb8o1Zy/mrOD8xetmvJX9OXKiOgjO7NaBPwAuBeYA7wWeGzA9N0ly3sYpsAj4hbgAqAX+Lakc/tXDZwKCNgWEafnt5kR8Vay49ddMn56RPzegMdWvCVzp1MzedJeYzWTJ7Fk7vSCEpmZjY2KLrncvcCVwD358keB9ijjOmtEdAPdks7Ohxb2r5P0GuCJiLgOuA14fb7qeEln5cvvAe4DOoFj+sclTZZ0WkQ8C+yQ9K58XJJ+P3/sRuCigfutRPMa61k+fyb1dTWI7Axu+fyZftOJmVW9Sr9cCVmxLQPuj4jnJD2fj5XrEuAmSQHcVTL+buC9kvrIXt/7NHAkWaFdWvJ63A0R8RtJC4DrJE0hO25fALaRFdgNkq4BJgPNwMPAXwC3SLqKrEQr2rzGepeamSWn4ksuIu4mK4/++6eULDfki8+QvWbWP/7ZkuXNQP/ZFcAn8vEVwIrSfeWv/b0QERcPkmMLcM4g4zuA84YYP6tk6JpBnp6ZmY2jarhcaWZmdkAq/kxuIkXETkrOCM3MrLr5TM7MzJLlkjMzs2S55MzMLFkuOTMzS5ZLzszMkuWSMzOzZLnkzMwsWS45MzNLlkvOzMyS5ZIzM7NkueTMzCxZLjkzM0uWS87MzJLlkjMzs2S55MzMLFkuOTMzS5ZLzszMkuWSMzOzZLnkzMwsWS45MzNLlkvOzMyS5ZIzM7NkueTMzCxZLjkzM0uWS87MzJLlkjMzs2S55MzMLFkuOTMzS5ZLzszMkuWSMzOzZLnkzMwsWS45MzNLlkvOzMyS5ZIzM7NkueTMzCxZLjkzM0uWS87MzJLlkjMzs2S55MzMLFmHFB3ARq+lvYuVrZ081d3LsXU1LJk7nXmN9UXHMjMr3KjP5CT1DLO+QdIjI9zmakkLRjB/xPtIRUt7F0vXdtDV3UsAXd29LF3bQUt7V9HRzMwK58uVVW5laye9fXv2Guvt28PK1s6CEpmZVQ5FxOg2IPVERK2kWuA24BXAZOCaiLhNUgPwXWAz8AZgG/C+iPi1pFnA54Ba4BlgUUT8XNJq4I6IWLOfObOAm/IYdwHnR8SMITKeBnwFOJSs2C+MiJ9Iuhi4Ih//IfCxiNgj6QbgjUANsCYi/ne+nRXABcALwF0RcWX+/G4CpgJPA5dExH/kz+FZoAn4XeATEbFmkGyLgcUA06ZNm9Xc3Fz2se/p6WHHrj1Drp9ZP6XsbRWhp6eH2traomMcsGrOX83ZwfmLVmn558yZszkimgZdGRGjugE9+Z+HAEfmy1OB7YCABiCAN+frbgKuJCvCHwDH5ON/CtyUL68GFgwzZytwTr68EnhkPxmvBxbmy4eSldfvAbcDk/PxL5OVL8BR+Z+TgDbg9cDRQCcv/WJQl/95O/D+fPkDQEvJc/gmWam+Dtg+3LGcNWtWjMSGDRviTcvvjhOuumOf25uW3z2ibRVhw4YNRUcYlWrOX83ZI5y/aJWWH9gUQ/xcHcvLlQI+LWkr8H2gHpiWr/tZRGzMl78OnA1MB2YA35O0BbgGOG7ANgedI6kuL5l78nlfGybb/cBfS7oKOCEieoG3ALOAB/NtvwV4TT7/3ZIeAtqB08hKahfwPPAvkuYDv87nngXcUpLj7JL9tkTEixHxaMmxGFNL5k6nZvKkvcZqJk9iydzp47E7M7OqMpbvrlwIHAPMiog+STuBw/J1A6+JBlkpbouIs/azzUHn5CVXtoi4RdIPgbcD35b0kXzb/xoRSwds+0SyM803RsR/55cdD4uIFySdQVaGC4DLgHOH2fXuAc9lzPW/i9LvrjQz29dYltwU4Bd5wc0BTihZd7yksyLifuA9wH1kl/6O6R+XNBk4JSK2lTxuyDmSuiWdHRH3kRXskCS9BngiIq6TdDzZ5ce7gNskfT4ifiHpKOAI4EjgOWCXpGnA+UBb/prjyyPi25I2Ak/km/8BcBHZWdxC4N4DPH4HbF5jvUvNzGwQY1lyNwO3S+oANgE/LlnXCVwq6SbgUeCGiPhN/t8ErpM0Jc/yBbI3pgAwzJxLgJskBVlh7c+7gfdK6gP+E/h0RPyXpGuAuyS9DOgDLo2IByS15/l/BvRfZj2CrBQPIzsr+3g+fjnwFUlLyN94Uv4hMzOz8TTqkouI2vzPZ8henxrMqUM8dgtwziDji8qYsxn4/ZKhT+wn4wpgxSDjtwK37m//A5wxyNyfMshly4Hb6D9OZmY2cfz/5MzMLFlJfayXpLnAZwYM74iIdxaRx8zMipVUyUVEK9BadA4zM6sMo/7EExs7kp4GfjqCh0wl+xSYauX8xanm7OD8Rau0/CdExDGDrXDJVTFJm2Koj7KpAs5fnGrODs5ftGrK7zeemJlZslxyZmaWLJdcdVtVdIBRcv7iVHN2cP6iVU1+vyZnZmbJ8pmcmZklyyVnZmbJcslVAUnnSeqUtF3S1YOs/x1Jt+brf5h/W3lFKCP7OZIekvRC/mHcFaWM/B+X9KikrZLulnTCYNspShn5PyqpQ9IWSfdJel0ROYcyXP6SeRdKCkkV9bb2Mo7/IklP58d/i6QPFZFzKOUcf0nvzv8NbJN0y2BzCjXUt6n6Vhk3sm8nf5zsC10PBR4GXjdgzseAf8yXLwJuLTr3CLI3kH310VeBBUVnPoD8c8i+ggngzyvl2I8g/5ElyxcA3y0690jy5/OOAO4BHgCais49wuO/CPiHorOOIv/JZF8u/Yr8/iuLzj3w5jO5yncGsD0inoiI3wDNwDsGzHkH8K/58hrgLZLG5UtaR2jY7BGxMyK2Ai8WEXAY5eTfEBH93xL/APt+u32Rysn/bMndw9n3C46LVM7ffYBPkn1m7fMTGa4M5eavVOXk/zDwpYj4b4CI+MUEZxyWS67y1ZN9r12/J/OxQedExAvALuDoCUm3f+Vkr2Qjzf9B4Dvjmmhkysov6VJJjwN/D1wxQdnKMWx+SW8AXh0Rd05ksDKV+/fnwvxy9xpJr56YaGUpJ/8pwCmSNkp6QNJ5E5auTC45szEg6WKgCVhZdJaRiogvRcRJwFXANUXnKVf+ZcefA/6q6CyjcDvQEBGvB77HS1dkqsUhZJcsZwN/Btwoqa7IQAO55CpfF1D6291x+digcyQdAkwBfjkh6favnOyVrKz8kv4YWAZcEBG7JyhbOUZ6/JuBeeMZaISGy38EMANok7QTOBNYV0FvPhn2+EfEL0v+zvwzMGuCspWjnL8/TwLrIqIvInYA/05WehXDJVf5HgROlnSipEPJ3liybsCcdcD78+UFwPrIXwUuWDnZK9mw+SU1Av9EVnCV9npEOflLfyC9HfjJBOYbzn7zR8SuiJgaEQ0R0UD2mugFEbGpmLj7KOf4v6rk7gXAYxOYbzjl/PttITuLQ9JUssuXT0xgxuEV/c4X34a/AW8j+w3pcWBZPvZ/yP5BAxwGfBPYDvwIeE3RmUeQ/Y1kvw0+R3b2ua3ozCPM/33g/wFb8tu6ojOPMP8XgW159g3AaUVnHkn+AXPbqKB3V5Z5/Jfnx//h/PifWnTmEeYX2SXjR4EO4KKiMw+8+WO9zMwsWb5caWZmyXLJmZlZslxyZmaWLJecmZklyyVnZmbJcsmZmVmyXHJmZpas/w8np+WRMaySVgAAAABJRU5ErkJggg==", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.scatter(fi_df['importance'], fi_df['features'])\n", + "plt.grid()" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "metadata": {}, + "outputs": [], + "source": [ + "# le.inverse_transform(dt_df['labeled_season'])" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": {}, + "outputs": [], + "source": [ + "season_map = {\n", + " 'WINTER': 1,\n", + " 'FALL': 2,\n", + " 'SPRING': 3,\n", + " 'SUMMER': 4\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/3h/n7g9d8x521gg4nfvf31y3_840000gn/T/ipykernel_15068/3250286714.py:1: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " dt_df['my_season'] = dt_df['season'].map(season_map)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
tempseasondays_since_2011windspeedhumcntlabeled_seasonmy_season
08.175849WINTER010.74988280.583398531
19.083466WINTER116.65211369.608780131
21.229108WINTER216.63670343.7273134931
31.400000WINTER310.73983259.0435156231
42.666979WINTER412.52230043.6957160031
...........................
7263.945849WINTER72623.45891165.2917211431
7273.906651WINTER72710.41655759.0000309531
7283.906651WINTER7288.33366175.2917134131
7294.024151WINTER72923.50051848.3333179631
7302.144151WINTER73010.37468257.7500272931
\n", + "

731 rows × 8 columns

\n", + "
" + ], + "text/plain": [ + " temp season days_since_2011 windspeed hum cnt \\\n", + "0 8.175849 WINTER 0 10.749882 80.5833 985 \n", + "1 9.083466 WINTER 1 16.652113 69.6087 801 \n", + "2 1.229108 WINTER 2 16.636703 43.7273 1349 \n", + "3 1.400000 WINTER 3 10.739832 59.0435 1562 \n", + "4 2.666979 WINTER 4 12.522300 43.6957 1600 \n", + ".. ... ... ... ... ... ... \n", + "726 3.945849 WINTER 726 23.458911 65.2917 2114 \n", + "727 3.906651 WINTER 727 10.416557 59.0000 3095 \n", + "728 3.906651 WINTER 728 8.333661 75.2917 1341 \n", + "729 4.024151 WINTER 729 23.500518 48.3333 1796 \n", + "730 2.144151 WINTER 730 10.374682 57.7500 2729 \n", + "\n", + " labeled_season my_season \n", + "0 3 1 \n", + "1 3 1 \n", + "2 3 1 \n", + "3 3 1 \n", + "4 3 1 \n", + ".. ... ... \n", + "726 3 1 \n", + "727 3 1 \n", + "728 3 1 \n", + "729 3 1 \n", + "730 3 1 \n", + "\n", + "[731 rows x 8 columns]" + ] + }, + "execution_count": 58, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dt_df['my_season'] = dt_df['season'].map(season_map)\n", + "dt_df" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": {}, + "outputs": [], + "source": [ + "# Splitting the data into train test split\n", + "\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "X = dt_df[[col for col in dt_df.columns if col not in ['cnt','season', 'labeled_season']]]\n", + "y = dt_df['cnt']\n", + "\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Text(0, 0.5, 'Features')" + ] + }, + "execution_count": 60, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "dt = DecisionTreeRegressor()\n", + "dt.fit(X,y)\n", + "fi_df = pd.DataFrame({'features':dt.feature_names_in_, 'importance': dt.feature_importances_}).sort_values('importance')\n", + "plt.scatter(fi_df['importance'], fi_df['features'])\n", + "plt.grid()\n", + "plt.xlabel('Importance')\n", + "plt.ylabel('Features')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "On average, 70% of node purity was explained by (contributed to) 'days_since_2011'." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Partial Dependence Plot\n", + "1. Run a model (preferably Tree model)\n", + "2. from sklearn.inspection import plot_partial_dependence" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.ensemble import RandomForestRegressor" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
RandomForestRegressor(n_estimators=50, random_state=0)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "RandomForestRegressor(n_estimators=50, random_state=0)" + ] + }, + "execution_count": 98, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rf = RandomForestRegressor(\n", + " n_estimators=50,\n", + " random_state=0\n", + ")\n", + "\n", + "rf.fit(\n", + " X_train,\n", + " y_train,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "from sklearn.inspection import plot_partial_dependence, PartialDependenceDisplay\n", + "\n", + "fig, ax = plt.subplots(figsize=(12, 6))\n", + "PartialDependenceDisplay.from_estimator(\n", + " estimator=rf,\n", + " X=X_train,\n", + " features=X_train.columns,\n", + " target=y_train,\n", + " ax=ax\n", + ")\n", + "ax.set_title(\"Partial Dependence Plot\")\n", + "plt.tight_layout();" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Temperature affects the Bike Count by:\n", + "* positively correlation from 0-15 degrees C\n", + "* levels off from 15-25 degrees C\n", + "* starts a slight dip after 25 deegrees C\n", + "\n", + "Days Since 2011\n", + "* Overall positive trend\n", + "* Cyclic\n", + "\n", + "Windspeed\n", + "* As wind speed increases, the bike count decreases\n", + "\n", + "Humidity\n", + "* Similar to windspeed\n", + "* As Humidity increases, the number of bikes rented decreases slightly\n", + "\n", + "Season\n", + "* Seems not to affect the number of rented bikes" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## ALE Plots" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": {}, + "outputs": [], + "source": [ + "from PyALE import ale" + ] + }, + { + "cell_type": "code", + "execution_count": 125, + "metadata": {}, + "outputs": [], + "source": [ + "le = LabelEncoder()\n", + "cleaned_bike_data['mnth'] = le.fit_transform(cleaned_bike_data['mnth'])" + ] + }, + { + "cell_type": "code", + "execution_count": 126, + "metadata": {}, + "outputs": [], + "source": [ + "# Splitting the data into train test split\n", + "\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "X = cleaned_bike_data[[col for col in cleaned_bike_data.columns if col not in ['cnt', 'season', 'working_day', 'weathersit']]]\n", + "y = cleaned_bike_data['cnt']\n", + "\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 127, + "metadata": {}, + "outputs": [ + { + "ename": "ValueError", + "evalue": "could not convert string to float: 'NO HOLIDAY'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m/Users/tqrahman/Desktop/Biased_Outliers/interpretable-ml-book/Python/bike-sharing/bike-sharing-dataset.ipynb Cell 82\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m rf \u001b[39m=\u001b[39m RandomForestRegressor()\n\u001b[0;32m----> 2\u001b[0m rf\u001b[39m.\u001b[39;49mfit(X_train, y_train)\n", + "File \u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/ensemble/_forest.py:331\u001b[0m, in \u001b[0;36mBaseForest.fit\u001b[0;34m(self, X, y, sample_weight)\u001b[0m\n\u001b[1;32m 329\u001b[0m \u001b[39mif\u001b[39;00m issparse(y):\n\u001b[1;32m 330\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\u001b[39m\"\u001b[39m\u001b[39msparse multilabel-indicator for y is not supported.\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[0;32m--> 331\u001b[0m X, y \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_validate_data(\n\u001b[1;32m 332\u001b[0m X, y, multi_output\u001b[39m=\u001b[39;49m\u001b[39mTrue\u001b[39;49;00m, accept_sparse\u001b[39m=\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39mcsc\u001b[39;49m\u001b[39m\"\u001b[39;49m, dtype\u001b[39m=\u001b[39;49mDTYPE\n\u001b[1;32m 333\u001b[0m )\n\u001b[1;32m 334\u001b[0m \u001b[39mif\u001b[39;00m sample_weight \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[1;32m 335\u001b[0m sample_weight \u001b[39m=\u001b[39m _check_sample_weight(sample_weight, X)\n", + "File \u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/base.py:596\u001b[0m, in \u001b[0;36mBaseEstimator._validate_data\u001b[0;34m(self, X, y, reset, validate_separately, **check_params)\u001b[0m\n\u001b[1;32m 594\u001b[0m y \u001b[39m=\u001b[39m check_array(y, input_name\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39my\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mcheck_y_params)\n\u001b[1;32m 595\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m--> 596\u001b[0m X, y \u001b[39m=\u001b[39m check_X_y(X, y, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mcheck_params)\n\u001b[1;32m 597\u001b[0m out \u001b[39m=\u001b[39m X, y\n\u001b[1;32m 599\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m no_val_X \u001b[39mand\u001b[39;00m check_params\u001b[39m.\u001b[39mget(\u001b[39m\"\u001b[39m\u001b[39mensure_2d\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39mTrue\u001b[39;00m):\n", + "File \u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/utils/validation.py:1074\u001b[0m, in \u001b[0;36mcheck_X_y\u001b[0;34m(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator)\u001b[0m\n\u001b[1;32m 1069\u001b[0m estimator_name \u001b[39m=\u001b[39m _check_estimator_name(estimator)\n\u001b[1;32m 1070\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\n\u001b[1;32m 1071\u001b[0m \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m{\u001b[39;00mestimator_name\u001b[39m}\u001b[39;00m\u001b[39m requires y to be passed, but the target y is None\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 1072\u001b[0m )\n\u001b[0;32m-> 1074\u001b[0m X \u001b[39m=\u001b[39m check_array(\n\u001b[1;32m 1075\u001b[0m X,\n\u001b[1;32m 1076\u001b[0m accept_sparse\u001b[39m=\u001b[39;49maccept_sparse,\n\u001b[1;32m 1077\u001b[0m accept_large_sparse\u001b[39m=\u001b[39;49maccept_large_sparse,\n\u001b[1;32m 1078\u001b[0m dtype\u001b[39m=\u001b[39;49mdtype,\n\u001b[1;32m 1079\u001b[0m order\u001b[39m=\u001b[39;49morder,\n\u001b[1;32m 1080\u001b[0m copy\u001b[39m=\u001b[39;49mcopy,\n\u001b[1;32m 1081\u001b[0m force_all_finite\u001b[39m=\u001b[39;49mforce_all_finite,\n\u001b[1;32m 1082\u001b[0m ensure_2d\u001b[39m=\u001b[39;49mensure_2d,\n\u001b[1;32m 1083\u001b[0m allow_nd\u001b[39m=\u001b[39;49mallow_nd,\n\u001b[1;32m 1084\u001b[0m ensure_min_samples\u001b[39m=\u001b[39;49mensure_min_samples,\n\u001b[1;32m 1085\u001b[0m ensure_min_features\u001b[39m=\u001b[39;49mensure_min_features,\n\u001b[1;32m 1086\u001b[0m estimator\u001b[39m=\u001b[39;49mestimator,\n\u001b[1;32m 1087\u001b[0m input_name\u001b[39m=\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39mX\u001b[39;49m\u001b[39m\"\u001b[39;49m,\n\u001b[1;32m 1088\u001b[0m )\n\u001b[1;32m 1090\u001b[0m y \u001b[39m=\u001b[39m _check_y(y, multi_output\u001b[39m=\u001b[39mmulti_output, y_numeric\u001b[39m=\u001b[39my_numeric, estimator\u001b[39m=\u001b[39mestimator)\n\u001b[1;32m 1092\u001b[0m check_consistent_length(X, y)\n", + "File \u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/utils/validation.py:856\u001b[0m, in \u001b[0;36mcheck_array\u001b[0;34m(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name)\u001b[0m\n\u001b[1;32m 854\u001b[0m array \u001b[39m=\u001b[39m array\u001b[39m.\u001b[39mastype(dtype, casting\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39munsafe\u001b[39m\u001b[39m\"\u001b[39m, copy\u001b[39m=\u001b[39m\u001b[39mFalse\u001b[39;00m)\n\u001b[1;32m 855\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m--> 856\u001b[0m array \u001b[39m=\u001b[39m np\u001b[39m.\u001b[39;49masarray(array, order\u001b[39m=\u001b[39;49morder, dtype\u001b[39m=\u001b[39;49mdtype)\n\u001b[1;32m 857\u001b[0m \u001b[39mexcept\u001b[39;00m ComplexWarning \u001b[39mas\u001b[39;00m complex_warning:\n\u001b[1;32m 858\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\n\u001b[1;32m 859\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mComplex data not supported\u001b[39m\u001b[39m\\n\u001b[39;00m\u001b[39m{}\u001b[39;00m\u001b[39m\\n\u001b[39;00m\u001b[39m\"\u001b[39m\u001b[39m.\u001b[39mformat(array)\n\u001b[1;32m 860\u001b[0m ) \u001b[39mfrom\u001b[39;00m \u001b[39mcomplex_warning\u001b[39;00m\n", + "File \u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/pandas/core/generic.py:2064\u001b[0m, in \u001b[0;36mNDFrame.__array__\u001b[0;34m(self, dtype)\u001b[0m\n\u001b[1;32m 2063\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m__array__\u001b[39m(\u001b[39mself\u001b[39m, dtype: npt\u001b[39m.\u001b[39mDTypeLike \u001b[39m|\u001b[39m \u001b[39mNone\u001b[39;00m \u001b[39m=\u001b[39m \u001b[39mNone\u001b[39;00m) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m np\u001b[39m.\u001b[39mndarray:\n\u001b[0;32m-> 2064\u001b[0m \u001b[39mreturn\u001b[39;00m np\u001b[39m.\u001b[39;49masarray(\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_values, dtype\u001b[39m=\u001b[39;49mdtype)\n", + "\u001b[0;31mValueError\u001b[0m: could not convert string to float: 'NO HOLIDAY'" + ] + } + ], + "source": [ + "rf = RandomForestRegressor()\n", + "rf.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 101, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "PyALE._ALE_generic:INFO: Continuous feature detected.\n", + "PyALE._ALE_generic:INFO: Continuous feature detected.\n", + "PyALE._ALE_generic:INFO: Continuous feature detected.\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# fig, [ax1, ax2, ax3] = plt.subplots(nrows=1,ncols=3,figsize=(12,6))\n", + "\n", + "ale_temp = ale(\n", + " X=cleaned_bike_data, \n", + " model=rf, \n", + " feature=['temp'], \n", + " grid_size=50, \n", + " include_CI=False,\n", + " # ax=ax1\n", + ")\n", + "\n", + "ale_hum = ale(\n", + " X=X_train, \n", + " model=rf, \n", + " feature=['hum'], \n", + " grid_size=50, \n", + " include_CI=False,\n", + " # ax=ax2\n", + ")\n", + "\n", + "ale_ws = ale(\n", + " X=X_train, \n", + " model=rf, \n", + " feature=['windspeed'], \n", + " grid_size=50, \n", + " include_CI=False,\n", + " # ax=ax3\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "PyALE._ALE_generic:INFO: Discrete feature detected.\n" + ] + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAjgAAAEYCAYAAABRMYxdAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8qNh9FAAAACXBIWXMAAAsTAAALEwEAmpwYAABBmUlEQVR4nO3deXxU1f3/8deHsCcsrjiAgAqOVRTrSl0QUaxaFZ3WfRelWrSKWkC0iloXBPXrUuEX0QKKuE5VLIpLVarWBXFfRqmCgqPQyr4HPr8/7g0GzDIJmdzM5P18POaRmXvv3PvOXEI+Ofecc83dEREREcknjaIOICIiIlLbVOCIiIhI3lGBIyIiInlHBY6IiIjkHRU4IiIikndU4IiIiEjeUYEjkoPM7BUzOzfqHGWZ2VIz2z7qHA2FmY0xsz9v4j56m9mc2sokUp+owJG8ZmYXmtl0M1tlZuM2WtfbzNaFv5iXmtkcM3vUzPbOYL9F4XueLWfdLDM7tJzlGx+v9PGrCo4xy8xWhNv8YGbjzKyoGt8+ZtbFzNzMGlfnfVXkLvdzcvcid/+qpsepQa5xZvaXGrxvqpkdFj7f0cweM7P/mtkiM/vQzC41s4JsHb+2uPv57n59VMcXqe9U4Ei++w74C3B/RevdvQhoBfQEPgf+ZWaHVLHf3wKrgL5mtk118oSFQNnHvyvZ/ugw3x7AXsBV1ThWbarp51QjFqj1/5/MrJDgc3zVzHYA3gK+BXZ19zbA8eH6VrV97NqUSQEm0tCpwJG85u5Jd38S+F8V27m7z3H3q4GxwIgqdn0mMAb4EDitNrJWkW8u8CzQfeN1ZtbIzK4ys9lmNs/MJphZm3D1tPDrwspai6qRo8LPKWwp6ho+P9LMPjWzJWY218wuL7NdPzN738wWm9l/zOzwcPkrZnaDmb0OLAe2N7OdzOwFM/vRzFJmdkK47QDgVGBw+H1NDpe3N7MnzGy+mX1tZn/c6Fs4BHjd3VcB1wJvuPul7p4Ov7+Uu5/i7gvD/T1mZt+HrTvTzGyXmh7fzFqY2XgzW2Bmn5nZ4LKXh8zsF+FnsNDMPjGzY8qsG2dmo81sipktAw7euAWpks/17PB4S8zsKzP7ffXPvEjuUYEj8nNJYI/wr/2fMbPOQG9gYvg4I9uBzGxb4EjgvXJWnxU+Dga2B4qAu8N1vcKvbTNoLaquyj6n+4Dfu3srgqLsnwBmtg8wAfgT0DbMN6vM+04HBhC0oMwHXgAeArYGTgLuMbOd3b2Y4LO/Jfy+jg5bfCYDHwAdCIqZS8zs12X2fyTwj/D5ocDjVXyPzwLdwuPPCI9JDY9/DdCF4Bz1pUxhbGZNwvc+Hx7rImCimcXLZDkFuCH8bF4rG7KKz3UecBTQGjgbuN3M9qji+xbJeSpwRH7uO8AIflGU53TgQ3f/FHgY2MXMfpnhvtuHf6GXfZRbSIWeNLOFBL/QXgVuLGebU4Hb3P0rd18KXAGcZJvQ7yZDlX1Oa4Cdzay1uy9w9xnh8v7A/e7+gruvc/e57v55mfeNc/dP3L0EOByY5e5/c/cSd38PeILgMlJ59ga2cvfr3H112B/oXoLCqNSRwJTw+RZAurJv0N3vd/clYYvPcKBHmdax6h7/BODG8POYA9xZ5r09CQrTm8P3/hN4Bji5zDZPufvr4ee2cqNjV/i5uvs/3P0/YevbqwRF1IGVfd8i+UAFjsjPdQAcWFjB+jP46S/5uQSFx5kZ7vs7d2+70WNZJdsfG27T2d3/4O4rytmmPTC7zOvZQGOgXSaBbMMOz50y/D6g8s/ptwTFxGwze7XMpbFtgf9Uss9vyzzvDOxbthgkKOYq6vPUmY0KSGAY4edgZrsCi9y99Bj/A2IVBTGzAjO7Obzcs5ifWkS2rMnxCc5T2e+v7PP2wLfuvq7MstkEn3F522+sws/VzI4wszfDy3wLCc5LRd+DSN7I9l94IrnoOGBGeYWHme1HcMniCjO7LFzcCuhuZpeHLQ917TuCX66lOgElwA9s+AuyXGHn4Zqo8HNy93eAfuGllwuBRwl+CX8L7FBZnDLPvwVedfe+GWxbuv3X7t6tgu3Ltt4AvEhQiP2tgu1PAfoRXMqaBbQBFhC0WtXk+GmgI/Bp+HrbMuu+A7Y1s0ZlipxOwBdlttn4eBsf+2efq5k1I2j1OoOgBWiNmT1Z5nsQyVtqwZG8ZmaNzaw5UAAUmFnz8i7dWKCDmV0DnEvwl3d5ziToF7IzsHv46A60AI4os12T8FjNKzpmLZoEDDKz7SwYRn4j8EhYbM0H1hH0+9hkmXxOZtbUzE41szbuvgZYHGaAoG/O2WZ2iAWdozuY2U4VHO4ZYEczO93MmoSPvc3sF+H6Hzb6vt4GlpjZkLBDb4GZdbefhrOX7X8DQZ+Y/cxspIUj4cysq5k9aGZtCQrXVQQtPS35+eXB6h7/UYLCeDMz60BQ+JV6i6Bj9eDw++wNHE1wCTQTFX2uTYFmBP8OSszsCOCwDPcpktNU4Ei+uwpYAQwl6NS5gg2HWrc3s6XAUuAdYFegt7s/v/GOwkLpBOAud/++zONr4AE2vEw1JTxW6WN42eNt9PjtJn6P94fHnwZ8Dawk6KSKuy8n6Jj6enjZpGcNj5Hx5xQ6HZgVXto5n+DSEu7+NmFHV2ARweW9zuXtwN2XEPwyPomgheN7glFbzcJN7iPo57PQzJ5097UEnWl3Dz+H/xKM9GoTFiw7A2+U2f9/gF8RdPz9xMwWEbR2TAeWEHTanQ3MJWh1eXOjiBkfP9z+OmBOuO5Fgg7Oq8IsqwkKmiPC990DnLFR/6QKVfS5hp/hHwmKqwUErVJPZ7JPkVxn7pW1eoqI5D4Lhpf/zt1PiDpLKTO7ADjJ3Q+KOotIPlILjog0BAsJWjciY2YxM9s/vIQUBy4D/h5lJpF8phYcEZE6YMH8Sf8AtiMouB4GrggvT4lILVOBIyIiInlHl6hEREQk7+TFPDiNGjXyFi1aRB1DREQk7yxfvtzdPecaRPKiwGnRogXLllU2GayIiIjUhJmVN4N6vZdzFZmIiIhIVVTgiIiISN5RgSMiIiJ5RwWOiIiI5B0VOCIiIpJ3VOCI1BMTJ06kS5cuNGrUiC5dujBx4sSoI4mI5Ky8GCYukusmTpzIgAEDWL58OQCzZ89mwIABAJx66qlRRhMRyUl5cauGwsJC1zw4ksu6dOnC7Nmzf7a8c+fOzJo1q+4DiYiEzGy5uxdGnaO6VOCI1ANmVuG6fPgZFZHclasFji5RidQDnTt3LrcFp02bNixdupSioqIIUtUPPyxeGXWEBqFd6+ZRRxCpVepkLFIPnHfeeT9bVlBQwKJFi+jatStjxoyhpKQkgmQiIrlJBY5IxNydF198kcLCQjp27IiZ0blzZ8aPH8+bb75Jt27dGDZsGEuWLIk6qohIztAlKpGIPfXUU7zyyivcfffdDBw48Gfrp02bxqxZs9hss81Yu3YtF1xwAeeeey777LNPBGlFRHKDOhmLRMjd2W233Vi7di0ffvghjRtX/jfHF198wYEHHsi8efM48cQTufHGG9l+++3rKG001AenbqgPjlQkVzsZ6xKVSITMjMmTJzNhwoQqixuAHXfckZkzZ3L11VczefJkdtppJwYNGoQKfBGRDakFRyQia9asoUmTJjV+/3fffcfw4cN57bXX+OCDD2jSpAnuXumQ81ykFpy6oRYcqYhacESkWgYNGsRRRx3F2rVra/T+9u3bU1xczIwZM2jSpAlLly6lZ8+ePPDAA6xbt66W04qI5BZ1Mpa8Vl//+v/iiy8ofuBRTj/9NP67bA2wZpP2t2j1Sr7+eg6rC1pw9h8uYdRf7+XPf/4zBx54YO0EroL++pfK1Nefw3yjn8MNqQVHJALXXXcdhYUtufzyy2ttn9tt14UpU6Zw9913s2jhIk466SROPfU09c8RkQYpsgLHzLY1s5fN7FMz+8TMLg6Xb25mL5jZl+HXzaLKKJINr7zyCi+//DKXXDKIzTffvFb33ahRI4477jhenTaNP//5alq2bEHLli0BWLFiRa0eS0SkPousk7GZxYCYu88ws1bAu8CxwFnAj+5+s5kNBTZz9yGV7UudjKUi9bFp/Pjjj2fu3Lm8/PIrNGvWtE6O+d1339G3b19OP/10Bg4cSKtWrWp1/9lsGq+P5zAf6RzmvmydQ3UyriZ3T7v7jPD5EuAzoAPQDxgfbjaeoOgRyRv3338/9913X50VNwBmjejduzd33XUX+++3P+PGjWPNmk3r9yMiUp/Vi2HiZtYFmAZ0B75x97bhcgMWlL7e6D0DgAEATZs23XPVqlV1FVdySH36y3HFihU0adIko/lusuWDDz7g+uuv59///jddu3Zl6tSpNG++6X/16a//3KdzmPuiasFJptL3A0cB8xLxWPeN1l0GjAK2SsRj/02m0gbcARwJLAfOSsRjM7KRO/JOxmZWBDwBXOLui8uu86D6KrcCc/did9/L3feK8heGSKZGjRrFYYcdxsqV0f1n36NHDx577DHGj5/Asccet764+eqrryPLJCI5bxxw+MYLk6n0tsBhwDdlFh8BdAsfA4DR2QoVaYFjZk0IipuJ7p4MF/8Q9s8p7aczL6p8IrVl1qxZjB07lt13371WWkw2hZlx6KGHMGjQJUDQqtOr14Gcd955KnREpNoS8dg04MdyVt0ODGbDhop+wIREPOaJeOxNoG0ylY5lI1dkTR/h5af7gM/c/bYyq54GzgRuDr8+FUE8QM2qdaUhzN3wl7/8haZNmzJkSKX95SPRtWtXLrv0Mu4ZfQ9Tp07ljDPOYNCgQWyxxRZRRxOR+qGxmU0v87rY3Ysre0Myle4HzE3EYx8kU+myqzoA35Z5PSdctsFGtSHKFpz9gdOBPmb2fvg4kqCw6WtmXwKHhq9Fcta///1vnn32WQYOHEi7du2ijvMzhYWFDLp0EK+//jqnnHIK48dP4LDDDmP16tVRRxOR+qGktEtI+KiquGkJDAOurpt45YusBcfdXwMqumnOIXWZRSSbJk58iPbt23P++edHHaVSW2+9NTfffDP9+/fn889TNG3aFHdn6tSp9O3bl4KCgqgjikhu2AHYDihtvekIzEim0vsAc4Fty2zbMVxW69Q7VyTL/u//bufbb+dE3vcmU926daNbt25AMClh//79+cUvfsFVV11F7969ow0nIvVeIh77CNi69HUylZ4F7BWOonoauDCZSj8M7AssSsRjtX55CurBKCqRfLV8+XIWLVpE48aN2W67LlHHqZHevXszevQYli1bxqmnnsrJJ5/MJ598EnUsEalHkqn0JODfQDyZSs9JptL9K9l8CvAVMBO4F/hDtnLVi3lwNlW2ZjJWJ+O6ka/zb9xyyy08+MCDvDrtVTbbLLfvOLJq1WoeeGACt99+O0VFrXj99dc2mM8nX89hQ6JzmPs0k/GG1IIjkgVz585lzJgxHHDgATlf3AA0a9aUc889lzfeeIOxY++lcePGrFy5ijvuuIPFixdXvQMRkTqmAkckC2688SYAhg0bFnGS2tWmTRt23XVXAF577TVGjhzJfvvtxx133KFRVyJSr2RU4JjZZma2i5ltb2YqikQqMWPGDJ588u8MGDCAjh07Rh0naw499BCee+45unfvziWXXMLOO+/MY489Rj5c9haR3FdhsWJmbcxsmJl9BLwJ/D/gUWC2mT1mZgfXVUiRXDJ16vNstdVWXHjhhVFHybru3bszadIknnvuOVq2bMmtt94adSQREaCSTsZm9gIwAZjs7gs3WrcnwSR9H7n7fdkOWRV1Ms5t+di5cf78+Wy11VaRHDsK7Vo3Z+3atcyfP59tttmGefPm8cc//pHhw4ez0047bdK+9XNYN/Lx57ChUSfjDVXYguPufd39gY2Lm3Ddu+5+SX0obkTqi5UrVzJ79myABlXclCooKGCbbbYB4L333mPKlCl0796dCy64gO+//z7idCLS0FR2iWqPyh51GVIkFxQXF3PQQQfx7bffVr1xnvv1r3/NzJkzOf/88xk7dixdu3bl+uuvV/8cEakzlc1kXHoxvTmwF/ABwa0VdgOmA7/KbjSR3DFv3jzuuutuDulzCNtuu23Vb2gAtt56a+6++27++Mc/csUVV/Dpp58S3GMX3H39cxGRbKjsEtXB7n4wwR0+9whvsLUn8EuydN8IkVx1yy23sHr1Kq7685+jjlLv7LjjjjzxxBNMmDABgI8//pgePXrwzDPPqEVHRLImkyHfcXf/qPSFu38M/CJ7kURyy8cff8zDDz9M//79c/aWDHWhSZMmACxcuJAVK1Zw9NFH06dPH6ZPnx5xMhHJR5kUOB+a2Vgz6x0+7gU+zHYwkVwxffq7bLnlllx88cVRR8kJBxxwAJ9++il33XUXH3/8MXvvvTf9+/dXa46I1Koq70VlZs2BC4Be4aJpwGh3rzfj/jRMPLflw/DU5cuX07Jlyzo5Vn1U03O4aNEibrnlFpo1a8bVV1+Nu7NkyRJat269fhv9HNaNfPg5bOg0THxDGd1s08xaAJ3cPZX9SNWnAie35ep/rKtXr+bDDz9ir732zNoxckVtncNnn32WU089lSuvvJILL7yQZs2a6eewjuTqz6H8RAXOhqq8RGVmxwDvA8+Fr3c3s6eznEuk3vvb3/5Gv37H8OGHumJbWzp16sQ+++zD5Zdfzk477cRDDz3EunXroo4lIjkokz441wD7AAsB3P19YLvsRRKp/3788Uduv/12DjqoN7vttlvUcfLGLrvswnPPPcfzzz9P27ZtOfXUUzn77LOjjiUiOaiyeXBKrXH3RRvNWaHegNKg3XrrrSxbtpxrrrk66ih5qW/fvrz77rtMnDiRFTQFYM2aNcyaNYtu3bpFnE5EckEmLTifmNkpQIGZdTOzu4A3spxLpN764osvmDDhAU477VTi8XjUcfJWo0aNOP300+nXrx8AkyZN4uCD+/CnPw3mhx9+iDidiNR3mRQ4FwG7AKuAh4BFwCVZzCRSr3311Ve0j8W4/PLLo47SoBx99NH0738Ojz76CPvvvz+jRo1i6dKlUccSkXqq0lFUZlYAvBjOaFxvaRRVbsvF0RslJSU0bpzJFd6GoS7P4axZsxgxYgRPP/00vXr1YtKkSVk7dkOSiz+HsiGNotpQpf9Du/taM1tnZm3cfVFdhRKpj0pKSpg6dSpHHHGEipsIdenShdGjR3PeeedR+vfZwoWLePvtt+jbt6/ucSVSx5Kp9P3AUcC8RDzWPVw2EjgaWA38Bzg7EY8tDNddAfQH1gJ/TMRjU7ORK5NLVEuBj8zsPjO7s/RRGwc3s/vNbJ6ZfVxm2eZm9oKZfRl+3aw2jiWyqSZOfIgBAwbwr3/9K+ooAuyxxx7sueceADzwwATOPvtsfvvb3/Hee+9FnEykwRkHHL7RsheA7ol4bDfgC+AKgGQqvTNwEkHXl8OBe5KpdEE2QmVS4CSBPxPMYPxumUdtGMfPP5ShwEvu3g14KXwtEqlFixYxatRIevbsSa9evap+g9SpCy64gJtuuomZM2dy1FFHcf755zNr1qyoY4k0CIl4bBrw40bLnk/EYyXhyzeBjuHzfsDDiXhsVSIe+xqYSTAVTa2rsp3d3cdnayZjd59mZl02WtwP6B0+Hw+8AgypzeOKVNedd97JggULuPbaa3UJpB5q3LgxZ5xxBolEgjFjxjBmzBjWrF7DffffF3U0kXzQ2MzK3hW32N2Lq/H+c4BHwucdCAqeUnPCZbWuygLHzI4GRgFNge3MbHfgOnc/JhuBgHbung6ffw+0qyDXAGAAQNOmTbMURSTo1Dp27FhOOOEEunfvHnUcqURRURGXX345p59+OmvWrAHg669n8eyzUzjnnP40b94s4oQiOanE3feqyRuTqfSVQAkwsXYjVS2TS1TD+flMxttnLVEZHgzxKneYl7sXu/te7r6XOnxKNv33v/9jxx13ZMgQNSTminbt2tGxY9AiPmXKP7jhhhvo1etAHn/8cd36QaSOJFPpswg6H5+aiMdKf5fPBbYts1nHcFmty6TAWVPOCKps/g/xg5nFAMKv87J4LJEq7bXXnjz//PO0a1duY6LUcwMHDuTRRx9l88035+KLL+aII47gtddejzqWSF5LptKHA4OBYxLx2PIyq54GTkqm0s2SqfR2QDfg7WxkqI8zGT8NnBk+PxN4KovHEqnQunXrGDduPCtWrFC/mxy3//77M2XKFO6++24WLljI1KnPRR1JJG8kU+lJwL+BeDKVnpNMpfsDdwOtgBeSqfT7yVR6DEAiHvsEeBT4lOAm3gMT8djabOSqdKI/ADNrCVwJHBYumgpc7+6rNvngZpMIOhRvCfxAcGPPJwm++U7AbOAEd/+xgl0Amugv19XXCcYeeeRRLr10EKNHj+GYY46uxVT5p76ew/KsXLmKNWtW06pVK958800eeeQRBv9pMLH2sVo9Tq7JpXMo5dNEfxvKpMA53t0fq2pZlFTg5Lb6+B/rsmXLOOCAA+jQoQOTJ09WC04V6uM5zMS4ceMZPvwaCgoKOO+88xg4cCCtWrXK2vHqs1w9h/ITFTgbyuQS1RUZLhPJG3/961+ZN28ew4drWHg+O+usM5k2bRqHH344d911F/vttx+PPPJI1W8UkXqvwuFHZnYEcCTQYaOZi1sTDPkSyUtz585lzJgx9OvXj7322jPqOJJlnTp14q9//SsDBgzg+uuvZ9GiYEzFunXrMDMVuCI5qrIWnO+A6cBKNpzB+Gng19mPJhKNVatW0XPfngwbNizqKFKHevTowWOPPUb//v0BeOKJJzj22GOZPn16Fe+UfJJMJtl3n33p2LEj++6zL8lkMupIUkMVtuC4+wfAB2b2kLuvqcNMIpHafvvteWjSQ1HHkAiYGQUFwW1xmjVrzjfffEO/fv044ogjGDbsSrbffruIE0o2JZNJBg8ezIoVKwCYM3cOgwcPBiCRSEQZTWogk07G+xNM9teZoCAygjn46mSyv0yok3Fuqy+dG92dO+64g5NOOpltttGcN9VRX85hbVu+fDn/b8z/457R97Bq1SqGDBnCwIEDI8uTTfl2DktKSli2bBnLly2n7WZtadGiBfPnz+f9995n6bKlLFu2bP3jxBNPpEOHDvTosTv//e/8n+2rY4eOvPX2W3X+PVSXOhlvKJMpgO8DBhFcnsrKWHWR+mDy5MmMHDmSdu3acfLJJ0cdR+qBli1bMujSQZx2+mnceuutdO3aFWD9X/gtWrSIMl7eWbNmDd9+O4fly5exdOlPRUj37t3ZfvvtSae/529/u59ly35av3zZci74wx848MADeOedd+h/Tn+WLlvKqlU/zWQybtx4+vY9lPffe5+zzj7rZ8ft2bMnHTp0KLe4gaAlR3JPJgXOInd/NutJRCK0cuVKbvjLDeyyyy6ccMIJUceRemarrbbi5ptvXv969OjRTJw4kcGDB/O73/1u/WWthsDdWblyJUuXLmXp0qUsWbKE1q1b06VLF9auXcuDDz7IkiVL1q9funQpv9yvN4cffjiLFi3irLPOZlnYgrJ06TKWL1/GoEGD+MMf/sB3333HgQce8LNjXn/99Wy//fYsWrSQ4uJiioqKKGxZSMvClhQWFrJmzWoAttxyK35z1G8oLCyksLCIwnD9L36xEwB777MPzz77LIWFhbRsWUhRUSEtW7Zcf/46duhYbjHTsUPHny2T+i+TAudlMxsJJIH1JbG7z8haKpE6VlxczJy5c7jt9tsa1C8rqZkDDjiAl156iUsvvZTi4mKuuuoqevfuzd///ndG3DyCud/NpUP7DgwZOiTyvhtr1qxh9erVFBYGVxg++eQTfvxxwfoiY9myZbRr144zju8HwGWXXUY6nV5fvCxdupQ+ffowYsQIANq0acOSJUs2OMbvf/97xowZg5lx1llnrV/eqFEjWrVqRdM2W3H44YfTtGlTCgoaEYvFKCoqWl9k7LrrbgBsvfXW3HXXXesLkMLClhQVFa2/TUo8HmfWrFkVfq/bbdeFm266qcL1bdu2oW3b3SpcP2TokA364EDQSjdkqO5Dl4sy6YPzcjmL3d37ZCdS9akPTm6L+tr/vHnz2H//A+h14IHcd/99WcuSz6I+h1FwdyZPfoabbrqRb775hr59+/Laa6/97JfjLbfcUq0iZ/ny5SxevKRMK8dS3J39998fgGeeeYYvv5y5wfotttiC4cOHA3DBBRfw7rvvri9e1qxZw957782TTz4JQO/eB/Pll19scMxevXrxz3/8HYB99tmHBQsW0KpVK4qKiigqKqJPnz5cfvnlANxwww0UFBRQVFS0fptu3bqx225B4fD111+vf1/z5s0xs3p7DsuTTCbrXZGaKfXB2VCVBU4uUIGT26L+5Th//nxuuukmLrroj2y3XZesZclnUZ/DKK1atZoHHpjA3Xffzfz5P+/D0bp1a84880yGDh0KBP1Bnn/++Q36mTRq1IjXXw9uAHr++eczefLkDfbRrl07ZswIGs3POOMMXnrpJZo1axZeiilkxx13ZMKECQCMGjWKOXPmrF9XWFhIp06dOO644wB45513WLt27QYtKEVFRWy3zeZZ+4zq+znMFypwNpRJC0474EagvbsfYWY7A79y93rzp64KnNzWkH855gudQ+jQoUOF6woKCvjyy5k0a9aUO++8k6lTpwb9SMJLMa1bt+LGG28E4JVXXllfoBQVFtGysCWtW7dm1113BYIOzk2aNKFx40x6GGRO5zD3qcDZUCYFzrPA34Ar3b2HmTUG3nP3XesiYCZU4OS2qP5jdXduuOEGjj32WLp37561DA2BfjnCvvvsW24H1Q4dOvD2229HkKh6dA5znwqcDWVyL6ot3f1RYB2Au5eg4eKSB55//nlGjx6tmWqlVgwZOuRnw8ZbtGix/tKUiNStTNo4l5nZFoADmFlPYFFWU4lk2erVq7nuuuvo2rUbp512WtRxJA+UdkTN1Q6qIvkmkwLnUoL7T+1gZq8DWwG/y2oqkSwbN24cs2bN4oEHHqj1vgzScCUSCRU0IvVElf+zu/sMMzsIiBPcpiGle1NJLluwYAG33347Bx3Umz596s1sByIiUouqLHDMbCAw0d0/CV9vZmYnu/s9WU8nkgXNmzfnvPPO4ze/+U3UUUREJEsyGUX1vrvvvtGy99z9l9kMVh0aRZXbNHoj9+kc5j6dw9wX1SiqZCp9P3AUMC8Rj3UPl20OPAJ0AWYBJyTisQXJVNqAO4AjgeXAWYl4LCt3RshkFFWBmVnpCzMrAJpmI4xItg0bNowXX3wp6hgiIvlkHHD4RsuGAi8l4rFuwEvha4AjgG7hYwAwOluhMilwngMeMbNDzOwQYFK4TCSnvPLKK4wfP54vv/wy6igiInkjEY9NA37caHE/YHz4fDxwbJnlExLxmCfisTeBtslUOpaNXJkMHxlCUGVdEL5+ARibjTAi2VJSUsLw4dfSuXNnzjnnnKjjiIjkksZmVnbCsGJ3L67iPe0S8Vg6fP490C583gH4tsx2c8JlaWpZJqOo1gFjwodITpo48SG+/PIL7r33Xpo10xVWEZFqKHH3vWr65kQ85slUus5vfFnhJSozm2xmR5tZk3LWbW9m15mZ/hSWem/x4sWMGjWSnj17csQRR0QdR0SkIfih9NJT+HVeuHwusG2Z7TqGy2pdZS045xFM8vd/ZvYjMB9oDmwHzATudvenshEKwMwOJ+hpXQCMdfebs3UsyW9FRUVcffXV/OIXv6BMf3kREcmep4EzgZvDr0+VWX5hMpV+GNgXWFTmUlatqnKYOICZdQFiwArgC3dfno0wZY5XAHwB9CW4PvcOcLK7f1re9homnts0PDX36RzmPp3D3BfhMPFJQG9gS+AH4BrgSeBRoBMwm2CY+I/hMPG7CUZdLQfOTsRjWbkhYEYFTl0zs18Bw9391+HrKwDc/abytleBk9uy+R/r6QMuYtddd+XMM8/I2jFEvxzzgc5h7tPdxDeUyTDxKFTUy1okY9OmTWPSpIf48ceNRy+KiEi+y9m7DJrZAILh6zRtqlExsqF169YxaNAg2rdvz/nn/z7qOCIiUsfqa4FTZS/rcAx+MQSXqOoumuSCCRMmMGPGDEaPe4gWLVpEHUdEROpYlZeozGx/M3vBzL4ws6/M7Gsz+yrLud4BupnZdmbWFDiJoOe1SJVWrVrFlVdeyb777suxxx4bdRwREYlAJi049wGDgHeBtdmNE3D3EjO7EJhKMEz8/tK7mYtUpVmzZkyaNImioiINCxcRaaAyKXAWufuzWU+yEXefAkyp6+NKbnN3zIxevXoBGr0hItJQZVLgvGxmI4EksKp0obtn5fbmIpvinHPOYeutt2bEiBFRRxERkQhlUuDsG34tex8KB/rUfhyRmnv77bcZN24cw4YNizqKiIhELJObbR5cF0FENoW7M2jQILbZZhuGDh0adRwREYlYlQWOmbUhmHa5V7joVeA6d1+UzWAi1fHoo4/yxhtvMHbsWFq1ahV1HBERiVgmMxnfDywBTggfi4G/ZTOUSHWsW7eOq6++mt13352zzjor6jgiIlIPZNIHZwd3/22Z19ea2ftZyiNSbY0aNWLKlCksXryYgoKCqOOIiEg9kEmBs8LMDnD31yCY+I/gruIikVuzZg1NmjRhhx12iDqKiIjUI5kUOBcA48O+OAb8CJyVzVAimfrDH/7Ajz/+yOOPP65J/UREZL0q++C4+/vu3gPYDdjV3X/p7h9kP5pI5T744APuu+8+OnfurOJGREQ2UGELjpmd5u4PmtmlGy0HwN1vy3I2kQq5O5deeimbb745f/7zn6OOIyIiWZBMpdsBNwLtE/HYEclUemfgV4l47L6q3ltZC05h+LVVOY+iTYsssmkmT57MP//5T4YPH85mm20WdRwREcmOcQT3pWwfvv4CuCSTN1bYguPu/y98+qK7v152XdjRWCQyI0aMYKedduL3v/991FFERCR7tkzEY48mU+krABLxWEkylc7oxt+ZdDK+C9gjg2UideYf//gHc+bMoUmTJlFHERGR7FmWTKW3ILhFFMlUuieQ0UTDlfXB+RWwH7DVRv1wWgOabEQisWzZMpo3b07btm1p27Zt1HFERBq8ZCo9CDiXoAj5CDgbiAEPA1sA7wKnJ+Kx1TXY/WXA08AOyVT6dWAr4PhM3lhZH5ymBH1tGrNh/5vFwO9qEFJkkw0dOpS9996b1atr8nMiIiK1KZlKdwD+COyViMe6EzSAnASMAG5PxGNdgQVA/5rsPxGPvQscRNDg8ntgl0Q8ltFI7sr64LwKvGpm49x9dk2CidSmzz77jNGjRzNgwACaNm0adRwREQk0BlokU+k1QEsgDfQBTgnXjweGA6Oru+NkKv0fYGQiHhtTZtkziXjsqKrem8m9qMaaWdvSF2a2mZlNrW5IkU11+eWXU1RUxLXXXht1FBGRhqSxmU0v8xhQuiIRj80FRgHfEBQ2iwguSS1MxGMl4WZzgA41PPYa4OBkKv23ZCpd+pdtRvvKpMDZ0t0Xlr5w9wXA1tWOKLIJnn/+eaZMmcJVV13FVlttFXUcEZGGpMTd9yrzKC5dkUylNwP6AdsRDOUuBA6vxWMvT8RjJwKfAf9KptKdCDscVyWTAmedmXUqfWFmnTPduUhtKS4uZocdduCiiy6KOoqIiPzkUODrRDw2PxGPrQGSwP5A22QqXdoNpiMwt4b7N4BEPHYLcCXwfLi/KmUyTPxK4DUzezU80IHAgMrfIlK7Jk2axDfffEOzZs2ijiIiIj/5BuiZTKVbEtyI+xBgOvAywYCkh4EzgadquP+rS58k4rEXk6n0r8P9VanKAsfdnzOzPYCe4aJL3P2/NYopUk1LliwBoFWrVrpjuIhIPZOIx95KptKPAzOAEuA9oBj4B/BwMpX+S7isylsrlJVMpXdKxGOfA3OTqfTG8+49k8k+KpsHZyd3/zwsbgC+C792MrNO7j6jOmFFauL666/nwQcf5PPPP6d169ZRxxERkY0k4rFrgGs2WvwVsM8m7PZSgqtFt4avN+4a06eqHVTWgnMZcF6ZnZflmexcZFN89dVX3HHHHZxyyikqbkREGpaxyVR6m0Q8djBAMpU+E/gtMItgyHmVKpsH57zw68GbHFOkBgYPHkyTJk244YYboo4iIiJ1awxBB2aSqXQv4CbgImB3gktgVU44XNklqkRlb3T3ZDWCbrzv4wkqsF8A+7j79DLrriCY8XAt8Ed315w7DdC0adN44oknuO6662jfvn3VbxARkXxSkIjHfgyfnwgUJ+KxJ4Ankqn0+5nsoLJLVEeHX7cmmCL5n+Hrg4E3CIaC1dTHQAL4f2UXmtnOBFM870Iwnv5FM9vR3TO6c6jkj7///e907NiRyy67LOooIiJS9wqSqXTjcLLAQ9hw9HYmI8ArngfH3c9297OBJsDO7v5bd/8tQfGxSbdwdvfP3D1Vzqp+wMPuvsrdvwZmsmmdlCRH3XbbbUyfPp2WLVtGHUVEROreJODVZCr9FMHw838BJFPprmR4N/FMJvrb1t3TZV7/AHSqaONN1AH4tszrCqd3NrMBpdNGl5SUlLeJ5KClS5cye/ZszIx27dpFHUdERCKQiMduIBjsNA44IBGPlY6iakTQF6dKmTTzvBTee2pS+PpE4MWq3mRmLwLblLPqSnev6YQ/64VTRRcDFBYWamblPHHLLbcwcuRIZs6cSYcONb11iYiI5LpEPPZmOcu+yPT9mUz0d6GZHQf0ChcVu/vfM3jfoZmGKGMusG2Z15syvbPkmG+//ZZRo0bRr18/FTciIrJJMuqoQzBD4RJ3f9HMWppZK3dfkoU8TwMPmdltBJ2MuwFvZ+E4Ug8NGzaMdevWMWLEiKijiIhIjquyD46ZnQc8zk8jnjoAT27KQc3sODObA/wK+Ed4CQx3/wR4FPgUeA4YqBFUDcPbb7/Ngw8+yKWXXkrnzp2jjiMiIjkukxacgQQjmd4CcPcvzWzrTTloeImr3Mtc7n4DoJndGpg33niD9u3bc8UVV0QdRURE8kAmo6hWufvq0hdm1pif3xNCZJNccsklpFIpWrVqFXUUERHJA5kUOK+a2TCghZn1BR4DJmc3ljQUK1eu4q233gKgqKgo4jQiIpIvMilwhgDzgY+A3wNTgKuyGUoajnvvLaZnz5588sknUUcREZE8UmkfHDMrAD5x952Ae+smkjQU8+bN48477+KYY45hl112iTqOiIjkkUpbcMIRTCkzy9bMxdKA3XLLSFavXsWoUaOijiIiInkmk1FUmwGfmNnbwLLShe5+TNZSSd775JNPePjhSZx77rl069Yt6jgiIpJnMilw/pz1FNLg/Oc/X9GxY0cuuWRQ1FFERCQPmXvVI77NbBuCuXAceMfdv892sOooLCz0ZcuWVb1hNf2weGWt71N+UlJSQuPGjWnXunnWjqFzWDd0DnOfzmHuy9Y5NLPl7l5Y0fpkKt0WGAt0J6gTzgFSwCNAF2AWcEIiHluQlYAVyGQm43MJbpeQAH4HvGlm52Q7mOSn1atX8+yzz+LuNG6c6Z1CRESkHrsDeC4Rj+0E9AA+A4YCLyXisW7AS+HrOpXJMPE/Ab9097Pc/UxgT4Kh4yLVNm7cOM4999z1c9+IiEjuSqbSbQhuxn0fQCIeW52IxxYC/YDx4WbjgWPrOlsmf0L/Dyh7Y80l4TKRalmwYAG33347Bx3Um549e0YdR0REMtPYzKaXeV3s7sXh8+0I5sr7WzKV7gG8C1wMtEvEY+lwm++BdnWWNpRJC85M4C0zG25m1wBvAl+Y2aVmdml240k+ufXWW1m6dBnXXHN11FFERCRzJe6+V5lHcZl1jYE9gNGJeOyXBKOtN7gclYjHnAhu8ZRJgfMfgruHl4Z7CvgaaBU+RKr05ZdfMn78BE477VTi8XjUcUREpHbMAeYk4rHSfgePExQ8PyRT6RhA+HVeXQer8hKVu19bF0Ekv/3vf/9jp53iXH755VFHERGRWpKIx75PptLfJlPpeCIeSwGHAJ+GjzOBm8OvT9V1toyGidd3GiaeG9wdM/vZcg1PzX06h7lP5zD3RThMfHeCYeJNga+AswmuED0KdAJmEwwT/zErASugcbqSVSUlJUyc+BAnnngizZs3izqOiIjUskQ89j6wVzmrDqnjKBvIpA+OSI099NBDDBt2Ba+++krUUUREpAGpsgXHzLYCziOYjXD99u6uyf6kUosXL2bUyFHsu29PDjvssKjjiIhIA5LJJaqngH8BLwJrsxtH8sldd93Fjwt+ZPjwa8rteyMiIpItmRQ4Ld1dMxdLtcyePZt7772X3/3ud+y2225RxxERkQYmkz44z5jZkVlPInll1apV9OzZk6FDr4g6ioiINEBVDhM3syVAIbAaWBMudndvneVsGdMw8dym4am5T+cw9+kc5r6ohonXV1W24Lh7K3dv5O7Nw+et6lNxI/XLunXruP32/2P+/PlRRxERkQYso2HiZnaMmY0KH0dt6kHNbKSZfW5mH5rZ382sbZl1V5jZTDNLmdmvN/VYUrcef/wJRo0ayb/+9a+oo4iISANWZYFjZjcT3Bm0dOrli83spk087gtAd3ffDfgCuCI81s7AScAuwOHAPWZWsInHkjqybNkybr75JnbffXeOPfbYqOOIiEgDlskoqiOB3d19HYCZjQfeIyxKasLdny/z8k3gd+HzfsDD7r4K+NrMZgL7AP+u6bGk7txzzz388MMPFBcX06iR5pAUEZHoZPpbqG2Z521qOcM5wLPh8w7At2XWzQmX/YyZDTCz6WY2vaSkpJYjSXV99913jBkzhmOOOYa99ipvxm4REZG6k0kLzk3Ae2b2MmBAL2BoVW8ysxeBbcpZdaW7PxVucyVQAkzMOHHI3YuBYghGUVX3/VK7CgoKOOqoo/jTn/4UdRQREZGqCxx3n2RmrwB7h4uGuPv3Gbzv0MrWm9lZwFHAIf7TWPW5wLZlNusYLpN6rl27dtxxxx1RxxAREQEyvETl7ml3fzp8VFncVMXMDgcGA8e4+/Iyq54GTjKzZma2HdANeHtTjyfZ4+5ce+21fPbZZ1FHERERWS+qnqB3A62AF8zsfTMbA+DunwCPEozWeg4Y6O66/1U9NnnyZIqLi3nvvfeijiIiIrJelTMZ5wLNZByNlStXcVCvXrRu05rnnnuOgoKajejXDKq5T+cw9+kc5j7NZLyhTObBeSCTZdLw3HtvMXPmzuGaa4bXuLgRERHJhkwuUe1S9kU48d6e2YkjuWLevHnceedd/PrXv+aAA/aPOo6IiMgGKhxFZWZXAMOAFma2uHQxwU03i+sgm9RjhYWFnHtuf44//oSoo4iIiPxMJncTv8ndazxrcV1QH5zcpmv/uU/nMPfpHOa+KPvgJFPpAmA6MDcRjx2VTKW3Ax4GtgDeBU5PxGOrsxKwAplconrbzNbPXmxmbc3s2OxFkvrM3Rk8eDCvvvpq1FFERKT+uBgoO1/ICOD2RDzWFVgA9K/rQJkUONe4+6LSF+6+ELgma4mkXnvhhReYOHEiX331VdRRRESkHkim0h2B3wBjw9cG9AEeDzcZDxxb17kyKXDK2yaTWzxInlm9ejXXXXcdXbt25bTTTos6joiI1I3Gpfd+DB8DNlr/fwST964LX28BLEzEY6U3iqzwvpLZlEmhMt3MbgP+Gr4eSHA9TRqY8ePH8/XXXzNhwgSaNGkSdRwREakbJe5e7l2Uk6n0UcC8RDz2bjKV7l23sSqXSQvORQQjpx4h6DC0kqDIkQZk4YKF3HbbbRzU6yD69OkTdRwREakf9geOSabSswhqhD7AHUDbZCpd2ogSyX0lM7nZ5jJgqJkVhs+lAWrVuhVXX301e+yxB2YWdRwREakHEvHYFcAVAGELzuWJeOzUZCr9GPA7gqLnTOCpus6WyUzG+5nZp4S9o82sh5ndk/VkUq8UFBRw8sknE4/Ho44iIiL13xDg0mQqPZOgT859dR0gk3lw3iKowp5291+Gyz529+51kC8jmgcnuy6++GJ69uzJySefnJX9a/6N3KdzmPt0DnOf7kW1oYzuJu7u3260SHf4biCmTZvG448/zsKFC6OOIiIikrFMRlF9a2b7AW5mTfj5ZD6Sp0pKShg+fDidOnXinHPqfI4mERGRGsukwDmfoEd0B4Je0M+jUVQNwqRJk0ilUhQXF9OsWdOo44iIiGSsspttjnD3IcDB7n5qHWaSemDFihWMvGUk++7bkyOPPDLqOCIiItVSWQvOkWY2lGD412N1lEfqiRYtWvDXe+5hiy220LBwERHJOZUVOM8R3CCryMwWAwZ46Vd3b10H+SQC7o6ZceCBB0QdRUREpEYqG0V1lbu3Bf7h7q3dvVXZr3WUTyJw4YUXccstt0QdQ0REpMYqK3D+HX5dXBdBpH546623ePLJv9Okse41JSIiuauyS1RNzewUYD8zS2y80t2T2YslUVi3bh3XXDOcWCzG+RecH3UcERGRGquswDkfOBVoCxy90ToHVODkmccff4KPPvqQO++8kxYtWkQdR0REpMYqLHDc/TXgNTOb7u51fg8JqVtr167l1lGj6NGjB8cdd1zUcURERDZJhX1wzGwwgLvfZ2bHb7Tuxk05qJldb2Yfmtn7Zva8mbUPl5uZ3WlmM8P1e2zKcSRzBQUFPDTpIUaNupVGjTK6g4eIiEi9VdlvspPKPL9io3WHb+JxR7r7bu6+O/AMcHW4/AigW/gYAIzexONIBtasWQPADjvswM47/yLiNCIiIpuusgLHKnhe3utqcfeyI7MKCfr0APQDJnjgTaCtmcU25VhStUsvvZSBAwdS1Z3lRUREckVlBY5X8Ly819VmZjeY2bcEHZlLW3A6AGXvXD4nXFbe+weY2XQzm15SUrKpcRqs9957j2QySceO22rGYhERyRuVFTg9zGyxmS0Bdgufl77etaodm9mLZvZxOY9+AO5+pbtvC0wELqxucHcvdve93H2vxo0zuWeobMzdufbaa9lyy6246KJqnwIREZF6q7JRVAWbsmN3PzTDTScCU4BrCO5Wvm2ZdR3DZZIFkyc/wzvvvMPIkSMpKiqKOo6IiEitiWS4jJl1K/OyH/B5+Pxp4IxwNFVPYJG7p+s8YAPg7txzz1/ZeeedOfHEE6OOIyIiUquiurZzs5nFgXXAbIJJBSFoyTkSmAksB86OJl7+MzMenvQw8+bPo6BgkxrrRESkgUqm0tsCE4B2BP1zixPx2B3JVHpz4BGgCzALOCERjy2oy2yRtOC4+2/dvXs4VPxod58bLnd3H+juO7j7ru4+PYp8+W758uWsW7eOtpu1Zccdd4w6joiI5K4S4LJEPLYz0BMYmEyldwaGAi8l4rFuwEvh6zqlGd0aoGuuGc4xxxyDRp+JiMimSMRj6UQ8NiN8vgT4jGD0cz9gfLjZeODYus6mAqeB+fTTz5g0aRJ77rknGn0mIiIZaFw6LUv4GFDeRslUugvwS+AtoF0iHivtQ/s9wSWsOqUCpwEJhoUPp02b1gwadGnUcUREJDeUlE7LEj6KN94gmUoXAU8AlyTisbKT+ZKIx5xamD+vulTgNCAvvPACr732Gpdddhlt27aJOo6IiOSBZCrdhKC4mZiIx5Lh4h+SqXQsXB8D5tV1LhU4DciDDz5I165dOf3006OOIiIieSCZShtwH/BZIh67rcyqp4Ezw+dnAk/VdTZ1wmhAxo4dSzqdpkmTJlFHERGR/LA/cDrwUTKVfj9cNgy4GXg0mUr3J5gO5oS6DqYCpwFYsmQJjRo1orCwkM6dO0cdR0RE8kQiHnuNim/AfUhdZtmYLlE1ALfcMpLevXuzfPnyqKOIiIjUCRU4eW7mzJmMGzeOPn360LJly6jjiIiI1AkVOHnuuuuuo2XLFvzpT3+KOoqIiEidUYGTx6ZNm8ZLL73ExRdfzJZbbhl1HBERkTqjAiePTZkyhU6dOtG//7lRRxEREalTGkWVx2666Sbmz59Ps2ZNo44iIiJSp9SCk4eWLFlC+rs0ZsbWW28ddRwREZE6pwInD9155530OqgX//vf/6KOIiIiEgkVOHlm9uzZ3HvvvfzmN79hiy22iDqOiIhIJFTg5IlkMsm+++zLfvvtR0lJCT169Ig6koiISGTUyTgPJJNJBg8ezIoVKwBwd2644QbatGlDIpGIOJ2IiEjdUwtOHhhx84j1xU2pFStWMOLmERElEhERiZYKnDwwZ+6cai0XERHJdypw8kDHDh2rtVxERCTfqcDJA0OGDqFFixYbLGvRogVDhg6JKJGIiEi01Mk4D5R2JB5x8wjmfjeXDu07MGToEHUwFhGRBsvcPbqDm10GjAK2cvf/mpkBdwBHAsuBs9x9RlX7KSws9GXLltV6vh8Wr6z1fcrPtWvdPGv71jmsGzqHuU/nMPdl6xya2XJ3L8zKzrMosktUZrYtcBjwTZnFRwDdwscAYHQE0URERCTHRdkH53ZgMFC2CakfMMEDbwJtzSwWSToRERHJWZH0wTGzfsBcd/8guCq1Xgfg2zKv54TL0nUYT0RERDKUTKUPJ+heUgCMTcRjN0ccCchigWNmLwLblLPqSmAYweWpTdn/AILLWDRt2nRTdiUiIiI1kEylC4C/An0JGiXeSabSTyfisU+jTZbFAsfdDy1vuZntCmwHlLbedARmmNk+wFxg2zKbdwyXlbf/YqAYgk7GtZdcREREMrQPMDMRj30FkEylHybobpK/BU5F3P0jYOvS12Y2C9grHEX1NHChmT0M7AsscvcqL08tX77czWxFVds1EI2BkqhDyCbROcx9Ooe5T+fwJy3MbHqZ18VhIwOU37Vk3zpLVon6Ng/OFIIh4jMJhomfncmb3F0TFobMbLq77xV1Dqk5ncPcp3OY+3QOc1/kBY67dynz3IGB0aURERGRasi4a0ldi7zAERERkZz1DtAtmUpvR1DYnAScEm2kgC7t5J/iqjeRek7nMPfpHOY+ncMMJOKxEuBCYCrwGfBoIh77JNpUgUhv1SAiIiKSDWrBERERkbyjAkdERETyjgqcPGFm95vZPDP7OOosUn1mtq2ZvWxmn5rZJ2Z2cdSZpPrMrLmZvW1mH4Tn8dqoM0n1mVmBmb1nZs9EnUVqTgVO/hgHHB51CKmxEuAyd98Z6AkMNLOdI84k1bcK6OPuPYDdgcPNrGe0kaQGLiboMCs5TAVOnnD3acCPUeeQmnH3tLvPCJ8vIfjPtUO0qaS6PLA0fNkkfGgkRw4xs47Ab4CxUWeRTaMCR6SeMbMuwC+BtyKOIjUQXt54H5gHvODuOo+55f+AwcC6iHPIJlKBI1KPmFkR8ARwibsvjjqPVJ+7r3X33QlmdN3HzLpHHEkyZGZHAfPc/d2os8imU4EjUk+YWROC4maiuyejziObxt0XAi+jvnG5ZH/gmPAm0A8DfczswWgjSU2pwBGpB8zMgPuAz9z9tqjzSM2Y2VZm1jZ83gLoC3weaSjJmLtf4e4dw3skngT8091PiziW1JAKnDxhZpOAfwNxM5tjZv2jziTVsj9wOsFfjO+HjyOjDiXVFgNeNrMPCe7R84K7a6ixSAR0qwYRERHJO2rBERERkbyjAkdERETyjgocERERyTsqcERERCTvqMARERGRvKMCR0RERPKOChwRERHJOypwRAQz62Jmn5vZODP7wswmmtmhZva6mX1pZvuEX7cKt29kZjNLX5ezv+PN7GMz+8DMpoXLCsxspJm9Y2Yfmtnvw+VFZvaSmc0ws4/MrF+4vNDM/hHu42MzOzFcfoiZvRdue7+ZNQuXzzKza8vsZ6e6+OxEpH5SgSMipboCtwI7hY9TgAOAy4FhwIPAqeG2hwIfuPv8CvZ1NfBrd+8BHBMu6w8scve9gb2B88xsO2AlcJy77wEcDNwa3rricOA7d+/h7t2B58ysOTAOONHddwUaAxeUOe5/w/2MDnOLSAOlAkdESn3t7h+5+zrgE+AlD6Y6/wjoAtwPnBFuew7wt0r29TowzszOAwrCZYcBZ5jZ+8BbwBZAN8CAG8PbG7wIdADahcfta2YjzOxAd18ExMOcX4T7HA/0KnPc0puUvhtmFpEGqnHUAUSk3lhV5vm6Mq/XAY3d/Vsz+8HM+gD78FNrzs+4+/lmti/wG+BdM9uToJC5yN2nlt3WzM4CtgL2dPc14Z2cm7v7F2a2B3Ak8Bczewl4KsPvYS36/02kQVMLjohUx1iCS1WPufvaijYysx3c/S13vxqYD2wLTAUuMLMm4TY7mlkh0AaYFxY3BwOdw/XtgeXu/iAwEtgDSAFdzKxreKjTgVez8Y2KSG7TXzgiUh1PE1yaquzyFMBIMyu9/PQS8AHwIcFloxlhH5v5wLHARGCymX0ETAc+D/exa7ifdcAa4AJ3X2lmZwOPmVljgjt2j6m9b09E8oXuJi4iGTOzvYDb3f3AqLOIiFRGLTgikhEzG0owYqnCvjciIvWFWnBEpMbM7Erg+I0WP+buN0SRR0SklAocERERyTsaRSUiIiJ5RwWOiIiI5B0VOCIiIpJ3VOCIiIhI3vn/KkQVZQVsnKsAAAAASUVORK5CYII=", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "ale_eff = ale(\n", + " X=X_train, \n", + " model=rf, \n", + " feature=['my_season'], \n", + " grid_size=50, \n", + " include_CI=False,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 102, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['temp', 'hum', 'windspeed', 'days_since_2011', 'season_SPRING',\n", + " 'season_SUMMER', 'season_WINTER', 'holiday_NO HOLIDAY',\n", + " 'workingday_WORKING DAY', 'weathersit_MISTY',\n", + " 'weathersit_RAIN/SNOW/STORM'],\n", + " dtype='object')" + ] + }, + "execution_count": 102, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_train.columns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ale_eff = ale(\n", + " X=X_train, \n", + " model=rf, \n", + " feature=[''], \n", + " grid_size=50, \n", + " include_CI=False,\n", + ")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.10.5 64-bit", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.6" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/Python/bike-sharing/preprocess_bike_data.py b/Python/bike-sharing/preprocess_bike_data.py new file mode 100644 index 00000000..15c07ab6 --- /dev/null +++ b/Python/bike-sharing/preprocess_bike_data.py @@ -0,0 +1,70 @@ +# Imports +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt + +# Setting up the path +DIR = "../../data/" +DATA = "bike-sharing-daily.csv" + +# Label Lists +weekdays = ['SUN', 'MON', 'TUE', 'WED', 'THU', 'FRI', 'SAT'] +holidays = ['NO HOLIDAY', 'HOLIDAY'] +working_day = ['NO WORKING DAY', 'WORKING DAY'] +season = ['WINTER', 'SPRING', 'SUMMER', 'FALL'] +weathersit = ['GOOD', 'MISTY', 'RAIN/SNOW/STORM'] +months = ['JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', 'AUG', 'SEP', 'OCT', 'NOV', 'DEC'] + +# Function to create a dictionary mapping index of list to the respective label +def create_dictionary(label_list, start_at_zero=True): + d = {} + if start_at_zero: + for idx, val in enumerate(label_list): + d[idx] = val + else: + for idx, val in enumerate(label_list): + d[idx+1] = val + return d + +# Function that denormalizes temperture +def inverse_min_max(row, tmin, tmax): + return row * (tmax - tmin) + tmin + +def data_pipeline(): + + # Reading in the data + day_bike_rentals = pd.read_csv(DIR+DATA) + + # Mapping numerical codes with actual values + day_bike_rentals['weekday'] = day_bike_rentals['weekday'].map(create_dictionary(weekdays)) + day_bike_rentals['holiday'] = day_bike_rentals['holiday'].map(create_dictionary(holidays)) + day_bike_rentals['workingday'] = day_bike_rentals['workingday'].map(create_dictionary(working_day)) + day_bike_rentals['season'] = day_bike_rentals['season'].map(create_dictionary(season, start_at_zero=False)) + day_bike_rentals['weathersit'] = day_bike_rentals['weathersit'].map(create_dictionary(weathersit, start_at_zero=False)) + day_bike_rentals['mnth'] = day_bike_rentals['mnth'].map(create_dictionary(months, start_at_zero=False)) + + # Reversing min_max + day_bike_rentals['temp'] = day_bike_rentals['temp'].apply(inverse_min_max, args=(-8, 39)) + day_bike_rentals['atemp'] = day_bike_rentals['atemp'].apply(inverse_min_max, args=(-16, 50)) + + # Unnormalizing the data + day_bike_rentals['windspeed'] = day_bike_rentals['windspeed'].apply(lambda row: row * 67) + day_bike_rentals['hum'] = day_bike_rentals['hum'].apply(lambda row: row * 100) + + # Converting labels into actual year + day_bike_rentals['yr'] = day_bike_rentals['yr'].apply(lambda row: 2011 if row == 0 else 2012) + + # Converting 'dteday' to datetime object + day_bike_rentals['dteday'] = pd.to_datetime(day_bike_rentals['dteday']) + + # Calculating days since 2011 + day_bike_rentals['days_since_2011'] = (day_bike_rentals['dteday'] - day_bike_rentals['dteday'].min()).dt.days + + # Feature selecting + day_bike_rentals = day_bike_rentals[[col for col in day_bike_rentals.columns if col not in ['instant', 'dteday', 'registered', 'casual', 'atemp']]] + + # Dummifying categorical features + day_bike_rentals = pd.concat([day_bike_rentals, pd.get_dummies(day_bike_rentals[['season', 'holiday','workingday', 'weathersit']], drop_first=True)], axis=1) + + # Returning the cleaned data + return day_bike_rentals \ No newline at end of file diff --git a/Python/cervical-cancer/Chapter5-interpretable_models.ipynb b/Python/cervical-cancer/Chapter5-interpretable_models.ipynb new file mode 100644 index 00000000..9045ced2 --- /dev/null +++ b/Python/cervical-cancer/Chapter5-interpretable_models.ipynb @@ -0,0 +1,461 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "from preprocess_cervical_cancer_data import data_pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AgeNumber.of.sexual.partnersFirst.sexual.intercourseNum.of.pregnanciesSmokesSmokes..years.Hormonal.ContraceptivesHormonal.Contraceptives..years.IUDIUD..years.STDsSTDs..number.STDs..Number.of.diagnosisSTDs..Time.since.first.diagnosisSTDs..Time.since.last.diagnosisBiopsy
018415100.000.000.0000110
115114100.000.000.0000110
234115100.000.000.0000110
3525164137.013.000.0000110
446321400.0115.000.0000110
\n", + "
" + ], + "text/plain": [ + " Age Number.of.sexual.partners ... STDs..Time.since.last.diagnosis Biopsy\n", + "0 18 4 ... 1 0\n", + "1 15 1 ... 1 0\n", + "2 34 1 ... 1 0\n", + "3 52 5 ... 1 0\n", + "4 46 3 ... 1 0\n", + "\n", + "[5 rows x 16 columns]" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cervical = data_pipeline()\n", + "cervical.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "some_risk_factors = ['Hormonal.Contraceptives', 'Smokes', 'Num.of.pregnancies', 'STDs..Number.of.diagnosis', 'IUD']" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
LogisticRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "LogisticRegression()" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# from sklearn.linear_model import LogisticRegression\n", + "\n", + "# lr = LogisticRegression()\n", + "# lr.fit(cervical[some_risk_factors], cervical['Biopsy'])" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Optimization terminated successfully.\n", + " Current function value: 0.297389\n", + " Iterations 7\n" + ] + } + ], + "source": [ + "import statsmodels.api as sm\n", + "\n", + "lr = sm.Logit(cervical['Biopsy'], cervical[some_risk_factors]).fit()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "
Logit Regression Results
Dep. Variable: Biopsy No. Observations: 858
Model: Logit Df Residuals: 853
Method: MLE Df Model: 4
Date: Tue, 06 Dec 2022 Pseudo R-squ.: -0.2490
Time: 10:41:08 Log-Likelihood: -255.16
converged: True LL-Null: -204.30
Covariance Type: nonrobust LLR p-value: 1.000
\n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "
coef std err z P>|z| [0.025 0.975]
Hormonal.Contraceptives -1.2914 0.220 -5.857 0.000 -1.723 -0.859
Smokes -0.3487 0.365 -0.955 0.339 -1.064 0.367
Num.of.pregnancies -0.8171 0.099 -8.284 0.000 -1.010 -0.624
STDs..Number.of.diagnosis 0.3884 0.329 1.181 0.238 -0.256 1.033
IUD 0.6721 0.416 1.617 0.106 -0.143 1.487
" + ], + "text/plain": [ + "\n", + "\"\"\"\n", + " Logit Regression Results \n", + "==============================================================================\n", + "Dep. Variable: Biopsy No. Observations: 858\n", + "Model: Logit Df Residuals: 853\n", + "Method: MLE Df Model: 4\n", + "Date: Tue, 06 Dec 2022 Pseudo R-squ.: -0.2490\n", + "Time: 10:41:08 Log-Likelihood: -255.16\n", + "converged: True LL-Null: -204.30\n", + "Covariance Type: nonrobust LLR p-value: 1.000\n", + "=============================================================================================\n", + " coef std err z P>|z| [0.025 0.975]\n", + "---------------------------------------------------------------------------------------------\n", + "Hormonal.Contraceptives -1.2914 0.220 -5.857 0.000 -1.723 -0.859\n", + "Smokes -0.3487 0.365 -0.955 0.339 -1.064 0.367\n", + "Num.of.pregnancies -0.8171 0.099 -8.284 0.000 -1.010 -0.624\n", + "STDs..Number.of.diagnosis 0.3884 0.329 1.181 0.238 -0.256 1.033\n", + "IUD 0.6721 0.416 1.617 0.106 -0.143 1.487\n", + "=============================================================================================\n", + "\"\"\"" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "lr.summary()" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [], + "source": [ + "weights = lr.params.to_dict()\n", + "std_error = lr.bse.to_dict()\n", + "odds_ratio = {}\n", + "for key, val in weights.items():\n", + " odds_ratio[key] = np.exp(val)" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
weightsodds_ratiostd_error
Hormonal.Contraceptives-1.2913530.2748990.220483
Smokes-0.3487320.7055820.365052
Num.of.pregnancies-0.8171200.4417020.098634
STDs..Number.of.diagnosis0.3883921.4746080.328834
IUD0.6721031.9583510.415768
\n", + "
" + ], + "text/plain": [ + " weights odds_ratio std_error\n", + "Hormonal.Contraceptives -1.291353 0.274899 0.220483\n", + "Smokes -0.348732 0.705582 0.365052\n", + "Num.of.pregnancies -0.817120 0.441702 0.098634\n", + "STDs..Number.of.diagnosis 0.388392 1.474608 0.328834\n", + "IUD 0.672103 1.958351 0.415768" + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "for key, val in weights.items():\n", + " weights[key] = [val]\n", + "\n", + "t = pd.DataFrame(weights, index=['weights']).T\n", + "t['odds_ratio'] = t.index.map(odds_ratio)\n", + "t['std_error'] = t.index.map(std_error)\n", + "\n", + "t" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.10.6 64-bit", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.6" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/Python/cervical-cancer/__pycache__/preprocess_cervical_cancer_data.cpython-310.pyc b/Python/cervical-cancer/__pycache__/preprocess_cervical_cancer_data.cpython-310.pyc new file mode 100644 index 00000000..5141eb60 Binary files /dev/null and b/Python/cervical-cancer/__pycache__/preprocess_cervical_cancer_data.cpython-310.pyc differ diff --git a/Python/cervical-cancer/cervical_cancer_data.ipynb b/Python/cervical-cancer/cervical_cancer_data.ipynb new file mode 100644 index 00000000..42d6d442 --- /dev/null +++ b/Python/cervical-cancer/cervical_cancer_data.ipynb @@ -0,0 +1,733 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# Imports\n", + "\n", + "import pandas as pd\n", + "\n", + "import matplotlib.pyplot as plt\n", + "\n", + "from sklearn.inspection import PartialDependenceDisplay\n", + "from sklearn.ensemble import RandomForestClassifier" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AgeNumber.of.sexual.partnersFirst.sexual.intercourseNum.of.pregnanciesSmokesSmokes..years.Hormonal.ContraceptivesHormonal.Contraceptives..years.IUDIUD..years.STDsSTDs..number.STDs..Number.of.diagnosisSTDs..Time.since.first.diagnosisSTDs..Time.since.last.diagnosisBiopsy
018415100.000.000.000011Healthy
115114100.000.000.000011Healthy
234115100.000.000.000011Healthy
3525164137.013.000.000011Healthy
446321400.0115.000.000011Healthy
\n", + "
" + ], + "text/plain": [ + " Age Number.of.sexual.partners First.sexual.intercourse \\\n", + "0 18 4 15 \n", + "1 15 1 14 \n", + "2 34 1 15 \n", + "3 52 5 16 \n", + "4 46 3 21 \n", + "\n", + " Num.of.pregnancies Smokes Smokes..years. Hormonal.Contraceptives \\\n", + "0 1 0 0.0 0 \n", + "1 1 0 0.0 0 \n", + "2 1 0 0.0 0 \n", + "3 4 1 37.0 1 \n", + "4 4 0 0.0 1 \n", + "\n", + " Hormonal.Contraceptives..years. IUD IUD..years. STDs STDs..number. \\\n", + "0 0.0 0 0.0 0 0 \n", + "1 0.0 0 0.0 0 0 \n", + "2 0.0 0 0.0 0 0 \n", + "3 3.0 0 0.0 0 0 \n", + "4 15.0 0 0.0 0 0 \n", + "\n", + " STDs..Number.of.diagnosis STDs..Time.since.first.diagnosis \\\n", + "0 0 1 \n", + "1 0 1 \n", + "2 0 1 \n", + "3 0 1 \n", + "4 0 1 \n", + "\n", + " STDs..Time.since.last.diagnosis Biopsy \n", + "0 1 Healthy \n", + "1 1 Healthy \n", + "2 1 Healthy \n", + "3 1 Healthy \n", + "4 1 Healthy " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Reading in the data\n", + "\n", + "DIR = \"../../data/\"\n", + "DATA = \"cervical.csv\"\n", + "cervical = pd.read_csv(DIR+DATA)\n", + "cervical.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Converting the \"Biopsy\" to binary\n", + "\n", + "cervical['Biopsy'] = cervical['Biopsy'].apply(lambda row: 1 if row=='Cancer' else 0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cervical.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AgeNumber.of.sexual.partnersFirst.sexual.intercourseNum.of.pregnanciesSmokesSmokes..years.Hormonal.ContraceptivesHormonal.Contraceptives..years.IUDIUD..years.STDsSTDs..number.STDs..Number.of.diagnosisSTDs..Time.since.first.diagnosisSTDs..Time.since.last.diagnosis
count858.000000858.000000858.000000858.000000858.000000858.000000858.000000858.000000858.000000858.000000858.000000858.000000858.000000858.000000858.000000
mean26.8205132.51165516.9790212.1923080.1433571.2012410.6864801.9723940.0967370.4446040.0920750.1550120.0874131.4254081.398601
std8.4979481.6447592.7976531.4343950.3506414.0606230.4641943.5978880.2957711.8142180.2893000.5296170.3025452.2015502.113931
min13.0000001.00000010.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000001.0000001.000000
25%20.0000002.00000015.0000001.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000001.0000001.000000
50%25.0000002.00000017.0000002.0000000.0000000.0000001.0000000.2500000.0000000.0000000.0000000.0000000.0000001.0000001.000000
75%32.0000003.00000018.0000003.0000000.0000000.0000001.0000002.0000000.0000000.0000000.0000000.0000000.0000001.0000001.000000
max84.00000028.00000032.00000011.0000001.00000037.0000001.00000030.0000001.00000019.0000001.0000004.0000003.00000022.00000022.000000
\n", + "
" + ], + "text/plain": [ + " Age Number.of.sexual.partners First.sexual.intercourse \\\n", + "count 858.000000 858.000000 858.000000 \n", + "mean 26.820513 2.511655 16.979021 \n", + "std 8.497948 1.644759 2.797653 \n", + "min 13.000000 1.000000 10.000000 \n", + "25% 20.000000 2.000000 15.000000 \n", + "50% 25.000000 2.000000 17.000000 \n", + "75% 32.000000 3.000000 18.000000 \n", + "max 84.000000 28.000000 32.000000 \n", + "\n", + " Num.of.pregnancies Smokes Smokes..years. \\\n", + "count 858.000000 858.000000 858.000000 \n", + "mean 2.192308 0.143357 1.201241 \n", + "std 1.434395 0.350641 4.060623 \n", + "min 0.000000 0.000000 0.000000 \n", + "25% 1.000000 0.000000 0.000000 \n", + "50% 2.000000 0.000000 0.000000 \n", + "75% 3.000000 0.000000 0.000000 \n", + "max 11.000000 1.000000 37.000000 \n", + "\n", + " Hormonal.Contraceptives Hormonal.Contraceptives..years. IUD \\\n", + "count 858.000000 858.000000 858.000000 \n", + "mean 0.686480 1.972394 0.096737 \n", + "std 0.464194 3.597888 0.295771 \n", + "min 0.000000 0.000000 0.000000 \n", + "25% 0.000000 0.000000 0.000000 \n", + "50% 1.000000 0.250000 0.000000 \n", + "75% 1.000000 2.000000 0.000000 \n", + "max 1.000000 30.000000 1.000000 \n", + "\n", + " IUD..years. STDs STDs..number. STDs..Number.of.diagnosis \\\n", + "count 858.000000 858.000000 858.000000 858.000000 \n", + "mean 0.444604 0.092075 0.155012 0.087413 \n", + "std 1.814218 0.289300 0.529617 0.302545 \n", + "min 0.000000 0.000000 0.000000 0.000000 \n", + "25% 0.000000 0.000000 0.000000 0.000000 \n", + "50% 0.000000 0.000000 0.000000 0.000000 \n", + "75% 0.000000 0.000000 0.000000 0.000000 \n", + "max 19.000000 1.000000 4.000000 3.000000 \n", + "\n", + " STDs..Time.since.first.diagnosis STDs..Time.since.last.diagnosis \n", + "count 858.000000 858.000000 \n", + "mean 1.425408 1.398601 \n", + "std 2.201550 2.113931 \n", + "min 1.000000 1.000000 \n", + "25% 1.000000 1.000000 \n", + "50% 1.000000 1.000000 \n", + "75% 1.000000 1.000000 \n", + "max 22.000000 22.000000 " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cervical.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "# Organizing features and label\n", + "\n", + "features = [col for col in cervical.columns if col != 'Biopsy']\n", + "X = cervical[features]\n", + "y = cervical['Biopsy']" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
RandomForestClassifier()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "RandomForestClassifier()" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Running a random forest\n", + "\n", + "rf = RandomForestClassifier()\n", + "rf.fit(X,y)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# Plotting a Partial Dependence Plot\n", + "\n", + "fig, ax = plt.subplots(figsize=(10, 20))\n", + "PartialDependenceDisplay.from_estimator(\n", + " estimator=rf,\n", + " X=X,\n", + " features=features,\n", + " target=y,\n", + " ax=ax\n", + ")\n", + "ax.set_title(\"Partial Dependence Plot\")\n", + "plt.tight_layout();" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [], + "source": [ + "data = cervical[['Age', 'Num.of.pregnancies']]" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AgeNum.of.pregnancies
0181
1151
2341
3524
4464
.........
853340
854321
855250
856332
857291
\n", + "

858 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " Age Num.of.pregnancies\n", + "0 18 1\n", + "1 15 1\n", + "2 34 1\n", + "3 52 4\n", + "4 46 4\n", + ".. ... ...\n", + "853 34 0\n", + "854 32 1\n", + "855 25 0\n", + "856 33 2\n", + "857 29 1\n", + "\n", + "[858 rows x 2 columns]" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [], + "source": [ + "# fig, ax = plt.subplots(figsize=(8,6))\n", + "\n", + "# plt.subplot(1,1,1)\n", + "# plt.pcolormesh(data, cmap = 'rainbow')\n", + "# plt.title('HeatMap Using pcolormesh function')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.10.6 64-bit", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.6" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/Python/cervical-cancer/preprocess_cervical_cancer_data.py b/Python/cervical-cancer/preprocess_cervical_cancer_data.py new file mode 100644 index 00000000..d659e407 --- /dev/null +++ b/Python/cervical-cancer/preprocess_cervical_cancer_data.py @@ -0,0 +1,15 @@ +# Imports +import pandas as pd + +# Reading in the data + + +def data_pipeline(): + DIR = "../../data/" + DATA = "cervical.csv" + cervical = pd.read_csv(DIR+DATA) + + # Converting target into binary variable + cervical['Biopsy'] = cervical['Biopsy'].apply(lambda row: 1 if row=='Cancer' else 0) + + return cervical \ No newline at end of file