Skip to content

Commit

Permalink
updating mortgage E2E for new API
Browse files Browse the repository at this point in the history
  • Loading branch information
mtjrider committed Mar 18, 2019
1 parent cc0b697 commit 084756b
Showing 1 changed file with 109 additions and 42 deletions.
151 changes: 109 additions & 42 deletions mortgage/E2E.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,20 @@
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/nfs/majones/conda/envs/rapids/lib/python3.6/site-packages/dask/config.py:168: YAMLLoadWarning: calling yaml.load() without Loader=... is deprecated, as the default Loader is unsafe. Please read https://msg.pyyaml.org/load for full details.\n",
" data = yaml.load(f.read()) or {}\n",
"/home/nfs/majones/conda/envs/rapids/lib/python3.6/site-packages/distributed/config.py:20: YAMLLoadWarning: calling yaml.load() without Loader=... is deprecated, as the default Loader is unsafe. Please read https://msg.pyyaml.org/load for full details.\n",
" defaults = yaml.load(f)\n"
]
}
],
"source": [
"import numpy as np\n",
"import dask_xgboost as dxgb_gpu\n",
Expand All @@ -74,9 +85,41 @@
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<table style=\"border: 2px solid white;\">\n",
"<tr>\n",
"<td style=\"vertical-align: top; border: 0px solid white\">\n",
"<h3>Client</h3>\n",
"<ul>\n",
" <li><b>Scheduler: </b>tcp://10.33.227.165:37501\n",
" <li><b>Dashboard: </b><a href='http://10.33.227.165:8787/status' target='_blank'>http://10.33.227.165:8787/status</a>\n",
"</ul>\n",
"</td>\n",
"<td style=\"vertical-align: top; border: 0px solid white\">\n",
"<h3>Cluster</h3>\n",
"<ul>\n",
" <li><b>Workers: </b>8</li>\n",
" <li><b>Cores: </b>8</li>\n",
" <li><b>Memory: </b>1.08 TB</li>\n",
"</ul>\n",
"</td>\n",
"</tr>\n",
"</table>"
],
"text/plain": [
"<Client: scheduler='tcp://10.33.227.165:37501' processes=8 cores=8>"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import subprocess\n",
"\n",
Expand All @@ -99,22 +142,22 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# to download data for this notebook, visit https://rapidsai.github.io/demos/datasets/mortgage-data and update the following paths accordingly\n",
"acq_data_path = \"/path/to/mortgage/acq\"\n",
"perf_data_path = \"/path/to/mortgage/perf\"\n",
"col_names_path = \"/path/to/mortgage/names.csv\"\n",
"acq_data_path = \"/datasets/mortgage/mortgage/acq\"\n",
"perf_data_path = \"/datasets/mortgage/mortgage/perf_1000M\"\n",
"col_names_path = \"/datasets/mortgage/mortgage/names.csv\"\n",
"start_year = 2000\n",
"end_year = 2016 # end_year is inclusive\n",
"part_count = 16 # the number of data files to train against"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -136,9 +179,27 @@
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'tcp://10.33.227.165:33247': True,\n",
" 'tcp://10.33.227.165:33773': True,\n",
" 'tcp://10.33.227.165:37184': True,\n",
" 'tcp://10.33.227.165:38419': True,\n",
" 'tcp://10.33.227.165:42962': True,\n",
" 'tcp://10.33.227.165:43195': True,\n",
" 'tcp://10.33.227.165:45119': True,\n",
" 'tcp://10.33.227.165:45562': True}"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"client.run(initialize_rmm_pool)"
]
Expand All @@ -152,7 +213,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -174,7 +235,7 @@
" if str(data_type) == \"category\":\n",
" df[column] = df[column].astype('int32').fillna(-1)\n",
" if str(data_type) in ['int8', 'int16', 'int32', 'int64', 'float32', 'float64']:\n",
" df[column] = df[column].fillna(-1)\n",
" df[column] = df[column].fillna(np.dtype(data_type).type(-1))\n",
" return df\n",
"\n",
"def run_gpu_workflow(quarter=1, year=2000, perf_file=\"\", **kwargs):\n",
Expand Down Expand Up @@ -331,39 +392,39 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"def create_ever_features(gdf, **kwargs):\n",
" everdf = gdf[['loan_id', 'current_loan_delinquency_status']]\n",
" everdf = everdf.groupby('loan_id', method='hash').max()\n",
" everdf = everdf.groupby('loan_id', method='hash', as_index=False).max()\n",
" del(gdf)\n",
" everdf['ever_30'] = (everdf['max_current_loan_delinquency_status'] >= 1).astype('int8')\n",
" everdf['ever_90'] = (everdf['max_current_loan_delinquency_status'] >= 3).astype('int8')\n",
" everdf['ever_180'] = (everdf['max_current_loan_delinquency_status'] >= 6).astype('int8')\n",
" everdf.drop_column('max_current_loan_delinquency_status')\n",
" everdf['ever_30'] = (everdf['current_loan_delinquency_status'] >= 1).astype('int8')\n",
" everdf['ever_90'] = (everdf['current_loan_delinquency_status'] >= 3).astype('int8')\n",
" everdf['ever_180'] = (everdf['current_loan_delinquency_status'] >= 6).astype('int8')\n",
" everdf.drop_column('current_loan_delinquency_status')\n",
" return everdf"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"def create_delinq_features(gdf, **kwargs):\n",
" delinq_gdf = gdf[['loan_id', 'monthly_reporting_period', 'current_loan_delinquency_status']]\n",
" del(gdf)\n",
" delinq_30 = delinq_gdf.query('current_loan_delinquency_status >= 1')[['loan_id', 'monthly_reporting_period']].groupby('loan_id', method='hash').min()\n",
" delinq_30['delinquency_30'] = delinq_30['min_monthly_reporting_period']\n",
" delinq_30.drop_column('min_monthly_reporting_period')\n",
" delinq_90 = delinq_gdf.query('current_loan_delinquency_status >= 3')[['loan_id', 'monthly_reporting_period']].groupby('loan_id', method='hash').min()\n",
" delinq_90['delinquency_90'] = delinq_90['min_monthly_reporting_period']\n",
" delinq_90.drop_column('min_monthly_reporting_period')\n",
" delinq_180 = delinq_gdf.query('current_loan_delinquency_status >= 6')[['loan_id', 'monthly_reporting_period']].groupby('loan_id', method='hash').min()\n",
" delinq_180['delinquency_180'] = delinq_180['min_monthly_reporting_period']\n",
" delinq_180.drop_column('min_monthly_reporting_period')\n",
" delinq_30 = delinq_gdf.query('current_loan_delinquency_status >= 1')[['loan_id', 'monthly_reporting_period']].groupby('loan_id', method='hash', as_index=False).min()\n",
" delinq_30['delinquency_30'] = delinq_30['monthly_reporting_period']\n",
" delinq_30.drop_column('monthly_reporting_period')\n",
" delinq_90 = delinq_gdf.query('current_loan_delinquency_status >= 3')[['loan_id', 'monthly_reporting_period']].groupby('loan_id', method='hash', as_index=False).min()\n",
" delinq_90['delinquency_90'] = delinq_90['monthly_reporting_period']\n",
" delinq_90.drop_column('monthly_reporting_period')\n",
" delinq_180 = delinq_gdf.query('current_loan_delinquency_status >= 6')[['loan_id', 'monthly_reporting_period']].groupby('loan_id', method='hash', as_index=False).min()\n",
" delinq_180['delinquency_180'] = delinq_180['monthly_reporting_period']\n",
" delinq_180.drop_column('monthly_reporting_period')\n",
" del(delinq_gdf)\n",
" delinq_merge = delinq_30.merge(delinq_90, how='left', on=['loan_id'], type='hash')\n",
" delinq_merge['delinquency_90'] = delinq_merge['delinquency_90'].fillna(np.dtype('datetime64[ms]').type('1970-01-01').astype('datetime64[ms]'))\n",
Expand All @@ -377,7 +438,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -393,7 +454,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -430,7 +491,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -441,7 +502,7 @@
" tmpdf = joined_df[['loan_id', 'timestamp_year', 'timestamp_month', 'delinquency_12', 'upb_12']]\n",
" tmpdf['josh_months'] = tmpdf['timestamp_year'] * 12 + tmpdf['timestamp_month']\n",
" tmpdf['josh_mody_n'] = ((tmpdf['josh_months'].astype('float64') - 24000 - y) / 12).floor()\n",
" tmpdf = tmpdf.groupby(['loan_id', 'josh_mody_n'], method='hash').agg({'delinquency_12': 'max','upb_12': 'min'})\n",
" tmpdf = tmpdf.groupby(['loan_id', 'josh_mody_n'], method='hash', as_index=False).agg({'delinquency_12': 'max','upb_12': 'min'})\n",
" tmpdf['delinquency_12'] = (tmpdf['max_delinquency_12']>3).astype('int32')\n",
" tmpdf['delinquency_12'] +=(tmpdf['min_upb_12']==0).astype('int32')\n",
" tmpdf.drop_column('max_delinquency_12')\n",
Expand All @@ -459,7 +520,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -473,13 +534,15 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"def final_performance_delinquency(gdf, joined_df, **kwargs):\n",
" merged = null_workaround(gdf)\n",
" joined_df = null_workaround(joined_df)\n",
" joined_df['timestamp_month'] = joined_df['timestamp_month'].astype('int8')\n",
" joined_df['timestamp_year'] = joined_df['timestamp_year'].astype('int16')\n",
" merged['timestamp_month'] = merged['monthly_reporting_period'].dt.month\n",
" merged['timestamp_month'] = merged['timestamp_month'].astype('int8')\n",
" merged['timestamp_year'] = merged['monthly_reporting_period'].dt.year\n",
Expand All @@ -492,7 +555,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -504,7 +567,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -524,7 +587,11 @@
" df['delinquency_12'] = df['delinquency_12'] > 0\n",
" df['delinquency_12'] = df['delinquency_12'].fillna(False).astype('int32')\n",
" for column in df.columns:\n",
" df[column] = df[column].fillna(-1)\n",
" if column != 'delinquency_12': df[column] = df[column].astype('float32')\n",
" else: df[column] = df[column].astype('int32')\n",
" df[column] = df[column].fillna(np.dtype(str(df[column].dtype)).type(-1))\n",
" print('DF DTYPES')\n",
" print(df.dtypes)\n",
" return df.to_arrow(preserve_index=False)"
]
},
Expand Down Expand Up @@ -721,7 +788,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.6"
"version": "3.6.7"
}
},
"nbformat": 4,
Expand Down

0 comments on commit 084756b

Please sign in to comment.