From 69df5a4d4105a93be8d29e30d9e60701c140dd80 Mon Sep 17 00:00:00 2001
From: Matthew Jones <majones@dgx15.aselab.nvidia.com>
Date: Mon, 3 Dec 2018 15:30:03 -0800
Subject: [PATCH 01/21] adding utilities and notebooks compatible with the
 refactored cudf

---
 notebooks/E2E.ipynb        | 667 +++++++++++++++++++++++++++++++++++++
 utils/conda_create_cudf.sh |  39 +++
 utils/dask-cluster.py      |  63 ++++
 utils/dask-setup.sh        | 100 ++++++
 utils/dask.conf            |   7 +
 utils/start_jupyter.sh     |   5 +
 utils/stop_jupyter.sh      |   7 +
 7 files changed, 888 insertions(+)
 create mode 100644 notebooks/E2E.ipynb
 create mode 100755 utils/conda_create_cudf.sh
 create mode 100644 utils/dask-cluster.py
 create mode 100755 utils/dask-setup.sh
 create mode 100644 utils/dask.conf
 create mode 100755 utils/start_jupyter.sh
 create mode 100755 utils/stop_jupyter.sh

diff --git a/notebooks/E2E.ipynb b/notebooks/E2E.ipynb
new file mode 100644
index 00000000..fdbad63d
--- /dev/null
+++ b/notebooks/E2E.ipynb
@@ -0,0 +1,667 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Imports and Helper Functions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import dask_xgboost as dxgb_gpu\n",
+    "import dask\n",
+    "import dask_cudf\n",
+    "from dask.delayed import delayed\n",
+    "from dask.distributed import Client, wait\n",
+    "import xgboost as xgb\n",
+    "import cudf\n",
+    "from cudf.dataframe import DataFrame\n",
+    "from collections import OrderedDict\n",
+    "import gc\n",
+    "from glob import glob\n",
+    "import os"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%bash\n",
+    "IPADDR=($(hostname --all-ip-addresses))\n",
+    "bash -c \"../utils/dask-setup.sh 0\"\n",
+    "bash -c \"../utils/dask-setup.sh 8 8786 8787 8790 ${IPADDR[0]} MASTER\"\n",
+    "#                               ^------------------------------ this tells the scheduler how many GPU workers you have on your node"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import subprocess\n",
+    "\n",
+    "import dask\n",
+    "from dask.delayed import delayed\n",
+    "from dask.distributed import Client, wait\n",
+    "\n",
+    "cmd = \"hostname --all-ip-addresses\"\n",
+    "process = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE)\n",
+    "output, error = process.communicate()\n",
+    "IPADDR = str(output.decode()).split()[0]\n",
+    "_client = IPADDR + str(\":8786\")\n",
+    "    \n",
+    "client = dask.distributed.Client(_client)\n",
+    "client"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "acq_data_path = \"/datasets/mortgage/acquisition\"\n",
+    "perf_data_path = \"/datasets/mortgage/perf_clean_full_split\"\n",
+    "col_names_path = \"/datasets/mortgage/names.csv\"\n",
+    "start_year = 2000\n",
+    "end_year = 2017 # end_year is not inclusive\n",
+    "part_count = 16 # the number of data files to train against"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def initialize_rmm_pool():\n",
+    "    from librmm_cffi import librmm_config as rmm_cfg\n",
+    "\n",
+    "    rmm_cfg.use_pool_allocator = True\n",
+    "    #rmm_cfg.initial_pool_size = 2<<30 # set to 2GiB. Default is 1/2 total GPU memory\n",
+    "    import cudf\n",
+    "    return cudf._gdf.rmm_initialize()\n",
+    "\n",
+    "def initialize_rmm_no_pool():\n",
+    "    from librmm_cffi import librmm_config as rmm_cfg\n",
+    "    \n",
+    "    rmm_cfg.use_pool_allocator = False\n",
+    "    import cudf\n",
+    "    return cudf._gdf.rmm_initialize()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "client.run(initialize_rmm_pool)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def run_dask_task(func, **kwargs):\n",
+    "    task = func(**kwargs)\n",
+    "    return task\n",
+    "\n",
+    "def process_quarter_gpu(year=2000, quarter=1, perf_file=\"\"):\n",
+    "    ml_arrays = run_dask_task(delayed(run_gpu_workflow),\n",
+    "                                          quarter=quarter,\n",
+    "                                          year=year,\n",
+    "                                          perf_file=perf_file)\n",
+    "    return client.compute(ml_arrays,\n",
+    "                          optimize_graph=False,\n",
+    "                          fifo_timeout=\"0ms\")\n",
+    "\n",
+    "def null_workaround(df, **kwargs):\n",
+    "    for column, data_type in df.dtypes.items():\n",
+    "        if str(data_type) == \"category\":\n",
+    "            df[column] = df[column].astype('int32').fillna(-1)\n",
+    "        if str(data_type) in ['int8', 'int16', 'int32', 'int64', 'float32', 'float64']:\n",
+    "            df[column] = df[column].fillna(-1)\n",
+    "    return df\n",
+    "\n",
+    "def run_gpu_workflow(quarter=1, year=2000, perf_file=\"\", **kwargs):\n",
+    "    names = gpu_load_names()\n",
+    "    acq_gdf = gpu_load_acquisition_csv(acquisition_path= acq_data_path + \"/Acquisition_\"\n",
+    "                                      + str(year) + \"Q\" + str(quarter) + \".txt\")\n",
+    "    acq_gdf = acq_gdf.merge(names, how='left', on=['seller_name'])\n",
+    "    acq_gdf.drop_column('seller_name')\n",
+    "    acq_gdf['seller_name'] = acq_gdf['new']\n",
+    "    acq_gdf.drop_column('new')\n",
+    "    perf_df_tmp = gpu_load_performance_csv(perf_file)\n",
+    "    gdf = perf_df_tmp\n",
+    "    everdf = create_ever_features(gdf)\n",
+    "    delinq_merge = create_delinq_features(gdf)\n",
+    "    everdf = join_ever_delinq_features(everdf, delinq_merge)\n",
+    "    del(delinq_merge)\n",
+    "    joined_df = create_joined_df(gdf, everdf)\n",
+    "    testdf = create_12_mon_features(joined_df)\n",
+    "    joined_df = combine_joined_12_mon(joined_df, testdf)\n",
+    "    del(testdf)\n",
+    "    perf_df = final_performance_delinquency(gdf, joined_df)\n",
+    "    del(gdf, joined_df)\n",
+    "    final_gdf = join_perf_acq_gdfs(perf_df, acq_gdf)\n",
+    "    del(perf_df)\n",
+    "    del(acq_gdf)\n",
+    "    final_gdf = last_mile_cleaning(final_gdf)\n",
+    "    return final_gdf\n",
+    "\n",
+    "def gpu_load_performance_csv(performance_path, **kwargs):\n",
+    "    \"\"\" Loads performance data\n",
+    "\n",
+    "    Returns\n",
+    "    -------\n",
+    "    GPU DataFrame\n",
+    "    \"\"\"\n",
+    "    \n",
+    "    cols = [\n",
+    "        \"loan_id\", \"monthly_reporting_period\", \"servicer\", \"interest_rate\", \"current_actual_upb\",\n",
+    "        \"loan_age\", \"remaining_months_to_legal_maturity\", \"adj_remaining_months_to_maturity\",\n",
+    "        \"maturity_date\", \"msa\", \"current_loan_delinquency_status\", \"mod_flag\", \"zero_balance_code\",\n",
+    "        \"zero_balance_effective_date\", \"last_paid_installment_date\", \"foreclosed_after\",\n",
+    "        \"disposition_date\", \"foreclosure_costs\", \"prop_preservation_and_repair_costs\",\n",
+    "        \"asset_recovery_costs\", \"misc_holding_expenses\", \"holding_taxes\", \"net_sale_proceeds\",\n",
+    "        \"credit_enhancement_proceeds\", \"repurchase_make_whole_proceeds\", \"other_foreclosure_proceeds\",\n",
+    "        \"non_interest_bearing_upb\", \"principal_forgiveness_upb\", \"repurchase_make_whole_proceeds_flag\",\n",
+    "        \"foreclosure_principal_write_off_amount\", \"servicing_activity_indicator\"\n",
+    "    ]\n",
+    "    \n",
+    "    dtypes = OrderedDict([\n",
+    "        (\"loan_id\", \"int64\"),\n",
+    "        (\"monthly_reporting_period\", \"date\"),\n",
+    "        (\"servicer\", \"category\"),\n",
+    "        (\"interest_rate\", \"float64\"),\n",
+    "        (\"current_actual_upb\", \"float64\"),\n",
+    "        (\"loan_age\", \"float64\"),\n",
+    "        (\"remaining_months_to_legal_maturity\", \"float64\"),\n",
+    "        (\"adj_remaining_months_to_maturity\", \"float64\"),\n",
+    "        (\"maturity_date\", \"date\"),\n",
+    "        (\"msa\", \"float64\"),\n",
+    "        (\"current_loan_delinquency_status\", \"int32\"),\n",
+    "        (\"mod_flag\", \"category\"),\n",
+    "        (\"zero_balance_code\", \"category\"),\n",
+    "        (\"zero_balance_effective_date\", \"date\"),\n",
+    "        (\"last_paid_installment_date\", \"date\"),\n",
+    "        (\"foreclosed_after\", \"date\"),\n",
+    "        (\"disposition_date\", \"date\"),\n",
+    "        (\"foreclosure_costs\", \"float64\"),\n",
+    "        (\"prop_preservation_and_repair_costs\", \"float64\"),\n",
+    "        (\"asset_recovery_costs\", \"float64\"),\n",
+    "        (\"misc_holding_expenses\", \"float64\"),\n",
+    "        (\"holding_taxes\", \"float64\"),\n",
+    "        (\"net_sale_proceeds\", \"float64\"),\n",
+    "        (\"credit_enhancement_proceeds\", \"float64\"),\n",
+    "        (\"repurchase_make_whole_proceeds\", \"float64\"),\n",
+    "        (\"other_foreclosure_proceeds\", \"float64\"),\n",
+    "        (\"non_interest_bearing_upb\", \"float64\"),\n",
+    "        (\"principal_forgiveness_upb\", \"float64\"),\n",
+    "        (\"repurchase_make_whole_proceeds_flag\", \"category\"),\n",
+    "        (\"foreclosure_principal_write_off_amount\", \"float64\"),\n",
+    "        (\"servicing_activity_indicator\", \"category\")\n",
+    "    ])\n",
+    "\n",
+    "    print(performance_path)\n",
+    "    \n",
+    "    return cudf.read_csv(performance_path, names=cols, delimiter='|', dtype=list(dtypes.values()), skiprows=1)\n",
+    "\n",
+    "def gpu_load_acquisition_csv(acquisition_path, **kwargs):\n",
+    "    \"\"\" Loads acquisition data\n",
+    "\n",
+    "    Returns\n",
+    "    -------\n",
+    "    GPU DataFrame\n",
+    "    \"\"\"\n",
+    "    \n",
+    "    cols = [\n",
+    "        'loan_id', 'orig_channel', 'seller_name', 'orig_interest_rate', 'orig_upb', 'orig_loan_term', \n",
+    "        'orig_date', 'first_pay_date', 'orig_ltv', 'orig_cltv', 'num_borrowers', 'dti', 'borrower_credit_score', \n",
+    "        'first_home_buyer', 'loan_purpose', 'property_type', 'num_units', 'occupancy_status', 'property_state',\n",
+    "        'zip', 'mortgage_insurance_percent', 'product_type', 'coborrow_credit_score', 'mortgage_insurance_type', \n",
+    "        'relocation_mortgage_indicator'\n",
+    "    ]\n",
+    "    \n",
+    "    dtypes = OrderedDict([\n",
+    "        (\"loan_id\", \"int64\"),\n",
+    "        (\"orig_channel\", \"category\"),\n",
+    "        (\"seller_name\", \"category\"),\n",
+    "        (\"orig_interest_rate\", \"float64\"),\n",
+    "        (\"orig_upb\", \"int64\"),\n",
+    "        (\"orig_loan_term\", \"int64\"),\n",
+    "        (\"orig_date\", \"date\"),\n",
+    "        (\"first_pay_date\", \"date\"),\n",
+    "        (\"orig_ltv\", \"float64\"),\n",
+    "        (\"orig_cltv\", \"float64\"),\n",
+    "        (\"num_borrowers\", \"float64\"),\n",
+    "        (\"dti\", \"float64\"),\n",
+    "        (\"borrower_credit_score\", \"float64\"),\n",
+    "        (\"first_home_buyer\", \"category\"),\n",
+    "        (\"loan_purpose\", \"category\"),\n",
+    "        (\"property_type\", \"category\"),\n",
+    "        (\"num_units\", \"int64\"),\n",
+    "        (\"occupancy_status\", \"category\"),\n",
+    "        (\"property_state\", \"category\"),\n",
+    "        (\"zip\", \"int64\"),\n",
+    "        (\"mortgage_insurance_percent\", \"float64\"),\n",
+    "        (\"product_type\", \"category\"),\n",
+    "        (\"coborrow_credit_score\", \"float64\"),\n",
+    "        (\"mortgage_insurance_type\", \"float64\"),\n",
+    "        (\"relocation_mortgage_indicator\", \"category\")\n",
+    "    ])\n",
+    "    \n",
+    "    print(acquisition_path)\n",
+    "    \n",
+    "    return cudf.read_csv(acquisition_path, names=cols, delimiter='|', dtype=list(dtypes.values()), skiprows=1)\n",
+    "\n",
+    "def gpu_load_names(**kwargs):\n",
+    "    \"\"\" Loads names used for renaming the banks\n",
+    "    \n",
+    "    Returns\n",
+    "    -------\n",
+    "    GPU DataFrame\n",
+    "    \"\"\"\n",
+    "\n",
+    "    cols = [\n",
+    "        'seller_name', 'new'\n",
+    "    ]\n",
+    "    \n",
+    "    dtypes = OrderedDict([\n",
+    "        (\"seller_name\", \"category\"),\n",
+    "        (\"new\", \"category\"),\n",
+    "    ])\n",
+    "\n",
+    "    return cudf.read_csv(col_names_path, names=cols, delimiter='|', dtype=list(dtypes.values()), skiprows=1)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### GPU ETL and Feature Engineering Functions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def create_ever_features(gdf, **kwargs):\n",
+    "    everdf = gdf[['loan_id', 'current_loan_delinquency_status']]\n",
+    "    everdf = everdf.groupby('loan_id', method='hash').max()\n",
+    "    del(gdf)\n",
+    "    everdf['ever_30'] = (everdf['max_current_loan_delinquency_status'] >= 1).astype('int8')\n",
+    "    everdf['ever_90'] = (everdf['max_current_loan_delinquency_status'] >= 3).astype('int8')\n",
+    "    everdf['ever_180'] = (everdf['max_current_loan_delinquency_status'] >= 6).astype('int8')\n",
+    "    everdf.drop_column('max_current_loan_delinquency_status')\n",
+    "    return everdf"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def create_delinq_features(gdf, **kwargs):\n",
+    "    delinq_gdf = gdf[['loan_id', 'monthly_reporting_period', 'current_loan_delinquency_status']]\n",
+    "    del(gdf)\n",
+    "    delinq_30 = delinq_gdf.query('current_loan_delinquency_status >= 1')[['loan_id', 'monthly_reporting_period']].groupby('loan_id', method='hash').min()\n",
+    "    delinq_30['delinquency_30'] = delinq_30['min_monthly_reporting_period']\n",
+    "    delinq_30.drop_column('min_monthly_reporting_period')\n",
+    "    delinq_90 = delinq_gdf.query('current_loan_delinquency_status >= 3')[['loan_id', 'monthly_reporting_period']].groupby('loan_id', method='hash').min()\n",
+    "    delinq_90['delinquency_90'] = delinq_90['min_monthly_reporting_period']\n",
+    "    delinq_90.drop_column('min_monthly_reporting_period')\n",
+    "    delinq_180 = delinq_gdf.query('current_loan_delinquency_status >= 6')[['loan_id', 'monthly_reporting_period']].groupby('loan_id', method='hash').min()\n",
+    "    delinq_180['delinquency_180'] = delinq_180['min_monthly_reporting_period']\n",
+    "    delinq_180.drop_column('min_monthly_reporting_period')\n",
+    "    del(delinq_gdf)\n",
+    "    delinq_merge = delinq_30.merge(delinq_90, how='left', on=['loan_id'], type='hash')\n",
+    "    delinq_merge['delinquency_90'] = delinq_merge['delinquency_90'].fillna(np.dtype('datetime64[ms]').type('1970-01-01').astype('datetime64[ms]'))\n",
+    "    delinq_merge = delinq_merge.merge(delinq_180, how='left', on=['loan_id'], type='hash')\n",
+    "    delinq_merge['delinquency_180'] = delinq_merge['delinquency_180'].fillna(np.dtype('datetime64[ms]').type('1970-01-01').astype('datetime64[ms]'))\n",
+    "    del(delinq_30)\n",
+    "    del(delinq_90)\n",
+    "    del(delinq_180)\n",
+    "    return delinq_merge"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def join_ever_delinq_features(everdf_tmp, delinq_merge, **kwargs):\n",
+    "    everdf = everdf_tmp.merge(delinq_merge, on=['loan_id'], how='left', type='hash')\n",
+    "    del(everdf_tmp)\n",
+    "    del(delinq_merge)\n",
+    "    everdf['delinquency_30'] = everdf['delinquency_30'].fillna(np.dtype('datetime64[ms]').type('1970-01-01').astype('datetime64[ms]'))\n",
+    "    everdf['delinquency_90'] = everdf['delinquency_90'].fillna(np.dtype('datetime64[ms]').type('1970-01-01').astype('datetime64[ms]'))\n",
+    "    everdf['delinquency_180'] = everdf['delinquency_180'].fillna(np.dtype('datetime64[ms]').type('1970-01-01').astype('datetime64[ms]'))\n",
+    "    return everdf"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def create_joined_df(gdf, everdf, **kwargs):\n",
+    "    test = gdf[['loan_id', 'monthly_reporting_period', 'current_loan_delinquency_status', 'current_actual_upb']]\n",
+    "    del(gdf)\n",
+    "    test['timestamp'] = test['monthly_reporting_period']\n",
+    "    test.drop_column('monthly_reporting_period')\n",
+    "    test['timestamp_month'] = test['timestamp'].dt.month\n",
+    "    test['timestamp_year'] = test['timestamp'].dt.year\n",
+    "    test['delinquency_12'] = test['current_loan_delinquency_status']\n",
+    "    test.drop_column('current_loan_delinquency_status')\n",
+    "    test['upb_12'] = test['current_actual_upb']\n",
+    "    test.drop_column('current_actual_upb')\n",
+    "    test['upb_12'] = test['upb_12'].fillna(999999999)\n",
+    "    test['delinquency_12'] = test['delinquency_12'].fillna(-1)\n",
+    "    \n",
+    "    joined_df = test.merge(everdf, how='left', on=['loan_id'], type='hash')\n",
+    "    del(everdf)\n",
+    "    del(test)\n",
+    "    \n",
+    "    joined_df['ever_30'] = joined_df['ever_30'].fillna(-1)\n",
+    "    joined_df['ever_90'] = joined_df['ever_90'].fillna(-1)\n",
+    "    joined_df['ever_180'] = joined_df['ever_180'].fillna(-1)\n",
+    "    joined_df['delinquency_30'] = joined_df['delinquency_30'].fillna(-1)\n",
+    "    joined_df['delinquency_90'] = joined_df['delinquency_90'].fillna(-1)\n",
+    "    joined_df['delinquency_180'] = joined_df['delinquency_180'].fillna(-1)\n",
+    "    \n",
+    "    joined_df['timestamp_year'] = joined_df['timestamp_year'].astype('int32')\n",
+    "    joined_df['timestamp_month'] = joined_df['timestamp_month'].astype('int32')\n",
+    "    \n",
+    "    return joined_df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def create_12_mon_features(joined_df, **kwargs):\n",
+    "    testdfs = []\n",
+    "    n_months = 12\n",
+    "    for y in range(1, n_months + 1):\n",
+    "        tmpdf = joined_df[['loan_id', 'timestamp_year', 'timestamp_month', 'delinquency_12', 'upb_12']]\n",
+    "        tmpdf['josh_months'] = tmpdf['timestamp_year'] * 12 + tmpdf['timestamp_month']\n",
+    "        tmpdf['josh_mody_n'] = ((tmpdf['josh_months'].astype('float64') - 24000 - y) / 12).floor()\n",
+    "        tmpdf = tmpdf.groupby(['loan_id', 'josh_mody_n'], method='hash').agg({'delinquency_12': 'max','upb_12': 'min'})\n",
+    "        tmpdf['delinquency_12'] = (tmpdf['max_delinquency_12']>3).astype('int32')\n",
+    "        tmpdf['delinquency_12'] +=(tmpdf['min_upb_12']==0).astype('int32')\n",
+    "        tmpdf.drop_column('max_delinquency_12')\n",
+    "        tmpdf['upb_12'] = tmpdf['min_upb_12']\n",
+    "        tmpdf.drop_column('min_upb_12')\n",
+    "        tmpdf['timestamp_year'] = (((tmpdf['josh_mody_n'] * n_months) + 24000 + (y - 1)) / 12).floor().astype('int16')\n",
+    "        tmpdf['timestamp_month'] = np.int8(y)\n",
+    "        tmpdf.drop_column('josh_mody_n')\n",
+    "        testdfs.append(tmpdf)\n",
+    "        del(tmpdf)\n",
+    "    del(joined_df)\n",
+    "\n",
+    "    return cudf.concat(testdfs)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def combine_joined_12_mon(joined_df, testdf, **kwargs):\n",
+    "    joined_df.drop_column('delinquency_12')\n",
+    "    joined_df.drop_column('upb_12')\n",
+    "    joined_df['timestamp_year'] = joined_df['timestamp_year'].astype('int16')\n",
+    "    joined_df['timestamp_month'] = joined_df['timestamp_month'].astype('int8')\n",
+    "    return joined_df.merge(testdf, how='left', on=['loan_id', 'timestamp_year', 'timestamp_month'], type='hash')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def final_performance_delinquency(gdf, joined_df, **kwargs):\n",
+    "    merged = null_workaround(gdf)\n",
+    "    joined_df = null_workaround(joined_df)\n",
+    "    merged['timestamp_month'] = merged['monthly_reporting_period'].dt.month\n",
+    "    merged['timestamp_month'] = merged['timestamp_month'].astype('int8')\n",
+    "    merged['timestamp_year'] = merged['monthly_reporting_period'].dt.year\n",
+    "    merged['timestamp_year'] = merged['timestamp_year'].astype('int16')\n",
+    "    merged = merged.merge(joined_df, how='left', on=['loan_id', 'timestamp_year', 'timestamp_month'], type='hash')\n",
+    "    merged.drop_column('timestamp_year')\n",
+    "    merged.drop_column('timestamp_month')\n",
+    "    return merged"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def join_perf_acq_gdfs(perf, acq, **kwargs):\n",
+    "    perf = null_workaround(perf)\n",
+    "    acq = null_workaround(acq)\n",
+    "    return perf.merge(acq, how='left', on=['loan_id'], type='hash')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def last_mile_cleaning(df, **kwargs):\n",
+    "    drop_list = [\n",
+    "        'loan_id', 'orig_date', 'first_pay_date', 'seller_name',\n",
+    "        'monthly_reporting_period', 'last_paid_installment_date', 'maturity_date', 'ever_30', 'ever_90', 'ever_180',\n",
+    "        'delinquency_30', 'delinquency_90', 'delinquency_180', 'upb_12',\n",
+    "        'zero_balance_effective_date','foreclosed_after', 'disposition_date','timestamp'\n",
+    "    ]\n",
+    "    for column in drop_list:\n",
+    "        df.drop_column(column)\n",
+    "    for col, dtype in df.dtypes.iteritems():\n",
+    "        if str(dtype)=='category':\n",
+    "            df[col] = df[col].cat.codes\n",
+    "        df[col] = df[col].astype('float32')\n",
+    "    df['delinquency_12'] = df['delinquency_12'] > 0\n",
+    "    df['delinquency_12'] = df['delinquency_12'].fillna(False).astype('int32')\n",
+    "    for column in df.columns:\n",
+    "        df[column] = df[column].fillna(-1)\n",
+    "    return df.to_arrow(index=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Process the data using the functions"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Dask + cuDF multi-year"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "\n",
+    "# NOTE: The ETL calculates additional features which are then dropped before creating the XGBoost DMatrix.\n",
+    "# This can be optimized to avoid calculating the dropped features.\n",
+    "\n",
+    "gpu_dfs = []\n",
+    "gpu_time = 0\n",
+    "quarter = 1\n",
+    "year = start_year\n",
+    "count = 0\n",
+    "while year != end_year:\n",
+    "    for file in glob(os.path.join(perf_data_path + \"/Performance_\" + str(year) + \"Q\" + str(quarter) + \"*\")):\n",
+    "        gpu_dfs.append(process_quarter_gpu(year=year, quarter=quarter, perf_file=file))\n",
+    "        count += 1\n",
+    "    quarter += 1\n",
+    "    if quarter == 5:\n",
+    "        year += 1\n",
+    "        quarter = 1\n",
+    "wait(gpu_dfs)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "client.run(cudf._gdf.rmm_finalize)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "client.run(initialize_rmm_no_pool)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### GPU Machine Learning"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dxgb_gpu_params = {\n",
+    "    'nround':            100,\n",
+    "    'max_depth':         8,\n",
+    "    'max_leaves':        2**8,\n",
+    "    'alpha':             0.9,\n",
+    "    'eta':               0.1,\n",
+    "    'gamma':             0.1,\n",
+    "    'learning_rate':     0.1,\n",
+    "    'subsample':         1,\n",
+    "    'reg_lambda':        1,\n",
+    "    'scale_pos_weight':  2,\n",
+    "    'min_child_weight':  30,\n",
+    "    'tree_method':       'gpu_hist',\n",
+    "    'n_gpus':            1,\n",
+    "    'distributed_dask':  True,\n",
+    "    'loss':              'ls',\n",
+    "    'objective':         'gpu:reg:linear',\n",
+    "    'max_features':      'auto',\n",
+    "    'criterion':         'friedman_mse',\n",
+    "    'grow_policy':       'lossguide',\n",
+    "    'verbose':           True\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "\n",
+    "gpu_dfs = [delayed(DataFrame.from_arrow)(gpu_df) for gpu_df in gpu_dfs[:part_count]]\n",
+    "gpu_dfs = [gpu_df for gpu_df in gpu_dfs]\n",
+    "wait(gpu_dfs)\n",
+    "\n",
+    "tmp_map = [(gpu_df, list(client.who_has(gpu_df).values())[0]) for gpu_df in gpu_dfs]\n",
+    "new_map = {}\n",
+    "for key, value in tmp_map:\n",
+    "    if value not in new_map:\n",
+    "        new_map[value] = [key]\n",
+    "    else:\n",
+    "        new_map[value].append(key)\n",
+    "\n",
+    "del(tmp_map)\n",
+    "gpu_dfs = []\n",
+    "for list_delayed in new_map.values():\n",
+    "    gpu_dfs.append(delayed(cudf.concat)(list_delayed))\n",
+    "\n",
+    "del(new_map)\n",
+    "gpu_dfs = [(gpu_df[['delinquency_12']], gpu_df[delayed(list)(gpu_df.columns.difference(['delinquency_12']))]) for gpu_df in gpu_dfs]\n",
+    "gpu_dfs = [(gpu_df[0].persist(), gpu_df[1].persist()) for gpu_df in gpu_dfs]\n",
+    "\n",
+    "gpu_dfs = [dask.delayed(xgb.DMatrix)(gpu_df[1], gpu_df[0]) for gpu_df in gpu_dfs]\n",
+    "gpu_dfs = [gpu_df.persist() for gpu_df in gpu_dfs]\n",
+    "gc.collect()\n",
+    "wait(gpu_dfs)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "labels = None\n",
+    "bst = dxgb_gpu.train(client, dxgb_gpu_params, gpu_dfs, labels, num_boost_round=dxgb_gpu_params['nround'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/utils/conda_create_cudf.sh b/utils/conda_create_cudf.sh
new file mode 100755
index 00000000..0b176402
--- /dev/null
+++ b/utils/conda_create_cudf.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+
+export PYTHON_VERSION=3.6
+export NUMBA_VERSION=0.40.0
+export NUMPY_VERSION=1.14.5
+export PANDAS_VERSION=0.20.3
+export PYARROW_VERSION=0.10
+
+echo -e "\n"                                                                            
+echo "attempting to remove current conda environment cudf"                              
+conda-env   remove  --name cudf  --quiet --yes                                          
+echo "creating dev environment for cudf"                                                
+echo -e "\n"                                                                            
+conda       update  --name base --yes conda                                          && \
+conda       install --yes  python=$PYTHON_VERSION                                    && \
+conda       create  --name cudf  --yes python=$PYTHON_VERSION                        && \
+conda       install --name cudf  --yes --channel conda-forge                            \
+                                      --channel numba                                   \
+                                      --channel nvidia                                  \
+                                      nvstrings                                         \
+                                      bokeh                                             \
+                                      cmake                                             \
+                                      dask                                              \
+                                      pytest                                            \
+                                      pycparser                                         \
+                                      cffi                                              \
+                                      cython                                            \
+                                      jupyterlab                                        \
+                                      numba=$NUMBA_VERSION                              \
+                                      numpy=$NUMPY_VERSION                              \
+                                      numpy-base=$NUMPY_VERSION                         \
+                                      pandas=$PANDAS_VERSION                            \
+                                      pyarrow=$PYARROW_VERSION                          \
+                                      scikit-learn                                      \
+                                      scipy                                          && \
+conda       clean   --all       --yes                                                && \
+echo -e "\n"                                                                         && \
+echo "successfully created environment cudf"                                         && \
+echo -e "\n"
diff --git a/utils/dask-cluster.py b/utils/dask-cluster.py
new file mode 100644
index 00000000..fd051e10
--- /dev/null
+++ b/utils/dask-cluster.py
@@ -0,0 +1,63 @@
+import subprocess
+
+dask_conf_path = "./dask.conf"
+with open(dask_conf_path, "r") as file:
+	dask_conf = file.read()
+
+_dask_conf = dask_conf.split("\n")
+dask_conf = list()
+for i, line in enumerate(_dask_conf):
+	line = line.split()
+	if 0 < len(line):
+		dask_conf.append(line)
+
+cmd = "bash ./dask-setup.sh 0"
+
+print(cmd)
+
+process = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE)
+output, error = process.communicate()
+
+cmd = "hostname --all-ip-addresses"
+process = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE)
+output, error = process.communicate()
+IPADDR = str(output.decode()).split()[0]
+
+NWORKERS_PER_NODE = None;
+DASK_SCHED_PORT = None;
+DASK_SCHED_BOKEH_PORT = None;
+DASK_WORKER_BOKEH_PORT = None;
+MASTER_IPADDR = None;
+WHOAMI = None
+
+for line in dask_conf:
+    if line[0] == "NWORKERS_PER_NODE":
+        NWORKERS_PER_NODE = line[1]
+    if line[0] == "DASK_SCHED_PORT":
+        DASK_SCHED_PORT = line[1]
+    if line[0] == "DASK_SCHED_BOKEH_PORT":
+        DASK_SCHED_BOKEH_PORT = line[1]
+    if line[0] == "DASK_WORKER_BOKEH_PORT":
+        DASK_WORKER_BOKEH_PORT = line[1]
+    if line[1] == "MASTER":
+        MASTER_IPADDR = line[0]
+    if line[0] == IPADDR:
+        WHOAMI = line[1]
+
+cmd = "bash ./dask-setup.sh " + str(NWORKERS_PER_NODE)
+cmd = cmd + " " + str(DASK_SCHED_PORT)
+cmd = cmd + " " + str(DASK_SCHED_BOKEH_PORT)
+cmd = cmd + " " + str(DASK_WORKER_BOKEH_PORT)
+cmd = cmd + " " + str(MASTER_IPADDR)
+cmd = cmd + " " + str(WHOAMI)
+
+print(cmd)
+
+process = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE)
+output, error = process.communicate()
+
+cmd = "screen -list"
+
+process = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE)
+output, error = process.communicate()
+print(output.decode())
\ No newline at end of file
diff --git a/utils/dask-setup.sh b/utils/dask-setup.sh
new file mode 100755
index 00000000..7ba64b91
--- /dev/null
+++ b/utils/dask-setup.sh
@@ -0,0 +1,100 @@
+#!/bin/bash
+# export NCCL_P2P_DISABLE=1
+# export NCCL_SOCKET_IFNAME=ib
+
+export DASK_DISTRIBUTED__SCHEDULER__WORK_STEALING=False
+export DASK_DISTRIBUTED__SCHEDULER__BANDWIDTH=1
+
+NWORKERS_PER_NODE=$1
+DASK_SCHED_PORT=$2
+DASK_SCHED_BOKEH_PORT=$3
+DASK_WORKER_BOKEH_PORT=$4
+MASTER_IPADDR=$5
+WHOAMI=$6
+
+DASK_LOCAL_DIR=./.dask
+NUM_GPUS=$(nvidia-smi --list-gpus | wc --lines)
+MY_IPADDR=($(hostname --all-ip-addresses))
+
+mkdir -p $DASK_LOCAL_DIR
+
+echo -e "\n"
+
+echo "shutting down current dask cluster if it exists..."
+NUM_SCREENS=$(screen -list | grep --only-matching --extended-regexp '[0-9]\ Socket|[0-9]{1,10}\ Sockets' | grep --only-matching --extended-regexp '[0-9]{1,10}')
+SCREENS=($(screen -list | grep --only-matching --extended-regexp '[0-9]{1,10}\.dask|[0-9]{1,10}\.gpu' | grep --only-matching --extended-regexp '[0-9]{1,10}'))
+if [[ $NUM_SCREENS > 0 ]]; then
+    screen -wipe
+    for screen_id in $(seq 1 $NUM_SCREENS);
+    do
+    index=$screen_id-1
+    echo ${SCREENS[$index]}
+    screen -S ${SCREENS[$index]} -X quit
+    done
+fi
+echo "... cluster shut down"
+
+echo -e "\n"
+
+if [[ "0" -lt "$NWORKERS_PER_NODE" ]] && [[ "$NWORKERS_PER_NODE" -le "$NUM_GPUS" ]]; then
+
+    if [[ "$WHOAMI" = "MASTER" ]]; then
+        echo "initializing dask scheduler..."
+        screen -dmS dask_scheduler bash -c "source activate cudf_dev && dask-scheduler"
+        sleep 5
+        echo "... scheduler started"
+    fi
+
+    echo -e "\n"
+
+    echo "starting $NWORKERS_PER_NODE worker(s)..."
+    declare -a WIDS
+    for worker_id in $(seq 1 $NWORKERS_PER_NODE);
+    do
+    start=$(( worker_id - 1 ))
+    end=$(( NWORKERS_PER_NODE - 1 ))
+    other=$(( start - 1 ))
+    devs=$(seq --separator=, $start $end)
+    second=$(seq --separator=, 0 $other)
+    if [ "$second" != "" ]; then
+        devs="$devs,$second"
+    fi
+    echo "... starting gpu worker $worker_id"
+    # change the following command to read "... cuda-memcheck dask-worker ..." for debugging
+    export create_worker="source activate cudf_dev && \
+                          dask-worker $MASTER_IPADDR:$DASK_SCHED_PORT \
+                                      --host=${MY_IPADDR[0]} --no-nanny \
+                                      --nprocs=1 --nthreads=1 \
+                                      --memory-limit=0 --name ${MY_IPADDR[0]}_gpu_$worker_id \
+                                      --local-directory $DASK_LOCAL_DIR/$name"
+    # the following specifies the location for the log files ... uncomment for debugging
+    # export logfile="${DASK_LOCAL_DIR}/gpu_worker_${worker_id}_log.txt"
+    env CUDA_VISIBLE_DEVICES=$devs screen -dmS gpu_worker_$worker_id \
+                                   bash -c "$create_worker"
+                                   # bash -c 'script -c "$create_worker" "$logfile"' # uncomment this line for debugging
+    WIDS[$id]=$!
+    done
+    sleep 5
+
+    echo -e "\n"
+
+    echo "... $NWORKERS_PER_NODE worker(s) successfully started"
+
+    echo -e "\n"
+fi
+
+if [[ "$NWORKERS_PER_NODE" -eq "0" ]]; then
+    NUM_SCREENS=$(screen -list | grep --only-matching --extended-regexp '[0-9]\ Socket|[0-9]{1,10}\ Sockets' | grep --only-matching --extended-regexp '[0-9]{1,10}')
+    if [[ $NUM_SCREENS == "" ]]; then
+        echo "cluster shut down successfully"
+        echo "verifying status:"
+        screen -list
+    fi
+fi
+
+if [[ "0" -lt "$NWORKERS_PER_NODE" ]]; then
+    echo "printing status ..."
+    echo -e "\n"
+    screen -list
+    echo -e "\n"
+fi
diff --git a/utils/dask.conf b/utils/dask.conf
new file mode 100644
index 00000000..040f0bfc
--- /dev/null
+++ b/utils/dask.conf
@@ -0,0 +1,7 @@
+NWORKERS_PER_NODE 8
+
+12.34.567.890 MASTER
+
+DASK_SCHED_PORT         8786
+DASK_SCHED_BOKEH_PORT   8787
+DASK_WORKER_BOKEH_PORT  8790
\ No newline at end of file
diff --git a/utils/start_jupyter.sh b/utils/start_jupyter.sh
new file mode 100755
index 00000000..10ee4da2
--- /dev/null
+++ b/utils/start_jupyter.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+nohup jupyter-lab --allow-root --ip=0.0.0.0 --no-browser --NotebookApp.token='' > /dev/null 2>&1 &
+echo -e "\n"
+echo "nohup jupyter-lab --allow-root --ip=0.0.0.0 --no-browser --NotebookApp.token='rapids' > /dev/null 2>&1 &"
+echo -e "\n"
diff --git a/utils/stop_jupyter.sh b/utils/stop_jupyter.sh
new file mode 100755
index 00000000..aaaa0028
--- /dev/null
+++ b/utils/stop_jupyter.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+ps aux | grep jupyter | \
+    grep --extended-regexp "$USER[\ ]{1,10}[0-9]{1,10}" | \
+    grep --only-matching --extended-regexp "$USER[\ ]{1,10}[0-9]{1,10}" | \
+    grep --only-matching --extended-regexp "[\ ]{1,10}[0-9]{1,10}" | \
+    xargs kill -9
+sleep 2
\ No newline at end of file

From 0b0a121beaa766e1bac61a47282d5a12f0f6dcca Mon Sep 17 00:00:00 2001
From: Matthew Jones <majones@dgx15.aselab.nvidia.com>
Date: Mon, 3 Dec 2018 15:31:39 -0800
Subject: [PATCH 02/21] renaming dir for clarity

---
 {notebooks => mortgage}/E2E.ipynb | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename {notebooks => mortgage}/E2E.ipynb (100%)

diff --git a/notebooks/E2E.ipynb b/mortgage/E2E.ipynb
similarity index 100%
rename from notebooks/E2E.ipynb
rename to mortgage/E2E.ipynb

From e9a993012d261eb077c14045634e5c795fe1c92c Mon Sep 17 00:00:00 2001
From: Matthew Jones <majones@dgx15.aselab.nvidia.com>
Date: Mon, 3 Dec 2018 15:33:22 -0800
Subject: [PATCH 03/21] disabling nccl p2p

---
 utils/dask-setup.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utils/dask-setup.sh b/utils/dask-setup.sh
index 7ba64b91..bcbddfdd 100755
--- a/utils/dask-setup.sh
+++ b/utils/dask-setup.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# export NCCL_P2P_DISABLE=1
+export NCCL_P2P_DISABLE=1
 # export NCCL_SOCKET_IFNAME=ib
 
 export DASK_DISTRIBUTED__SCHEDULER__WORK_STEALING=False

From df7c718c38c914cecf02fde281813b6639d910f3 Mon Sep 17 00:00:00 2001
From: Matthew Jones <majones@dgx15.aselab.nvidia.com>
Date: Wed, 5 Dec 2018 09:21:29 -0800
Subject: [PATCH 04/21] ENH updated script names and gave dask setup script a
 new DEBUG option for increased logging

---
 ...da_create_cudf.sh => conda-create-cudf.sh} |  0
 utils/dask-setup.sh                           | 35 ++++++++++++-------
 utils/start-jupyter.sh                        |  5 +++
 utils/start_jupyter.sh                        |  5 ---
 utils/{stop_jupyter.sh => stop-jupyter.sh}    |  0
 5 files changed, 28 insertions(+), 17 deletions(-)
 rename utils/{conda_create_cudf.sh => conda-create-cudf.sh} (100%)
 create mode 100755 utils/start-jupyter.sh
 delete mode 100755 utils/start_jupyter.sh
 rename utils/{stop_jupyter.sh => stop-jupyter.sh} (100%)

diff --git a/utils/conda_create_cudf.sh b/utils/conda-create-cudf.sh
similarity index 100%
rename from utils/conda_create_cudf.sh
rename to utils/conda-create-cudf.sh
diff --git a/utils/dask-setup.sh b/utils/dask-setup.sh
index bcbddfdd..06cdba22 100755
--- a/utils/dask-setup.sh
+++ b/utils/dask-setup.sh
@@ -11,6 +11,7 @@ DASK_SCHED_BOKEH_PORT=$3
 DASK_WORKER_BOKEH_PORT=$4
 MASTER_IPADDR=$5
 WHOAMI=$6
+DEBUG=$7
 
 DASK_LOCAL_DIR=./.dask
 NUM_GPUS=$(nvidia-smi --list-gpus | wc --lines)
@@ -60,18 +61,28 @@ if [[ "0" -lt "$NWORKERS_PER_NODE" ]] && [[ "$NWORKERS_PER_NODE" -le "$NUM_GPUS"
         devs="$devs,$second"
     fi
     echo "... starting gpu worker $worker_id"
-    # change the following command to read "... cuda-memcheck dask-worker ..." for debugging
-    export create_worker="source activate cudf_dev && \
-                          dask-worker $MASTER_IPADDR:$DASK_SCHED_PORT \
-                                      --host=${MY_IPADDR[0]} --no-nanny \
-                                      --nprocs=1 --nthreads=1 \
-                                      --memory-limit=0 --name ${MY_IPADDR[0]}_gpu_$worker_id \
-                                      --local-directory $DASK_LOCAL_DIR/$name"
-    # the following specifies the location for the log files ... uncomment for debugging
-    # export logfile="${DASK_LOCAL_DIR}/gpu_worker_${worker_id}_log.txt"
-    env CUDA_VISIBLE_DEVICES=$devs screen -dmS gpu_worker_$worker_id \
-                                   bash -c "$create_worker"
-                                   # bash -c 'script -c "$create_worker" "$logfile"' # uncomment this line for debugging
+
+    if [[ "$DEBUG" = "DEBUG" ]]; then
+        export create_worker="source activate cudf && \
+                              cuda-memcheck dask-worker $MASTER_IPADDR:$DASK_SCHED_PORT \
+                                                        --host=${MY_IPADDR[0]} --no-nanny \
+                                                        --nprocs=1 --nthreads=1 \
+                                                        --memory-limit=0 --name ${MY_IPADDR[0]}_gpu_$worker_id \
+                                                        --local-directory $DASK_LOCAL_DIR/$name"
+        export logfile="${DASK_LOCAL_DIR}/gpu_worker_${worker_id}_log.txt"
+        env CUDA_VISIBLE_DEVICES=$devs screen -dmS gpu_worker_$worker_id \
+                                                   bash -c 'script -c "$create_worker" "$logfile"'
+    else
+        export create_worker="source activate cudf && \
+                              dask-worker $MASTER_IPADDR:$DASK_SCHED_PORT \
+                                          --host=${MY_IPADDR[0]} --no-nanny \
+                                          --nprocs=1 --nthreads=1 \
+                                          --memory-limit=0 --name ${MY_IPADDR[0]}_gpu_$worker_id \
+                                          --local-directory $DASK_LOCAL_DIR/$name"
+        env CUDA_VISIBLE_DEVICES=$devs screen -dmS gpu_worker_$worker_id \
+                                                   bash -c "$create_worker"
+    fi
+
     WIDS[$id]=$!
     done
     sleep 5
diff --git a/utils/start-jupyter.sh b/utils/start-jupyter.sh
new file mode 100755
index 00000000..cec23064
--- /dev/null
+++ b/utils/start-jupyter.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+echo -e "\n"
+echo "jupyter-lab --allow-root --ip=0.0.0.0 --no-browser --NotebookApp.token=''"
+echo -e "\n"
+jupyter-lab --allow-root --ip=0.0.0.0 --no-browser --NotebookApp.token=''
\ No newline at end of file
diff --git a/utils/start_jupyter.sh b/utils/start_jupyter.sh
deleted file mode 100755
index 10ee4da2..00000000
--- a/utils/start_jupyter.sh
+++ /dev/null
@@ -1,5 +0,0 @@
-#!/bin/bash
-nohup jupyter-lab --allow-root --ip=0.0.0.0 --no-browser --NotebookApp.token='' > /dev/null 2>&1 &
-echo -e "\n"
-echo "nohup jupyter-lab --allow-root --ip=0.0.0.0 --no-browser --NotebookApp.token='rapids' > /dev/null 2>&1 &"
-echo -e "\n"
diff --git a/utils/stop_jupyter.sh b/utils/stop-jupyter.sh
similarity index 100%
rename from utils/stop_jupyter.sh
rename to utils/stop-jupyter.sh

From aa701661d840c2da3d6513e9633ce6b4047ef0ad Mon Sep 17 00:00:00 2001
From: Matthew Jones <majones@dgx15.aselab.nvidia.com>
Date: Wed, 5 Dec 2018 09:39:10 -0800
Subject: [PATCH 05/21] adding readme

---
 utils/README.md | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)
 create mode 100644 utils/README.md

diff --git a/utils/README.md b/utils/README.md
new file mode 100644
index 00000000..2e2fb380
--- /dev/null
+++ b/utils/README.md
@@ -0,0 +1,19 @@
+# Utility Scripts
+
+## Quick Start
+
+* `conda-create-cudf.sh`
+* `start-jupyter.sh`
+* `stop-jupyter.sh`
+* `dask-cluster.py`
+* `dask-setup.sh`
+
+## conda-create-cudf
+
+## start-jupyter
+
+## stop-jupyter
+
+## dask-cluster
+
+## dask-setup

From e9f49a3afeb4554a948009ad78d544d7cf91ecce Mon Sep 17 00:00:00 2001
From: Matthew Jones <majones@dgx15.aselab.nvidia.com>
Date: Wed, 5 Dec 2018 10:20:16 -0800
Subject: [PATCH 06/21] updated NWORKERS_PER_NODE to NWORKERS to reflect
 heterogeneous cluster setups

---
 utils/README.md | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/utils/README.md b/utils/README.md
index 2e2fb380..3dbba2fa 100644
--- a/utils/README.md
+++ b/utils/README.md
@@ -2,11 +2,11 @@
 
 ## Quick Start
 
-* `conda-create-cudf.sh`
-* `start-jupyter.sh`
-* `stop-jupyter.sh`
-* `dask-cluster.py`
-* `dask-setup.sh`
+* `conda-create-cudf.sh`: creates a conda environment named `cudf` with all of the requisite software dependencies for [cuDF](https://github.com/rapidsai/cudf)
+* `start-jupyter.sh`: starts a JupyterLab environment for interacting with, and running, notebooks
+* `stop-jupyter.sh`: identifies all process IDs associated with Jupyter and kills them
+* `dask-cluster.py`: launches a configured Dask cluster (a set of nodes) for use within a notebook
+* `dask-setup.sh`: a low-level script for constructing a set of Dask workers on a single node
 
 ## conda-create-cudf
 

From ea52728be4e7853e633647253178316d68d86ac9 Mon Sep 17 00:00:00 2001
From: Matthew Jones <majones@dgx15.aselab.nvidia.com>
Date: Wed, 5 Dec 2018 10:21:19 -0800
Subject: [PATCH 07/21] adding content to readme

---
 utils/README.md       | 77 ++++++++++++++++++++++++++++++++++++++++++-
 utils/dask-cluster.py |  9 ++---
 utils/dask-setup.sh   | 16 ++++-----
 utils/dask.conf       |  2 +-
 4 files changed, 90 insertions(+), 14 deletions(-)

diff --git a/utils/README.md b/utils/README.md
index 3dbba2fa..65847bc0 100644
--- a/utils/README.md
+++ b/utils/README.md
@@ -6,14 +6,89 @@
 * `start-jupyter.sh`: starts a JupyterLab environment for interacting with, and running, notebooks
 * `stop-jupyter.sh`: identifies all process IDs associated with Jupyter and kills them
 * `dask-cluster.py`: launches a configured Dask cluster (a set of nodes) for use within a notebook
-* `dask-setup.sh`: a low-level script for constructing a set of Dask workers on a single node
+* `dask-setup.sh`: a low-level script for constructing a set of Dask workers on a single node ... **do not use this script directly**
 
 ## conda-create-cudf
 
+Typical output (suppressing package plans) will be of the following form:
+
+```bash
+notebooks$ bash utils/conda-create-cudf.sh
+
+attempting to remove current conda environment cudf
+
+Remove all packages in environment /conda/envs/cudf:
+
+...
+
+creating dev environment for cudf
+
+...
+
+successfully created environment cudf
+
+```
+
+Activate the dev environment with 
+
+```bash
+$ source activate cudf
+```
+
 ## start-jupyter
 
+Typical output for `start-jupyter.sh` will be of the following form:
+
+```bash
+
+jupyter-lab --allow-root --ip=0.0.0.0 --no-browser --NotebookApp.token=''
+
+
+[I 09:58:01.481 LabApp] Writing notebook server cookie secret to /run/user/10060/jupyter/notebook_cookie_secret
+[W 09:58:01.928 LabApp] All authentication is disabled.  Anyone who can connect to this server will be able to run code.
+[I 09:58:01.945 LabApp] JupyterLab extension loaded from /conda/envs/cudf/lib/python3.6/site-packages/jupyterlab
+[I 09:58:01.945 LabApp] JupyterLab application directory is /conda/envs/cudf/share/jupyter/lab
+[W 09:58:01.946 LabApp] JupyterLab server extension not enabled, manually loading...
+[I 09:58:01.949 LabApp] JupyterLab extension loaded from /conda/envs/cudf/lib/python3.6/site-packages/jupyterlab
+[I 09:58:01.949 LabApp] JupyterLab application directory is /conda/envs/cudf/share/jupyter/lab
+[I 09:58:01.950 LabApp] Serving notebooks from local directory: /workspace/notebooks/notebooks
+[I 09:58:01.950 LabApp] The Jupyter Notebook is running at:
+[I 09:58:01.950 LabApp] http://(dgx15 or 127.0.0.1):8888/
+[I 09:58:01.950 LabApp] Use Control-C to stop this server and shut down all kernels (twice to skip confirmation).
+```
+
+`jupyter-lab` will expose a JupyterLab server on port `:8888`. Opening a web-browser, and navigating to `http://YOUR.IP.ADDRESS:8888` provides a GUI which can used to edit/run code.
+
 ## stop-jupyter
 
+Sometimes a server needs to be forcibly shut down. Running 
+
+```bash
+notebooks$ bash utils/stop-jupyter.sh
+```
+
+will kill any and all JupyterLab servers running on the machine.
+
 ## dask-cluster
 
+This is a Python script used to launch a Dask cluster. A configuration file is provided at `/path/to/notebooks/utils/dask.conf`.
+
+```bash
+notebooks$ cat utils/dask.conf
+
+NWORKERS 8
+
+12.34.567.890 MASTER
+
+DASK_SCHED_PORT         8786
+DASK_SCHED_BOKEH_PORT   8787
+DASK_WORKER_BOKEH_PORT  8790
+```
+
+* `NWORKERS 8`: a keyword to tell `dask-cluster.py` how many workers to instantiate on the node which called `dask-cluster.py`
+* `12.34.567.890 MASTER`: a map of `IP.ADDRESS {WORKER/MASTER}`
+* `DASK_SCHED_PORT         8786`: a keyword to tell `dask-cluster.py` which port is assigned to the Dask scheduler
+* `DASK_SCHED_BOKEH_PORT   8787`: a keyword to tell `dask-cluster.py` which port is assigned to the scheduler's visual front-end
+* `DASK_WORKER_BOKEH_PORT  8790`: a keyword to tell `dask-cluster.py` which port is assigned to the worker's visual front-end
+
 ## dask-setup
diff --git a/utils/dask-cluster.py b/utils/dask-cluster.py
index fd051e10..d5248667 100644
--- a/utils/dask-cluster.py
+++ b/utils/dask-cluster.py
@@ -23,16 +23,17 @@
 output, error = process.communicate()
 IPADDR = str(output.decode()).split()[0]
 
-NWORKERS_PER_NODE = None;
+NWORKERS = None;
 DASK_SCHED_PORT = None;
 DASK_SCHED_BOKEH_PORT = None;
 DASK_WORKER_BOKEH_PORT = None;
 MASTER_IPADDR = None;
 WHOAMI = None
+DEBUG = None
 
 for line in dask_conf:
-    if line[0] == "NWORKERS_PER_NODE":
-        NWORKERS_PER_NODE = line[1]
+    if line[0] == "NWORKERS":
+        NWORKERS = line[1]
     if line[0] == "DASK_SCHED_PORT":
         DASK_SCHED_PORT = line[1]
     if line[0] == "DASK_SCHED_BOKEH_PORT":
@@ -44,7 +45,7 @@
     if line[0] == IPADDR:
         WHOAMI = line[1]
 
-cmd = "bash ./dask-setup.sh " + str(NWORKERS_PER_NODE)
+cmd = "bash ./dask-setup.sh " + str(NWORKERS)
 cmd = cmd + " " + str(DASK_SCHED_PORT)
 cmd = cmd + " " + str(DASK_SCHED_BOKEH_PORT)
 cmd = cmd + " " + str(DASK_WORKER_BOKEH_PORT)
diff --git a/utils/dask-setup.sh b/utils/dask-setup.sh
index 06cdba22..cce9c9e6 100755
--- a/utils/dask-setup.sh
+++ b/utils/dask-setup.sh
@@ -5,7 +5,7 @@ export NCCL_P2P_DISABLE=1
 export DASK_DISTRIBUTED__SCHEDULER__WORK_STEALING=False
 export DASK_DISTRIBUTED__SCHEDULER__BANDWIDTH=1
 
-NWORKERS_PER_NODE=$1
+NWORKERS=$1
 DASK_SCHED_PORT=$2
 DASK_SCHED_BOKEH_PORT=$3
 DASK_WORKER_BOKEH_PORT=$4
@@ -37,7 +37,7 @@ echo "... cluster shut down"
 
 echo -e "\n"
 
-if [[ "0" -lt "$NWORKERS_PER_NODE" ]] && [[ "$NWORKERS_PER_NODE" -le "$NUM_GPUS" ]]; then
+if [[ "0" -lt "$NWORKERS" ]] && [[ "$NWORKERS" -le "$NUM_GPUS" ]]; then
 
     if [[ "$WHOAMI" = "MASTER" ]]; then
         echo "initializing dask scheduler..."
@@ -48,12 +48,12 @@ if [[ "0" -lt "$NWORKERS_PER_NODE" ]] && [[ "$NWORKERS_PER_NODE" -le "$NUM_GPUS"
 
     echo -e "\n"
 
-    echo "starting $NWORKERS_PER_NODE worker(s)..."
+    echo "starting $NWORKERS worker(s)..."
     declare -a WIDS
-    for worker_id in $(seq 1 $NWORKERS_PER_NODE);
+    for worker_id in $(seq 1 $NWORKERS);
     do
     start=$(( worker_id - 1 ))
-    end=$(( NWORKERS_PER_NODE - 1 ))
+    end=$(( NWORKERS - 1 ))
     other=$(( start - 1 ))
     devs=$(seq --separator=, $start $end)
     second=$(seq --separator=, 0 $other)
@@ -89,12 +89,12 @@ if [[ "0" -lt "$NWORKERS_PER_NODE" ]] && [[ "$NWORKERS_PER_NODE" -le "$NUM_GPUS"
 
     echo -e "\n"
 
-    echo "... $NWORKERS_PER_NODE worker(s) successfully started"
+    echo "... $NWORKERS worker(s) successfully started"
 
     echo -e "\n"
 fi
 
-if [[ "$NWORKERS_PER_NODE" -eq "0" ]]; then
+if [[ "$NWORKERS" -eq "0" ]]; then
     NUM_SCREENS=$(screen -list | grep --only-matching --extended-regexp '[0-9]\ Socket|[0-9]{1,10}\ Sockets' | grep --only-matching --extended-regexp '[0-9]{1,10}')
     if [[ $NUM_SCREENS == "" ]]; then
         echo "cluster shut down successfully"
@@ -103,7 +103,7 @@ if [[ "$NWORKERS_PER_NODE" -eq "0" ]]; then
     fi
 fi
 
-if [[ "0" -lt "$NWORKERS_PER_NODE" ]]; then
+if [[ "0" -lt "$NWORKERS" ]]; then
     echo "printing status ..."
     echo -e "\n"
     screen -list
diff --git a/utils/dask.conf b/utils/dask.conf
index 040f0bfc..e88cbc7c 100644
--- a/utils/dask.conf
+++ b/utils/dask.conf
@@ -1,4 +1,4 @@
-NWORKERS_PER_NODE 8
+NWORKERS 8
 
 12.34.567.890 MASTER
 

From 8ee2606f65d81402aeb73e0fd3cd7b7fe6a01eb2 Mon Sep 17 00:00:00 2001
From: Matthew Jones <majones@dgx15.aselab.nvidia.com>
Date: Wed, 5 Dec 2018 12:24:35 -0800
Subject: [PATCH 08/21] added dask-setup content to readme

---
 utils/README.md | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/utils/README.md b/utils/README.md
index 65847bc0..534b0fb2 100644
--- a/utils/README.md
+++ b/utils/README.md
@@ -92,3 +92,25 @@ DASK_WORKER_BOKEH_PORT  8790
 * `DASK_WORKER_BOKEH_PORT  8790`: a keyword to tell `dask-cluster.py` which port is assigned to the worker's visual front-end
 
 ## dask-setup
+
+`dask-setup.sh` expect several inputs, and order matters:
+
+* `NWORKERS`: number of workers to create
+* `DASK_SCHED_PORT`: port to assign the scheduler
+* `DASK_SCHED_BOKEH_PORT`: port to assign the scheduler's front-end
+* `DASK_WORKER_BOKEH_PORT`: port to assign the worker's front-end
+* `YOUR.IP.ADDRESS`: machine's IP address
+* `{WORKER/MASTER}`: the node's title
+* `DEBUG`: log-level (optional, case-sensitive)
+
+The script is called like follows:
+
+```bash
+notebooks$ bash utils/dask-setup.sh 8 8786 8787 8790 12.34.567.890 MASTER DEBUG
+```
+
+Note: `DEBUG` is optional. This script was designed to be called by `dask-cluster.py`. It is not meant to be called directly by a user other than to kill all present Dask workers:
+
+```bash
+notebooks$ bash utils/dask-setup.sh 0
+```
\ No newline at end of file

From 8654978d1ee15a63d6846dbad1e0d074db0bc749 Mon Sep 17 00:00:00 2001
From: Matthew Jones <majones@dgx15.aselab.nvidia.com>
Date: Wed, 5 Dec 2018 12:26:36 -0800
Subject: [PATCH 09/21] changed headers

---
 utils/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utils/README.md b/utils/README.md
index 534b0fb2..2bd89126 100644
--- a/utils/README.md
+++ b/utils/README.md
@@ -1,6 +1,6 @@
 # Utility Scripts
 
-## Quick Start
+## Quick Summary
 
 * `conda-create-cudf.sh`: creates a conda environment named `cudf` with all of the requisite software dependencies for [cuDF](https://github.com/rapidsai/cudf)
 * `start-jupyter.sh`: starts a JupyterLab environment for interacting with, and running, notebooks

From 854d6cfd97f87257430a399e693dd2cd29ab4abc Mon Sep 17 00:00:00 2001
From: Matthew Jones <majones@dgx15.aselab.nvidia.com>
Date: Wed, 5 Dec 2018 12:48:17 -0800
Subject: [PATCH 10/21] changing relative paths in dask-cluster, udpating E2E
 to use subprocess instead of cell magic

---
 mortgage/E2E.ipynb    | 25 ++++++++++++++-----------
 utils/dask-cluster.py |  6 +++---
 2 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/mortgage/E2E.ipynb b/mortgage/E2E.ipynb
index fdbad63d..4142b802 100644
--- a/mortgage/E2E.ipynb
+++ b/mortgage/E2E.ipynb
@@ -34,11 +34,20 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "%%bash\n",
-    "IPADDR=($(hostname --all-ip-addresses))\n",
-    "bash -c \"../utils/dask-setup.sh 0\"\n",
-    "bash -c \"../utils/dask-setup.sh 8 8786 8787 8790 ${IPADDR[0]} MASTER\"\n",
-    "#                               ^------------------------------ this tells the scheduler how many GPU workers you have on your node"
+    "import subprocess\n",
+    "\n",
+    "cmd = \"hostname --all-ip-addresses\"\n",
+    "process = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE)\n",
+    "output, error = process.communicate()\n",
+    "IPADDR = str(output.decode()).split()[0]\n",
+    "\n",
+    "cmd = 'bash -c \"../utils/dask-setup.sh 0\"'\n",
+    "process = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE)\n",
+    "output, error = process.communicate()\n",
+    "\n",
+    "cmd = 'bash -c \"../utils/dask-setup.sh 8 8786 8787 8790 ${IPADDR[0]} MASTER\"'\n",
+    "process = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE)\n",
+    "output, error = process.communicate()"
    ]
   },
   {
@@ -47,16 +56,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import subprocess\n",
-    "\n",
     "import dask\n",
     "from dask.delayed import delayed\n",
     "from dask.distributed import Client, wait\n",
     "\n",
-    "cmd = \"hostname --all-ip-addresses\"\n",
-    "process = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE)\n",
-    "output, error = process.communicate()\n",
-    "IPADDR = str(output.decode()).split()[0]\n",
     "_client = IPADDR + str(\":8786\")\n",
     "    \n",
     "client = dask.distributed.Client(_client)\n",
diff --git a/utils/dask-cluster.py b/utils/dask-cluster.py
index d5248667..53b072a5 100644
--- a/utils/dask-cluster.py
+++ b/utils/dask-cluster.py
@@ -1,6 +1,6 @@
 import subprocess
 
-dask_conf_path = "./dask.conf"
+dask_conf_path = "../utils/dask.conf"
 with open(dask_conf_path, "r") as file:
 	dask_conf = file.read()
 
@@ -11,7 +11,7 @@
 	if 0 < len(line):
 		dask_conf.append(line)
 
-cmd = "bash ./dask-setup.sh 0"
+cmd = "bash ../utils/dask-setup.sh 0"
 
 print(cmd)
 
@@ -45,7 +45,7 @@
     if line[0] == IPADDR:
         WHOAMI = line[1]
 
-cmd = "bash ./dask-setup.sh " + str(NWORKERS)
+cmd = "bash ../utils/dask-setup.sh " + str(NWORKERS)
 cmd = cmd + " " + str(DASK_SCHED_PORT)
 cmd = cmd + " " + str(DASK_SCHED_BOKEH_PORT)
 cmd = cmd + " " + str(DASK_WORKER_BOKEH_PORT)

From 1837e8d79b9c31b6943440dd800881de6142964d Mon Sep 17 00:00:00 2001
From: Matthew Jones <majones@nvidia.com>
Date: Wed, 5 Dec 2018 14:31:05 -0800
Subject: [PATCH 11/21] added ENVNAME keyword to dask-setup

---
 utils/README.md       |  4 ++++
 utils/dask-cluster.py | 13 ++++++++-----
 utils/dask-setup.sh   | 21 +++++++++++----------
 utils/dask.conf       |  2 ++
 4 files changed, 25 insertions(+), 15 deletions(-)

diff --git a/utils/README.md b/utils/README.md
index 2bd89126..a38e0821 100644
--- a/utils/README.md
+++ b/utils/README.md
@@ -76,6 +76,8 @@ This is a Python script used to launch a Dask cluster. A configuration file is p
 ```bash
 notebooks$ cat utils/dask.conf
 
+ENVNAME cudf
+
 NWORKERS 8
 
 12.34.567.890 MASTER
@@ -85,6 +87,7 @@ DASK_SCHED_BOKEH_PORT   8787
 DASK_WORKER_BOKEH_PORT  8790
 ```
 
+* `ENVNAME cudf`: a keyword to tell `dask-cluster.py` the name of the virtual environment where `cudf` is installed
 * `NWORKERS 8`: a keyword to tell `dask-cluster.py` how many workers to instantiate on the node which called `dask-cluster.py`
 * `12.34.567.890 MASTER`: a map of `IP.ADDRESS {WORKER/MASTER}`
 * `DASK_SCHED_PORT         8786`: a keyword to tell `dask-cluster.py` which port is assigned to the Dask scheduler
@@ -95,6 +98,7 @@ DASK_WORKER_BOKEH_PORT  8790
 
 `dask-setup.sh` expect several inputs, and order matters:
 
+* `ENVNAME`: name of the virtual environment where `cudf` is installed
 * `NWORKERS`: number of workers to create
 * `DASK_SCHED_PORT`: port to assign the scheduler
 * `DASK_SCHED_BOKEH_PORT`: port to assign the scheduler's front-end
diff --git a/utils/dask-cluster.py b/utils/dask-cluster.py
index 53b072a5..5fcc58b1 100644
--- a/utils/dask-cluster.py
+++ b/utils/dask-cluster.py
@@ -23,15 +23,18 @@
 output, error = process.communicate()
 IPADDR = str(output.decode()).split()[0]
 
-NWORKERS = None;
-DASK_SCHED_PORT = None;
-DASK_SCHED_BOKEH_PORT = None;
-DASK_WORKER_BOKEH_PORT = None;
-MASTER_IPADDR = None;
+ENVNAME = None
+NWORKERS = None
+DASK_SCHED_PORT = None
+DASK_SCHED_BOKEH_PORT = None
+DASK_WORKER_BOKEH_PORT = None
+MASTER_IPADDR = None
 WHOAMI = None
 DEBUG = None
 
 for line in dask_conf:
+    if line[0] == "ENVNAME":
+        ENVNAME = line[1]
     if line[0] == "NWORKERS":
         NWORKERS = line[1]
     if line[0] == "DASK_SCHED_PORT":
diff --git a/utils/dask-setup.sh b/utils/dask-setup.sh
index cce9c9e6..d90507f3 100755
--- a/utils/dask-setup.sh
+++ b/utils/dask-setup.sh
@@ -5,13 +5,14 @@ export NCCL_P2P_DISABLE=1
 export DASK_DISTRIBUTED__SCHEDULER__WORK_STEALING=False
 export DASK_DISTRIBUTED__SCHEDULER__BANDWIDTH=1
 
-NWORKERS=$1
-DASK_SCHED_PORT=$2
-DASK_SCHED_BOKEH_PORT=$3
-DASK_WORKER_BOKEH_PORT=$4
-MASTER_IPADDR=$5
-WHOAMI=$6
-DEBUG=$7
+ENVNAME=$1
+NWORKERS=$2
+DASK_SCHED_PORT=$3
+DASK_SCHED_BOKEH_PORT=$4
+DASK_WORKER_BOKEH_PORT=$5
+MASTER_IPADDR=$6
+WHOAMI=$7
+DEBUG=$8
 
 DASK_LOCAL_DIR=./.dask
 NUM_GPUS=$(nvidia-smi --list-gpus | wc --lines)
@@ -41,7 +42,7 @@ if [[ "0" -lt "$NWORKERS" ]] && [[ "$NWORKERS" -le "$NUM_GPUS" ]]; then
 
     if [[ "$WHOAMI" = "MASTER" ]]; then
         echo "initializing dask scheduler..."
-        screen -dmS dask_scheduler bash -c "source activate cudf_dev && dask-scheduler"
+        screen -dmS dask_scheduler bash -c "source activate $ENVNAME && dask-scheduler"
         sleep 5
         echo "... scheduler started"
     fi
@@ -63,7 +64,7 @@ if [[ "0" -lt "$NWORKERS" ]] && [[ "$NWORKERS" -le "$NUM_GPUS" ]]; then
     echo "... starting gpu worker $worker_id"
 
     if [[ "$DEBUG" = "DEBUG" ]]; then
-        export create_worker="source activate cudf && \
+        export create_worker="source activate $ENVNAME && \
                               cuda-memcheck dask-worker $MASTER_IPADDR:$DASK_SCHED_PORT \
                                                         --host=${MY_IPADDR[0]} --no-nanny \
                                                         --nprocs=1 --nthreads=1 \
@@ -73,7 +74,7 @@ if [[ "0" -lt "$NWORKERS" ]] && [[ "$NWORKERS" -le "$NUM_GPUS" ]]; then
         env CUDA_VISIBLE_DEVICES=$devs screen -dmS gpu_worker_$worker_id \
                                                    bash -c 'script -c "$create_worker" "$logfile"'
     else
-        export create_worker="source activate cudf && \
+        export create_worker="source activate $ENVNAME && \
                               dask-worker $MASTER_IPADDR:$DASK_SCHED_PORT \
                                           --host=${MY_IPADDR[0]} --no-nanny \
                                           --nprocs=1 --nthreads=1 \
diff --git a/utils/dask.conf b/utils/dask.conf
index e88cbc7c..8c19ab99 100644
--- a/utils/dask.conf
+++ b/utils/dask.conf
@@ -1,3 +1,5 @@
+ENVNAME cudf
+
 NWORKERS 8
 
 12.34.567.890 MASTER

From 90b113385ed354aa59175676358f41d4dc0842bf Mon Sep 17 00:00:00 2001
From: Matthew Jones <majones@nvidia.com>
Date: Wed, 5 Dec 2018 16:30:59 -0800
Subject: [PATCH 12/21] adding comments to notebook about where to access data

---
 mortgage/E2E.ipynb    | 19 +++++++++++--------
 utils/README.md       |  3 +++
 utils/dask-cluster.py |  7 ++++++-
 3 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/mortgage/E2E.ipynb b/mortgage/E2E.ipynb
index 4142b802..fbe124fb 100644
--- a/mortgage/E2E.ipynb
+++ b/mortgage/E2E.ipynb
@@ -41,13 +41,15 @@
     "output, error = process.communicate()\n",
     "IPADDR = str(output.decode()).split()[0]\n",
     "\n",
-    "cmd = 'bash -c \"../utils/dask-setup.sh 0\"'\n",
+    "cmd = \"../utils/dask-setup.sh 0\"\n",
     "process = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE)\n",
     "output, error = process.communicate()\n",
     "\n",
-    "cmd = 'bash -c \"../utils/dask-setup.sh 8 8786 8787 8790 ${IPADDR[0]} MASTER\"'\n",
+    "cmd = \"../utils/dask-setup.sh cudf 8 8786 8787 8790 \" + str(IPADDR) + \" MASTER\"\n",
     "process = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE)\n",
-    "output, error = process.communicate()"
+    "output, error = process.communicate()\n",
+    "\n",
+    "print(output.decode())"
    ]
   },
   {
@@ -72,11 +74,12 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "acq_data_path = \"/datasets/mortgage/acquisition\"\n",
-    "perf_data_path = \"/datasets/mortgage/perf_clean_full_split\"\n",
-    "col_names_path = \"/datasets/mortgage/names.csv\"\n",
+    "# to download data for this notebook, visit https://rapidsai.github.io/datasets and update the following paths accordingly\n",
+    "acq_data_path = \"/path/to/mortgage/acq\"\n",
+    "perf_data_path = \"/path/to/mortgage/perf\"\n",
+    "col_names_path = \"/path/to/mortgage/names.csv\"\n",
     "start_year = 2000\n",
-    "end_year = 2017 # end_year is not inclusive\n",
+    "end_year = 2016 # end_year is inclusive\n",
     "part_count = 16 # the number of data files to train against"
    ]
   },
@@ -526,7 +529,7 @@
     "quarter = 1\n",
     "year = start_year\n",
     "count = 0\n",
-    "while year != end_year:\n",
+    "while year <= end_year:\n",
     "    for file in glob(os.path.join(perf_data_path + \"/Performance_\" + str(year) + \"Q\" + str(quarter) + \"*\")):\n",
     "        gpu_dfs.append(process_quarter_gpu(year=year, quarter=quarter, perf_file=file))\n",
     "        count += 1\n",
diff --git a/utils/README.md b/utils/README.md
index a38e0821..64dfe64e 100644
--- a/utils/README.md
+++ b/utils/README.md
@@ -85,6 +85,8 @@ NWORKERS 8
 DASK_SCHED_PORT         8786
 DASK_SCHED_BOKEH_PORT   8787
 DASK_WORKER_BOKEH_PORT  8790
+
+DEBUG
 ```
 
 * `ENVNAME cudf`: a keyword to tell `dask-cluster.py` the name of the virtual environment where `cudf` is installed
@@ -93,6 +95,7 @@ DASK_WORKER_BOKEH_PORT  8790
 * `DASK_SCHED_PORT         8786`: a keyword to tell `dask-cluster.py` which port is assigned to the Dask scheduler
 * `DASK_SCHED_BOKEH_PORT   8787`: a keyword to tell `dask-cluster.py` which port is assigned to the scheduler's visual front-end
 * `DASK_WORKER_BOKEH_PORT  8790`: a keyword to tell `dask-cluster.py` which port is assigned to the worker's visual front-end
+* `DEBUG`: a keyword to tell `dask-cluster.py` to launch all Dask workers to set the log-level to DEBUG
 
 ## dask-setup
 
diff --git a/utils/dask-cluster.py b/utils/dask-cluster.py
index 5fcc58b1..93e3731e 100644
--- a/utils/dask-cluster.py
+++ b/utils/dask-cluster.py
@@ -47,13 +47,18 @@
         MASTER_IPADDR = line[0]
     if line[0] == IPADDR:
         WHOAMI = line[1]
+    if line[0] == "DEBUG"
+        DEBUG = "DEBUG"
 
-cmd = "bash ../utils/dask-setup.sh " + str(NWORKERS)
+cmd = "bash ../utils/dask-setup.sh " + str(ENVNAME)
+cmd = cmd + " " + str(NWORKERS)
 cmd = cmd + " " + str(DASK_SCHED_PORT)
 cmd = cmd + " " + str(DASK_SCHED_BOKEH_PORT)
 cmd = cmd + " " + str(DASK_WORKER_BOKEH_PORT)
 cmd = cmd + " " + str(MASTER_IPADDR)
 cmd = cmd + " " + str(WHOAMI)
+if DEBUG != None:
+    cmd = cmd + " " + str(DEBUG)
 
 print(cmd)
 

From 372ba2ff0733e20db01dcfa084b4f717e74cae0a Mon Sep 17 00:00:00 2001
From: Matthew Jones <majones@nvidia.com>
Date: Wed, 5 Dec 2018 16:41:47 -0800
Subject: [PATCH 13/21] added information to readme

---
 README.md | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 7bbeab42..f63c2507 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,6 @@
-# notebooks
-RAPIDS Sample Notebooks
+# RAPIDS Notebooks and Utilities
+
+## Quick Summary
+
+* `mortgage`: contains notebooks which run ETL + ML on the Mortgage Dataset derived from [Fannie Mae’s Single-Family Loan Performance Data](http://www.fanniemae.com/portal/funding-the-market/data/loan-performance-data.html)
+* `utils`: contains a set of useful scripts for interacting with RAPIDS

From c0d8ffbd92a30414d7d1653a5c37040af932cc36 Mon Sep 17 00:00:00 2001
From: Matthew Jones <majones@nvidia.com>
Date: Wed, 5 Dec 2018 16:42:52 -0800
Subject: [PATCH 14/21] fixed typo

---
 utils/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utils/README.md b/utils/README.md
index 64dfe64e..40406fec 100644
--- a/utils/README.md
+++ b/utils/README.md
@@ -95,7 +95,7 @@ DEBUG
 * `DASK_SCHED_PORT         8786`: a keyword to tell `dask-cluster.py` which port is assigned to the Dask scheduler
 * `DASK_SCHED_BOKEH_PORT   8787`: a keyword to tell `dask-cluster.py` which port is assigned to the scheduler's visual front-end
 * `DASK_WORKER_BOKEH_PORT  8790`: a keyword to tell `dask-cluster.py` which port is assigned to the worker's visual front-end
-* `DEBUG`: a keyword to tell `dask-cluster.py` to launch all Dask workers to set the log-level to DEBUG
+* `DEBUG`: a keyword to tell `dask-cluster.py` to launch all Dask workers with log-level set to DEBUG
 
 ## dask-setup
 

From 2e8f862934eb555bb150ddd47a452bc32e785590 Mon Sep 17 00:00:00 2001
From: Matthew Jones <majones@nvidia.com>
Date: Wed, 5 Dec 2018 16:43:42 -0800
Subject: [PATCH 15/21] updated sample dask configuration

---
 utils/dask.conf | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/utils/dask.conf b/utils/dask.conf
index 8c19ab99..b1640f47 100644
--- a/utils/dask.conf
+++ b/utils/dask.conf
@@ -6,4 +6,6 @@ NWORKERS 8
 
 DASK_SCHED_PORT         8786
 DASK_SCHED_BOKEH_PORT   8787
-DASK_WORKER_BOKEH_PORT  8790
\ No newline at end of file
+DASK_WORKER_BOKEH_PORT  8790
+
+DEBUG
\ No newline at end of file

From 20f42fd42768e8878d93cdee04a067be6865736e Mon Sep 17 00:00:00 2001
From: Matthew Jones <majones@nvidia.com>
Date: Wed, 5 Dec 2018 16:45:15 -0800
Subject: [PATCH 16/21] updating readme

---
 README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index f63c2507..ea8ed839 100644
--- a/README.md
+++ b/README.md
@@ -2,5 +2,6 @@
 
 ## Quick Summary
 
-* `mortgage`: contains notebooks which run ETL + ML on the Mortgage Dataset derived from [Fannie Mae’s Single-Family Loan Performance Data](http://www.fanniemae.com/portal/funding-the-market/data/loan-performance-data.html)
+* `mortgage`: contains the notebook which runs ETL + ML on the Mortgage Dataset derived from [Fannie Mae’s Single-Family Loan Performance Data](http://www.fanniemae.com/portal/funding-the-market/data/loan-performance-data.html)
+** you
 * `utils`: contains a set of useful scripts for interacting with RAPIDS

From f09bc833021f67cffda2670102bd61d89591d2b5 Mon Sep 17 00:00:00 2001
From: Matthew Jones <majones@nvidia.com>
Date: Wed, 5 Dec 2018 16:46:37 -0800
Subject: [PATCH 17/21] fixing typos

---
 README.md | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/README.md b/README.md
index ea8ed839..179d511f 100644
--- a/README.md
+++ b/README.md
@@ -2,6 +2,5 @@
 
 ## Quick Summary
 
-* `mortgage`: contains the notebook which runs ETL + ML on the Mortgage Dataset derived from [Fannie Mae’s Single-Family Loan Performance Data](http://www.fanniemae.com/portal/funding-the-market/data/loan-performance-data.html)
-** you
+* `mortgage`: contains the notebook which runs ETL + ML on the Mortgage Dataset derived from [Fannie Mae’s Single-Family Loan Performance Data](http://www.fanniemae.com/portal/funding-the-market/data/loan-performance-data.html) ... download the mortgage dataset for use with the notebook [here](https://rapidsai.github.io/datasets/)
 * `utils`: contains a set of useful scripts for interacting with RAPIDS

From 1b93cf82518ad1bbd6f1d17b114e9fe06394b9cc Mon Sep 17 00:00:00 2001
From: Matthew Jones <majones@nvidia.com>
Date: Wed, 5 Dec 2018 17:39:02 -0800
Subject: [PATCH 18/21] removed conda env script... moving it to cudf

---
 README.md                  |  4 +---
 utils/README.md            | 27 --------------------------
 utils/conda-create-cudf.sh | 39 --------------------------------------
 3 files changed, 1 insertion(+), 69 deletions(-)
 delete mode 100755 utils/conda-create-cudf.sh

diff --git a/README.md b/README.md
index 179d511f..054eaa85 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,4 @@
-# RAPIDS Notebooks and Utilities
-
-## Quick Summary
+# RAPIDS Notebooks and Notebook Utilities
 
 * `mortgage`: contains the notebook which runs ETL + ML on the Mortgage Dataset derived from [Fannie Mae’s Single-Family Loan Performance Data](http://www.fanniemae.com/portal/funding-the-market/data/loan-performance-data.html) ... download the mortgage dataset for use with the notebook [here](https://rapidsai.github.io/datasets/)
 * `utils`: contains a set of useful scripts for interacting with RAPIDS
diff --git a/utils/README.md b/utils/README.md
index 40406fec..16f203b0 100644
--- a/utils/README.md
+++ b/utils/README.md
@@ -8,33 +8,6 @@
 * `dask-cluster.py`: launches a configured Dask cluster (a set of nodes) for use within a notebook
 * `dask-setup.sh`: a low-level script for constructing a set of Dask workers on a single node ... **do not use this script directly**
 
-## conda-create-cudf
-
-Typical output (suppressing package plans) will be of the following form:
-
-```bash
-notebooks$ bash utils/conda-create-cudf.sh
-
-attempting to remove current conda environment cudf
-
-Remove all packages in environment /conda/envs/cudf:
-
-...
-
-creating dev environment for cudf
-
-...
-
-successfully created environment cudf
-
-```
-
-Activate the dev environment with 
-
-```bash
-$ source activate cudf
-```
-
 ## start-jupyter
 
 Typical output for `start-jupyter.sh` will be of the following form:
diff --git a/utils/conda-create-cudf.sh b/utils/conda-create-cudf.sh
deleted file mode 100755
index 0b176402..00000000
--- a/utils/conda-create-cudf.sh
+++ /dev/null
@@ -1,39 +0,0 @@
-#!/bin/bash
-
-export PYTHON_VERSION=3.6
-export NUMBA_VERSION=0.40.0
-export NUMPY_VERSION=1.14.5
-export PANDAS_VERSION=0.20.3
-export PYARROW_VERSION=0.10
-
-echo -e "\n"                                                                            
-echo "attempting to remove current conda environment cudf"                              
-conda-env   remove  --name cudf  --quiet --yes                                          
-echo "creating dev environment for cudf"                                                
-echo -e "\n"                                                                            
-conda       update  --name base --yes conda                                          && \
-conda       install --yes  python=$PYTHON_VERSION                                    && \
-conda       create  --name cudf  --yes python=$PYTHON_VERSION                        && \
-conda       install --name cudf  --yes --channel conda-forge                            \
-                                      --channel numba                                   \
-                                      --channel nvidia                                  \
-                                      nvstrings                                         \
-                                      bokeh                                             \
-                                      cmake                                             \
-                                      dask                                              \
-                                      pytest                                            \
-                                      pycparser                                         \
-                                      cffi                                              \
-                                      cython                                            \
-                                      jupyterlab                                        \
-                                      numba=$NUMBA_VERSION                              \
-                                      numpy=$NUMPY_VERSION                              \
-                                      numpy-base=$NUMPY_VERSION                         \
-                                      pandas=$PANDAS_VERSION                            \
-                                      pyarrow=$PYARROW_VERSION                          \
-                                      scikit-learn                                      \
-                                      scipy                                          && \
-conda       clean   --all       --yes                                                && \
-echo -e "\n"                                                                         && \
-echo "successfully created environment cudf"                                         && \
-echo -e "\n"

From d418a4fee94175198dc9d0ee04ca15aa6c45f9a8 Mon Sep 17 00:00:00 2001
From: Matthew Jones <majones@nvidia.com>
Date: Wed, 5 Dec 2018 17:41:12 -0800
Subject: [PATCH 19/21] typos

---
 utils/README.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/utils/README.md b/utils/README.md
index 16f203b0..0cda81a2 100644
--- a/utils/README.md
+++ b/utils/README.md
@@ -2,7 +2,6 @@
 
 ## Quick Summary
 
-* `conda-create-cudf.sh`: creates a conda environment named `cudf` with all of the requisite software dependencies for [cuDF](https://github.com/rapidsai/cudf)
 * `start-jupyter.sh`: starts a JupyterLab environment for interacting with, and running, notebooks
 * `stop-jupyter.sh`: identifies all process IDs associated with Jupyter and kills them
 * `dask-cluster.py`: launches a configured Dask cluster (a set of nodes) for use within a notebook

From fe0b8aa2676cc1506ef3087e8af11c3b1e5c422d Mon Sep 17 00:00:00 2001
From: Matthew Jones <majones@nvidia.com>
Date: Wed, 5 Dec 2018 19:01:32 -0800
Subject: [PATCH 20/21] removing quick phrase from readme

---
 README.md       | 2 +-
 utils/README.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 054eaa85..77cb42cd 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# RAPIDS Notebooks and Notebook Utilities
+# RAPIDS Notebooks and Utilities
 
 * `mortgage`: contains the notebook which runs ETL + ML on the Mortgage Dataset derived from [Fannie Mae’s Single-Family Loan Performance Data](http://www.fanniemae.com/portal/funding-the-market/data/loan-performance-data.html) ... download the mortgage dataset for use with the notebook [here](https://rapidsai.github.io/datasets/)
 * `utils`: contains a set of useful scripts for interacting with RAPIDS
diff --git a/utils/README.md b/utils/README.md
index 0cda81a2..12276262 100644
--- a/utils/README.md
+++ b/utils/README.md
@@ -1,6 +1,6 @@
 # Utility Scripts
 
-## Quick Summary
+## Summary
 
 * `start-jupyter.sh`: starts a JupyterLab environment for interacting with, and running, notebooks
 * `stop-jupyter.sh`: identifies all process IDs associated with Jupyter and kills them

From 9b06b31f5227c00b029d57474d1372fa17d81be3 Mon Sep 17 00:00:00 2001
From: Matthew Jones <majones@nvidia.com>
Date: Wed, 5 Dec 2018 19:04:25 -0800
Subject: [PATCH 21/21] corrected typos in readme

---
 utils/README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/utils/README.md b/utils/README.md
index 12276262..9963afcb 100644
--- a/utils/README.md
+++ b/utils/README.md
@@ -5,7 +5,7 @@
 * `start-jupyter.sh`: starts a JupyterLab environment for interacting with, and running, notebooks
 * `stop-jupyter.sh`: identifies all process IDs associated with Jupyter and kills them
 * `dask-cluster.py`: launches a configured Dask cluster (a set of nodes) for use within a notebook
-* `dask-setup.sh`: a low-level script for constructing a set of Dask workers on a single node ... **do not use this script directly**
+* `dask-setup.sh`: a low-level script for constructing a set of Dask workers on a single node
 
 ## start-jupyter
 
@@ -71,7 +71,7 @@ DEBUG
 
 ## dask-setup
 
-`dask-setup.sh` expect several inputs, and order matters:
+`dask-setup.sh` expects several inputs, and order matters:
 
 * `ENVNAME`: name of the virtual environment where `cudf` is installed
 * `NWORKERS`: number of workers to create
@@ -82,7 +82,7 @@ DEBUG
 * `{WORKER/MASTER}`: the node's title
 * `DEBUG`: log-level (optional, case-sensitive)
 
-The script is called like follows:
+The script is called as follows:
 
 ```bash
 notebooks$ bash utils/dask-setup.sh 8 8786 8787 8790 12.34.567.890 MASTER DEBUG