report.tex

% Options for packages loaded elsewhere
\PassOptionsToPackage{unicode}{hyperref}
\PassOptionsToPackage{hyphens}{url}
%
\documentclass[
]{article}
\title{Predicting Group Life Client Mortality during a Pandemic, Final
Report}
\author{Team Outliers}
\date{August 2, 2021}

\usepackage{amsmath,amssymb}
\usepackage{lmodern}
\usepackage{iftex}
\ifPDFTeX
  \usepackage[T1]{fontenc}
  \usepackage[utf8]{inputenc}
  \usepackage{textcomp} % provide euro and other symbols
\else % if luatex or xetex
  \usepackage{unicode-math}
  \defaultfontfeatures{Scale=MatchLowercase}
  \defaultfontfeatures[\rmfamily]{Ligatures=TeX,Scale=1}
\fi
% Use upquote if available, for straight quotes in verbatim environments
\IfFileExists{upquote.sty}{\usepackage{upquote}}{}
\IfFileExists{microtype.sty}{% use microtype if available
  \usepackage[]{microtype}
  \UseMicrotypeSet[protrusion]{basicmath} % disable protrusion for tt fonts
}{}
\makeatletter
\@ifundefined{KOMAClassName}{% if non-KOMA class
  \IfFileExists{parskip.sty}{%
    \usepackage{parskip}
  }{% else
    \setlength{\parindent}{0pt}
    \setlength{\parskip}{6pt plus 2pt minus 1pt}}
}{% if KOMA class
  \KOMAoptions{parskip=half}}
\makeatother
\usepackage{xcolor}
\IfFileExists{xurl.sty}{\usepackage{xurl}}{} % add URL line breaks if available
\IfFileExists{bookmark.sty}{\usepackage{bookmark}}{\usepackage{hyperref}}
\hypersetup{
  pdftitle={Predicting Group Life Client Mortality during a Pandemic, Final Report},
  pdfauthor={Team Outliers},
  hidelinks,
  pdfcreator={LaTeX via pandoc}}
\urlstyle{same} % disable monospaced font for URLs
\usepackage[margin=1in]{geometry}
\usepackage{color}
\usepackage{fancyvrb}
\newcommand{\VerbBar}{|}
\newcommand{\VERB}{\Verb[commandchars=\\\{\}]}
\DefineVerbatimEnvironment{Highlighting}{Verbatim}{commandchars=\\\{\}}
% Add ',fontsize=\small' for more characters per line
\usepackage{framed}
\definecolor{shadecolor}{RGB}{248,248,248}
\newenvironment{Shaded}{\begin{snugshade}}{\end{snugshade}}
\newcommand{\AlertTok}[1]{\textcolor[rgb]{0.94,0.16,0.16}{#1}}
\newcommand{\AnnotationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}}
\newcommand{\AttributeTok}[1]{\textcolor[rgb]{0.77,0.63,0.00}{#1}}
\newcommand{\BaseNTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}}
\newcommand{\BuiltInTok}[1]{#1}
\newcommand{\CharTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}}
\newcommand{\CommentTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textit{#1}}}
\newcommand{\CommentVarTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}}
\newcommand{\ConstantTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}}
\newcommand{\ControlFlowTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{\textbf{#1}}}
\newcommand{\DataTypeTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{#1}}
\newcommand{\DecValTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}}
\newcommand{\DocumentationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}}
\newcommand{\ErrorTok}[1]{\textcolor[rgb]{0.64,0.00,0.00}{\textbf{#1}}}
\newcommand{\ExtensionTok}[1]{#1}
\newcommand{\FloatTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}}
\newcommand{\FunctionTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}}
\newcommand{\ImportTok}[1]{#1}
\newcommand{\InformationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}}
\newcommand{\KeywordTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{\textbf{#1}}}
\newcommand{\NormalTok}[1]{#1}
\newcommand{\OperatorTok}[1]{\textcolor[rgb]{0.81,0.36,0.00}{\textbf{#1}}}
\newcommand{\OtherTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{#1}}
\newcommand{\PreprocessorTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textit{#1}}}
\newcommand{\RegionMarkerTok}[1]{#1}
\newcommand{\SpecialCharTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}}
\newcommand{\SpecialStringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}}
\newcommand{\StringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}}
\newcommand{\VariableTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}}
\newcommand{\VerbatimStringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}}
\newcommand{\WarningTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}}
\usepackage{longtable,booktabs,array}
\usepackage{calc} % for calculating minipage widths
% Correct order of tables after \paragraph or \subparagraph
\usepackage{etoolbox}
\makeatletter
\patchcmd\longtable{\par}{\if@noskipsec\mbox{}\fi\par}{}{}
\makeatother
% Allow footnotes in longtable head/foot
\IfFileExists{footnotehyper.sty}{\usepackage{footnotehyper}}{\usepackage{footnote}}
\makesavenoteenv{longtable}
\usepackage{graphicx}
\makeatletter
\def\maxwidth{\ifdim\Gin@nat@width>\linewidth\linewidth\else\Gin@nat@width\fi}
\def\maxheight{\ifdim\Gin@nat@height>\textheight\textheight\else\Gin@nat@height\fi}
\makeatother
% Scale images if necessary, so that they will not overflow the page
% margins by default, and it is still possible to overwrite the defaults
% using explicit options in \includegraphics[width, height, ...]{}
\setkeys{Gin}{width=\maxwidth,height=\maxheight,keepaspectratio}
% Set default figure placement to htbp
\makeatletter
\def\fps@figure{htbp}
\makeatother
\setlength{\emergencystretch}{3em} % prevent overfull lines
\providecommand{\tightlist}{%
  \setlength{\itemsep}{0pt}\setlength{\parskip}{0pt}}
\setcounter{secnumdepth}{-\maxdimen} % remove section numbering
\ifLuaTeX
  \usepackage{selnolig}  % disable illegal ligatures
\fi

\begin{document}
\maketitle

\hypertarget{final-report}{%
\section{Final report}\label{final-report}}

IMA Math-to-Industry Bootcamp, Securian Financial

Team members:

\begin{itemize}
\tightlist
\item
  Marc Härkönen
\item
  Samara Chamoun
\item
  Shuxian Xu
\item
  Abba Ramadan
\item
  Lei Yang
\item
  Yuchen Cao
\end{itemize}

Mentor:

\begin{itemize}
\tightlist
\item
  Douglas Armstrong
\end{itemize}

Supervisors:

\begin{itemize}
\tightlist
\item
  Daniel Spirn
\item
  Thomas Höft
\end{itemize}

\hypertarget{executive-summary}{%
\section{Executive summary}\label{executive-summary}}

As Babe Ruth once said: ``Yesterday's home runs don't win today's
games!''

In March 2020, with the pandemic starting, the whole world fell into a
state of uncertainty about the future. Similarly to other businesses,
the insurance sector was affected by the COVID breakout as well and the
whole business landscape needs to address the changes that came along.

We are team Outliers from the Securian Financial Department of Data
Science and we think we have the resources, the expertise and the
determination to present the management team with a whole new set of
information that can help their decision making during a pandemic. With
Group Life Insurance being an important part of our company and for our
clients, there is no doubt that we should look closely at how it is
being affected by the recent events. For the past few weeks, we have
been working on a project that aims to predict Group Life mortality for
our clients during a pandemic.

One might ask how the pandemic is exactly affecting group life
insurance. As we know, life insurance guarantees payment of death
benefits. Since the COVID-19 breakout, our clients are experiencing a
higher mortality rate then usual, which has resulted in an unprecedented
increase in claims. Our primary function as data scientists is to
correctly forecast the mortality risk, and the way to do that is by
first tracking the claims performance. We classify clients as high-risk
and low-risk by using one of the most popular metrics to track claims
performance, the Actual-to-Expected ratio (AE). Observing the large
shift of the proportion of clients that are classified as high-risk from
2019 to 2020, we hypothesize that the pre-pandemic, historical AE of a
client is no longer a good predictor of the client's performance during
a pandemic.

We aim at replacing this historical AE with a predictive one that can
help our management and sales team have better insight on the
possibility that a client experiences an Adverse mortality event during
an outbreak. We collect data from the zip codes where the companies are
located: poverty percentage, education level, unemployment rate, etc. We
then combine this information with some characteristics of the companies
such as average age of employees and some pandemic-related resources. We
then apply several machine learning models, validate the results and
build the best possible insight for proper risk-management.

We provide our management team with two models: one is long-term and the
other is short-term. Each of these models serve different purposes and
bring valuable assets to the company. The long-term model can be used at
a specific time and uses the information of some clients to predict what
can happen to other clients in different zip codes. While working on
this model, our goal was to minimize the loss of money for Securian that
can be caused by long-term adverse mortality event such as a pandemic.
On one hand, we aim at minimizing the number of clients that were
adverse and predicted otherwise. We also wanted to prevent the company
from losing clients that will perform well, so we simultaneously focused
on minimizing the number of clients that are not adverse and predicted
to be so. The strength of this model lies in understanding the
contributions of different predictors in the performance of the clients.
The management team can have better insights and clarity regarding how
each predictor contributes positively or negatively into the
classification. It is worthy to note that adding the AE2019 to the list
of predictors for this model won't make any additional improvements.

As opposed to the long-term model, the short-term model integrates the
time factor and can react to changes during the pandemic. Not only can
the model predict the future performance of existing clients, it can
also do so for potential new clients.

Having these two models in the hands of the management team, the latter
can gain accurate and deep understanding of old and new clients
performance during a pandemic. They can use this enhanced understanding
to determine contract renewals, to negotiate with clients and most
importantly to better face the uncertainties of the future.

\hypertarget{data-wrangling}{%
\section{Data wrangling}\label{data-wrangling}}

In this section, we describe our data gathering and tidying process. We
will be making extensive use of the \texttt{tidyverse} family of
packages. A series of scripts are used to generate tibbles, which are
then saved in a \texttt{*.feather} for fast loading. A full list of
scripts with their dependencies can be viewed in the Appendix.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{library}\NormalTok{(tidyverse)}
\FunctionTok{library}\NormalTok{(feather)}
\FunctionTok{library}\NormalTok{(lubridate)}
\end{Highlighting}
\end{Shaded}

\hypertarget{data-sources}{%
\subsection{Data sources}\label{data-sources}}

Our dataset consists of two parts: publicly obtained data and simulated
clients. Below we describe our publicly obtained datasets.

\begin{longtable}[]{@{}
  >{\raggedright\arraybackslash}p{(\columnwidth - 4\tabcolsep) * \real{0.31}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 4\tabcolsep) * \real{0.28}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 4\tabcolsep) * \real{0.41}}@{}}
\toprule
\begin{minipage}[b]{\linewidth}\raggedright
Filename
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Source
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Description
\end{minipage} \\
\midrule
\endhead
\texttt{covid\_deaths\_usafacts.csv} &
\href{https://usafacts.org/visualizations/coronavirus-covid-19-spread-map/}{USAFacts}
& Cumulative weekly COVID-19 deaths by county \\
\texttt{soa\_base\_2017.csv} & (Sent by Douglas Armstrong) & \(q_x\)
values by gender, age, industry \\
\texttt{Population\_Estimates.csv} &
\href{https://www.ers.usda.gov/data-products/county-level-data-sets/download-data/}{USDA
ERS} & Population estimates of the U.S., states and counties, 2019 \\
\texttt{COVID-19\_Vaccinations...}
&
\href{https://catalog.data.gov/dataset/covid-19-vaccinations-in-the-united-statescounty-8204e}{CDC}
& Overall US COVID-19 Vaccine administration and vaccine equity data at
county level \\
\texttt{Education\_Estimates.csv} &
\href{https://www.ers.usda.gov/data-products/county-level-data-sets/download-data/}{USDA
ERS} & Educational attainment for adults age 25 and older for the U.S.,
states and counties, 2015-19 \\
\texttt{Poverty\_Estimates.csv} &
\href{https://www.ers.usda.gov/data-products/county-level-data-sets/download-data/}{USDA
ERS} & Poverty rates in the U.S., states and counties, 2019 \\
\texttt{Unemployment\_Estimates.csv} &
\href{https://www.ers.usda.gov/data-products/county-level-data-sets/download-data/}{USDA
ERS} & Unemployment rates, 2019 and 2020; median househould income,
2019. States and counties \\
\texttt{Vaccine\_Hesitancy...}
&
\href{https://catalog.data.gov/dataset/vaccine-hesitancy-for-covid-19-county-and-local-estimates}{CDC}
& Vaccine hesitancy estimates for COVID-19 \\
\texttt{countypres\_2000-2020.csv} &
\href{https://dataverse.harvard.edu/file.xhtml?fileId=4819117\&version=9.0}{MIT
Election Data + Science Lab} & Election data by county (only 2020
used) \\
\texttt{zcta\_county\_rel\_10.txt} &
\href{https://www.census.gov/geographies/reference-files/time-series/geo/relationship-files.2010.html\#par_textimage_674173622}{US
Census Bureau} & Zip code to county relationship file (2010) \\
\texttt{2020\_12\_23/reference...} &
\href{http://www.healthdata.org/node/8787}{IHME} & COVID-19 projections
\textbf{as of Dec 23 2020} \\
\texttt{state.txt} &
\href{https://www.census.gov/library/reference/code-lists/ansi.html}{US
Census Bureau} & State names and FIPS codes \\
\bottomrule
\end{longtable}

\hypertarget{us-census-bureau}{%
\subsubsection{US Census bureau}\label{us-census-bureau}}

We used the US Census Bureau's API to obtain the 2019 estimates for
population and density per county from the Census Bureau's Population
Estimates Program (PEP). The \texttt{censusapi} package provides an R
interface to the API. Using the API requires an API key, which can be
obtained from \href{https://api.census.gov/data/key_signup.html}{here}.
The following snippet fetches the data, and saves the tibble into a file
called \texttt{pop\_den.feather}. See also \texttt{data/census.R}.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{library}\NormalTok{(}\StringTok{"censusapi"}\NormalTok{)}

\FunctionTok{Sys.setenv}\NormalTok{(}\AttributeTok{CENSUS\_KEY =} \StringTok{"YOUR\_KEY\_HERE"}\NormalTok{)}

\CommentTok{\# date\_code = 12 is an estimate for July 1, 2019}
\CommentTok{\# total population + density}
\NormalTok{pop }\OtherTok{\textless{}{-}} \FunctionTok{getCensus}\NormalTok{(}
                 \AttributeTok{name =} \StringTok{"pep/population"}\NormalTok{,}
                 \AttributeTok{vintage =} \DecValTok{2019}\NormalTok{,}
                 \AttributeTok{region =} \StringTok{"county:*"}\NormalTok{,}
                 \AttributeTok{vars =} \FunctionTok{c}\NormalTok{(}\StringTok{"POP"}\NormalTok{, }\StringTok{"DENSITY"}\NormalTok{),}
                 \AttributeTok{DATE\_CODE =} \DecValTok{12}\NormalTok{)}
\NormalTok{pop }\OtherTok{\textless{}{-}} \FunctionTok{tibble}\NormalTok{(pop) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{select}\NormalTok{(}\SpecialCharTok{{-}}\NormalTok{DATE\_CODE)}
\FunctionTok{write\_feather}\NormalTok{(pop, }\StringTok{"pop\_den.feather"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\hypertarget{county-to-zip3}{%
\subsection{County to zip3}\label{county-to-zip3}}

So far all of our public data is expressed by US county, but our
clients' location are given as a ZIP3 code (the first three digits of a
five-digit zip code). The conversion from county to ZIP3 is nontrivial,
as some zip codes span multiple counties and some counties span multiple
zip codes.

To convert data given by county to ZIP3, we first need a ZIP3 to county
relationship table.

The relationship table contains three columns: ZIP3, County, and
Population. Each row corresponds to a pair
\((\text{ZIP3}, \text{county})\), and the Population column contains the
population in the intersection \(\text{ZIP3} \cap \text{county}\). Then,
given county-level data, we compute the corresponding value for any
given ZIP3 by taking a weighted average of all counties intersecting
that ZIP3, and weighting by the population in
\(\text{ZIP3} \cap \text{county}\). This operation looks as follows in
code (suppose \texttt{A} contains some county-level data, e.g.~poverty
levels):

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{A }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{left\_join}\NormalTok{(zip3\_rel, }\AttributeTok{by =} \StringTok{"county"}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{group\_by}\NormalTok{(zip3) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{summarize}\NormalTok{(}\AttributeTok{poverty =} \FunctionTok{weighted.mean}\NormalTok{(poverty, population, }\AttributeTok{na.rm =} \ConstantTok{TRUE}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

We note that in practice, the country is represented by a 5 digit FIPS
code. The first two digits indicate the state, and the last 3 digits
indicate the county.

The relationship table is generated by \texttt{zip3\_rel.R} and can be
loaded from \texttt{zip3\_rel.feather}. For an example of how it's used,
see \texttt{wrangling.Rmd} and \texttt{deaths.R}.

\hypertarget{weekly-deaths-ihme-forecasts}{%
\subsection{Weekly deaths \& IHME
forecasts}\label{weekly-deaths-ihme-forecasts}}

In some of our models we use weekly COVID deaths as a predictor. The
file \texttt{covid\_deaths\_usafacts.csv} contains this data for every
day and every county. We convert the county-level information to zip3 as
above, and convert the daily data to weekly. The library
\texttt{lubridate} doesn't contain a type for week; we use the last day
of the week instead (using
\texttt{lubridate::ceiling\_date(date,\ unit\ =\ "week")}).

We will also be using forecasts from the Institute for Health Metrics
and Evaluation (IHME) to assist our models. These forecasts are only
given by state, so we need to convert states to ZIP3. The file
\texttt{data/state.txt} contains the state FIPS code and state name.
Since some ZIP3 codes span several states, we assign a state to each
ZIP3 code by determining which state is most represented among counties
in the ZIP3.

See also \texttt{data/deaths.R} and \texttt{time.Rmd} (line 856
onwards).

\hypertarget{simulated-client-dataset}{%
\subsection{Simulated client dataset}\label{simulated-client-dataset}}

The clients we were tasked to study were simulated by Securian
Financial. The dataset consists of 20 files called
\texttt{data/simulation\_data/experience\_weekly\_\{n\}.RDS} and
\texttt{data/simulation\_data/person\_\{n\}.RDS} for
\(n = 1,\dotsc, 10\). In total, we have 500 clients and 1,382,321
individuals.

The \texttt{person\_\{n\}.RDS} files contain information such as
company, zip code, age, face amount, gender, and collar (blue or white,
but in this dataset every indivual was blue collar). The rows in
\texttt{experience\_weekly\_\{n\}.RDS} correspond to individuals and
weeks, and contains a flag \texttt{death} that becomes 1 on the week
they die. In total, these tables contain 170,025,483 rows, but the same
information can be conveyed in 1,382,231 rows by attaching to each
individual their death date (or \texttt{NA} if they don't die).

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{read\_data }\OtherTok{\textless{}{-}} \ControlFlowTok{function}\NormalTok{(n) \{}
\NormalTok{  exp\_name }\OtherTok{\textless{}{-}} \FunctionTok{str\_glue}\NormalTok{(}\StringTok{"simulation\_data/experience\_weekly\_\{n\}.RDS"}\NormalTok{)}
\NormalTok{  per\_name }\OtherTok{\textless{}{-}} \FunctionTok{str\_glue}\NormalTok{(}\StringTok{"simulation\_data/person\_\{n\}.RDS"}\NormalTok{)}
\NormalTok{  exp }\OtherTok{\textless{}{-}} \FunctionTok{read\_rds}\NormalTok{(exp\_name)}
\NormalTok{  per }\OtherTok{\textless{}{-}} \FunctionTok{read\_rds}\NormalTok{(per\_name)}

\NormalTok{  dies }\OtherTok{\textless{}{-}}
\NormalTok{    exp }\SpecialCharTok{\%\textgreater{}\%}
    \FunctionTok{filter}\NormalTok{(death }\SpecialCharTok{\textgreater{}} \DecValTok{0}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}
    \FunctionTok{select}\NormalTok{(client, participant, week, month, year)}
\NormalTok{  aug\_per }\OtherTok{\textless{}{-}}
\NormalTok{    per }\SpecialCharTok{\%\textgreater{}\%}
    \FunctionTok{left\_join}\NormalTok{(dies, }\AttributeTok{by =} \FunctionTok{c}\NormalTok{(}\StringTok{"client"}\NormalTok{, }\StringTok{"participant"}\NormalTok{))}

\NormalTok{  aug\_per}
\NormalTok{\}}

\NormalTok{all\_persons }\OtherTok{\textless{}{-}}\NormalTok{ (}\DecValTok{1}\SpecialCharTok{:}\DecValTok{10}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{map\_dfr}\NormalTok{(read\_data)}
\end{Highlighting}
\end{Shaded}

We noticed that some individuals die more than once. The following
removes the multiple deaths.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{all\_persons }\OtherTok{\textless{}{-}}
\NormalTok{  all\_persons }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{group\_by}\NormalTok{(client, participant) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{arrange}\NormalTok{(year, week, }\AttributeTok{.by\_group =} \ConstantTok{TRUE}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{slice\_head}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

We finally attach to each individual their yearly \(q_x\) value, and
save the resuilting tibble in
\texttt{data/simultation\_data/all\_persons.feather}.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{qx\_table }\OtherTok{\textless{}{-}} \FunctionTok{read\_csv}\NormalTok{(}\StringTok{"soa\_base\_2017.csv"}\NormalTok{)}

\NormalTok{all\_persons }\OtherTok{\textless{}{-}}
\NormalTok{  all\_persons }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{left\_join}\NormalTok{(qx\_table, }\AttributeTok{by =} \FunctionTok{c}\NormalTok{(}\StringTok{"Age"}\NormalTok{, }\StringTok{"Sex"}\NormalTok{, }\StringTok{"collar"}\NormalTok{)) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{relocate}\NormalTok{(qx, }\AttributeTok{.after =}\NormalTok{ collar)}

\FunctionTok{write\_feather}\NormalTok{(all\_persons }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{ungroup}\NormalTok{(), }\StringTok{"simulation\_data/all\_persons.feather"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

The individual-level dataset is then converted to a client-level
dataset. We summarize each client by taking their ZIP3, size (number of
individuals), volume (sum of face amounts), average qx, average age, and
expected amount of claims. We also compute the amount weekly total
amount of claims.

See also \texttt{data/all\_persons.r}.

\hypertarget{final-cleanup}{%
\subsection{Final cleanup}\label{final-cleanup}}

Some of our clients are located in ZIP3 codes that we cannot deal with
for various reasons. They correspond to the following areas

\begin{longtable}[]{@{}
  >{\raggedright\arraybackslash}p{(\columnwidth - 2\tabcolsep) * \real{0.29}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 2\tabcolsep) * \real{0.71}}@{}}
\toprule
\begin{minipage}[b]{\linewidth}\raggedright
ZIP3
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Area
\end{minipage} \\
\midrule
\endhead
969 & Guam, Palau, Federated States of Micronesia, Northern Mariana
Islands, Marshall Islands \\
093 & Military bases in Iraq and Afghanistan \\
732 & Not in use \\
872 & Not in use \\
004 & Not in use \\
202 & Washington DC, Government 1 \\
753 & Dallas, TX \\
772 & Houston, TX \\
\bottomrule
\end{longtable}

The final two are problematic since they contained no population in
2010: one is used exclusively by a hospital, and the other is used
exclusively by a mall. Additionally, election data is not available in
Washington D.C., so we remove clients located there. In the end, we have
a total of 492 clients to work with.

The data merging is done in the file \texttt{processed\_data.r} which
generates the file \texttt{data/processed\_data\_20\_12\_23.feather}.
The dependency tree is outlined in the Appendix.

After merging, this gives us a final dataset of 492 clients over 118
weeks ranging from Jan 1st 2019 to June 27th 2021. We make two separate
tibbles.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{weekly\_data }\OtherTok{\textless{}{-}}
  \FunctionTok{read\_feather}\NormalTok{(}\StringTok{"data/processed\_data\_20\_12\_23.feather"}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{select}\NormalTok{(}\SpecialCharTok{{-}}\NormalTok{ae\_2021, }\SpecialCharTok{{-}}\NormalTok{ae\_2020, }\SpecialCharTok{{-}}\NormalTok{ae\_2019,}
         \SpecialCharTok{{-}}\NormalTok{actual\_2021, }\SpecialCharTok{{-}}\NormalTok{actual\_2020, }\SpecialCharTok{{-}}\NormalTok{actual\_2019, }\SpecialCharTok{{-}}\NormalTok{adverse,}
         \SpecialCharTok{{-}}\NormalTok{STATE\_NAME, }\SpecialCharTok{{-}}\NormalTok{shrinkage,  }\SpecialCharTok{{-}}\NormalTok{dep\_var) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{arrange}\NormalTok{(client, date)}

\NormalTok{yearly\_data }\OtherTok{\textless{}{-}}
  \FunctionTok{read\_feather}\NormalTok{(}\StringTok{"data/processed\_data\_20\_12\_23.feather"}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{group\_by}\NormalTok{(client) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{slice}\NormalTok{(}\DecValTok{1}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{select}\NormalTok{(}\SpecialCharTok{{-}}\NormalTok{date, }\SpecialCharTok{{-}}\NormalTok{claims, }\SpecialCharTok{{-}}\NormalTok{zip\_deaths, }\SpecialCharTok{{-}}\NormalTok{smoothed\_ae, }\SpecialCharTok{{-}}\NormalTok{shrunk\_ae,}
         \SpecialCharTok{{-}}\NormalTok{class, }\SpecialCharTok{{-}}\NormalTok{smoothed\_deaths,}
         \SpecialCharTok{{-}}\NormalTok{hes, }\SpecialCharTok{{-}}\NormalTok{hes\_uns, }\SpecialCharTok{{-}}\NormalTok{str\_hes, }\SpecialCharTok{{-}}\NormalTok{ae, }\SpecialCharTok{{-}}\NormalTok{dep\_var, }\SpecialCharTok{{-}}\NormalTok{shrinkage, }\SpecialCharTok{{-}}\NormalTok{STATE\_NAME, }\SpecialCharTok{{-}}\NormalTok{ihme\_deaths)}
\end{Highlighting}
\end{Shaded}

Each row in \texttt{yearly\_data} corresponds to a client, and it
contains the following variables

\begin{longtable}[]{@{}
  >{\raggedright\arraybackslash}p{(\columnwidth - 2\tabcolsep) * \real{0.43}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 2\tabcolsep) * \real{0.57}}@{}}
\toprule
\begin{minipage}[b]{\linewidth}\raggedright
Variable
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Description
\end{minipage} \\
\midrule
\endhead
\texttt{zip3} & ZIP3 code \\
\texttt{client} & client ID \\
\texttt{size} & number of individuals \\
\texttt{volume} & sum of face values \\
\texttt{avg\_qx} & average \(q_x\) \\
\texttt{avg\_age} & average age \\
\texttt{per\_male} & percentage of males \\
\texttt{per\_blue\_collar} & percentage of blue collar workers \\
\texttt{expected} & expected yearly amount of claims \\
\texttt{actual\_\{2021,\ 2020,\ 2019\}} & actual claims in \{2021, 2020,
2019\} \\
\texttt{ae\_\{2021,\ 2020,\ 2019\}} & actual claims / expected claims in
\{2021, 2020, 2019\} \\
\texttt{nohs} & percentage of zip residents without a high school
diploma \\
\texttt{hs} & percentage of zip residents with only a high school
diploma \\
\texttt{college} & percentage of zip residents with only a community
college or associates degree \\
\texttt{bachelor} & percentage of zip residents with a bachelor's
degree \\
\texttt{R\_birth} & birthrate in zip \\
\texttt{R\_death} & deathrate in zip (pre-covid) \\
\texttt{unemp} & unemployment in zip \\
\texttt{poverty} & percentage of zip residents living in poverty \\
\texttt{per\_dem} & percentage of zip residents who voted Democrat in
2020 \\
\texttt{svi} & Social Vulnerability Index \\
\texttt{cvac} & CVAC level of concern for vaccine rollout \\
\texttt{income} & median household income in zipcode \\
\texttt{POP} & population in zipcode \\
\texttt{density} & zipcode population density \\
\texttt{adverse} & whether or not ae\_2020 \textgreater{} 3 \\
\bottomrule
\end{longtable}

The tibble \texttt{weekly\_data} contain most of the above variables,
but also some that change weekly. Each row correspond to a pair
\((\text{client}, \text{week})\). We describe the ones not present above

\begin{longtable}[]{@{}
  >{\raggedright\arraybackslash}p{(\columnwidth - 2\tabcolsep) * \real{0.43}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 2\tabcolsep) * \real{0.57}}@{}}
\toprule
\begin{minipage}[b]{\linewidth}\raggedright
Variable
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Description
\end{minipage} \\
\midrule
\endhead
\texttt{date} & the last day of the week
(\texttt{lubridate::ceiling\_date(date,\ unit\ =\ "week")}) \\
\texttt{claims} & claims for that client on that week (\$) \\
\texttt{zip\_deaths} & number of deaths that week in the zipcode \\
\texttt{smoothed\_ae} & smoothed version of actual weekly AE (see the
section on long-term models) \\
\texttt{shrunk\_ae} & shrunk version of smoothed weekly AE (see the
section on long-term models) \\
\texttt{ae} & actual weekly AE \\
\texttt{ihme\_deaths} & IHME Covid death forecasts. \textbf{These are
only available until Apr 4th 2021, and are set to 0 after this date.} \\
\texttt{hes}, \texttt{hes\_uns}, \texttt{str\_hes} & percentage of the
zip population that are vaccine hesitant, hesitant or unsure, and
strongly hesistan respectively \\
\bottomrule
\end{longtable}

\hypertarget{data-exploration-and-motivation}{%
\section{Data exploration and
motivation}\label{data-exploration-and-motivation}}

Since the pandemic started, our clients' claims increased dramatically.
In normal times, we expect an Actual-to-Expected ratio close to 1. As we
can see below, this doesn't apply in times of pandemic.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{yearly\_data }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{ungroup}\NormalTok{() }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{transmute}\NormalTok{(}
    \StringTok{\textasciigrave{}}\AttributeTok{2019}\StringTok{\textasciigrave{}} \OtherTok{=}\NormalTok{ ae\_2019 }\SpecialCharTok{\textgreater{}} \DecValTok{1}\NormalTok{,}
    \StringTok{\textasciigrave{}}\AttributeTok{2020}\StringTok{\textasciigrave{}} \OtherTok{=}\NormalTok{ ae\_2020 }\SpecialCharTok{\textgreater{}} \DecValTok{1}\NormalTok{,}
    \StringTok{\textasciigrave{}}\AttributeTok{2021}\StringTok{\textasciigrave{}} \OtherTok{=}\NormalTok{ ae\_2021 }\SpecialCharTok{\textgreater{}} \DecValTok{1}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{pivot\_longer}\NormalTok{(}\StringTok{\textasciigrave{}}\AttributeTok{2019}\StringTok{\textasciigrave{}}\SpecialCharTok{:}\StringTok{\textasciigrave{}}\AttributeTok{2021}\StringTok{\textasciigrave{}}\NormalTok{, }\AttributeTok{names\_to =} \StringTok{"year"}\NormalTok{, }\AttributeTok{values\_to =} \StringTok{"adverse"}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{mutate}\NormalTok{(}\AttributeTok{adverse =} \FunctionTok{fct\_rev}\NormalTok{(}\FunctionTok{fct\_recode}\NormalTok{(}\FunctionTok{factor}\NormalTok{(adverse), }\StringTok{\textasciigrave{}}\AttributeTok{AE \textgreater{} 1}\StringTok{\textasciigrave{}} \OtherTok{=} \StringTok{"TRUE"}\NormalTok{, }
    \StringTok{\textasciigrave{}}\AttributeTok{AE \textless{} 1}\StringTok{\textasciigrave{}} \OtherTok{=} \StringTok{"FALSE"}\NormalTok{))) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{ggplot}\NormalTok{(}\FunctionTok{aes}\NormalTok{(}\AttributeTok{x =}\NormalTok{ year, }\AttributeTok{fill =}\NormalTok{ adverse)) }\SpecialCharTok{+} \FunctionTok{geom\_bar}\NormalTok{() }\SpecialCharTok{+}
  \FunctionTok{labs}\NormalTok{(}\AttributeTok{x =} \StringTok{"Year"}\NormalTok{, }\AttributeTok{y =} \StringTok{"Count"}\NormalTok{, }\AttributeTok{fill =} \StringTok{"Class"}\NormalTok{, }
    \AttributeTok{title =} \StringTok{"Number of clients experiencing adverse mortality"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\includegraphics{figures/report/fig-unnamed-chunk-9-1.pdf}

We plot the magnitude of claims. Each dot corresponds to a client. We
see that the expected claims look similar to the actual claims in 2019,
while things change dramatically in 2020 and 2021. Note that the
vertical axis is logarithmic! The change in the claims during a pandemic
differs by orders of magnitude compared to the expected ones.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{library}\NormalTok{(ggbeeswarm)}
\FunctionTok{set.seed}\NormalTok{(}\DecValTok{92929292}\NormalTok{)}
\NormalTok{yearly\_data }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{ungroup}\NormalTok{() }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{select}\NormalTok{(expected, actual\_2019, actual\_2020, actual\_2021) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{rename}\NormalTok{(}\AttributeTok{actual\_Expected =}\NormalTok{ expected) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{pivot\_longer}\NormalTok{(}\FunctionTok{everything}\NormalTok{(), }\AttributeTok{names\_to =} \StringTok{"Year"}\NormalTok{, }\AttributeTok{values\_to =} \StringTok{"Claims"}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{mutate}\NormalTok{(}\AttributeTok{Year =} \FunctionTok{str\_sub}\NormalTok{(Year, }\DecValTok{8}\NormalTok{)) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{filter}\NormalTok{(Claims }\SpecialCharTok{\textgreater{}} \DecValTok{0}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{ggplot}\NormalTok{(}\FunctionTok{aes}\NormalTok{(Year, Claims, }\AttributeTok{color =}\NormalTok{ Year)) }\SpecialCharTok{+} \FunctionTok{scale\_y\_log10}\NormalTok{() }\SpecialCharTok{+}
  \FunctionTok{geom\_beeswarm}\NormalTok{(}\AttributeTok{size =} \FloatTok{0.5}\NormalTok{, }\AttributeTok{priority =} \StringTok{"random"}\NormalTok{) }\SpecialCharTok{+}
  \FunctionTok{guides}\NormalTok{(}\AttributeTok{color =} \StringTok{"none"}\NormalTok{) }\SpecialCharTok{+} \FunctionTok{labs}\NormalTok{(}\AttributeTok{title =} \StringTok{"Size of claims"}\NormalTok{) }\SpecialCharTok{+}
  \FunctionTok{scale\_color\_manual}\NormalTok{(}\AttributeTok{values =} \FunctionTok{c}\NormalTok{(}\StringTok{"yellow3"}\NormalTok{, }\StringTok{"deepskyblue"}\NormalTok{, }\StringTok{"black"}\NormalTok{, }\StringTok{"red"}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\includegraphics{figures/report/fig-unnamed-chunk-10-1.pdf}

\hypertarget{long-term-model}{%
\section{Long-term model}\label{long-term-model}}

Our first goal was to create a simple model to classify clients between
high risk or low risk. In this first model, we determine client risk
based on AE in 2020, and we will use data available before the pandemic
as predictors.

Our first task is to determine what ``high risk'' and ``low risk'' mean.
To this extent, we define ``AE 2020 \textgreater{} 3'' as ``high risk'',
as this is close the the first quartile of the AE in 2020.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{summary}\NormalTok{(yearly\_data }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{pull}\NormalTok{(ae\_2020))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   2.896   6.342  14.961  13.595 229.937
\end{verbatim}

This threshold was used to create the column \texttt{adverse} in
\texttt{yearly\_data}.

Thoughout this and following sections, we will be using extensively the
\texttt{tidymodels} framework. We will explain the commands as they
appear.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{library}\NormalTok{(tidymodels)}
\end{Highlighting}
\end{Shaded}

\hypertarget{feature-engineering}{%
\subsection{Feature engineering}\label{feature-engineering}}

Our mentor's hypothesis was that the AE for 2019 was not a good
predictor for client risk during a pandemic. To test this hypothesis, we
train and test a selection of models, some with 2019 AE as a predictor,
and some without.

We start with a recipe, which defines our model formulas and data
preprocessing steps. We remove all categorical predictors and all
variables that are not available before 2020. We also remove the
correlated variable \texttt{actual\_2019}. We then remove zero-variance
predictors and normalize all predictors.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{with2019 }\OtherTok{\textless{}{-}}
  \FunctionTok{recipe}\NormalTok{(adverse }\SpecialCharTok{\textasciitilde{}}\NormalTok{ ., }\AttributeTok{data =}\NormalTok{ yearly\_data) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{step\_rm}\NormalTok{(}\FunctionTok{all\_nominal\_predictors}\NormalTok{()) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{step\_rm}\NormalTok{(ae\_2020, ae\_2021, actual\_2019, actual\_2020, actual\_2021) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{step\_zv}\NormalTok{(}\FunctionTok{all\_predictors}\NormalTok{()) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{step\_normalize}\NormalTok{(}\FunctionTok{all\_predictors}\NormalTok{())}

\NormalTok{no2019 }\OtherTok{\textless{}{-}}
\NormalTok{  with2019 }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{step\_rm}\NormalTok{(ae\_2019)}
\end{Highlighting}
\end{Shaded}

Next, we describe our models using \texttt{parsnip} model
specifications. We will try 8 different models: logistic regression,
penalized logistic regression (penalty value chosen by initial tuning),
random forest, tuned random forest, single layer neural network, RBF
support vector machine, polynomial support vector machine, and K nearest
neighbors.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{log\_spec }\OtherTok{\textless{}{-}}
  \FunctionTok{logistic\_reg}\NormalTok{() }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{set\_engine}\NormalTok{(}\StringTok{"glm"}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{set\_mode}\NormalTok{(}\StringTok{"classification"}\NormalTok{)}
\NormalTok{tuned\_log\_spec }\OtherTok{\textless{}{-}}
  \FunctionTok{logistic\_reg}\NormalTok{(}\AttributeTok{penalty =} \FloatTok{0.00118}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{set\_engine}\NormalTok{(}\StringTok{"glmnet"}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{set\_mode}\NormalTok{(}\StringTok{"classification"}\NormalTok{)}
\NormalTok{forest\_spec }\OtherTok{\textless{}{-}}
  \FunctionTok{rand\_forest}\NormalTok{(}\AttributeTok{trees =} \DecValTok{1000}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{set\_mode}\NormalTok{(}\StringTok{"classification"}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{set\_engine}\NormalTok{(}\StringTok{"ranger"}\NormalTok{, }\AttributeTok{num.threads =} \DecValTok{8}\NormalTok{, }\AttributeTok{importance =} \StringTok{"impurity"}\NormalTok{, }\AttributeTok{seed =} \DecValTok{123}\NormalTok{)}
\NormalTok{tuned\_forest\_spec }\OtherTok{\textless{}{-}}
  \FunctionTok{rand\_forest}\NormalTok{(}\AttributeTok{trees =} \DecValTok{1000}\NormalTok{, }\AttributeTok{mtry =} \DecValTok{12}\NormalTok{, }\AttributeTok{min\_n =} \DecValTok{21}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{set\_mode}\NormalTok{(}\StringTok{"classification"}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{set\_engine}\NormalTok{(}\StringTok{"ranger"}\NormalTok{, }\AttributeTok{num.threads =} \DecValTok{8}\NormalTok{, }\AttributeTok{importance =} \StringTok{"impurity"}\NormalTok{, }\AttributeTok{seed =} \DecValTok{123}\NormalTok{)}
\NormalTok{sln\_spec }\OtherTok{\textless{}{-}}
  \FunctionTok{mlp}\NormalTok{() }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{set\_engine}\NormalTok{(}\StringTok{"nnet"}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{set\_mode}\NormalTok{(}\StringTok{"classification"}\NormalTok{)}
\NormalTok{svm\_rbf\_spec }\OtherTok{\textless{}{-}}
  \FunctionTok{svm\_rbf}\NormalTok{() }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{set\_engine}\NormalTok{(}\StringTok{"kernlab"}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{set\_mode}\NormalTok{(}\StringTok{"classification"}\NormalTok{)}
\NormalTok{svm\_poly\_spec }\OtherTok{\textless{}{-}}
  \FunctionTok{svm\_poly}\NormalTok{() }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{set\_engine}\NormalTok{(}\StringTok{"kernlab"}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{set\_mode}\NormalTok{(}\StringTok{"classification"}\NormalTok{)}
\NormalTok{knn\_spec }\OtherTok{\textless{}{-}}
  \FunctionTok{nearest\_neighbor}\NormalTok{() }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{set\_engine}\NormalTok{(}\StringTok{"kknn"}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{set\_mode}\NormalTok{(}\StringTok{"classification"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

In \texttt{tidymodels}, the combination of a recipe and a model
specification is called a \textbf{workflow}. Training a workflow trains
both the recipe (i.e.~it will learn the scaling and translation
parameters for the normalization step) and the underlying model. When a
workflow is used to predict, the trained recipe will automatically be
applied to a new set of data, and passed on to the trained model. We can
also combine sets of models and recipes into a \texttt{workflowset}.
This will allow us to easily train and test our models on the same
dataset.

We first split our clients into training and testing sets.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{set.seed}\NormalTok{(}\DecValTok{30308}\NormalTok{)}
\NormalTok{init }\OtherTok{\textless{}{-}} \FunctionTok{initial\_split}\NormalTok{(yearly\_data, }\AttributeTok{strata =}\NormalTok{ adverse)}
\end{Highlighting}
\end{Shaded}

All of our model selection, tuning, etc. will be done using 10-fold CV
on the training set.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{set.seed}\NormalTok{(}\DecValTok{30308}\NormalTok{)}
\NormalTok{crossval }\OtherTok{\textless{}{-}} \FunctionTok{vfold\_cv}\NormalTok{(}\FunctionTok{training}\NormalTok{(init), }\AttributeTok{strata =}\NormalTok{ adverse)}
\end{Highlighting}
\end{Shaded}

Our workflowset will contain the 16 combinations of the 8 model
specifications and 2 recipes. We train each one on the 10
cross-validation splits, and assess the results using the area under the
ROC (\texttt{roc\_auc}).

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{models }\OtherTok{\textless{}{-}} \FunctionTok{list}\NormalTok{(}\AttributeTok{Logistic =}\NormalTok{ log\_spec,}
               \StringTok{\textasciigrave{}}\AttributeTok{Penalized logistic}\StringTok{\textasciigrave{}} \OtherTok{=}\NormalTok{ tuned\_log\_spec,}
               \StringTok{\textasciigrave{}}\AttributeTok{Random forest}\StringTok{\textasciigrave{}} \OtherTok{=}\NormalTok{ forest\_spec,}
               \StringTok{\textasciigrave{}}\AttributeTok{Tuned random forest}\StringTok{\textasciigrave{}} \OtherTok{=}\NormalTok{ tuned\_forest\_spec,}
               \StringTok{\textasciigrave{}}\AttributeTok{Neural net}\StringTok{\textasciigrave{}} \OtherTok{=}\NormalTok{ sln\_spec,}
               \StringTok{\textasciigrave{}}\AttributeTok{RBF SVM}\StringTok{\textasciigrave{}} \OtherTok{=}\NormalTok{ svm\_rbf\_spec,}
               \StringTok{\textasciigrave{}}\AttributeTok{Polynomial SVM}\StringTok{\textasciigrave{}} \OtherTok{=}\NormalTok{ svm\_poly\_spec,}
               \StringTok{\textasciigrave{}}\AttributeTok{KNN}\StringTok{\textasciigrave{}} \OtherTok{=}\NormalTok{ knn\_spec)}
\NormalTok{recipes }\OtherTok{\textless{}{-}} \FunctionTok{list}\NormalTok{(}\StringTok{"with2019ae"} \OtherTok{=}\NormalTok{ with2019,}
                \StringTok{"no2019ae"} \OtherTok{=}\NormalTok{ no2019)}
\NormalTok{wflows }\OtherTok{\textless{}{-}} \FunctionTok{workflow\_set}\NormalTok{(recipes, models)}
\NormalTok{fit\_wflows }\OtherTok{\textless{}{-}}
\NormalTok{    wflows }\SpecialCharTok{\%\textgreater{}\%}
      \FunctionTok{workflow\_map}\NormalTok{(}\AttributeTok{fn =} \StringTok{"fit\_resamples"}\NormalTok{,}
                   \AttributeTok{seed =} \DecValTok{30332}\NormalTok{,}
                   \AttributeTok{resamples =}\NormalTok{ crossval,}
                   \AttributeTok{control =} \FunctionTok{control\_resamples}\NormalTok{(}\AttributeTok{save\_pred =} \ConstantTok{TRUE}\NormalTok{),}
                   \AttributeTok{metrics =} \FunctionTok{metric\_set}\NormalTok{(roc\_auc, accuracy))}
\end{Highlighting}
\end{Shaded}

We now look at the results with and without the 2019 AE as a predictor

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{fit\_wflows }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{collect\_metrics}\NormalTok{() }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{separate}\NormalTok{(wflow\_id, }\AttributeTok{into =} \FunctionTok{c}\NormalTok{(}\StringTok{"rec"}\NormalTok{, }\StringTok{"mod"}\NormalTok{), }\AttributeTok{sep =} \StringTok{"\_"}\NormalTok{, }\AttributeTok{remove =} \ConstantTok{FALSE}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{ggplot}\NormalTok{(}\FunctionTok{aes}\NormalTok{(}\AttributeTok{x =}\NormalTok{ rec, }\AttributeTok{y =}\NormalTok{ mean, }\AttributeTok{color =}\NormalTok{ mod, }\AttributeTok{group =}\NormalTok{ mod)) }\SpecialCharTok{+}
  \FunctionTok{geom\_point}\NormalTok{() }\SpecialCharTok{+} \FunctionTok{geom\_line}\NormalTok{() }\SpecialCharTok{+} \FunctionTok{facet\_wrap}\NormalTok{(}\SpecialCharTok{\textasciitilde{}} \FunctionTok{factor}\NormalTok{(.metric)) }\SpecialCharTok{+}
  \FunctionTok{labs}\NormalTok{(}\AttributeTok{color =} \StringTok{"Model"}\NormalTok{, }\AttributeTok{x =} \ConstantTok{NULL}\NormalTok{, }\AttributeTok{y =} \StringTok{"Value"}\NormalTok{, }
      \AttributeTok{title =} \StringTok{"Performance of models with/without 2019 data"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\includegraphics{figures/report/fig-unnamed-chunk-18-1.pdf}

The performance with 2019 AE as a predictor is equal or worse than not
using it. Thus in the following we use the recipe where 2019 AE is
removed. We note that the above analysis was done with models with
default hyperparameters. It is certainly possible that some methods
would have seen benefits from tuning.

\hypertarget{model-selection}{%
\subsection{Model selection}\label{model-selection}}

With our data preprocessing locked in, we turn to model selection next.
We will look at five models, each with 10 different hyperparameters.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{tune\_log\_spec }\OtherTok{\textless{}{-}}
  \FunctionTok{logistic\_reg}\NormalTok{(}\AttributeTok{penalty =} \FunctionTok{tune}\NormalTok{()) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{set\_engine}\NormalTok{(}\StringTok{"glmnet"}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{set\_mode}\NormalTok{(}\StringTok{"classification"}\NormalTok{)}
\NormalTok{tune\_forest\_spec }\OtherTok{\textless{}{-}}
  \FunctionTok{rand\_forest}\NormalTok{(}\AttributeTok{trees =} \DecValTok{1000}\NormalTok{, }\AttributeTok{mtry =} \FunctionTok{tune}\NormalTok{(), }\AttributeTok{min\_n =} \FunctionTok{tune}\NormalTok{()) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{set\_mode}\NormalTok{(}\StringTok{"classification"}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{set\_engine}\NormalTok{(}\StringTok{"ranger"}\NormalTok{, }\AttributeTok{num.threads =} \DecValTok{8}\NormalTok{, }\AttributeTok{importance =} \StringTok{"impurity"}\NormalTok{, }\AttributeTok{seed =} \DecValTok{123}\NormalTok{)}
\NormalTok{tune\_sln\_spec }\OtherTok{\textless{}{-}}
  \FunctionTok{mlp}\NormalTok{(}\AttributeTok{hidden\_units =} \FunctionTok{tune}\NormalTok{(), }\AttributeTok{penalty =} \FunctionTok{tune}\NormalTok{(), }\AttributeTok{epochs =} \FunctionTok{tune}\NormalTok{()) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{set\_engine}\NormalTok{(}\StringTok{"nnet"}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{set\_mode}\NormalTok{(}\StringTok{"classification"}\NormalTok{)}
\NormalTok{tune\_svm\_rbf\_spec }\OtherTok{\textless{}{-}}
  \FunctionTok{svm\_rbf}\NormalTok{(}\AttributeTok{cost =} \FunctionTok{tune}\NormalTok{(), }\AttributeTok{rbf\_sigma =} \FunctionTok{tune}\NormalTok{(), }\AttributeTok{margin =} \FunctionTok{tune}\NormalTok{()) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{set\_engine}\NormalTok{(}\StringTok{"kernlab"}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{set\_mode}\NormalTok{(}\StringTok{"classification"}\NormalTok{)}
\NormalTok{tune\_knn\_spec }\OtherTok{\textless{}{-}}
  \FunctionTok{nearest\_neighbor}\NormalTok{(}\AttributeTok{neighbors =} \FunctionTok{tune}\NormalTok{(), }\AttributeTok{dist\_power =} \FunctionTok{tune}\NormalTok{()) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{set\_engine}\NormalTok{(}\StringTok{"kknn"}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{set\_mode}\NormalTok{(}\StringTok{"classification"}\NormalTok{)}

\NormalTok{models }\OtherTok{\textless{}{-}} \FunctionTok{list}\NormalTok{(}\StringTok{\textasciigrave{}}\AttributeTok{Logistic}\StringTok{\textasciigrave{}} \OtherTok{=}\NormalTok{ tune\_log\_spec,}
               \StringTok{\textasciigrave{}}\AttributeTok{Random forest}\StringTok{\textasciigrave{}} \OtherTok{=}\NormalTok{ tune\_forest\_spec,}
               \StringTok{\textasciigrave{}}\AttributeTok{Neural network}\StringTok{\textasciigrave{}} \OtherTok{=}\NormalTok{ tune\_sln\_spec,}
               \StringTok{\textasciigrave{}}\AttributeTok{SVM RBF}\StringTok{\textasciigrave{}} \OtherTok{=}\NormalTok{ tune\_svm\_rbf\_spec,}
               \StringTok{\textasciigrave{}}\AttributeTok{KNN}\StringTok{\textasciigrave{}} \OtherTok{=}\NormalTok{ tune\_knn\_spec)}
\NormalTok{recipes }\OtherTok{\textless{}{-}} \FunctionTok{list}\NormalTok{(no2019)}
\NormalTok{wflows }\OtherTok{\textless{}{-}} \FunctionTok{workflow\_set}\NormalTok{(recipes, models)}
\end{Highlighting}
\end{Shaded}

For each model, the 10 tuning parameters will be automatically selected
using a latin hypercube. See the documentation of
\texttt{dials::grid\_latin\_hypercube} for implementation details.
Again, performance will be evaluated by 10-fold crossvalidation.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{results }\OtherTok{\textless{}{-}}
\NormalTok{  wflows }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{workflow\_map}\NormalTok{(}\AttributeTok{resamples =}\NormalTok{ crossval,}
               \AttributeTok{grid =} \DecValTok{10}\NormalTok{,}
               \AttributeTok{metrics =} \FunctionTok{metric\_set}\NormalTok{(roc\_auc, accuracy),}
               \AttributeTok{control =} \FunctionTok{control\_grid}\NormalTok{(}\AttributeTok{save\_pred =} \ConstantTok{TRUE}\NormalTok{),}
               \AttributeTok{seed =} \DecValTok{828282}\NormalTok{)}
\DocumentationTok{\#\# i Creating pre{-}processing data to finalize unknown parameter: mtry}
\end{Highlighting}
\end{Shaded}

The results below suggest that the random forest is performing the best,
especially in terms of the area under the ROC. We will thus choose it
for further tuning.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{autoplot}\NormalTok{(results)}
\end{Highlighting}
\end{Shaded}

\includegraphics{figures/report/fig-unnamed-chunk-20-1.pdf}

\hypertarget{tuning-a-random-forest}{%
\subsection{Tuning a random forest}\label{tuning-a-random-forest}}

Since we've chosen a random forest, we no longer need to normalize our
predictors. This will make model explanation easier later on. We wrap
the recipe and model specification into a workflow.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{forest\_rec }\OtherTok{\textless{}{-}}
  \FunctionTok{recipe}\NormalTok{(adverse }\SpecialCharTok{\textasciitilde{}}\NormalTok{ ., }\AttributeTok{data =}\NormalTok{ yearly\_data) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{step\_rm}\NormalTok{(}\FunctionTok{all\_nominal\_predictors}\NormalTok{()) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{step\_rm}\NormalTok{(ae\_2020, ae\_2021, actual\_2019, actual\_2020, actual\_2021) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{step\_zv}\NormalTok{(}\FunctionTok{all\_predictors}\NormalTok{()) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{step\_rm}\NormalTok{(ae\_2019)}

\NormalTok{forest\_wflow }\OtherTok{\textless{}{-}}
  \FunctionTok{workflow}\NormalTok{() }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{add\_model}\NormalTok{(tune\_forest\_spec) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{add\_recipe}\NormalTok{(forest\_rec)}
\end{Highlighting}
\end{Shaded}

We have two tunable hyperparameters: \texttt{min\_n}, the minimal number
of datapoints required for a node to split, and \texttt{mtry}, the
number of randomly selected predictors in each tree. We fix the number
of trees to 1000, and we set the tuning range of \texttt{mtry} to be
between 1 and 20. Tuning will happen on a regular, 10 x 10 grid.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{forest\_params }\OtherTok{\textless{}{-}}
\NormalTok{  forest\_wflow }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{parameters}\NormalTok{() }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{update}\NormalTok{(}\AttributeTok{mtry =} \FunctionTok{mtry}\NormalTok{(}\FunctionTok{c}\NormalTok{(}\DecValTok{1}\NormalTok{, }\DecValTok{20}\NormalTok{)))}

\NormalTok{forest\_grid }\OtherTok{\textless{}{-}}
  \FunctionTok{grid\_regular}\NormalTok{(forest\_params, }\AttributeTok{levels =} \DecValTok{10}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{forest\_tune }\OtherTok{\textless{}{-}}
\NormalTok{  forest\_wflow }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{tune\_grid}\NormalTok{(}
      \AttributeTok{resamples =}\NormalTok{ crossval,}
      \AttributeTok{grid =}\NormalTok{ forest\_grid,}
      \AttributeTok{metrics =} \FunctionTok{metric\_set}\NormalTok{(roc\_auc, accuracy)}
\NormalTok{  )}
\end{Highlighting}
\end{Shaded}

The tuning results are below. We choose a set of parameters whose
\texttt{roc\_auc} is high. In this case, we choose \texttt{mtry\ =\ 5},
\texttt{min\_n\ =\ 6}. The command \texttt{finalize\_workflow} applies
these parameters and returns a tuned workflow.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{autoplot}\NormalTok{(forest\_tune)}
\end{Highlighting}
\end{Shaded}

\includegraphics{figures/report/fig-unnamed-chunk-23-1.pdf}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{best\_params }\OtherTok{\textless{}{-}} \FunctionTok{list}\NormalTok{(}\AttributeTok{mtry =} \DecValTok{5}\NormalTok{, }\AttributeTok{min\_n =} \DecValTok{6}\NormalTok{)}
\NormalTok{final\_forest }\OtherTok{\textless{}{-}}
\NormalTok{  forest\_wflow }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{finalize\_workflow}\NormalTok{(best\_params)}
\end{Highlighting}
\end{Shaded}

\hypertarget{thresholding}{%
\subsection{Thresholding}\label{thresholding}}

At the moment, our forest classifies each client by predicting the
probability of belonging to the ``high risk'' class. If that probability
is greater than 0.5, the final classification will be ``high risk'', if
not, the final classification will be ``low risk''.

By changing the threshold from 0.5 to something else, we can influence
the number of false positives or false negatives. This is important,
since false positives and false negatives have different financial
impacts for the insurer. For example, a false positive would unfairly
label a customer as high-risk when in reality they are not. Such a
misclassification may lead to loss of profitable clients. On the other
hand, a false negative might lead to mismanagement of risk due to
exessive claims.

We can study the effect of different thresholds using the package
\texttt{probably}. For each of our 10 cross-validation sets, we train a
random forest using the optimal parameters found above, and predict
using 101 threshold values between 0 and 1. The function
\texttt{probably::threshold\_perf} will compute seveal metrics, but we
plot only sensitivity, specificity, and j-index. These are averaged over
the 10 crossvalidation sets.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{library}\NormalTok{(probably)}

\NormalTok{forest\_resamples }\OtherTok{\textless{}{-}}
\NormalTok{  final\_forest }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{finalize\_workflow}\NormalTok{(best\_params) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{fit\_resamples}\NormalTok{(}
      \AttributeTok{resamples =}\NormalTok{ crossval,}
      \AttributeTok{control =} \FunctionTok{control\_resamples}\NormalTok{(}\AttributeTok{save\_pred =} \ConstantTok{TRUE}\NormalTok{)}
\NormalTok{  )}

\NormalTok{forest\_resamples }\OtherTok{\textless{}{-}}
\NormalTok{  forest\_resamples }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{rowwise}\NormalTok{() }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{mutate}\NormalTok{(}\AttributeTok{thr\_perf =} \FunctionTok{list}\NormalTok{(}\FunctionTok{threshold\_perf}\NormalTok{(.predictions, adverse, }\StringTok{\textasciigrave{}}\AttributeTok{.pred\_ae \textgreater{} 3}\StringTok{\textasciigrave{}}\NormalTok{, }\AttributeTok{thresholds =} \FunctionTok{seq}\NormalTok{(}\FloatTok{0.0}\NormalTok{, }\DecValTok{1}\NormalTok{, }\AttributeTok{by =} \FloatTok{0.01}\NormalTok{))))}

\NormalTok{my\_threshold }\OtherTok{\textless{}{-}} \FloatTok{0.67}

\NormalTok{forest\_resamples }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{select}\NormalTok{(thr\_perf, id) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{unnest}\NormalTok{(thr\_perf) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{group\_by}\NormalTok{(.threshold, .metric) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{summarize}\NormalTok{(}\AttributeTok{estimate =} \FunctionTok{mean}\NormalTok{(.estimate)) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{filter}\NormalTok{(.metric }\SpecialCharTok{!=} \StringTok{"distance"}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{ggplot}\NormalTok{(}\FunctionTok{aes}\NormalTok{(}\AttributeTok{x =}\NormalTok{ .threshold, }\AttributeTok{y =}\NormalTok{ estimate, }\AttributeTok{color =}\NormalTok{ .metric)) }\SpecialCharTok{+} \FunctionTok{geom\_line}\NormalTok{() }\SpecialCharTok{+}
  \FunctionTok{geom\_vline}\NormalTok{(}\AttributeTok{xintercept =}\NormalTok{ my\_threshold, }\AttributeTok{linetype =} \StringTok{"dashed"}\NormalTok{) }\SpecialCharTok{+}
  \FunctionTok{labs}\NormalTok{(}\AttributeTok{x =} \StringTok{"Threshold"}\NormalTok{, }\AttributeTok{y =} \StringTok{"Estimate"}\NormalTok{, }\AttributeTok{color =} \StringTok{"Metric"}\NormalTok{, }
      \AttributeTok{title =} \StringTok{"Sensitivity and specificity by threshold"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\includegraphics{figures/report/fig-unnamed-chunk-24-1.pdf}

Some expertise and business intuition are required in order to determine
the desired threshold value. Due to a lack of time and resources, we
decided to choose a threshold value that would simultaneously optimize
for sensitivity and specificity. To that extent, we choose the threshold
value of 0.67, corresponding to the dotted line above.

\hypertarget{final-results}{%
\subsection{Final results}\label{final-results}}

With all the parameters chosen, we can finally train our random forest
on the whole training set, and test it on the test set. We augment the
testing set with our predicted probabilities.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{trained\_forest }\OtherTok{\textless{}{-}}
\NormalTok{  final\_forest }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{fit}\NormalTok{(}\FunctionTok{training}\NormalTok{(init))}

\NormalTok{thresholded\_predictions }\OtherTok{\textless{}{-}}
\NormalTok{  trained\_forest }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{predict}\NormalTok{(}\FunctionTok{testing}\NormalTok{(init), }\AttributeTok{type =} \StringTok{"prob"}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{mutate}\NormalTok{(}\AttributeTok{class\_pred =}
            \FunctionTok{make\_two\_class\_pred}\NormalTok{(}
                  \StringTok{\textasciigrave{}}\AttributeTok{.pred\_ae \textgreater{} 3}\StringTok{\textasciigrave{}}\NormalTok{,}
                  \AttributeTok{levels =} \FunctionTok{levels}\NormalTok{(yearly\_data}\SpecialCharTok{$}\NormalTok{adverse),}
                  \AttributeTok{threshold =}\NormalTok{ my\_threshold)) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{bind\_cols}\NormalTok{(}\FunctionTok{testing}\NormalTok{(init))}
\end{Highlighting}
\end{Shaded}

We can now compute a confusion matrix and some summary statistics. Note
that we have 124 clients in the testing set, of which 74\% are high risk
(\texttt{ae\ \textgreater{}\ 3}). This is the No Information Rate. We
can see that our model is clearly doing better than just naively
guessing.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{confusion\_matrix }\OtherTok{\textless{}{-}}
\NormalTok{  thresholded\_predictions }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{conf\_mat}\NormalTok{(adverse, class\_pred)}

\NormalTok{confusion\_matrix }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{autoplot}\NormalTok{(}\AttributeTok{type =} \StringTok{"heatmap"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\includegraphics{figures/report/fig-unnamed-chunk-26-1.pdf}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{confusion\_matrix }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{summary}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 13 x 3
##    .metric              .estimator .estimate
##    <chr>                <chr>          <dbl>
##  1 accuracy             binary         0.839
##  2 kap                  binary         0.603
##  3 sens                 binary         0.859
##  4 spec                 binary         0.781
##  5 ppv                  binary         0.919
##  6 npv                  binary         0.658
##  7 mcc                  binary         0.607
##  8 j_index              binary         0.640
##  9 bal_accuracy         binary         0.820
## 10 detection_prevalence binary         0.694
## 11 precision            binary         0.919
## 12 recall               binary         0.859
## 13 f_meas               binary         0.888
\end{verbatim}

\hypertarget{model-explanation}{%
\subsection{Model explanation}\label{model-explanation}}

We pick two specific clients as examples to explain our model result. We
choose client 58 who is located in Brooklyn, New York and client 412 who
is located in Asheville, North Carolina. The first one faced adverse
mortality and the second one didn't.

We load the \texttt{DALEX} package to plot break-down plots and to
compute SHAP values.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{library}\NormalTok{(DALEX)}
\FunctionTok{library}\NormalTok{(DALEXtra)}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{fit\_parsnip }\OtherTok{\textless{}{-}}\NormalTok{ trained\_forest }\SpecialCharTok{\%\textgreater{}\%}\NormalTok{ extract\_fit\_parsnip}
\NormalTok{trained\_recipe }\OtherTok{\textless{}{-}}\NormalTok{ trained\_forest }\SpecialCharTok{\%\textgreater{}\%}\NormalTok{ extract\_recipe}
\NormalTok{train }\OtherTok{\textless{}{-}}\NormalTok{ trained\_recipe }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{bake}\NormalTok{(}\FunctionTok{training}\NormalTok{(init))}
\NormalTok{test }\OtherTok{\textless{}{-}}\NormalTok{ trained\_recipe }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{bake}\NormalTok{(}\FunctionTok{testing}\NormalTok{(init))}
\end{Highlighting}
\end{Shaded}

Convert to an ``explainer'' object for later plot

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{ex }\OtherTok{\textless{}{-}}
  \FunctionTok{explain}\NormalTok{(}
    \AttributeTok{model =}\NormalTok{ fit\_parsnip,}
    \AttributeTok{data =}\NormalTok{ train)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Preparation of a new explainer is initiated
##   -> model label       :  model_fit  ( default )
##   -> data              :  368  rows  21  cols 
##   -> data              :  tibble converted into a data.frame 
##   -> target variable   :  not specified! ( WARNING )
##   -> predict function  :  yhat.model_fit  will be used ( default )
##   -> predicted values  :  No value for predict function target column. ( default )
##   -> model_info        :  package parsnip , ver. 0.1.7 , task classification ( default ) 
##   -> model_info        :  Model info detected classification task but 'y' is a NULL .  ( WARNING )
##   -> model_info        :  By deafult classification tasks supports only numercical 'y' parameter. 
##   -> model_info        :  Consider changing to numerical vector with 0 and 1 values.
##   -> model_info        :  Otherwise I will not be able to calculate residuals or loss function.
##   -> predicted values  :  numerical, min =  2e-04 , mean =  0.2594435 , max =  0.9503167  
##   -> residual function :  difference between y and yhat ( default )
##  A new explainer has been created! 
\end{verbatim}

Below, we have the break-down plot of the client located in New York.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{ex }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{predict\_parts}\NormalTok{(train }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{slice}\NormalTok{(}\DecValTok{343}\NormalTok{)) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{plot}\NormalTok{(}\AttributeTok{digits =} \DecValTok{2}\NormalTok{, }\AttributeTok{max\_features =} \DecValTok{5}\NormalTok{, }\AttributeTok{title =} \StringTok{"Client 58, New York, Brooklyn"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\includegraphics{figures/report/fig-unnamed-chunk-30-1.pdf}

The following presents the SHAP values of the client located in New
York.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{shap }\OtherTok{\textless{}{-}} \FunctionTok{predict\_parts}\NormalTok{(}\AttributeTok{explainer =}\NormalTok{ ex,}
                      \AttributeTok{new\_observation =}\NormalTok{ train }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{slice}\NormalTok{(}\DecValTok{343}\NormalTok{),}
                                 \AttributeTok{type =} \StringTok{"shap"}\NormalTok{,}
                                  \AttributeTok{B =} \DecValTok{25}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{plot}\NormalTok{(shap, }\AttributeTok{show\_boxplots =} \ConstantTok{FALSE}\NormalTok{, }\AttributeTok{title =} \StringTok{"Client 58, New York, Brooklyn"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\includegraphics{figures/report/fig-unnamed-chunk-31-1.pdf}

This is the break-down plot of the client located in NC.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{ex }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{predict\_parts}\NormalTok{(test }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{slice}\NormalTok{(}\DecValTok{80}\NormalTok{)) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{plot}\NormalTok{(}\AttributeTok{digits =} \DecValTok{2}\NormalTok{, }\AttributeTok{max\_features =} \DecValTok{5}\NormalTok{, }\AttributeTok{title =} \StringTok{"Client 412, North Carolina, Asheville"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\includegraphics{figures/report/fig-unnamed-chunk-32-1.pdf}

And those are the SHAP values of the client located in NC.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{shap }\OtherTok{\textless{}{-}} \FunctionTok{predict\_parts}\NormalTok{(}\AttributeTok{explainer =}\NormalTok{ ex,}
                      \AttributeTok{new\_observation =}\NormalTok{ test }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{slice}\NormalTok{(}\DecValTok{80}\NormalTok{),}
                                 \AttributeTok{type =} \StringTok{"shap"}\NormalTok{,}
                                  \AttributeTok{B =} \DecValTok{25}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{plot}\NormalTok{(shap, }\AttributeTok{show\_boxplots =} \ConstantTok{FALSE}\NormalTok{, }\AttributeTok{title =} \StringTok{"Client 412, North Carolina, Asheville"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\includegraphics{figures/report/fig-unnamed-chunk-33-1.pdf}

We can also determine which predictors contributed the most in our
model.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{trained\_forest }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{extract\_fit\_engine}\NormalTok{() }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{importance}\NormalTok{() }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{as\_tibble\_row}\NormalTok{() }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{pivot\_longer}\NormalTok{(}\FunctionTok{everything}\NormalTok{(), }\AttributeTok{names\_to =} \StringTok{"Variable"}\NormalTok{, }\AttributeTok{values\_to =} \StringTok{"Importance"}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{slice\_max}\NormalTok{(Importance, }\AttributeTok{n =} \DecValTok{10}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{ggplot}\NormalTok{(}\FunctionTok{aes}\NormalTok{(}\AttributeTok{y =} \FunctionTok{fct\_reorder}\NormalTok{(}\FunctionTok{factor}\NormalTok{(Variable), Importance), }\AttributeTok{x =}\NormalTok{ Importance, }
      \AttributeTok{fill =} \FunctionTok{fct\_reorder}\NormalTok{(}\FunctionTok{factor}\NormalTok{(Variable), Importance))) }\SpecialCharTok{+}
  \FunctionTok{geom\_col}\NormalTok{() }\SpecialCharTok{+}
  \FunctionTok{scale\_fill\_brewer}\NormalTok{(}\AttributeTok{palette =} \StringTok{"Spectral"}\NormalTok{) }\SpecialCharTok{+}
  \FunctionTok{guides}\NormalTok{(}\AttributeTok{fill =} \StringTok{"none"}\NormalTok{) }\SpecialCharTok{+} \FunctionTok{labs}\NormalTok{(}\AttributeTok{y =} \StringTok{"Variable"}\NormalTok{, }\AttributeTok{x =} \StringTok{"Importance"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\includegraphics{figures/report/fig-unnamed-chunk-34-1.pdf}

\hypertarget{short-term-model}{%
\section{Short-term model}\label{short-term-model}}

\hypertarget{introduction}{%
\subsection{Introduction}\label{introduction}}

Now that we have introduced the long-term model and presented its
results, we can move to the next step: adding time-dependent data. To do
this, we will be using \texttt{weekly\_data} throughout this section.

We recall how we obtained \texttt{weekly\_data} by describing some of
the preprocessing steps in \texttt{data/processed\_data.r}. To begin
with, we merged the following 4 data sets:

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  \texttt{data/deaths\_zip3.feather}: daily covid deaths data.
\item
  \texttt{data/simulation\_data/all\_persons.feather}: a tibble frame
  created in our data wrangling process, containing simulated data for
  each participants from our clients.
\item
  \texttt{data/data.feather} : zip3 data created in data wrangling
  process.
\item
  \texttt{data/2020\_12\_23/reference\_hospitalization\_all\_locs.csv}:
  data from IHME as of Dec 23 2020
\end{enumerate}

Next, we need some kind of a rolling count for AE. We could use the
``true'' weekly AE for each client, but it turns out that that number is
too volatile: there are many weeks without any deaths, which means that
any deaths will lead to huge, momentary spikes. Another quantity that is
quite volatile is the weekly COVID death count .

Thus we smooth the volatile quantities by taking a weighted average in
the 13 weeks prior. The weights come from a Gaussian distribution, and
we weight recent AE numbers higher than older ones.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{smoother }\OtherTok{\textless{}{-}} \ControlFlowTok{function}\NormalTok{(x) \{ }
    \FunctionTok{weighted.mean}\NormalTok{(x, }\FunctionTok{dnorm}\NormalTok{(}\FunctionTok{seq}\NormalTok{(}\SpecialCharTok{{-}}\DecValTok{1}\NormalTok{, }\DecValTok{0}\NormalTok{, }\AttributeTok{length.out =} \FunctionTok{length}\NormalTok{(x)), }\AttributeTok{sd =} \FloatTok{0.33}\NormalTok{)) \}}
\NormalTok{sliding\_smoother }\OtherTok{\textless{}{-}}
\NormalTok{  timetk}\SpecialCharTok{::}\FunctionTok{slidify}\NormalTok{(smoother, }\AttributeTok{.period =} \DecValTok{13}\NormalTok{, }\AttributeTok{.align =} \StringTok{"right"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

The function \texttt{sliding\_smoother} takes a vector and outputs a
vector of smoothed values.

We add smoothed AE and smoothed weekly zip deaths to the tibble
\texttt{weekly\_data}.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{weekly\_data }\OtherTok{\textless{}{-}}
\NormalTok{  weekly\_data }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{group\_by}\NormalTok{(client) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{mutate}\NormalTok{(}\AttributeTok{smoothed\_ae =} \FunctionTok{sliding\_smoother}\NormalTok{(ae), }
      \AttributeTok{smoothed\_deaths =} \FunctionTok{sliding\_smoother}\NormalTok{(zip\_deaths), }\AttributeTok{.before =}\NormalTok{ size) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{drop\_na}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

Then we shrink smoothed ae based on
\(\log(\text{Volume} \cdot \text{average } q_x)\). This gives us some
kind of a measure of client size and mortality. The motivation for this
is that small clients that experience adverse mortality are much less
impactful as large ones. We add the shrunk, smoothed AE to
\texttt{weekly\_data}.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{client\_shrinkage }\OtherTok{\textless{}{-}}
\NormalTok{  weekly\_data }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{summarize}\NormalTok{(}\AttributeTok{dep\_var =} \FunctionTok{first}\NormalTok{(volume }\SpecialCharTok{*}\NormalTok{ avg\_qx)) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{mutate}\NormalTok{(}\AttributeTok{shrinkage =} \FunctionTok{rescale}\NormalTok{(}\FunctionTok{log}\NormalTok{(dep\_var), }\AttributeTok{to =} \FunctionTok{c}\NormalTok{(}\FloatTok{0.3}\NormalTok{, }\DecValTok{1}\NormalTok{)))}

\NormalTok{weekly\_data }\OtherTok{\textless{}{-}}
\NormalTok{  weekly\_data }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{left\_join}\NormalTok{(client\_shrinkage, }\AttributeTok{by =} \StringTok{"client"}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{ungroup}\NormalTok{() }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{mutate}\NormalTok{(}\AttributeTok{shrunk\_ae =}\NormalTok{ smoothed\_ae }\SpecialCharTok{*}\NormalTok{ shrinkage, }\AttributeTok{.after =}\NormalTok{ smoothed\_ae)}
\end{Highlighting}
\end{Shaded}

In order to choose a threshold for high and low risk classification, we
look again at quantiles.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{weekly\_data }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{ungroup}\NormalTok{() }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{group\_by}\NormalTok{(date) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{summarize}\NormalTok{(}
      \StringTok{\textasciigrave{}}\AttributeTok{12.5}\StringTok{\textasciigrave{}} \OtherTok{=} \FunctionTok{quantile}\NormalTok{(shrunk\_ae, }\FloatTok{0.125}\NormalTok{),}
      \StringTok{\textasciigrave{}}\AttributeTok{25}\StringTok{\textasciigrave{}} \OtherTok{=} \FunctionTok{quantile}\NormalTok{(shrunk\_ae, }\FloatTok{0.25}\NormalTok{),}
      \StringTok{\textasciigrave{}}\AttributeTok{50}\StringTok{\textasciigrave{}} \OtherTok{=} \FunctionTok{quantile}\NormalTok{(shrunk\_ae, }\FloatTok{0.50}\NormalTok{)}
\NormalTok{  ) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{pivot\_longer}\NormalTok{(}\SpecialCharTok{{-}}\NormalTok{date, }\AttributeTok{names\_to =} \StringTok{"pth"}\NormalTok{, }\AttributeTok{values\_to =} \StringTok{"shrunk\_ae"}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{ggplot}\NormalTok{(}\FunctionTok{aes}\NormalTok{(}\AttributeTok{x =}\NormalTok{ date, }\AttributeTok{y =}\NormalTok{ shrunk\_ae, }\AttributeTok{color =}\NormalTok{ pth)) }\SpecialCharTok{+}
  \FunctionTok{geom\_line}\NormalTok{() }\SpecialCharTok{+}
  \FunctionTok{geom\_hline}\NormalTok{(}\AttributeTok{yintercept =} \FloatTok{2.5}\NormalTok{, }\AttributeTok{linetype =} \StringTok{"dashed"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\includegraphics{figures/report/fig-unnamed-chunk-38-1.pdf}

Based on this, we choose
\texttt{smoothed\ shrunk\ AE\ \textgreater{}\ 2.5} as ``Adverse'', which
corresponds to the dotted line above. With this choice, we have the
following proportion of adverse clients over time.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{weekly\_data }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{group\_by}\NormalTok{(date) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{summarize}\NormalTok{(}\StringTok{\textasciigrave{}}\AttributeTok{Percentage classified Adverse}\StringTok{\textasciigrave{}} \OtherTok{=} \FunctionTok{sum}\NormalTok{(class }\SpecialCharTok{==} \StringTok{"Adverse"}\NormalTok{) }\SpecialCharTok{/} \FunctionTok{n}\NormalTok{()) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{ggplot}\NormalTok{(}\FunctionTok{aes}\NormalTok{(}\AttributeTok{x =}\NormalTok{ date, }\AttributeTok{y =} \StringTok{\textasciigrave{}}\AttributeTok{Percentage classified Adverse}\StringTok{\textasciigrave{}}\NormalTok{)) }\SpecialCharTok{+} \FunctionTok{geom\_line}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\includegraphics{figures/report/fig-unnamed-chunk-39-1.pdf}

\hypertarget{model-selection-1}{%
\subsection{Model selection}\label{model-selection-1}}

We first start by dividing our timeline into training and testing sets:
we take all the dates before January 1 2021 as our training set and all
the dates from January 1 2021 to April 1 2021 as our test set (3 months
later). Our goal is to try and use the data from our clients'
performance before January 1 to predict their performance after this
date.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{train }\OtherTok{\textless{}{-}}
\NormalTok{  weekly\_data }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{filter}\NormalTok{(date }\SpecialCharTok{\textless{}=} \StringTok{"2021{-}01{-}01"}\NormalTok{)}

\NormalTok{test }\OtherTok{\textless{}{-}}
\NormalTok{  weekly\_data }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{filter}\NormalTok{(date }\SpecialCharTok{\textgreater{}} \StringTok{"2021{-}01{-}01"} \SpecialCharTok{\&}\NormalTok{ date }\SpecialCharTok{\textless{}=} \StringTok{"2021{-}04{-}01"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

There are two mains things that set this model apart from the long-term
model introduced in the first section. First, the AE is updated weekly
as opposed to the long-term model where the AE is taken yearly. Second,
we are adding weekly deaths as one of the predictors in addition to the
variables introduced in the long-term model. Now, that we have a clear
understanding of the predictors in the short-term model, the question
that arises is how we can use the weekly deaths in the testing time
(since such information won't be available for us in the ``future''). To
solve this issue, we decided to forecast the deaths for this ``future''
period: so we will use the weekly deaths from March 2020 to January 2021
and forecast the weekly deaths 3 months later. To do so, we will use the
ARIMA forecaster.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{library}\NormalTok{(fable)}
\FunctionTok{library}\NormalTok{(tsibble)}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{forecast }\OtherTok{\textless{}{-}}
\NormalTok{  weekly\_data }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{filter}\NormalTok{(date }\SpecialCharTok{\textgreater{}=} \StringTok{"2020{-}03{-}15"} \SpecialCharTok{\&}\NormalTok{ date }\SpecialCharTok{\textless{}=} \StringTok{"2021{-}01{-}01"}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{as\_tsibble}\NormalTok{(}\AttributeTok{index =}\NormalTok{ date, }\AttributeTok{key =}\NormalTok{ client) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{model}\NormalTok{(}\AttributeTok{arima =} \FunctionTok{ARIMA}\NormalTok{(smoothed\_deaths)) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{forecast}\NormalTok{(}\AttributeTok{h =} \StringTok{"3 months"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

We create a new set called \texttt{forecasted\_test} out of our testing
set where we replace \texttt{smoothed\_deaths} by
\texttt{forecasted\_deaths}.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{forecasted\_test }\OtherTok{\textless{}{-}}
\NormalTok{  forecast }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{as\_tibble}\NormalTok{() }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{select}\NormalTok{(client, date, .mean) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{right\_join}\NormalTok{(test, }\AttributeTok{by =} \FunctionTok{c}\NormalTok{(}\StringTok{"client"}\NormalTok{, }\StringTok{"date"}\NormalTok{)) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{select}\NormalTok{(}\SpecialCharTok{{-}}\NormalTok{smoothed\_deaths) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{rename}\NormalTok{(}\AttributeTok{smoothed\_deaths =}\NormalTok{ .mean)}
\end{Highlighting}
\end{Shaded}

We are ready to introduce our modeling strategy.

We first start by introducing a common recipe that we will use for all
our models. Our target variable is \texttt{class}, we use all the
predictors in \texttt{weekly\_data} except for \texttt{client},
\texttt{zip3}, \texttt{claims}, \texttt{smoothed\_ae},
\texttt{shrunk\_ae}, \texttt{ae}, \texttt{zip\_deaths},
\texttt{ihme\_deaths} and \texttt{date.} We normalize all predictors and
we apply log to both Volume of the client and Population of the zip
code.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{common\_recipe }\OtherTok{\textless{}{-}}
  \FunctionTok{recipe}\NormalTok{(class }\SpecialCharTok{\textasciitilde{}}\NormalTok{ ., }\AttributeTok{data =}\NormalTok{ weekly\_data) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{step\_rm}\NormalTok{(client, zip3, claims, smoothed\_ae, shrunk\_ae, ae, zip\_deaths, ihme\_deaths, date) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{step\_zv}\NormalTok{(}\FunctionTok{all\_predictors}\NormalTok{()) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{step\_log}\NormalTok{(volume, POP) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{step\_normalize}\NormalTok{(}\FunctionTok{all\_predictors}\NormalTok{())}
\end{Highlighting}
\end{Shaded}

Now, that we have our recipe, we are ready to try out different models
and report the results. Let us introduce the five models and then we
will talk a little bit about each one of them.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{forest\_spec }\OtherTok{\textless{}{-}}
  \FunctionTok{rand\_forest}\NormalTok{(}\AttributeTok{trees =} \DecValTok{1000}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{set\_engine}\NormalTok{(}\StringTok{"ranger"}\NormalTok{, }\AttributeTok{num.threads =} \DecValTok{8}\NormalTok{, }\AttributeTok{seed =} \DecValTok{123456789}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{set\_mode}\NormalTok{(}\StringTok{"classification"}\NormalTok{)}

\NormalTok{log\_spec }\OtherTok{\textless{}{-}}
  \FunctionTok{logistic\_reg}\NormalTok{(}
  \AttributeTok{mode =} \StringTok{"classification"}\NormalTok{,}
  \AttributeTok{engine =} \StringTok{"glm"}\NormalTok{)}

\NormalTok{knn\_spec }\OtherTok{\textless{}{-}}
  \FunctionTok{nearest\_neighbor}\NormalTok{() }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{set\_engine}\NormalTok{(}\StringTok{"kknn"}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{set\_mode}\NormalTok{(}\StringTok{"classification"}\NormalTok{)}

\NormalTok{sln\_spec }\OtherTok{\textless{}{-}}
  \FunctionTok{mlp}\NormalTok{(}\AttributeTok{activation =} \StringTok{"relu"}\NormalTok{, }\AttributeTok{hidden\_units =} \DecValTok{6}\NormalTok{, }\AttributeTok{epochs =} \DecValTok{100}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{set\_engine}\NormalTok{(}\StringTok{"keras"}\NormalTok{, }\AttributeTok{verbose=}\DecValTok{0}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{set\_mode}\NormalTok{(}\StringTok{"classification"}\NormalTok{)}


\NormalTok{bt\_spec }\OtherTok{\textless{}{-}} \FunctionTok{boost\_tree}\NormalTok{(}
  \AttributeTok{mode =} \StringTok{"classification"}\NormalTok{,}
  \AttributeTok{engine =} \StringTok{"xgboost"}\NormalTok{,}
  \AttributeTok{trees =} \DecValTok{100}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

We use Random Forest (\texttt{forest}), Logistic Regression
(\texttt{log}), Nearest Neighbor (\texttt{knn}), Neural Network with
single layer (\texttt{sln}) and Boosted Trees (\texttt{bt}) respectively
with default settings. For the Random Forest, we consider 1000 trees and
for the Boosted Trees, we take 100 trees. All of these models use
different engines introduced in \texttt{tidymodels}.

We then create the workflow for the five models mentioned above with the
recipe taken to be the ``common recipe'' and the model taken to be the
ones introduced in the previous chunk.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{bt\_wf }\OtherTok{\textless{}{-}}
  \FunctionTok{workflow}\NormalTok{() }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{add\_recipe}\NormalTok{(common\_recipe) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{add\_model}\NormalTok{(bt\_spec)}

\NormalTok{log\_wf }\OtherTok{\textless{}{-}}
  \FunctionTok{workflow}\NormalTok{() }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{add\_recipe}\NormalTok{(common\_recipe) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{add\_model}\NormalTok{(log\_spec)}

\NormalTok{forest\_wf }\OtherTok{\textless{}{-}}
  \FunctionTok{workflow}\NormalTok{() }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{add\_recipe}\NormalTok{(common\_recipe) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{add\_model}\NormalTok{(forest\_spec)}

\NormalTok{knn\_wf }\OtherTok{\textless{}{-}}
  \FunctionTok{workflow}\NormalTok{() }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{add\_recipe}\NormalTok{(common\_recipe) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{add\_model}\NormalTok{(knn\_spec)}

\NormalTok{sln\_wf }\OtherTok{\textless{}{-}}
  \FunctionTok{workflow}\NormalTok{() }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{add\_recipe}\NormalTok{(common\_recipe) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{add\_model}\NormalTok{(sln\_spec)}
\end{Highlighting}
\end{Shaded}

We can take each of these models and evaluate their performance
separately, but we want to find a way where we can compare their
performance through time. So, we create a tibble containing the five
different workflows, we fit out training set and we predict our
\texttt{forecasted\_test.} For the prediction, we use
\texttt{class\_predict} to come up with a class (this prediction will be
used to calculate accuracy, sensitivity and specificity). We also use
\texttt{prob\_predict} to come up with a predicitive probability (used
to calculate the \texttt{roc\_auc}).

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{wflows }\OtherTok{\textless{}{-}} \FunctionTok{tribble}\NormalTok{(}\SpecialCharTok{\textasciitilde{}}\NormalTok{wflow,}
\NormalTok{                  sln\_wf,}
\NormalTok{                  knn\_wf, log\_wf, forest\_wf, bt\_wf)}
 

\NormalTok{wflows }\OtherTok{\textless{}{-}}
\NormalTok{  wflows }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{mutate}\NormalTok{(}\AttributeTok{wflows\_fit =} \FunctionTok{map}\NormalTok{(wflow, }\SpecialCharTok{\textasciitilde{}} \FunctionTok{fit}\NormalTok{(.x, train))) }
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{wflows }\OtherTok{\textless{}{-}}
\NormalTok{  wflows }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{mutate}\NormalTok{(}
    \AttributeTok{class\_predict =} \FunctionTok{map}\NormalTok{(wflows\_fit, }\SpecialCharTok{\textasciitilde{}} \FunctionTok{predict}\NormalTok{(.x, forecasted\_test)),  }
    \AttributeTok{prob\_predict =} \FunctionTok{map}\NormalTok{(wflows\_fit, }\SpecialCharTok{\textasciitilde{}} \FunctionTok{predict}\NormalTok{(.x, forecasted\_test, }\AttributeTok{type =} \StringTok{"prob"}\NormalTok{)))}
\end{Highlighting}
\end{Shaded}

Now that we have our prediction as a class and as a probability, we are
ready to compare the metrics for the five models.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{wflows }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{bind\_cols}\NormalTok{(}\FunctionTok{tribble}\NormalTok{(}\SpecialCharTok{\textasciitilde{}}\NormalTok{id, }\StringTok{"Neural Network"}\NormalTok{, }\StringTok{"Nearest Neigbor"}\NormalTok{, }\StringTok{"Logistic Regression"}\NormalTok{, }
      \StringTok{" Random Forest"}\NormalTok{, }\StringTok{"Boosted Trees"}\NormalTok{)) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{select}\NormalTok{(}\SpecialCharTok{{-}}\NormalTok{wflow, }\SpecialCharTok{{-}}\NormalTok{wflows\_fit) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{mutate}\NormalTok{(}\AttributeTok{prob\_predict =} \FunctionTok{map}\NormalTok{(prob\_predict, }\SpecialCharTok{\textasciitilde{}} \FunctionTok{bind\_cols}\NormalTok{(.x, test }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{select}\NormalTok{(date, class)))) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{unnest}\NormalTok{(}\FunctionTok{c}\NormalTok{(class\_predict, prob\_predict)) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{group\_by}\NormalTok{(id, date) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{summarize}\NormalTok{(}
            \AttributeTok{sens =} \FunctionTok{sens\_vec}\NormalTok{(class, .pred\_class),}
            \AttributeTok{spec =} \FunctionTok{spec\_vec}\NormalTok{(class, .pred\_class),}
            \AttributeTok{roc\_auc =} \FunctionTok{roc\_auc\_vec}\NormalTok{(class, .pred\_Adverse),}
            \AttributeTok{accuracy =} \FunctionTok{accuracy\_vec}\NormalTok{(class, .pred\_class), }\AttributeTok{.groups =} \StringTok{"keep"}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{pivot\_longer}\NormalTok{(sens}\SpecialCharTok{:}\NormalTok{accuracy, }\AttributeTok{names\_to =} \StringTok{"metric"}\NormalTok{, }\AttributeTok{values\_to =} \StringTok{"value"}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{ungroup}\NormalTok{() }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{ggplot}\NormalTok{(}\FunctionTok{aes}\NormalTok{(}\AttributeTok{x =}\NormalTok{ date, }\AttributeTok{y =}\NormalTok{ value, }\AttributeTok{color =}\NormalTok{ id)) }\SpecialCharTok{+}
  \FunctionTok{geom\_point}\NormalTok{() }\SpecialCharTok{+}
  \FunctionTok{geom\_line}\NormalTok{() }\SpecialCharTok{+}
  \FunctionTok{facet\_wrap}\NormalTok{( }\SpecialCharTok{\textasciitilde{}}\NormalTok{ metric)}
\end{Highlighting}
\end{Shaded}

\includegraphics{figures/report/fig-unnamed-chunk-46-1.pdf}

One can wonder how much our models are being affected by the forecasting
of deaths. Let's replace \texttt{forecasted\_test} by \texttt{test} and
let's see what happens. (So, now actual deaths is used instead of
forecasted deaths). We see that the difference is not very big and our
forecasting is not affecting the models in a bad way.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{wflows\_cheat }\OtherTok{\textless{}{-}}
\NormalTok{  wflows }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{mutate}\NormalTok{(}
    \AttributeTok{class\_predict =} \FunctionTok{map}\NormalTok{(wflows\_fit, }\SpecialCharTok{\textasciitilde{}} \FunctionTok{predict}\NormalTok{(.x, test)),  }
    \AttributeTok{prob\_predict =} \FunctionTok{map}\NormalTok{(wflows\_fit, }\SpecialCharTok{\textasciitilde{}} \FunctionTok{predict}\NormalTok{(.x, test, }\AttributeTok{type =} \StringTok{"prob"}\NormalTok{)))}


\NormalTok{wflows\_cheat }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{bind\_cols}\NormalTok{(}\FunctionTok{tribble}\NormalTok{(}\SpecialCharTok{\textasciitilde{}}\NormalTok{id, }\StringTok{"Neural Network"}\NormalTok{, }\StringTok{"Nearest Neighbor"}\NormalTok{, }\StringTok{"Logistic Regression"}\NormalTok{, }
      \StringTok{"Random Forest"}\NormalTok{, }\StringTok{"Boosted Trees"}\NormalTok{)) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{select}\NormalTok{(}\SpecialCharTok{{-}}\NormalTok{wflow, }\SpecialCharTok{{-}}\NormalTok{wflows\_fit) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{mutate}\NormalTok{(}\AttributeTok{prob\_predict =} \FunctionTok{map}\NormalTok{(prob\_predict, }\SpecialCharTok{\textasciitilde{}} \FunctionTok{bind\_cols}\NormalTok{(.x, test }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{select}\NormalTok{(date, class)))) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{unnest}\NormalTok{(}\FunctionTok{c}\NormalTok{(class\_predict, prob\_predict)) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{group\_by}\NormalTok{(id, date) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{summarize}\NormalTok{(}
            \AttributeTok{sens =} \FunctionTok{sens\_vec}\NormalTok{(class, .pred\_class),}
            \AttributeTok{spec =} \FunctionTok{spec\_vec}\NormalTok{(class, .pred\_class),}
            \AttributeTok{roc\_auc =} \FunctionTok{roc\_auc\_vec}\NormalTok{(class, .pred\_Adverse), }
            \AttributeTok{accuracy =} \FunctionTok{accuracy\_vec}\NormalTok{(class, .pred\_class), }\AttributeTok{.groups =} \StringTok{"keep"}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{pivot\_longer}\NormalTok{(sens}\SpecialCharTok{:}\NormalTok{accuracy, }\AttributeTok{names\_to =} \StringTok{"metric"}\NormalTok{, }\AttributeTok{values\_to =} \StringTok{"value"}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{ungroup}\NormalTok{() }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{ggplot}\NormalTok{(}\FunctionTok{aes}\NormalTok{(}\AttributeTok{x =}\NormalTok{ date, }\AttributeTok{y =}\NormalTok{ value, }\AttributeTok{color =}\NormalTok{ id)) }\SpecialCharTok{+}
  \FunctionTok{geom\_point}\NormalTok{() }\SpecialCharTok{+}
  \FunctionTok{geom\_line}\NormalTok{() }\SpecialCharTok{+}
  \FunctionTok{facet\_wrap}\NormalTok{( }\SpecialCharTok{\textasciitilde{}}\NormalTok{ metric)}
\end{Highlighting}
\end{Shaded}

\includegraphics{figures/report/fig-unnamed-chunk-47-1.pdf}

\hypertarget{tuning-boosted-trees-model}{%
\subsection{Tuning Boosted Trees
Model}\label{tuning-boosted-trees-model}}

Comparing the models above, we can see that the Boosted Trees is the
best model. So, we use it for the rest of the project, we tune it, and
we report the results.

One of our goals is to have a model that can predict clients it hasn't
seen before. First, we split our clients into training and testing
clients. The training clients are ``known''; they will be what the model
will be trained on and they represent 75\% of our total. The testing
clients are ``unknown''; they will represent brand new clients and they
represent 25\% of our total.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{set.seed}\NormalTok{(}\DecValTok{1213}\NormalTok{)}
\NormalTok{training\_clients }\OtherTok{\textless{}{-}}
\NormalTok{  weekly\_data }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{nest\_by}\NormalTok{(client) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{ungroup}\NormalTok{() }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{slice\_sample}\NormalTok{(}\AttributeTok{prop =} \DecValTok{3}\SpecialCharTok{/}\DecValTok{4}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{pull}\NormalTok{(client)}

\NormalTok{testing\_clients }\OtherTok{\textless{}{-}}
\NormalTok{  weekly\_data }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{filter}\NormalTok{(}\SpecialCharTok{!}\NormalTok{client }\SpecialCharTok{\%in\%}\NormalTok{ training\_clients) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{pull}\NormalTok{(client) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{unique}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

We next divide the dates into training and testing dates. Our training
period includes all dates before January 1 2021, and our testing period
includes the next three months (so up to April 2021).

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{start }\OtherTok{\textless{}{-}} \FunctionTok{ceiling\_date}\NormalTok{(}\FunctionTok{ymd}\NormalTok{(}\StringTok{"2021{-}01{-}01"}\NormalTok{), }\AttributeTok{unit =} \StringTok{"week"}\NormalTok{)}
\NormalTok{end }\OtherTok{\textless{}{-}} \FunctionTok{ceiling\_date}\NormalTok{(}\FunctionTok{ymd}\NormalTok{(}\StringTok{"2021{-}04{-}01"}\NormalTok{), }\AttributeTok{unit =} \StringTok{"week"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

The goal of the following is to tune the Boosted Trees to optimize
predictions of ``unknown'' client three months out. To do so, we divide
the training dates into analysis dates (all dates before October 1 2020)
and assessment date (week of Jan 1st 2021).

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{analys }\OtherTok{\textless{}{-}} \FunctionTok{ceiling\_date}\NormalTok{(}\FunctionTok{ymd}\NormalTok{(}\StringTok{"2020{-}10{-}01"}\NormalTok{), }\AttributeTok{unit =} \StringTok{"week"}\NormalTok{)}
\NormalTok{assess }\OtherTok{\textless{}{-}}\NormalTok{ start}
\end{Highlighting}
\end{Shaded}

We then split the training clients into analysis (75\% of the training
or known clients) and assessment (25\% of the known clients). We will
create an \texttt{rsample} object, which requires knowledge of the row
indices for the analysis and assessment sets.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{set.seed}\NormalTok{(}\DecValTok{123}\NormalTok{)}
\NormalTok{ana\_clients }\OtherTok{\textless{}{-}}
\NormalTok{  training\_clients }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{sample}\NormalTok{(}\FunctionTok{length}\NormalTok{(.) }\SpecialCharTok{*} \DecValTok{3} \SpecialCharTok{/} \DecValTok{4}\NormalTok{)}

\NormalTok{ana\_idx }\OtherTok{\textless{}{-}}
\NormalTok{  weekly\_data }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{rownames\_to\_column}\NormalTok{() }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{filter}\NormalTok{(client }\SpecialCharTok{\%in\%}\NormalTok{ ana\_clients }\SpecialCharTok{\&}\NormalTok{ date }\SpecialCharTok{\textless{}=}\NormalTok{ analys) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{pull}\NormalTok{(rowname) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{as.integer}\NormalTok{()}

\NormalTok{ass\_idx }\OtherTok{\textless{}{-}}
\NormalTok{  weekly\_data }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{rownames\_to\_column}\NormalTok{() }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{filter}\NormalTok{(client }\SpecialCharTok{\%in\%}\NormalTok{ training\_clients) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{filter}\NormalTok{(}\SpecialCharTok{!}\NormalTok{client }\SpecialCharTok{\%in\%}\NormalTok{ ana\_clients }\SpecialCharTok{\&}\NormalTok{ date }\SpecialCharTok{==}\NormalTok{ assess) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{pull}\NormalTok{(rowname) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{as.integer}\NormalTok{()}

\NormalTok{spl }\OtherTok{\textless{}{-}} \FunctionTok{make\_splits}\NormalTok{(}\FunctionTok{list}\NormalTok{(}\AttributeTok{analysis =}\NormalTok{ ana\_idx, }\AttributeTok{assessment =}\NormalTok{ ass\_idx), }\AttributeTok{data =}\NormalTok{ weekly\_data)}
\NormalTok{resmpl }\OtherTok{\textless{}{-}} \FunctionTok{manual\_rset}\NormalTok{(}\FunctionTok{list}\NormalTok{(spl), }\FunctionTok{c}\NormalTok{(}\StringTok{"Manual split"}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

Now, we define our boosted trees. We remove the following from the list
of predictors: \texttt{zip3}, \texttt{date}, \texttt{client},
\texttt{claims}, \texttt{zip\_deaths}, \texttt{smoothed\_ae},
\texttt{shrunk\_ae}, \texttt{ae}.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{xgboost\_recipe }\OtherTok{\textless{}{-}}
  \FunctionTok{recipe}\NormalTok{(}\AttributeTok{formula =}\NormalTok{ class }\SpecialCharTok{\textasciitilde{}}\NormalTok{ ., }\AttributeTok{data =}\NormalTok{ weekly\_data) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{step\_rm}\NormalTok{(zip3, date, client, claims, zip\_deaths, smoothed\_ae, shrunk\_ae, ae) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{step\_zv}\NormalTok{(}\FunctionTok{all\_predictors}\NormalTok{())}

\NormalTok{xgboost\_spec }\OtherTok{\textless{}{-}}
  \FunctionTok{boost\_tree}\NormalTok{(}\AttributeTok{trees =} \FunctionTok{tune}\NormalTok{(), }\AttributeTok{tree\_depth =} \FunctionTok{tune}\NormalTok{(), }\AttributeTok{learn\_rate =} \FunctionTok{tune}\NormalTok{()) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{set\_mode}\NormalTok{(}\StringTok{"classification"}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{set\_engine}\NormalTok{(}\StringTok{"xgboost"}\NormalTok{, }\AttributeTok{nthread =} \DecValTok{8}\NormalTok{)}

\NormalTok{xgboost\_workflow }\OtherTok{\textless{}{-}}
  \FunctionTok{workflow}\NormalTok{() }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{add\_recipe}\NormalTok{(xgboost\_recipe) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{add\_model}\NormalTok{(xgboost\_spec)}
\end{Highlighting}
\end{Shaded}

We tuned the model using the following hyperparameters:

\begin{itemize}
\tightlist
\item
  \texttt{trees}: Number of trees contained in the ensemble.
\item
  \texttt{tree} depth: An integer for the maximum depth of the tree
  (i.e.~number of splits).
\item
  \texttt{learn\_rate}: A number for the rate at which the boosting
  algorithm adapts from iteration-to-iteration.
\end{itemize}

We start with 10 different sets of parameters, and then finetune it
using 20 iterations of simulated annealing. We use simulated annealing
to find a set of parameters that maximizes \texttt{roc\_auc}.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{library}\NormalTok{(finetune)}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{set.seed}\NormalTok{(}\DecValTok{98324}\NormalTok{)}
\NormalTok{res\_grd }\OtherTok{\textless{}{-}}
\NormalTok{  xgboost\_workflow }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{tune\_grid}\NormalTok{(}
    \AttributeTok{resamples =}\NormalTok{ resmpl,}
    \AttributeTok{grid =} \DecValTok{10}\NormalTok{,}
    \AttributeTok{metrics =} \FunctionTok{metric\_set}\NormalTok{(roc\_auc, sens, spec, j\_index, accuracy),}
    \AttributeTok{control =} \FunctionTok{control\_grid}\NormalTok{(}\AttributeTok{verbose =} \ConstantTok{TRUE}\NormalTok{))}
\DocumentationTok{\#\# i Manual split: preprocessor 1/1}
\DocumentationTok{\#\# v Manual split: preprocessor 1/1}
\DocumentationTok{\#\# i Manual split: preprocessor 1/1, model 1/10}
\DocumentationTok{\#\# v Manual split: preprocessor 1/1, model 1/10}
\DocumentationTok{\#\# i Manual split: preprocessor 1/1, model 1/10 (predictions)}
\DocumentationTok{\#\# i Manual split: preprocessor 1/1}
\DocumentationTok{\#\# v Manual split: preprocessor 1/1}
\DocumentationTok{\#\# i Manual split: preprocessor 1/1, model 2/10}
\DocumentationTok{\#\# v Manual split: preprocessor 1/1, model 2/10}
\DocumentationTok{\#\# i Manual split: preprocessor 1/1, model 2/10 (predictions)}
\DocumentationTok{\#\# i Manual split: preprocessor 1/1}
\DocumentationTok{\#\# v Manual split: preprocessor 1/1}
\DocumentationTok{\#\# i Manual split: preprocessor 1/1, model 3/10}
\DocumentationTok{\#\# v Manual split: preprocessor 1/1, model 3/10}
\DocumentationTok{\#\# i Manual split: preprocessor 1/1, model 3/10 (predictions)}
\DocumentationTok{\#\# i Manual split: preprocessor 1/1}
\DocumentationTok{\#\# v Manual split: preprocessor 1/1}
\DocumentationTok{\#\# i Manual split: preprocessor 1/1, model 4/10}
\DocumentationTok{\#\# v Manual split: preprocessor 1/1, model 4/10}
\DocumentationTok{\#\# i Manual split: preprocessor 1/1, model 4/10 (predictions)}
\DocumentationTok{\#\# i Manual split: preprocessor 1/1}
\DocumentationTok{\#\# v Manual split: preprocessor 1/1}
\DocumentationTok{\#\# i Manual split: preprocessor 1/1, model 5/10}
\DocumentationTok{\#\# v Manual split: preprocessor 1/1, model 5/10}
\DocumentationTok{\#\# i Manual split: preprocessor 1/1, model 5/10 (predictions)}
\DocumentationTok{\#\# i Manual split: preprocessor 1/1}
\DocumentationTok{\#\# v Manual split: preprocessor 1/1}
\DocumentationTok{\#\# i Manual split: preprocessor 1/1, model 6/10}
\DocumentationTok{\#\# v Manual split: preprocessor 1/1, model 6/10}
\DocumentationTok{\#\# i Manual split: preprocessor 1/1, model 6/10 (predictions)}
\DocumentationTok{\#\# i Manual split: preprocessor 1/1}
\DocumentationTok{\#\# v Manual split: preprocessor 1/1}
\DocumentationTok{\#\# i Manual split: preprocessor 1/1, model 7/10}
\DocumentationTok{\#\# v Manual split: preprocessor 1/1, model 7/10}
\DocumentationTok{\#\# i Manual split: preprocessor 1/1, model 7/10 (predictions)}
\DocumentationTok{\#\# i Manual split: preprocessor 1/1}
\DocumentationTok{\#\# v Manual split: preprocessor 1/1}
\DocumentationTok{\#\# i Manual split: preprocessor 1/1, model 8/10}
\DocumentationTok{\#\# v Manual split: preprocessor 1/1, model 8/10}
\DocumentationTok{\#\# i Manual split: preprocessor 1/1, model 8/10 (predictions)}
\DocumentationTok{\#\# i Manual split: preprocessor 1/1}
\DocumentationTok{\#\# v Manual split: preprocessor 1/1}
\DocumentationTok{\#\# i Manual split: preprocessor 1/1, model 9/10}
\DocumentationTok{\#\# v Manual split: preprocessor 1/1, model 9/10}
\DocumentationTok{\#\# i Manual split: preprocessor 1/1, model 9/10 (predictions)}
\DocumentationTok{\#\# i Manual split: preprocessor 1/1}
\DocumentationTok{\#\# v Manual split: preprocessor 1/1}
\DocumentationTok{\#\# i Manual split: preprocessor 1/1, model 10/10}
\DocumentationTok{\#\# v Manual split: preprocessor 1/1, model 10/10}
\DocumentationTok{\#\# i Manual split: preprocessor 1/1, model 10/10 (predictions)}

\NormalTok{res }\OtherTok{\textless{}{-}}
\NormalTok{  xgboost\_workflow }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{tune\_sim\_anneal}\NormalTok{(}
      \AttributeTok{resamples =}\NormalTok{ resmpl,}
      \AttributeTok{iter =} \DecValTok{20}\NormalTok{,}
      \AttributeTok{initial =}\NormalTok{ res\_grd,}
      \AttributeTok{metrics =} \FunctionTok{metric\_set}\NormalTok{(roc\_auc, sens, spec, j\_index, accuracy))}
\DocumentationTok{\#\# Optimizing roc\_auc}
\DocumentationTok{\#\# Initial best: 0.78996}
\DocumentationTok{\#\#  1 ( ) accept suboptimal  roc\_auc=0.74385}
\DocumentationTok{\#\#  2 ( ) accept suboptimal  roc\_auc=0.69237}
\DocumentationTok{\#\#  3 ( ) accept suboptimal  roc\_auc=0.66214}
\DocumentationTok{\#\#  4 + better suboptimal  roc\_auc=0.67367}
\DocumentationTok{\#\#  5 ( ) accept suboptimal  roc\_auc=0.67188}
\DocumentationTok{\#\#  6 {-} discard suboptimal roc\_auc=0.6647}
\DocumentationTok{\#\#  7 ( ) accept suboptimal  roc\_auc=0.6583}
\DocumentationTok{\#\#  8 x restart from best  roc\_auc=0.66342}
\DocumentationTok{\#\#  9 ( ) accept suboptimal  roc\_auc=0.76537}
\DocumentationTok{\#\# 10 {-} discard suboptimal roc\_auc=0.74129}
\DocumentationTok{\#\# 11 {-} discard suboptimal roc\_auc=0.71875}
\DocumentationTok{\#\# 12 ( ) accept suboptimal  roc\_auc=0.74027}
\DocumentationTok{\#\# 13 \textless{}3 new best           roc\_auc=0.80328}
\DocumentationTok{\#\# 14 {-} discard suboptimal roc\_auc=0.76383}
\DocumentationTok{\#\# 15 {-} discard suboptimal roc\_auc=0.78151}
\DocumentationTok{\#\# 16 {-} discard suboptimal roc\_auc=0.771}
\DocumentationTok{\#\# 17 {-} discard suboptimal roc\_auc=0.76076}
\DocumentationTok{\#\# 18 {-} discard suboptimal roc\_auc=0.78381}
\DocumentationTok{\#\# 19 \textless{}3 new best           roc\_auc=0.81916}
\DocumentationTok{\#\# 20 {-} discard suboptimal roc\_auc=0.7687}
\end{Highlighting}
\end{Shaded}

The best parameters are then selected, and we apply these to our
workflow.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{res }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{show\_best}\NormalTok{(}\AttributeTok{metric =} \StringTok{"roc\_auc"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 5 x 10
##   trees tree_depth learn_rate .metric .estimator  mean     n std_err .config    
##   <int>      <int>      <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>      
## 1  1939          2   0.00552  roc_auc binary     0.819     1      NA Iter19     
## 2  1807          4   0.00273  roc_auc binary     0.803     1      NA Iter13     
## 3  1701          7   0.000408 roc_auc binary     0.790     1      NA initial_Pr~
## 4  1703          5   0.000617 roc_auc binary     0.784     1      NA Iter18     
## 5  2000          5   0.000264 roc_auc binary     0.782     1      NA Iter15     
## # ... with 1 more variable: .iter <int>
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{best\_parms }\OtherTok{\textless{}{-}}\NormalTok{ res }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{select\_best}\NormalTok{(}\AttributeTok{metric =} \StringTok{"roc\_auc"}\NormalTok{)}
\NormalTok{final\_wf }\OtherTok{\textless{}{-}}\NormalTok{ xgboost\_workflow }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{finalize\_workflow}\NormalTok{(best\_parms)}
\end{Highlighting}
\end{Shaded}

We now need to forecast 3 months worth of \texttt{smoothed\_deaths}.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{forecast }\OtherTok{\textless{}{-}}
\NormalTok{  weekly\_data }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{filter}\NormalTok{(date }\SpecialCharTok{\textgreater{}=} \StringTok{"2020{-}03{-}15"} \SpecialCharTok{\&}\NormalTok{ date }\SpecialCharTok{\textless{}=}\NormalTok{ start) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{as\_tsibble}\NormalTok{(}\AttributeTok{index =}\NormalTok{ date, }\AttributeTok{key =}\NormalTok{ client) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{model}\NormalTok{(}\AttributeTok{arima =} \FunctionTok{ARIMA}\NormalTok{(smoothed\_deaths)) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{forecast}\NormalTok{(}\AttributeTok{h =} \StringTok{"3 months"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

We create a new data called \texttt{forecast\_data} where the actual
deaths are replaced by the forecasted deaths. So, now we have
\texttt{weekly\_data} that contains the \texttt{actual\_deaths} and we
have \texttt{forecasted\_data} that contains the
\texttt{forecasted\_deaths} instead.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{future }\OtherTok{\textless{}{-}}
\NormalTok{  forecast }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{as\_tibble}\NormalTok{() }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{select}\NormalTok{(client, date, .mean) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{rename}\NormalTok{(}\AttributeTok{smoothed\_deaths =}\NormalTok{ .mean)}

\NormalTok{forecast\_data }\OtherTok{\textless{}{-}}
\NormalTok{  weekly\_data }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{rows\_update}\NormalTok{(future, }\AttributeTok{by =} \FunctionTok{c}\NormalTok{(}\StringTok{"date"}\NormalTok{,}\StringTok{"client"}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

Now that we have these two data sets available, we can easily define the
training set to include all known clients with dates prior to January 1
(training dates).

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{train }\OtherTok{\textless{}{-}}
\NormalTok{  weekly\_data }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{filter}\NormalTok{(client }\SpecialCharTok{\%in\%}\NormalTok{ training\_clients }\SpecialCharTok{\&}\NormalTok{ date }\SpecialCharTok{\textless{}=}\NormalTok{ start)}
\end{Highlighting}
\end{Shaded}

For the purpose of comparison, we will have four types of testing sets:

\begin{itemize}
\tightlist
\item
  \texttt{test\_known\_true}: includes all known clients with dates 3
  months after January 1 (known clients + testing dates) using the
  actual deaths for this 3 months period.
\item
  \texttt{test\_unknown\_true}: includes all unknown clients with dates
  3 months after January 1 (unknown clients + testing dates) using the
  actual deaths for this 3 months period.
\item
  \texttt{test\_known\_fore}: includes all known clients with dates 3
  months after January 1 (known clients + testing dates ) using the
  forecasted deaths for this 3 months period.
\item
  \texttt{test\_unknown\_fore}: includes all unknown clients with dates
  3 months after January 1 (unknown clients + testing dates) using the
  forecasted deaths for this 3 months period.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{test\_known\_true }\OtherTok{\textless{}{-}}
\NormalTok{  weekly\_data }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{filter}\NormalTok{(client }\SpecialCharTok{\%in\%}\NormalTok{ training\_clients) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{filter}\NormalTok{(date }\SpecialCharTok{\textgreater{}}\NormalTok{ start }\SpecialCharTok{\&}\NormalTok{ date }\SpecialCharTok{\textless{}=}\NormalTok{ end)}

\NormalTok{test\_unknown\_true }\OtherTok{\textless{}{-}}
\NormalTok{  weekly\_data }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{filter}\NormalTok{(}\SpecialCharTok{!}\NormalTok{client }\SpecialCharTok{\%in\%}\NormalTok{ training\_clients) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{filter}\NormalTok{(date }\SpecialCharTok{\textgreater{}}\NormalTok{ start }\SpecialCharTok{\&}\NormalTok{ date }\SpecialCharTok{\textless{}=}\NormalTok{ end)}

\NormalTok{test\_known\_fore }\OtherTok{\textless{}{-}}
\NormalTok{  forecast\_data }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{filter}\NormalTok{(client }\SpecialCharTok{\%in\%}\NormalTok{ training\_clients) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{filter}\NormalTok{(date }\SpecialCharTok{\textgreater{}}\NormalTok{ start }\SpecialCharTok{\&}\NormalTok{ date }\SpecialCharTok{\textless{}=}\NormalTok{ end)}

\NormalTok{test\_unknown\_fore }\OtherTok{\textless{}{-}}
\NormalTok{  forecast\_data }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{filter}\NormalTok{(}\SpecialCharTok{!}\NormalTok{client }\SpecialCharTok{\%in\%}\NormalTok{ training\_clients) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{filter}\NormalTok{(date }\SpecialCharTok{\textgreater{}}\NormalTok{ start }\SpecialCharTok{\&}\NormalTok{ date }\SpecialCharTok{\textless{}=}\NormalTok{ end)}
\end{Highlighting}
\end{Shaded}

We train our final workflow.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{trained\_wf }\OtherTok{\textless{}{-}}
\NormalTok{  final\_wf }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{fit}\NormalTok{(train)}
\end{Highlighting}
\end{Shaded}

Now, we can create a tibble \texttt{tests} out of these 4 testing sets.
We compare the performance of our four testing sets. We can see that the
difference is not large and somehow our machine learning models have
been able to have good performance with the forecasted deaths.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{tests }\OtherTok{\textless{}{-}}
  \FunctionTok{tribble}\NormalTok{(}
    \SpecialCharTok{\textasciitilde{}}\NormalTok{id, }\SpecialCharTok{\textasciitilde{}}\NormalTok{set,}
    \StringTok{"Known clients, true deaths"}\NormalTok{, test\_known\_true,}
    \StringTok{"Unknown clients, true deaths"}\NormalTok{, test\_unknown\_true,}
    \StringTok{"Known clients"}\NormalTok{, test\_known\_fore,}
    \StringTok{"Unknown clients"}\NormalTok{, test\_unknown\_fore)}

\NormalTok{tests }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{mutate}\NormalTok{(}\AttributeTok{set =} \FunctionTok{map}\NormalTok{(set, }\SpecialCharTok{\textasciitilde{}} \FunctionTok{bind\_cols}\NormalTok{(.x, trained\_wf }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{predict}\NormalTok{(.x)))) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{unnest}\NormalTok{(set) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{group\_by}\NormalTok{(id, date) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{summarize}\NormalTok{(}
      \AttributeTok{Accuracy =}\NormalTok{ yardstick}\SpecialCharTok{::}\FunctionTok{accuracy\_vec}\NormalTok{(class, .pred\_class),}
      \AttributeTok{Sensitivity =}\NormalTok{ yardstick}\SpecialCharTok{::}\FunctionTok{sens\_vec}\NormalTok{(class, .pred\_class),}
      \AttributeTok{Specificity =}\NormalTok{ yardstick}\SpecialCharTok{::}\FunctionTok{spec\_vec}\NormalTok{(class, .pred\_class)) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{ungroup}\NormalTok{() }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{pivot\_longer}\NormalTok{(Accuracy}\SpecialCharTok{:}\NormalTok{Specificity, }\AttributeTok{names\_to =} \StringTok{"metric"}\NormalTok{, }\AttributeTok{values\_to =} \StringTok{"value"}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{filter}\NormalTok{(metric }\SpecialCharTok{==} \StringTok{"Accuracy"}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{ggplot}\NormalTok{(}\FunctionTok{aes}\NormalTok{(}\AttributeTok{x =}\NormalTok{ date, }\AttributeTok{y =}\NormalTok{ value, }\AttributeTok{color =}\NormalTok{ id)) }\SpecialCharTok{+}
  \FunctionTok{geom\_line}\NormalTok{(}\FunctionTok{aes}\NormalTok{(}\AttributeTok{linetype =} \FunctionTok{str\_detect}\NormalTok{(id, }\StringTok{"true"}\NormalTok{))) }\SpecialCharTok{+}
  \FunctionTok{labs}\NormalTok{(}\AttributeTok{x =} \StringTok{"Date"}\NormalTok{, }\AttributeTok{y =} \StringTok{"Accuracy"}\NormalTok{, }\AttributeTok{color =} \StringTok{""}\NormalTok{) }\SpecialCharTok{+} \FunctionTok{guides}\NormalTok{(}\AttributeTok{linetype =} \StringTok{"none"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\includegraphics{figures/report/fig-unnamed-chunk-59-1.pdf}

\hypertarget{explaining-outcomes}{%
\subsection{Explaining outcomes}\label{explaining-outcomes}}

One of the most important part in any model is interpreting the result.
Model interpretability helps extracting insight and clarity regarding
how the algorithms are performing. There are several tools that can be
used to increase model transparency. Breakdown plots can be used to
visualize localized variable importance scores. For each client, we can
explain why a case receives its prediction and how each predictor
contributes either positively or negatively to the target variable. The
local interpretability enables us to pinpoint and contrast the impacts
of the factors.

We explain how much each feature contributes to the value of a single
prediction using the following. We will explain the plots after running
the code.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{model }\OtherTok{\textless{}{-}}
\NormalTok{  trained\_wf }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{extract\_fit\_parsnip}\NormalTok{()}

\NormalTok{recipe }\OtherTok{\textless{}{-}}
\NormalTok{  trained\_wf }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{extract\_recipe}\NormalTok{(}\AttributeTok{estimated =} \ConstantTok{TRUE}\NormalTok{)}

\NormalTok{exp }\OtherTok{\textless{}{-}} \FunctionTok{explain}\NormalTok{(model, recipe }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{bake}\NormalTok{(train))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Preparation of a new explainer is initiated
##   -> model label       :  model_fit  ( default )
##   -> data              :  34317  rows  26  cols 
##   -> data              :  tibble converted into a data.frame 
##   -> target variable   :  not specified! ( WARNING )
##   -> predict function  :  yhat.model_fit  will be used ( default )
##   -> predicted values  :  No value for predict function target column. ( default )
##   -> model_info        :  package parsnip , ver. 0.1.7 , task classification ( default ) 
##   -> model_info        :  Model info detected classification task but 'y' is a NULL .  ( WARNING )
##   -> model_info        :  By deafult classification tasks supports only numercical 'y' parameter. 
##   -> model_info        :  Consider changing to numerical vector with 0 and 1 values.
##   -> model_info        :  Otherwise I will not be able to calculate residuals or loss function.
##   -> predicted values  :  the predict_function returns an error when executed ( WARNING ) 
##   -> residual function :  difference between y and yhat ( default )
##  A new explainer has been created! 
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{test\_obs }\OtherTok{\textless{}{-}}
\NormalTok{  test\_unknown\_fore }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{filter}\NormalTok{(date }\SpecialCharTok{==}\NormalTok{ end, client }\SpecialCharTok{==} \DecValTok{397}\NormalTok{)}

\NormalTok{exp }\SpecialCharTok{\%\textgreater{}\%}
\FunctionTok{predict\_parts}\NormalTok{(recipe }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{bake}\NormalTok{(test\_obs) }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{select}\NormalTok{(}\SpecialCharTok{{-}}\NormalTok{class)) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{plot}\NormalTok{(}\AttributeTok{digits =} \DecValTok{2}\NormalTok{, }\AttributeTok{max\_features =} \DecValTok{5}\NormalTok{, }\AttributeTok{title =} \StringTok{"Client 397"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\includegraphics{figures/report/fig-unnamed-chunk-60-1.pdf}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{test\_obs }\OtherTok{\textless{}{-}}
\NormalTok{  test\_known\_fore }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{filter}\NormalTok{(date }\SpecialCharTok{==}\NormalTok{ end, client }\SpecialCharTok{==} \DecValTok{405}\NormalTok{)}

\NormalTok{exp }\SpecialCharTok{\%\textgreater{}\%}
\FunctionTok{predict\_parts}\NormalTok{(recipe }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{bake}\NormalTok{(test\_obs) }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{select}\NormalTok{(}\SpecialCharTok{{-}}\NormalTok{class)) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{plot}\NormalTok{(}\AttributeTok{digits =} \DecValTok{2}\NormalTok{, }\AttributeTok{max\_features =} \DecValTok{5}\NormalTok{, }\AttributeTok{title =} \StringTok{"Client 405"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\includegraphics{figures/report/fig-unnamed-chunk-60-2.pdf}

We use client 405 (Not Adverse) from our known clients and client 397
(Adverse) from our unknown clients. The prediction in blue is the
probability that the client is \textbf{not adverse}. For client 405,
since the prediction is 0.53 (\textgreater0.5), this client is
classified as not adverse. For client 397, since the prediction is 0.37
(\textless0.5), this client is classified as adverse. A red bar means
that this predictor has caused more mortality. In contrast, a green bar
means that this predictor has causes less mortality client. For
instance, comparing clients 397 and 405, we can see that
\texttt{smoothed\_deaths} for client 397 is so much bigger than the
value for client 405 and hence the contribution of this predictor to the
mortality is larger for client 397 (red bar is bigger).

\hypertarget{other-modelling-attempts}{%
\section{Other modelling attempts}\label{other-modelling-attempts}}

In this section, we introduce some models that we tried but did not work
as well.

We want to predict the \texttt{AE\ value} for each client for each week
during COVID-19. Since the weekly AE value changes dramatically, we
decide to predict the \texttt{shrunk\ AE}. For detail of shrunk AE,
check the section on the short-term model.

Our main package in this section is \texttt{modeltime}, a framework for
time series models and machine learning. Since we have more than 500
clients, we have more than 500 time series. In practice, we will have
more clients. We create a global machine learning model that forecasts
all clients at once for computational efficiency.

We use data before Covid-19 (based on the zip code where the company is
located (such as poverty, education, unemployment levels) and
characteristics of the company (such as the average age of its
employees) as our predictors, and we compare results with IHME death
data/with zip death data/without death data as predictors.

Necessary package we need.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{library}\NormalTok{(modeltime)}
\FunctionTok{library}\NormalTok{(timetk)}
\end{Highlighting}
\end{Shaded}

\hypertarget{read-data-and-pre-processing}{%
\subsection{Read data and
pre-processing}\label{read-data-and-pre-processing}}

Get weeklydata from \texttt{2020-03-15} to \texttt{2021-06-27}.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{clients}\OtherTok{\textless{}{-}}\FunctionTok{read\_feather}\NormalTok{(}\StringTok{"data/processed\_data\_20\_12\_23.feather"}\NormalTok{)}\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{select}\NormalTok{(}\SpecialCharTok{{-}}\NormalTok{ae\_2021, }\SpecialCharTok{{-}}\NormalTok{ae\_2020, }\SpecialCharTok{{-}}\NormalTok{ae\_2019,}
         \SpecialCharTok{{-}}\NormalTok{actual\_2021, }\SpecialCharTok{{-}}\NormalTok{actual\_2020, }\SpecialCharTok{{-}}\NormalTok{actual\_2019, }\SpecialCharTok{{-}}\NormalTok{adverse,}
         \SpecialCharTok{{-}}\NormalTok{STATE\_NAME, }\SpecialCharTok{{-}}\NormalTok{dep\_var, }\SpecialCharTok{{-}}\NormalTok{smoothed\_ae)}\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{filter}\NormalTok{(date }\SpecialCharTok{\textgreater{}=} \StringTok{"2020{-}03{-}15"}\NormalTok{)}\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{mutate}\NormalTok{(}\AttributeTok{client =} \FunctionTok{as.factor}\NormalTok{(client))}
\end{Highlighting}
\end{Shaded}

Split our data into two part: train set (\texttt{2020-03-15} to
\texttt{2020-12-27}) and test set (\texttt{2021-01-03} to
\texttt{2021-06-27}).

\begin{Shaded}
\begin{Highlighting}[]
  \FunctionTok{set.seed}\NormalTok{(}\DecValTok{1234}\NormalTok{)}
\NormalTok{  splits }\OtherTok{\textless{}{-}}\NormalTok{  clients }\SpecialCharTok{\%\textgreater{}\%}
    \FunctionTok{time\_series\_split}\NormalTok{(}\AttributeTok{initial =} \StringTok{"6 months"}\NormalTok{, }\AttributeTok{assess =} \StringTok{"6 months"}\NormalTok{, }\AttributeTok{date\_var =}\NormalTok{ date, }
        \AttributeTok{cumulative =} \ConstantTok{TRUE}\NormalTok{)}
\NormalTok{  train }\OtherTok{=} \FunctionTok{training}\NormalTok{(splits)}
\NormalTok{  test }\OtherTok{=} \FunctionTok{testing}\NormalTok{(splits)}
\end{Highlighting}
\end{Shaded}

We can add feature engineering steps to get our data ready using
recipes. We remove useless variables: \texttt{zip3}, \texttt{actual},
\texttt{claims}, \texttt{class}, \texttt{shrinkage}, \texttt{ae.} For
extreme big number such as \texttt{population}, \texttt{volume} and
\texttt{expected}, we use \texttt{step\_log()} to do logarithm
transformation for pre-processing. We also use
\texttt{step\_mutate(client\ =\ droplevels(client))} to add ID variable,
\texttt{step\_timeseries\_signature()} to create a specification of a
recipe step that will convert date into many features that can aid in
machine learning with time-series data.

Here \texttt{rec\_obj} is for model with \texttt{ihme\_deaths},
\texttt{rec\_obj1} with \texttt{zip\_deaths} and \texttt{rec\_obj2}
without any death data.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{rec\_obj\_alldata }\OtherTok{\textless{}{-}}
    \FunctionTok{recipe}\NormalTok{(shrunk\_ae }\SpecialCharTok{\textasciitilde{}}\NormalTok{ ., }\AttributeTok{data =} \FunctionTok{training}\NormalTok{(splits)) }\SpecialCharTok{\%\textgreater{}\%}
    \FunctionTok{step\_rm}\NormalTok{(zip3)}\SpecialCharTok{\%\textgreater{}\%}
    \FunctionTok{step\_rm}\NormalTok{( claims , class, shrinkage, ae)}\SpecialCharTok{\%\textgreater{}\%}
    \FunctionTok{step\_log}\NormalTok{(POP, volume, expected)}\SpecialCharTok{\%\textgreater{}\%}
    \FunctionTok{step\_mutate}\NormalTok{(}\AttributeTok{client =} \FunctionTok{droplevels}\NormalTok{(client)) }\SpecialCharTok{\%\textgreater{}\%}
    \FunctionTok{step\_timeseries\_signature}\NormalTok{(date) }\SpecialCharTok{\%\textgreater{}\%}
    \FunctionTok{step\_rm}\NormalTok{(date)}\SpecialCharTok{\%\textgreater{}\%}
    \FunctionTok{step\_dummy}\NormalTok{(}\FunctionTok{all\_nominal\_predictors}\NormalTok{(), }\AttributeTok{one\_hot =} \ConstantTok{TRUE}\NormalTok{)}\SpecialCharTok{\%\textgreater{}\%}
    \FunctionTok{step\_zv}\NormalTok{(}\FunctionTok{all\_predictors}\NormalTok{()) }\SpecialCharTok{\%\textgreater{}\%}
    \FunctionTok{step\_normalize}\NormalTok{(}\FunctionTok{all\_predictors}\NormalTok{(), }\SpecialCharTok{{-}}\FunctionTok{all\_nominal}\NormalTok{())}
\CommentTok{\#recipe with ihme death data}
\NormalTok{rec\_obj }\OtherTok{\textless{}{-}}
\NormalTok{    rec\_obj\_alldata}\SpecialCharTok{\%\textgreater{}\%}
    \FunctionTok{step\_rm}\NormalTok{(zip\_deaths, smoothed\_deaths)}
\CommentTok{\#recipe with zip death}
\NormalTok{rec\_obj1 }\OtherTok{\textless{}{-}}
\NormalTok{    rec\_obj\_alldata}\SpecialCharTok{\%\textgreater{}\%}
    \FunctionTok{step\_rm}\NormalTok{( ihme\_deaths, smoothed\_deaths)}
\CommentTok{\#recipe without  death}
\NormalTok{rec\_obj2 }\OtherTok{\textless{}{-}}
\NormalTok{    rec\_obj\_alldata}\SpecialCharTok{\%\textgreater{}\%}
    \FunctionTok{step\_rm}\NormalTok{( smoothed\_deaths, ihme\_deaths, zip\_deaths)}
\end{Highlighting}
\end{Shaded}

Here are 5 machine learning models we try in this section .
\texttt{forest\_spec} is a random forest. \texttt{tuned\_forest\_spec}
is a tuned random forest. \texttt{svm\_rbf\_spec} is a radial basis
function support vector machine. \texttt{knn\_spec} is K-nearest
neighbors and \texttt{xgboost\_spec} is Xgboost.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{forest\_spec }\OtherTok{\textless{}{-}}
  \FunctionTok{rand\_forest}\NormalTok{(}\AttributeTok{trees =} \DecValTok{1000}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{set\_engine}\NormalTok{(}\StringTok{"ranger"}\NormalTok{, }\AttributeTok{num.threads =} \DecValTok{8}\NormalTok{, }\AttributeTok{seed =} \DecValTok{123456789}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{set\_mode}\NormalTok{(}\StringTok{"regression"}\NormalTok{)}\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{set\_engine}\NormalTok{(}\StringTok{"ranger"}\NormalTok{, }\AttributeTok{num.threads =} \DecValTok{8}\NormalTok{, }\AttributeTok{importance =} \StringTok{"impurity"}\NormalTok{, }\AttributeTok{seed =} \DecValTok{123}\NormalTok{)}
\NormalTok{tuned\_forest\_spec }\OtherTok{\textless{}{-}}
  \FunctionTok{rand\_forest}\NormalTok{(}\AttributeTok{trees =} \DecValTok{1000}\NormalTok{, }\AttributeTok{mtry =} \DecValTok{12}\NormalTok{, }\AttributeTok{min\_n =} \DecValTok{21}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{set\_mode}\NormalTok{(}\StringTok{"regression"}\NormalTok{)}\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{set\_engine}\NormalTok{(}\StringTok{"ranger"}\NormalTok{, }\AttributeTok{num.threads =} \DecValTok{8}\NormalTok{, }\AttributeTok{importance =} \StringTok{"impurity"}\NormalTok{, }\AttributeTok{seed =} \DecValTok{123}\NormalTok{)}
\NormalTok{svm\_rbf\_spec }\OtherTok{\textless{}{-}}
  \FunctionTok{svm\_rbf}\NormalTok{() }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{set\_engine}\NormalTok{(}\StringTok{"kernlab"}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{set\_mode}\NormalTok{(}\StringTok{"regression"}\NormalTok{)}
\NormalTok{knn\_spec }\OtherTok{\textless{}{-}}
  \FunctionTok{nearest\_neighbor}\NormalTok{() }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{set\_engine}\NormalTok{(}\StringTok{"kknn"}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{set\_mode}\NormalTok{(}\StringTok{"regression"}\NormalTok{)}
\NormalTok{xgboost\_spec }\OtherTok{\textless{}{-}}
  \FunctionTok{boost\_tree}\NormalTok{(}\AttributeTok{trees =} \DecValTok{100}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{set\_engine}\NormalTok{(}\StringTok{"xgboost"}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{set\_mode}\NormalTok{(}\StringTok{"regression"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\hypertarget{create-a-workflow}{%
\subsection{Create a workflow}\label{create-a-workflow}}

The workflow is an object that can bundle together our pre-processing,
modeling, and post-processing requests.

Workflow with IHME death data

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{wflw\_rf }\OtherTok{\textless{}{-}} \FunctionTok{workflow}\NormalTok{() }\SpecialCharTok{\%\textgreater{}\%}
    \FunctionTok{add\_model}\NormalTok{(}
\NormalTok{        forest\_spec}
\NormalTok{    ) }\SpecialCharTok{\%\textgreater{}\%}
    \FunctionTok{add\_recipe}\NormalTok{(rec\_obj) }\SpecialCharTok{\%\textgreater{}\%}
    \FunctionTok{fit}\NormalTok{(}\AttributeTok{data =} \FunctionTok{training}\NormalTok{(splits))}

\NormalTok{wflw\_tunedrf }\OtherTok{\textless{}{-}} \FunctionTok{workflow}\NormalTok{() }\SpecialCharTok{\%\textgreater{}\%}
    \FunctionTok{add\_model}\NormalTok{(}
\NormalTok{        tuned\_forest\_spec}
\NormalTok{    ) }\SpecialCharTok{\%\textgreater{}\%}
    \FunctionTok{add\_recipe}\NormalTok{(rec\_obj) }\SpecialCharTok{\%\textgreater{}\%}
    \FunctionTok{fit}\NormalTok{(}\AttributeTok{data =} \FunctionTok{training}\NormalTok{(splits))}

\NormalTok{wflw\_svmrbf }\OtherTok{\textless{}{-}} \FunctionTok{workflow}\NormalTok{() }\SpecialCharTok{\%\textgreater{}\%}
    \FunctionTok{add\_model}\NormalTok{(}
\NormalTok{        svm\_rbf\_spec}
\NormalTok{    ) }\SpecialCharTok{\%\textgreater{}\%}
    \FunctionTok{add\_recipe}\NormalTok{(rec\_obj) }\SpecialCharTok{\%\textgreater{}\%}
    \FunctionTok{fit}\NormalTok{(}\AttributeTok{data =} \FunctionTok{training}\NormalTok{(splits))}

\NormalTok{wflw\_knnspec }\OtherTok{\textless{}{-}} \FunctionTok{workflow}\NormalTok{() }\SpecialCharTok{\%\textgreater{}\%}
    \FunctionTok{add\_model}\NormalTok{(}
\NormalTok{        knn\_spec}
\NormalTok{    ) }\SpecialCharTok{\%\textgreater{}\%}
    \FunctionTok{add\_recipe}\NormalTok{(rec\_obj) }\SpecialCharTok{\%\textgreater{}\%}
    \FunctionTok{fit}\NormalTok{(}\AttributeTok{data =} \FunctionTok{training}\NormalTok{(splits))  }

\NormalTok{wflw\_xgboost }\OtherTok{\textless{}{-}} \FunctionTok{workflow}\NormalTok{() }\SpecialCharTok{\%\textgreater{}\%}
    \FunctionTok{add\_model}\NormalTok{(}
\NormalTok{        xgboost\_spec}
\NormalTok{    ) }\SpecialCharTok{\%\textgreater{}\%}
    \FunctionTok{add\_recipe}\NormalTok{(rec\_obj) }\SpecialCharTok{\%\textgreater{}\%}
    \FunctionTok{fit}\NormalTok{(}\AttributeTok{data =} \FunctionTok{training}\NormalTok{(splits))  }
\CommentTok{\#Create a Modeltime Table (table of model)}
\NormalTok{model\_tbl}\OtherTok{\textless{}{-}} \FunctionTok{modeltime\_table}\NormalTok{(}
\NormalTok{    wflw\_rf,}
\NormalTok{    wflw\_tunedrf,}
\NormalTok{    wflw\_svmrbf,}
\NormalTok{    wflw\_knnspec,}
\NormalTok{    wflw\_xgboost}
\NormalTok{)}
\end{Highlighting}
\end{Shaded}

Workflow with zip death data

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{wflw\_rf1 }\OtherTok{\textless{}{-}} \FunctionTok{workflow}\NormalTok{() }\SpecialCharTok{\%\textgreater{}\%}
    \FunctionTok{add\_model}\NormalTok{(}
\NormalTok{        forest\_spec}
\NormalTok{    ) }\SpecialCharTok{\%\textgreater{}\%}
    \FunctionTok{add\_recipe}\NormalTok{(rec\_obj1) }\SpecialCharTok{\%\textgreater{}\%}
    \FunctionTok{fit}\NormalTok{(}\AttributeTok{data =} \FunctionTok{training}\NormalTok{(splits))}

\NormalTok{wflw\_tunedrf1 }\OtherTok{\textless{}{-}} \FunctionTok{workflow}\NormalTok{() }\SpecialCharTok{\%\textgreater{}\%}
    \FunctionTok{add\_model}\NormalTok{(}
\NormalTok{        tuned\_forest\_spec}
\NormalTok{    ) }\SpecialCharTok{\%\textgreater{}\%}
    \FunctionTok{add\_recipe}\NormalTok{(rec\_obj1) }\SpecialCharTok{\%\textgreater{}\%}
    \FunctionTok{fit}\NormalTok{(}\AttributeTok{data =} \FunctionTok{training}\NormalTok{(splits))}

\NormalTok{wflw\_svmrbf1 }\OtherTok{\textless{}{-}} \FunctionTok{workflow}\NormalTok{() }\SpecialCharTok{\%\textgreater{}\%}
    \FunctionTok{add\_model}\NormalTok{(}
\NormalTok{        svm\_rbf\_spec}
\NormalTok{    ) }\SpecialCharTok{\%\textgreater{}\%}
    \FunctionTok{add\_recipe}\NormalTok{(rec\_obj1) }\SpecialCharTok{\%\textgreater{}\%}
    \FunctionTok{fit}\NormalTok{(}\AttributeTok{data =} \FunctionTok{training}\NormalTok{(splits))}

\NormalTok{wflw\_knnspec1 }\OtherTok{\textless{}{-}} \FunctionTok{workflow}\NormalTok{() }\SpecialCharTok{\%\textgreater{}\%}
    \FunctionTok{add\_model}\NormalTok{(}
\NormalTok{        knn\_spec}
\NormalTok{    ) }\SpecialCharTok{\%\textgreater{}\%}
    \FunctionTok{add\_recipe}\NormalTok{(rec\_obj1) }\SpecialCharTok{\%\textgreater{}\%}
    \FunctionTok{fit}\NormalTok{(}\AttributeTok{data =} \FunctionTok{training}\NormalTok{(splits))}

\NormalTok{wflw\_xgboost1 }\OtherTok{\textless{}{-}} \FunctionTok{workflow}\NormalTok{() }\SpecialCharTok{\%\textgreater{}\%}
    \FunctionTok{add\_model}\NormalTok{(}
\NormalTok{        xgboost\_spec}
\NormalTok{    ) }\SpecialCharTok{\%\textgreater{}\%}
    \FunctionTok{add\_recipe}\NormalTok{(rec\_obj1) }\SpecialCharTok{\%\textgreater{}\%}
    \FunctionTok{fit}\NormalTok{(}\AttributeTok{data =} \FunctionTok{training}\NormalTok{(splits))  }
\CommentTok{\#Create a Modeltime Table}
\NormalTok{model\_tbl1 }\OtherTok{\textless{}{-}} \FunctionTok{modeltime\_table}\NormalTok{(}
\NormalTok{    wflw\_rf1,}
\NormalTok{    wflw\_tunedrf1,}
\NormalTok{    wflw\_svmrbf1,}
\NormalTok{    wflw\_knnspec1,}
\NormalTok{    wflw\_xgboost1}
\NormalTok{)}
\end{Highlighting}
\end{Shaded}

Workflow without death

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{wflw\_rf2 }\OtherTok{\textless{}{-}} \FunctionTok{workflow}\NormalTok{() }\SpecialCharTok{\%\textgreater{}\%}
    \FunctionTok{add\_model}\NormalTok{(}
\NormalTok{        forest\_spec}
\NormalTok{    ) }\SpecialCharTok{\%\textgreater{}\%}
    \FunctionTok{add\_recipe}\NormalTok{(rec\_obj2) }\SpecialCharTok{\%\textgreater{}\%}
    \FunctionTok{fit}\NormalTok{(}\AttributeTok{data =} \FunctionTok{training}\NormalTok{(splits))}

\NormalTok{wflw\_tunedrf2 }\OtherTok{\textless{}{-}} \FunctionTok{workflow}\NormalTok{() }\SpecialCharTok{\%\textgreater{}\%}
    \FunctionTok{add\_model}\NormalTok{(}
\NormalTok{        tuned\_forest\_spec}
\NormalTok{    ) }\SpecialCharTok{\%\textgreater{}\%}
    \FunctionTok{add\_recipe}\NormalTok{(rec\_obj2) }\SpecialCharTok{\%\textgreater{}\%}
    \FunctionTok{fit}\NormalTok{(}\AttributeTok{data =} \FunctionTok{training}\NormalTok{(splits))}

\NormalTok{wflw\_svmrbf2 }\OtherTok{\textless{}{-}} \FunctionTok{workflow}\NormalTok{() }\SpecialCharTok{\%\textgreater{}\%}
    \FunctionTok{add\_model}\NormalTok{(}
\NormalTok{        svm\_rbf\_spec}
\NormalTok{    ) }\SpecialCharTok{\%\textgreater{}\%}
    \FunctionTok{add\_recipe}\NormalTok{(rec\_obj2) }\SpecialCharTok{\%\textgreater{}\%}
    \FunctionTok{fit}\NormalTok{(}\AttributeTok{data =} \FunctionTok{training}\NormalTok{(splits))}

\NormalTok{wflw\_knnspec2 }\OtherTok{\textless{}{-}} \FunctionTok{workflow}\NormalTok{() }\SpecialCharTok{\%\textgreater{}\%}
    \FunctionTok{add\_model}\NormalTok{(}
\NormalTok{        knn\_spec}
\NormalTok{    ) }\SpecialCharTok{\%\textgreater{}\%}
    \FunctionTok{add\_recipe}\NormalTok{(rec\_obj2) }\SpecialCharTok{\%\textgreater{}\%}
    \FunctionTok{fit}\NormalTok{(}\AttributeTok{data =} \FunctionTok{training}\NormalTok{(splits)) }

\NormalTok{wflw\_xgboost2 }\OtherTok{\textless{}{-}} \FunctionTok{workflow}\NormalTok{() }\SpecialCharTok{\%\textgreater{}\%}
    \FunctionTok{add\_model}\NormalTok{(}
\NormalTok{        xgboost\_spec}
\NormalTok{    ) }\SpecialCharTok{\%\textgreater{}\%}
    \FunctionTok{add\_recipe}\NormalTok{(rec\_obj2) }\SpecialCharTok{\%\textgreater{}\%}
    \FunctionTok{fit}\NormalTok{(}\AttributeTok{data =} \FunctionTok{training}\NormalTok{(splits))  }
\CommentTok{\#Create a Modeltime Table}
\NormalTok{model\_tbl2 }\OtherTok{\textless{}{-}} \FunctionTok{modeltime\_table}\NormalTok{(}
\NormalTok{    wflw\_rf2,}
\NormalTok{    wflw\_tunedrf2,}
\NormalTok{    wflw\_svmrbf2,}
\NormalTok{    wflw\_knnspec2,}
\NormalTok{    wflw\_xgboost2}
\NormalTok{)}
\end{Highlighting}
\end{Shaded}

For quick knit, we save the workflow here

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\#save the model table with IHME death data}
\FunctionTok{saveRDS}\NormalTok{(model\_tbl, }\StringTok{"modelwithIHME.rds"}\NormalTok{)}
\CommentTok{\#save the model table with zip death data}
\FunctionTok{saveRDS}\NormalTok{(model\_tbl1, }\StringTok{"modelwithzipdeath.rds"}\NormalTok{)}
\CommentTok{\#save the model table without death data}
\FunctionTok{saveRDS}\NormalTok{(model\_tbl2, }\StringTok{"modelwithoutdeath.rds"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

Read the saved workflow table

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{model\_tbl }\OtherTok{\textless{}{-}} \FunctionTok{readRDS}\NormalTok{(}\StringTok{"modelwithIHME.rds"}\NormalTok{)}
\NormalTok{model\_tbl1 }\OtherTok{\textless{}{-}} \FunctionTok{readRDS}\NormalTok{(}\StringTok{"modelwithzipdeath.rds"}\NormalTok{)}
\NormalTok{model\_tbl2 }\OtherTok{\textless{}{-}} \FunctionTok{readRDS}\NormalTok{(}\StringTok{"modelwithoutdeath.rds"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\hypertarget{preparation-for-forecasting}{%
\subsection{Preparation for
forecasting}\label{preparation-for-forecasting}}

Calibrate the model to testing set. It will calculate accuracy and
forecast confidence by computing predictions and residuals for testing
set.

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\#with IHME}
\NormalTok{calib\_tbl }\OtherTok{\textless{}{-}}\NormalTok{ model\_tbl }\SpecialCharTok{\%\textgreater{}\%}
    \FunctionTok{modeltime\_calibrate}\NormalTok{(}
      \AttributeTok{new\_data =} \FunctionTok{testing}\NormalTok{(splits), }
      \AttributeTok{id       =} \StringTok{"client"}
\NormalTok{    )}
\CommentTok{\#with zip death}
\NormalTok{calib\_tbl1 }\OtherTok{\textless{}{-}}\NormalTok{ model\_tbl1 }\SpecialCharTok{\%\textgreater{}\%}
    \FunctionTok{modeltime\_calibrate}\NormalTok{(}
      \AttributeTok{new\_data =} \FunctionTok{testing}\NormalTok{(splits), }
      \AttributeTok{id       =} \StringTok{"client"}
\NormalTok{    )}
\CommentTok{\#without death}
\NormalTok{calib\_tbl2 }\OtherTok{\textless{}{-}}\NormalTok{ model\_tbl2 }\SpecialCharTok{\%\textgreater{}\%}
    \FunctionTok{modeltime\_calibrate}\NormalTok{(}
      \AttributeTok{new\_data =} \FunctionTok{testing}\NormalTok{(splits), }
      \AttributeTok{id       =} \StringTok{"client"}
\NormalTok{    )}
\end{Highlighting}
\end{Shaded}

For quick knit, we save the calibration sets here.

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\#save the model table with IHME death data}
\FunctionTok{saveRDS}\NormalTok{(calib\_tbl, }\StringTok{"calibwithIHME.rds"}\NormalTok{)}
\CommentTok{\#save the model table with zip death data}
\FunctionTok{saveRDS}\NormalTok{(calib\_tbl1, }\StringTok{"calibwithzipdeath.rds"}\NormalTok{)}
\CommentTok{\#save the model table without death data}
\FunctionTok{saveRDS}\NormalTok{(calib\_tbl2, }\StringTok{"calibwithoutdeath.rds"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

Read the saved calibration sets.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{calib\_tbl }\OtherTok{\textless{}{-}} \FunctionTok{readRDS}\NormalTok{(}\StringTok{"calibwithIHME.rds"}\NormalTok{)}
\NormalTok{calib\_tbl1 }\OtherTok{\textless{}{-}} \FunctionTok{readRDS}\NormalTok{(}\StringTok{"calibwithzipdeath.rds"}\NormalTok{)}
\NormalTok{calib\_tbl2 }\OtherTok{\textless{}{-}} \FunctionTok{readRDS}\NormalTok{(}\StringTok{"calibwithoutdeath.rds"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

Here present the accuacy results. We can check the global error and
local error for each client on testing set for different models. The
accuracy metrics include:

\texttt{MAE} - Mean absolute error, \texttt{mae()}

\texttt{MAPE} - Mean absolute percentage error, \texttt{mape()}

\texttt{MASE} - Mean absolute scaled error, \texttt{mase()}

\texttt{SMAPE} - Symmetric mean absolute percentage error,
\texttt{smape()}

\texttt{RMSE} - Root mean squared error, \texttt{rmse()}

\texttt{RSQ} - R-squared, \texttt{rsq()}.

Global error

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\#with IHME death}
\NormalTok{calib\_tbl }\SpecialCharTok{\%\textgreater{}\%} 
    \FunctionTok{modeltime\_accuracy}\NormalTok{(}\AttributeTok{acc\_by\_id =} \ConstantTok{FALSE}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 5 x 9
##   .model_id .model_desc .type   mae  mape  mase smape  rmse    rsq
##       <int> <chr>       <chr> <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>
## 1         1 RANGER      Test  12.1    Inf 1.51   89.6  31.7 0.114 
## 2         2 RANGER      Test  10.5    Inf 1.31   88.7  20.7 0.179 
## 3         3 KERNLAB     Test   7.41   Inf 0.925  95.1  12.4 0.466 
## 4         4 KKNN        Test  12.4    NaN 1.55  NaN    39.4 0.0762
## 5         5 XGBOOST     Test   7.13   Inf 0.891  85.9  15.3 0.545
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\#with zip death}
\NormalTok{calib\_tbl1 }\SpecialCharTok{\%\textgreater{}\%} 
    \FunctionTok{modeltime\_accuracy}\NormalTok{(}\AttributeTok{acc\_by\_id =} \ConstantTok{FALSE}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 5 x 9
##   .model_id .model_desc .type   mae  mape  mase smape  rmse    rsq
##       <int> <chr>       <chr> <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>
## 1         1 RANGER      Test  12.9    Inf 1.61   90.3  32.2 0.130 
## 2         2 RANGER      Test  10.9    Inf 1.37   88.3  21.5 0.185 
## 3         3 KERNLAB     Test   6.76   Inf 0.844  90.2  11.9 0.510 
## 4         4 KKNN        Test  12.4    NaN 1.55  NaN    39.0 0.0801
## 5         5 XGBOOST     Test   6.81   Inf 0.850  91.0  15.5 0.314
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\#without death data}
\NormalTok{calib\_tbl2 }\SpecialCharTok{\%\textgreater{}\%} 
    \FunctionTok{modeltime\_accuracy}\NormalTok{(}\AttributeTok{acc\_by\_id =} \ConstantTok{FALSE}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 5 x 9
##   .model_id .model_desc .type   mae  mape  mase smape  rmse    rsq
##       <int> <chr>       <chr> <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>
## 1         1 RANGER      Test  10.5    Inf 1.31   86.0  31.2 0.112 
## 2         2 RANGER      Test   9.25   Inf 1.15   85.2  20.7 0.162 
## 3         3 KERNLAB     Test   6.37   Inf 0.795  91.5  11.9 0.513 
## 4         4 KKNN        Test  13.4    NaN 1.67  NaN    43.1 0.0798
## 5         5 XGBOOST     Test   6.30   Inf 0.787  81.5  12.5 0.450
\end{verbatim}

Local error for each client

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\#with IHME death}
\NormalTok{calib\_tbl }\SpecialCharTok{\%\textgreater{}\%} 
    \FunctionTok{modeltime\_accuracy}\NormalTok{(}\AttributeTok{acc\_by\_id =} \ConstantTok{TRUE}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 2,460 x 10
##    .model_id .model_desc .type client    mae  mape   mase smape   rmse     rsq
##        <int> <chr>       <chr> <fct>   <dbl> <dbl>  <dbl> <dbl>  <dbl>   <dbl>
##  1         1 RANGER      Test  1        4.92 Inf    15.4  156.    5.30  0.411 
##  2         1 RANGER      Test  10      57.3  333.   17.4  108.   58.6   0.126 
##  3         1 RANGER      Test  100      4.75  38.9   2.51  51.0   6.39  0.384 
##  4         1 RANGER      Test  101      4.30 191.    3.32  81.7   4.71  0.333 
##  5         1 RANGER      Test  102      2.46 Inf   Inf    200     2.50 NA     
##  6         1 RANGER      Test  103      1.30 155.    1.51  64.8   1.75  0.610 
##  7         1 RANGER      Test  104    262.   Inf   125.   173.  310.    0.385 
##  8         1 RANGER      Test  105     30.6  313.   16.4  109.   34.8   0.0498
##  9         1 RANGER      Test  106     19.4   93.9   7.67  56.7  24.2   0.0835
## 10         1 RANGER      Test  107      4.05  49.1   2.12  68.4   6.52  0.0691
## # ... with 2,450 more rows
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\#with zip death}
\NormalTok{calib\_tbl1 }\SpecialCharTok{\%\textgreater{}\%} 
    \FunctionTok{modeltime\_accuracy}\NormalTok{(}\AttributeTok{acc\_by\_id =} \ConstantTok{TRUE}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 2,460 x 10
##    .model_id .model_desc .type client    mae  mape   mase smape   rmse      rsq
##        <int> <chr>       <chr> <fct>   <dbl> <dbl>  <dbl> <dbl>  <dbl>    <dbl>
##  1         1 RANGER      Test  1       11.5  Inf    36.1  175.   13.8   0.412  
##  2         1 RANGER      Test  10      59.6  339.   18.1  110.   61.4   0.165  
##  3         1 RANGER      Test  100      6.74  68.8   3.56  61.3   8.04  0.0142 
##  4         1 RANGER      Test  101      5.63 219.    4.35  84.9   7.00  0.544  
##  5         1 RANGER      Test  102      2.12 Inf   Inf    200     2.16 NA      
##  6         1 RANGER      Test  103      1.67 198.    1.94  70.5   2.30  0.602  
##  7         1 RANGER      Test  104    240.   Inf   115.   168.  294.    0.333  
##  8         1 RANGER      Test  105     32.3  325.   17.3  113.   36.9   0.0476 
##  9         1 RANGER      Test  106     21.8  107.    8.62  64.3  26.8   0.0622 
## 10         1 RANGER      Test  107      3.98  49.0   2.09  68.3   6.44  0.00413
## # ... with 2,450 more rows
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\#without death data}
\NormalTok{calib\_tbl2 }\SpecialCharTok{\%\textgreater{}\%} 
    \FunctionTok{modeltime\_accuracy}\NormalTok{(}\AttributeTok{acc\_by\_id =} \ConstantTok{TRUE}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 2,460 x 10
##    .model_id .model_desc .type client    mae  mape   mase smape   rmse      rsq
##        <int> <chr>       <chr> <fct>   <dbl> <dbl>  <dbl> <dbl>  <dbl>    <dbl>
##  1         1 RANGER      Test  1        2.56 Inf     8.03 129.    2.79  0.314  
##  2         1 RANGER      Test  10      63.9  353.   19.4  114.   64.8   0.442  
##  3         1 RANGER      Test  100      5.20  43.4   2.75  58.8   6.74  0.476  
##  4         1 RANGER      Test  101      4.01 148.    3.09  78.3   4.65  0.123  
##  5         1 RANGER      Test  102      3.04 Inf   Inf    200     3.10 NA      
##  6         1 RANGER      Test  103      1.66 175.    1.92  77.5   2.28  0.00315
##  7         1 RANGER      Test  104    249.   Inf   119.   173.  294.    0.398  
##  8         1 RANGER      Test  105     24.2  253.   13.0  102.   26.9   0.124  
##  9         1 RANGER      Test  106     13.3   65.8   5.27  51.1  15.1   0.148  
## 10         1 RANGER      Test  107      4.31  53.5   2.26  79.6   6.56  0.721  
## # ... with 2,450 more rows
\end{verbatim}

\hypertarget{predict}{%
\subsection{Predict}\label{predict}}

We predict the shrunk AE on testing set. Our model will provide with
predicting shrunk AE and confidence interval.

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\#with IHME death}
\NormalTok{result }\OtherTok{\textless{}{-}}\NormalTok{ calib\_tbl }\SpecialCharTok{\%\textgreater{}\%}
    \FunctionTok{modeltime\_forecast}\NormalTok{(}
        \AttributeTok{new\_data    =} \FunctionTok{testing}\NormalTok{(splits),}
        \AttributeTok{actual\_data =} \FunctionTok{bind\_rows}\NormalTok{(}\FunctionTok{training}\NormalTok{(splits), }\FunctionTok{testing}\NormalTok{(splits)),}
        \AttributeTok{conf\_by\_id  =} \ConstantTok{TRUE}
\NormalTok{    )}
\CommentTok{\#with zip death}
\NormalTok{result1 }\OtherTok{\textless{}{-}}\NormalTok{ calib\_tbl1 }\SpecialCharTok{\%\textgreater{}\%}
    \FunctionTok{modeltime\_forecast}\NormalTok{(}
        \AttributeTok{new\_data    =} \FunctionTok{testing}\NormalTok{(splits),}
        \AttributeTok{actual\_data =} \FunctionTok{bind\_rows}\NormalTok{(}\FunctionTok{training}\NormalTok{(splits), }\FunctionTok{testing}\NormalTok{(splits)),}
        \AttributeTok{conf\_by\_id  =} \ConstantTok{TRUE}
\NormalTok{    )}
\CommentTok{\#without death}
\NormalTok{result2 }\OtherTok{\textless{}{-}}\NormalTok{ calib\_tbl2 }\SpecialCharTok{\%\textgreater{}\%}
    \FunctionTok{modeltime\_forecast}\NormalTok{(}
        \AttributeTok{new\_data    =} \FunctionTok{testing}\NormalTok{(splits),}
        \AttributeTok{actual\_data =} \FunctionTok{bind\_rows}\NormalTok{(}\FunctionTok{training}\NormalTok{(splits), }\FunctionTok{testing}\NormalTok{(splits)),}
        \AttributeTok{conf\_by\_id  =} \ConstantTok{TRUE}
\NormalTok{    )}
\end{Highlighting}
\end{Shaded}

For quick knit, we save the results on testing set here.

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\#result with IHME death data }
\FunctionTok{saveRDS}\NormalTok{(result, }\StringTok{"resultwithIHME.rds"}\NormalTok{)}
\CommentTok{\#result with zip death data}
\FunctionTok{saveRDS}\NormalTok{(result1, }\StringTok{"resultwithzipdeath.rds"}\NormalTok{)}
\CommentTok{\#result without death data}
\FunctionTok{saveRDS}\NormalTok{(result2, }\StringTok{"resultwithoutdeath.rds"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

Read the results on testing set.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{result }\OtherTok{\textless{}{-}} \FunctionTok{readRDS}\NormalTok{(}\StringTok{"resultwithIHME.rds"}\NormalTok{)}
\NormalTok{result1 }\OtherTok{\textless{}{-}} \FunctionTok{readRDS}\NormalTok{(}\StringTok{"resultwithzipdeath.rds"}\NormalTok{)}
\NormalTok{result2 }\OtherTok{\textless{}{-}} \FunctionTok{readRDS}\NormalTok{(}\StringTok{"resultwithoutdeath.rds"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\hypertarget{visualize-clients-forecast}{%
\subsection{Visualize clients
forecast}\label{visualize-clients-forecast}}

We pick clients \texttt{7}, \texttt{10}, \texttt{61}, \texttt{100} as
examples. We add the solid line \texttt{threshold\ =\ 2.5} to help us to
see whether this client meet adverse mortality event. The exact AE is in
the range of confidence interval.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{result }\SpecialCharTok{\%\textgreater{}\%}
    \FunctionTok{group\_by}\NormalTok{(client) }\SpecialCharTok{\%\textgreater{}\%}
    \FunctionTok{filter}\NormalTok{(client }\SpecialCharTok{\%in\%} \FunctionTok{c}\NormalTok{(}\StringTok{"7"}\NormalTok{,}\StringTok{"10"}\NormalTok{, }\StringTok{"61"}\NormalTok{,}\StringTok{"100"}\NormalTok{))}\SpecialCharTok{\%\textgreater{}\%}
    \FunctionTok{plot\_modeltime\_forecast}\NormalTok{(}
        \AttributeTok{.facet\_ncol  =} \DecValTok{2}\NormalTok{,}
        \AttributeTok{.interactive =} \ConstantTok{FALSE}\NormalTok{,}
        \AttributeTok{.title =} \StringTok{"Forecast Plot with IHME death "}\NormalTok{,}
        \AttributeTok{.line\_alpha =} \FloatTok{0.6}\NormalTok{,}
        \AttributeTok{.line\_size =} \DecValTok{1}\NormalTok{,}
        \AttributeTok{.y\_intercept =} \FloatTok{2.5}\NormalTok{,}
        \AttributeTok{.conf\_interval\_show =} \ConstantTok{FALSE}
\NormalTok{    )}
\end{Highlighting}
\end{Shaded}

\includegraphics{figures/report/fig-unnamed-chunk-80-1.pdf}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{result1 }\SpecialCharTok{\%\textgreater{}\%}
    \FunctionTok{group\_by}\NormalTok{(client) }\SpecialCharTok{\%\textgreater{}\%}
    \FunctionTok{filter}\NormalTok{(client }\SpecialCharTok{\%in\%} \FunctionTok{c}\NormalTok{(  }\StringTok{"7"}\NormalTok{,}\StringTok{"10"}\NormalTok{, }\StringTok{"61"}\NormalTok{,}\StringTok{"100"}\NormalTok{))}\SpecialCharTok{\%\textgreater{}\%}
    \FunctionTok{plot\_modeltime\_forecast}\NormalTok{(}
        \AttributeTok{.facet\_ncol  =} \DecValTok{2}\NormalTok{,}
        \AttributeTok{.interactive =} \ConstantTok{FALSE}\NormalTok{,}
        \AttributeTok{.title =} \StringTok{"Forecast Plot with zip death "}\NormalTok{,}
        \AttributeTok{.line\_alpha =} \FloatTok{0.6}\NormalTok{,}
        \AttributeTok{.line\_size =} \DecValTok{1}\NormalTok{,}
        \AttributeTok{.y\_intercept =} \FloatTok{2.5}\NormalTok{,}
        \AttributeTok{.conf\_interval\_show =} \ConstantTok{FALSE}
\NormalTok{    )}
\end{Highlighting}
\end{Shaded}

\includegraphics{figures/report/fig-unnamed-chunk-80-2.pdf}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{result2 }\SpecialCharTok{\%\textgreater{}\%}
    \FunctionTok{group\_by}\NormalTok{(client) }\SpecialCharTok{\%\textgreater{}\%}
    \FunctionTok{filter}\NormalTok{(client }\SpecialCharTok{\%in\%} \FunctionTok{c}\NormalTok{(  }\StringTok{"7"}\NormalTok{,}\StringTok{"10"}\NormalTok{, }\StringTok{"61"}\NormalTok{,}\StringTok{"100"}\NormalTok{))}\SpecialCharTok{\%\textgreater{}\%}
    \FunctionTok{plot\_modeltime\_forecast}\NormalTok{(}
        \AttributeTok{.facet\_ncol  =} \DecValTok{2}\NormalTok{,}
        \AttributeTok{.interactive =} \ConstantTok{FALSE}\NormalTok{,}
        \AttributeTok{.title =} \StringTok{"Forecast Plot with zip death "}\NormalTok{,}
        \AttributeTok{.line\_alpha =} \FloatTok{0.6}\NormalTok{,}
        \AttributeTok{.line\_size =} \DecValTok{1}\NormalTok{,}
        \AttributeTok{.y\_intercept =} \FloatTok{2.5}\NormalTok{,}
        \AttributeTok{.conf\_interval\_show =} \ConstantTok{FALSE}
\NormalTok{    )}
\end{Highlighting}
\end{Shaded}

\includegraphics{figures/report/fig-unnamed-chunk-80-3.pdf}

\hypertarget{plot-sens-spec-accuracy}{%
\subsection{Plot sens, spec, accuracy}\label{plot-sens-spec-accuracy}}

Classify whether the client is adverse or not adverse:
\texttt{shrunk\_ae\ \textgreater{}\ 2.5}.

We can get the following conclusions:

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\item
  Death data provides slight improvement for all models. The result of
  IHME Death data is almost same with zip death.
\item
  Xgboost and Suppor vector machine have the best accurcy around
  \texttt{78\%} for forecast 6 months.
\item
  K-neasrest neighbors has good sensitivity result and bad specifity,
  while other models show the opposite.
\end{enumerate}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{threshold }\OtherTok{\textless{}{-}} \FloatTok{2.5}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{result }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{select}\NormalTok{(}\SpecialCharTok{{-}}\NormalTok{.model\_desc, }\SpecialCharTok{{-}}\NormalTok{.conf\_lo, }\SpecialCharTok{{-}}\NormalTok{.conf\_hi, }\SpecialCharTok{{-}}\NormalTok{.key) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{rename}\NormalTok{(}\AttributeTok{model =}\NormalTok{ .model\_id, }\AttributeTok{value =}\NormalTok{ .value, }\AttributeTok{date=}\NormalTok{ .index)}\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{relocate}\NormalTok{(model, value, }\AttributeTok{.after =}\NormalTok{ client)}\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{pivot\_wider}\NormalTok{(}\AttributeTok{names\_from =}\NormalTok{ model, }\AttributeTok{values\_from =}\NormalTok{value)}\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{rename}\NormalTok{(}\AttributeTok{actual =} \StringTok{"NA"}\NormalTok{, }\StringTok{"Random forest"} \OtherTok{=} \StringTok{"1"}\NormalTok{, }\StringTok{"Tuned random forest"} \OtherTok{=} \StringTok{"2"}\NormalTok{, }
      \StringTok{"Support vector machines"} \OtherTok{=} \StringTok{"3"}\NormalTok{, }\StringTok{"K{-}nearest neighbors"} \OtherTok{=} \StringTok{"4"}\NormalTok{, }\AttributeTok{Xgboost =} \StringTok{"5"}\NormalTok{ )}\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{drop\_na}\NormalTok{()}\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{pivot\_longer}\NormalTok{(}\StringTok{"Random forest"}\SpecialCharTok{:}\NormalTok{Xgboost, }\AttributeTok{names\_to =} \StringTok{"model"}\NormalTok{, }\AttributeTok{values\_to =} \StringTok{"predict"}\NormalTok{)}\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{relocate}\NormalTok{(model,}\AttributeTok{.before =}\NormalTok{ date)}\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{mutate}\NormalTok{(}\AttributeTok{obs =}\NormalTok{ actual }\SpecialCharTok{\textgreater{}}\NormalTok{ threshold, }\AttributeTok{predict\_class =}\NormalTok{ predict }\SpecialCharTok{\textgreater{}}\NormalTok{ threshold)}\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{mutate}\NormalTok{(}\AttributeTok{obs =} \FunctionTok{as.factor}\NormalTok{(obs), }\AttributeTok{predict\_class =} \FunctionTok{as.factor}\NormalTok{(predict\_class))}\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{group\_by}\NormalTok{(date, model) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{summarize}\NormalTok{(}\AttributeTok{Sensitivity =} \FunctionTok{sens\_vec}\NormalTok{(obs, predict\_class),}
            \AttributeTok{Specifity =} \FunctionTok{spec\_vec}\NormalTok{(obs, predict\_class),}
            \AttributeTok{Accuracy =} \FunctionTok{accuracy\_vec}\NormalTok{(obs, predict\_class),}\AttributeTok{.groups =} \StringTok{"keep"}\NormalTok{)}\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{pivot\_longer}\NormalTok{(Sensitivity}\SpecialCharTok{:}\NormalTok{Accuracy, }\AttributeTok{names\_to =} \StringTok{"metric"}\NormalTok{, }\AttributeTok{values\_to =} \StringTok{"value"}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{ungroup}\NormalTok{() }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{ggplot}\NormalTok{(}\FunctionTok{aes}\NormalTok{(}\AttributeTok{x =}\NormalTok{ date, }\AttributeTok{y =}\NormalTok{ value, }\AttributeTok{color =}\NormalTok{ model)) }\SpecialCharTok{+}
  \FunctionTok{geom\_point}\NormalTok{() }\SpecialCharTok{+}
  \FunctionTok{geom\_line}\NormalTok{() }\SpecialCharTok{+}
  \FunctionTok{facet\_wrap}\NormalTok{( }\SpecialCharTok{\textasciitilde{}}\NormalTok{ metric)}\SpecialCharTok{+}
  \FunctionTok{ggtitle}\NormalTok{(}\StringTok{"With IHME death data"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\includegraphics{figures/report/fig-unnamed-chunk-82-1.pdf}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{result1 }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{select}\NormalTok{(}\SpecialCharTok{{-}}\NormalTok{.model\_desc, }\SpecialCharTok{{-}}\NormalTok{.conf\_lo, }\SpecialCharTok{{-}}\NormalTok{.conf\_hi, }\SpecialCharTok{{-}}\NormalTok{.key) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{rename}\NormalTok{(}\AttributeTok{model =}\NormalTok{ .model\_id, }\AttributeTok{value =}\NormalTok{ .value, }\AttributeTok{date=}\NormalTok{ .index)}\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{relocate}\NormalTok{(model, value, }\AttributeTok{.after =}\NormalTok{ client)}\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{pivot\_wider}\NormalTok{(}\AttributeTok{names\_from =}\NormalTok{ model, }\AttributeTok{values\_from =}\NormalTok{value)}\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{rename}\NormalTok{(}\AttributeTok{actual =} \StringTok{"NA"}\NormalTok{, }\StringTok{"Random forest"} \OtherTok{=} \StringTok{"1"}\NormalTok{, }\StringTok{"Tuned random forest"} \OtherTok{=} \StringTok{"2"}\NormalTok{, }
      \StringTok{"Support vector machines"} \OtherTok{=} \StringTok{"3"}\NormalTok{, }\StringTok{"K{-}nearest neighbors"} \OtherTok{=} \StringTok{"4"}\NormalTok{, }\AttributeTok{Xgboost =} \StringTok{"5"}\NormalTok{ )}\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{drop\_na}\NormalTok{()}\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{pivot\_longer}\NormalTok{(}\StringTok{"Random forest"}\SpecialCharTok{:}\NormalTok{Xgboost, }\AttributeTok{names\_to =} \StringTok{"model"}\NormalTok{, }\AttributeTok{values\_to =} \StringTok{"predict"}\NormalTok{)}\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{relocate}\NormalTok{(model,}\AttributeTok{.before =}\NormalTok{ date)}\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{mutate}\NormalTok{(}\AttributeTok{obs =}\NormalTok{ actual }\SpecialCharTok{\textgreater{}}\NormalTok{ threshold, }\AttributeTok{predict\_class =}\NormalTok{ predict }\SpecialCharTok{\textgreater{}}\NormalTok{ threshold)}\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{mutate}\NormalTok{(}\AttributeTok{obs =} \FunctionTok{as.factor}\NormalTok{(obs), }\AttributeTok{predict\_class =} \FunctionTok{as.factor}\NormalTok{(predict\_class))}\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{group\_by}\NormalTok{(date, model) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{summarize}\NormalTok{(}\AttributeTok{Sensitivity =} \FunctionTok{sens\_vec}\NormalTok{(obs, predict\_class),}
            \AttributeTok{Specifity =} \FunctionTok{spec\_vec}\NormalTok{(obs, predict\_class),}
            \AttributeTok{Accuracy =} \FunctionTok{accuracy\_vec}\NormalTok{(obs, predict\_class),}\AttributeTok{.groups =} \StringTok{"keep"}\NormalTok{)}\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{pivot\_longer}\NormalTok{(Sensitivity}\SpecialCharTok{:}\NormalTok{Accuracy, }\AttributeTok{names\_to =} \StringTok{"metric"}\NormalTok{, }\AttributeTok{values\_to =} \StringTok{"value"}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{ungroup}\NormalTok{() }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{ggplot}\NormalTok{(}\FunctionTok{aes}\NormalTok{(}\AttributeTok{x =}\NormalTok{ date, }\AttributeTok{y =}\NormalTok{ value, }\AttributeTok{color =}\NormalTok{ model)) }\SpecialCharTok{+}
  \FunctionTok{geom\_point}\NormalTok{() }\SpecialCharTok{+}
  \FunctionTok{geom\_line}\NormalTok{() }\SpecialCharTok{+}
  \FunctionTok{facet\_wrap}\NormalTok{( }\SpecialCharTok{\textasciitilde{}}\NormalTok{ metric)}\SpecialCharTok{+}
  \FunctionTok{ggtitle}\NormalTok{(}\StringTok{"With zip death data"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\includegraphics{figures/report/fig-unnamed-chunk-82-2.pdf}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{result2 }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{select}\NormalTok{(}\SpecialCharTok{{-}}\NormalTok{.model\_desc, }\SpecialCharTok{{-}}\NormalTok{.conf\_lo, }\SpecialCharTok{{-}}\NormalTok{.conf\_hi, }\SpecialCharTok{{-}}\NormalTok{.key) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{rename}\NormalTok{(}\AttributeTok{model =}\NormalTok{ .model\_id, }\AttributeTok{value =}\NormalTok{ .value, }\AttributeTok{date=}\NormalTok{ .index)}\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{relocate}\NormalTok{(model, value, }\AttributeTok{.after =}\NormalTok{ client)}\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{pivot\_wider}\NormalTok{(}\AttributeTok{names\_from =}\NormalTok{ model, }\AttributeTok{values\_from =}\NormalTok{value)}\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{rename}\NormalTok{(}\AttributeTok{actual =} \StringTok{"NA"}\NormalTok{, }\StringTok{"Random forest"} \OtherTok{=} \StringTok{"1"}\NormalTok{, }\StringTok{"Tuned random forest"} \OtherTok{=} \StringTok{"2"}\NormalTok{, }
      \StringTok{"Support vector machines"} \OtherTok{=} \StringTok{"3"}\NormalTok{, }\StringTok{"K{-}nearest neighbors"} \OtherTok{=} \StringTok{"4"}\NormalTok{, }\AttributeTok{Xgboost =} \StringTok{"5"}\NormalTok{ )}\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{drop\_na}\NormalTok{()}\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{pivot\_longer}\NormalTok{(}\StringTok{"Random forest"}\SpecialCharTok{:}\NormalTok{Xgboost, }\AttributeTok{names\_to =} \StringTok{"model"}\NormalTok{, }\AttributeTok{values\_to =} \StringTok{"predict"}\NormalTok{)}\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{relocate}\NormalTok{(model,}\AttributeTok{.before =}\NormalTok{ date)}\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{mutate}\NormalTok{(}\AttributeTok{obs =}\NormalTok{ actual }\SpecialCharTok{\textgreater{}}\NormalTok{ threshold, }\AttributeTok{predict\_class =}\NormalTok{ predict }\SpecialCharTok{\textgreater{}}\NormalTok{ threshold)}\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{mutate}\NormalTok{(}\AttributeTok{obs =} \FunctionTok{as.factor}\NormalTok{(obs), }\AttributeTok{predict\_class =} \FunctionTok{as.factor}\NormalTok{(predict\_class))}\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{group\_by}\NormalTok{(date, model) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{summarize}\NormalTok{(}\AttributeTok{Sensitivity =} \FunctionTok{sens\_vec}\NormalTok{(obs, predict\_class),}
            \AttributeTok{Specifity =} \FunctionTok{spec\_vec}\NormalTok{(obs, predict\_class),}
            \AttributeTok{Accuracy =} \FunctionTok{accuracy\_vec}\NormalTok{(obs, predict\_class),}\AttributeTok{.groups =} \StringTok{"keep"}\NormalTok{)}\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{pivot\_longer}\NormalTok{(Sensitivity}\SpecialCharTok{:}\NormalTok{Accuracy, }\AttributeTok{names\_to =} \StringTok{"metric"}\NormalTok{, }\AttributeTok{values\_to =} \StringTok{"value"}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{ungroup}\NormalTok{() }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{ggplot}\NormalTok{(}\FunctionTok{aes}\NormalTok{(}\AttributeTok{x =}\NormalTok{ date, }\AttributeTok{y =}\NormalTok{ value, }\AttributeTok{color =}\NormalTok{ model)) }\SpecialCharTok{+}
  \FunctionTok{geom\_point}\NormalTok{() }\SpecialCharTok{+}
  \FunctionTok{geom\_line}\NormalTok{() }\SpecialCharTok{+}
  \FunctionTok{facet\_wrap}\NormalTok{( }\SpecialCharTok{\textasciitilde{}}\NormalTok{ metric)}\SpecialCharTok{+}
  \FunctionTok{ggtitle}\NormalTok{(}\StringTok{"Without death data"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\includegraphics{figures/report/fig-unnamed-chunk-82-3.pdf} \#\#
Calculate predict claims

We calculate the predicted weekly claim by:

\[
\text{Predicted Smoothed AE} = \text{Predicted Shrunk AE} / \text{Shrinkage},
\]

\[
\text{Predicted weekly Claim} = \text{Predicted Smoothed AE} \cdot (\text{Expected Yearly Claim} /52.18 )
\]

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{claim }\OtherTok{\textless{}{-}} \FunctionTok{read\_feather}\NormalTok{(}\StringTok{"data/processed\_data\_20\_12\_23.feather"}\NormalTok{)}\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{select}\NormalTok{(date, client, claims, expected, shrinkage,volume)}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{predclaim}\OtherTok{\textless{}{-}}\NormalTok{result }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{select}\NormalTok{(}\SpecialCharTok{{-}}\NormalTok{.model\_desc, }\SpecialCharTok{{-}}\NormalTok{.conf\_lo, }\SpecialCharTok{{-}}\NormalTok{.conf\_hi, }\SpecialCharTok{{-}}\NormalTok{.key) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{rename}\NormalTok{(}\AttributeTok{model =}\NormalTok{ .model\_id, }\AttributeTok{value =}\NormalTok{ .value, }\AttributeTok{date=}\NormalTok{ .index)}\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{relocate}\NormalTok{(model, value, }\AttributeTok{.after =}\NormalTok{ client)}\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{pivot\_wider}\NormalTok{(}\AttributeTok{names\_from =}\NormalTok{ model, }\AttributeTok{values\_from =}\NormalTok{value)}\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{rename}\NormalTok{(}\AttributeTok{actual =} \StringTok{"NA"}\NormalTok{, }\AttributeTok{rf =} \StringTok{"1"}\NormalTok{, }\AttributeTok{rf\_tuned =} \StringTok{"2"}\NormalTok{, }\AttributeTok{svm\_rbd =} \StringTok{"3"}\NormalTok{, }\AttributeTok{knn =} \StringTok{"4"}\NormalTok{, }\AttributeTok{xgboost =} \StringTok{"5"}\NormalTok{ )}\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{drop\_na}\NormalTok{()}\SpecialCharTok{\%\textgreater{}\%}
 \FunctionTok{inner\_join}\NormalTok{(claim, }\AttributeTok{by =} \FunctionTok{c}\NormalTok{(}\StringTok{"date"}\NormalTok{, }\StringTok{"client"}\NormalTok{))}\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{mutate}\NormalTok{(}\AttributeTok{rf =}\NormalTok{ rf}\SpecialCharTok{/}\NormalTok{shrinkage }\SpecialCharTok{*}\NormalTok{(expected }\SpecialCharTok{/}\FloatTok{52.18}\NormalTok{),}
         \AttributeTok{rf\_tuned =}\NormalTok{ rf\_tuned}\SpecialCharTok{/}\NormalTok{shrinkage}\SpecialCharTok{*}\NormalTok{(expected }\SpecialCharTok{/}\FloatTok{52.18}\NormalTok{),}
         \AttributeTok{svm\_rbd =}\NormalTok{ svm\_rbd }\SpecialCharTok{/}\NormalTok{shrinkage}\SpecialCharTok{*}\NormalTok{(expected }\SpecialCharTok{/}\FloatTok{52.18}\NormalTok{),}
         \AttributeTok{knn =}\NormalTok{ knn}\SpecialCharTok{/}\NormalTok{shrinkage}\SpecialCharTok{*}\NormalTok{(expected }\SpecialCharTok{/}\FloatTok{52.18}\NormalTok{),}
         \AttributeTok{xgboost=}\NormalTok{ xgboost}\SpecialCharTok{/}\NormalTok{shrinkage}\SpecialCharTok{*}\NormalTok{(expected }\SpecialCharTok{/} \FloatTok{52.18}\NormalTok{))}\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{select}\NormalTok{(date, client, claims,expected, rf, rf\_tuned,svm\_rbd,knn,xgboost)}

\NormalTok{predclaim1}\OtherTok{\textless{}{-}}\NormalTok{result1 }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{select}\NormalTok{(}\SpecialCharTok{{-}}\NormalTok{.model\_desc, }\SpecialCharTok{{-}}\NormalTok{.conf\_lo, }\SpecialCharTok{{-}}\NormalTok{.conf\_hi, }\SpecialCharTok{{-}}\NormalTok{.key) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{rename}\NormalTok{(}\AttributeTok{model =}\NormalTok{ .model\_id, }\AttributeTok{value =}\NormalTok{ .value, }\AttributeTok{date=}\NormalTok{ .index)}\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{relocate}\NormalTok{(model, value, }\AttributeTok{.after =}\NormalTok{ client)}\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{pivot\_wider}\NormalTok{(}\AttributeTok{names\_from =}\NormalTok{ model, }\AttributeTok{values\_from =}\NormalTok{value)}\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{rename}\NormalTok{(}\AttributeTok{actual =} \StringTok{"NA"}\NormalTok{, }\AttributeTok{rf =} \StringTok{"1"}\NormalTok{, }\AttributeTok{rf\_tuned =} \StringTok{"2"}\NormalTok{, }\AttributeTok{svm\_rbd =} \StringTok{"3"}\NormalTok{, }\AttributeTok{knn =} \StringTok{"4"}\NormalTok{, }\AttributeTok{xgboost =} \StringTok{"5"}\NormalTok{ )}\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{drop\_na}\NormalTok{()}\SpecialCharTok{\%\textgreater{}\%}
 \FunctionTok{inner\_join}\NormalTok{(claim, }\AttributeTok{by =} \FunctionTok{c}\NormalTok{(}\StringTok{"date"}\NormalTok{, }\StringTok{"client"}\NormalTok{))}\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{mutate}\NormalTok{(}\AttributeTok{rf =}\NormalTok{ rf}\SpecialCharTok{/}\NormalTok{shrinkage }\SpecialCharTok{*}\NormalTok{(expected }\SpecialCharTok{/}\FloatTok{52.18}\NormalTok{),}
         \AttributeTok{rf\_tuned =}\NormalTok{ rf\_tuned}\SpecialCharTok{/}\NormalTok{shrinkage}\SpecialCharTok{*}\NormalTok{(expected }\SpecialCharTok{/}\FloatTok{52.18}\NormalTok{),}
         \AttributeTok{svm\_rbd =}\NormalTok{ svm\_rbd }\SpecialCharTok{/}\NormalTok{shrinkage}\SpecialCharTok{*}\NormalTok{(expected }\SpecialCharTok{/}\FloatTok{52.18}\NormalTok{),}
         \AttributeTok{knn =}\NormalTok{ knn}\SpecialCharTok{/}\NormalTok{shrinkage}\SpecialCharTok{*}\NormalTok{(expected }\SpecialCharTok{/}\FloatTok{52.18}\NormalTok{),}
         \AttributeTok{xgboost=}\NormalTok{ xgboost}\SpecialCharTok{/}\NormalTok{shrinkage}\SpecialCharTok{*}\NormalTok{(expected }\SpecialCharTok{/} \FloatTok{52.18}\NormalTok{))}\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{select}\NormalTok{(date, client, claims,expected, rf, rf\_tuned,svm\_rbd,knn,xgboost)}

\NormalTok{predclaim2}\OtherTok{\textless{}{-}}\NormalTok{result2 }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{select}\NormalTok{(}\SpecialCharTok{{-}}\NormalTok{.model\_desc, }\SpecialCharTok{{-}}\NormalTok{.conf\_lo, }\SpecialCharTok{{-}}\NormalTok{.conf\_hi, }\SpecialCharTok{{-}}\NormalTok{.key) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{rename}\NormalTok{(}\AttributeTok{model =}\NormalTok{ .model\_id, }\AttributeTok{value =}\NormalTok{ .value, }\AttributeTok{date=}\NormalTok{ .index)}\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{relocate}\NormalTok{(model, value, }\AttributeTok{.after =}\NormalTok{ client)}\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{pivot\_wider}\NormalTok{(}\AttributeTok{names\_from =}\NormalTok{ model, }\AttributeTok{values\_from =}\NormalTok{value)}\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{rename}\NormalTok{(}\AttributeTok{actual =} \StringTok{"NA"}\NormalTok{, }\AttributeTok{rf =} \StringTok{"1"}\NormalTok{, }\AttributeTok{rf\_tuned =} \StringTok{"2"}\NormalTok{, }\AttributeTok{svm\_rbd =} \StringTok{"3"}\NormalTok{, }\AttributeTok{knn =} \StringTok{"4"}\NormalTok{, }\AttributeTok{xgboost =} \StringTok{"5"}\NormalTok{ )}\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{drop\_na}\NormalTok{()}\SpecialCharTok{\%\textgreater{}\%}
 \FunctionTok{inner\_join}\NormalTok{(claim, }\AttributeTok{by =} \FunctionTok{c}\NormalTok{(}\StringTok{"date"}\NormalTok{, }\StringTok{"client"}\NormalTok{))}\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{mutate}\NormalTok{(}\AttributeTok{rf =}\NormalTok{ rf}\SpecialCharTok{/}\NormalTok{shrinkage }\SpecialCharTok{*}\NormalTok{(expected }\SpecialCharTok{/}\FloatTok{52.18}\NormalTok{),}
         \AttributeTok{rf\_tuned =}\NormalTok{ rf\_tuned}\SpecialCharTok{/}\NormalTok{shrinkage}\SpecialCharTok{*}\NormalTok{(expected }\SpecialCharTok{/}\FloatTok{52.18}\NormalTok{),}
         \AttributeTok{svm\_rbd =}\NormalTok{ svm\_rbd }\SpecialCharTok{/}\NormalTok{shrinkage}\SpecialCharTok{*}\NormalTok{(expected }\SpecialCharTok{/}\FloatTok{52.18}\NormalTok{),}
         \AttributeTok{knn =}\NormalTok{ knn}\SpecialCharTok{/}\NormalTok{shrinkage}\SpecialCharTok{*}\NormalTok{(expected }\SpecialCharTok{/}\FloatTok{52.18}\NormalTok{),}
         \AttributeTok{xgboost=}\NormalTok{ xgboost}\SpecialCharTok{/}\NormalTok{shrinkage}\SpecialCharTok{*}\NormalTok{(expected }\SpecialCharTok{/} \FloatTok{52.18}\NormalTok{))}\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{select}\NormalTok{(date, client, claims,expected, rf, rf\_tuned,svm\_rbd,knn,xgboost)}
\end{Highlighting}
\end{Shaded}

Weekly total claims vs predicted claims

We can draw the following conclusion:

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\item
  K-nearest neighbors, Xgboost, svm can catch the trend for every week
  total claim.
\item
  Death data improve the results greatly.The zip death have the best
  result since it is the exact death of the area. And IHME death data
  result is also good. Without death data, none of 5 models can catch
  the change of claims.
\end{enumerate}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{predclaim}\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{group\_by}\NormalTok{(date)}\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{summarise}\NormalTok{(}\AttributeTok{expected =} \FunctionTok{sum}\NormalTok{(expected)}\SpecialCharTok{/}\FloatTok{52.18}\NormalTok{,}
        \AttributeTok{claims =} \FunctionTok{sum}\NormalTok{(claims),}
          \AttributeTok{rf =}\FunctionTok{sum}\NormalTok{(rf) ,}
         \AttributeTok{rf\_tuned =} \FunctionTok{sum}\NormalTok{(rf\_tuned),}
         \AttributeTok{svm\_rbd =} \FunctionTok{sum}\NormalTok{(svm\_rbd) ,}
         \AttributeTok{knn =} \FunctionTok{sum}\NormalTok{(knn),}
         \AttributeTok{xgboost=} \FunctionTok{sum}\NormalTok{(xgboost))}\SpecialCharTok{\%\textgreater{}\%}
   \FunctionTok{rename}\NormalTok{(}\StringTok{"Random forest"} \OtherTok{=}\NormalTok{ rf, }\StringTok{"Tuned random forest"} \OtherTok{=}\NormalTok{ rf\_tuned, }
    \StringTok{"Support vector machines"} \OtherTok{=}\NormalTok{ svm\_rbd, }\StringTok{"K{-}nearest neighbors"} \OtherTok{=}\NormalTok{ knn, }\AttributeTok{Xgboost =}\NormalTok{ xgboost )}\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{pivot\_longer}\NormalTok{(expected}\SpecialCharTok{:}\NormalTok{Xgboost, }\AttributeTok{names\_to =} \StringTok{"metric"}\NormalTok{, }\AttributeTok{values\_to =} \StringTok{"value"}\NormalTok{)}\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{ggplot}\NormalTok{(}\FunctionTok{aes}\NormalTok{(}\AttributeTok{x =}\NormalTok{ date, }\AttributeTok{y =}\NormalTok{ value, }\AttributeTok{color =}\NormalTok{ metric)) }\SpecialCharTok{+} \FunctionTok{geom\_line}\NormalTok{()}\SpecialCharTok{+}
  \FunctionTok{ggtitle}\NormalTok{(}\StringTok{" Weekly total claim with IHME death"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\includegraphics{figures/report/fig-unnamed-chunk-86-1.pdf}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{predclaim1}\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{group\_by}\NormalTok{(date)}\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{summarise}\NormalTok{(}\AttributeTok{expected =} \FunctionTok{sum}\NormalTok{(expected)}\SpecialCharTok{/}\FloatTok{52.18}\NormalTok{,}
        \AttributeTok{claims =} \FunctionTok{sum}\NormalTok{(claims),}
          \AttributeTok{rf =}\FunctionTok{sum}\NormalTok{(rf) ,}
         \AttributeTok{rf\_tuned =} \FunctionTok{sum}\NormalTok{(rf\_tuned),}
         \AttributeTok{svm\_rbd =} \FunctionTok{sum}\NormalTok{(svm\_rbd) ,}
         \AttributeTok{knn =} \FunctionTok{sum}\NormalTok{(knn),}
         \AttributeTok{xgboost=} \FunctionTok{sum}\NormalTok{(xgboost))}\SpecialCharTok{\%\textgreater{}\%}
   \FunctionTok{rename}\NormalTok{(}\StringTok{"Random forest"} \OtherTok{=}\NormalTok{ rf, }\StringTok{"Tuned random forest"} \OtherTok{=}\NormalTok{ rf\_tuned, }
    \StringTok{"Support vector machines"} \OtherTok{=}\NormalTok{ svm\_rbd, }\StringTok{"K{-}nearest neighbors"} \OtherTok{=}\NormalTok{ knn, }\AttributeTok{Xgboost =}\NormalTok{ xgboost )}\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{pivot\_longer}\NormalTok{(expected}\SpecialCharTok{:}\NormalTok{Xgboost, }\AttributeTok{names\_to =} \StringTok{"metric"}\NormalTok{, }\AttributeTok{values\_to =} \StringTok{"value"}\NormalTok{)}\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{ggplot}\NormalTok{(}\FunctionTok{aes}\NormalTok{(}\AttributeTok{x =}\NormalTok{ date, }\AttributeTok{y =}\NormalTok{ value, }\AttributeTok{color =}\NormalTok{ metric)) }\SpecialCharTok{+} \FunctionTok{geom\_line}\NormalTok{()}\SpecialCharTok{+}
  \FunctionTok{ggtitle}\NormalTok{(}\StringTok{" Weekly total claim with zip death"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\includegraphics{figures/report/fig-unnamed-chunk-86-2.pdf}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{predclaim2}\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{group\_by}\NormalTok{(date)}\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{summarise}\NormalTok{(}\AttributeTok{expected =} \FunctionTok{sum}\NormalTok{(expected)}\SpecialCharTok{/}\FloatTok{52.18}\NormalTok{,}
        \AttributeTok{claims =} \FunctionTok{sum}\NormalTok{(claims),}
          \AttributeTok{rf =}\FunctionTok{sum}\NormalTok{(rf) ,}
         \AttributeTok{rf\_tuned =} \FunctionTok{sum}\NormalTok{(rf\_tuned),}
         \AttributeTok{svm\_rbd =} \FunctionTok{sum}\NormalTok{(svm\_rbd) ,}
         \AttributeTok{knn =} \FunctionTok{sum}\NormalTok{(knn),}
         \AttributeTok{xgboost=} \FunctionTok{sum}\NormalTok{(xgboost))}\SpecialCharTok{\%\textgreater{}\%}
   \FunctionTok{rename}\NormalTok{( }\StringTok{"Random forest"} \OtherTok{=}\NormalTok{ rf, }\StringTok{"Tuned random forest"} \OtherTok{=}\NormalTok{ rf\_tuned, }
    \StringTok{"Support vector machines"} \OtherTok{=}\NormalTok{ svm\_rbd, }\StringTok{"K{-}nearest neighbors"} \OtherTok{=}\NormalTok{ knn, }\AttributeTok{Xgboost =}\NormalTok{ xgboost )}\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{pivot\_longer}\NormalTok{(expected}\SpecialCharTok{:}\NormalTok{Xgboost, }\AttributeTok{names\_to =} \StringTok{"metric"}\NormalTok{, }\AttributeTok{values\_to =} \StringTok{"value"}\NormalTok{)}\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{ggplot}\NormalTok{(}\FunctionTok{aes}\NormalTok{(}\AttributeTok{x =}\NormalTok{ date, }\AttributeTok{y =}\NormalTok{ value, }\AttributeTok{color =}\NormalTok{ metric)) }\SpecialCharTok{+} \FunctionTok{geom\_line}\NormalTok{()}\SpecialCharTok{+}
  \FunctionTok{ggtitle}\NormalTok{(}\StringTok{" Weekly total claim without death"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\includegraphics{figures/report/fig-unnamed-chunk-86-3.pdf}

Half year total claims for each client vs predicted total claims

Each client, the result is not good since it is a global model for all
clients.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{predclaim}\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{filter}\NormalTok{(client }\SpecialCharTok{\%in\%} \FunctionTok{c}\NormalTok{(}\DecValTok{1}\SpecialCharTok{:}\DecValTok{50}\NormalTok{))}\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{group\_by}\NormalTok{(client)}\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{summarise}\NormalTok{(}\AttributeTok{expected =} \FunctionTok{sum}\NormalTok{(expected)}\SpecialCharTok{/}\FloatTok{52.18}\NormalTok{,}
        \AttributeTok{claims =} \FunctionTok{sum}\NormalTok{(claims),}
          \AttributeTok{rf =}\FunctionTok{sum}\NormalTok{(rf) ,}
         \AttributeTok{rf\_tuned =} \FunctionTok{sum}\NormalTok{(rf\_tuned),}
         \AttributeTok{svm\_rbd =} \FunctionTok{sum}\NormalTok{(svm\_rbd) ,}
         \AttributeTok{knn =} \FunctionTok{sum}\NormalTok{(knn),}
         \AttributeTok{xgboost=} \FunctionTok{sum}\NormalTok{(xgboost))}\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{rename}\NormalTok{(}\StringTok{"Random forest"} \OtherTok{=}\NormalTok{ rf, }\StringTok{"Tuned random forest"} \OtherTok{=}\NormalTok{ rf\_tuned, }
    \StringTok{"Support vector machines"} \OtherTok{=}\NormalTok{ svm\_rbd, }\StringTok{"K{-}nearest neighbors"} \OtherTok{=}\NormalTok{ knn, }\AttributeTok{Xgboost =}\NormalTok{ xgboost )}\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{pivot\_longer}\NormalTok{(claims}\SpecialCharTok{:}\NormalTok{Xgboost, }\AttributeTok{names\_to =} \StringTok{"metric"}\NormalTok{, }\AttributeTok{values\_to =} \StringTok{"value"}\NormalTok{)}\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{ggplot}\NormalTok{(}\FunctionTok{aes}\NormalTok{(}\AttributeTok{x =}\NormalTok{ client, }\AttributeTok{y =}\NormalTok{ value, }\AttributeTok{color =}\NormalTok{ metric)) }\SpecialCharTok{+} \FunctionTok{geom\_point}\NormalTok{(}\AttributeTok{alpha =} \FloatTok{0.7}\NormalTok{) }\SpecialCharTok{+}
  \FunctionTok{scale\_y\_log10}\NormalTok{() }\SpecialCharTok{+}
  \FunctionTok{ggtitle}\NormalTok{(}\StringTok{"Total claims for 2021{-}01{-}01 to 2021{-}06{-}01 for each client with IHME death"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\includegraphics{figures/report/fig-unnamed-chunk-87-1.pdf}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{predclaim1}\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{filter}\NormalTok{(client }\SpecialCharTok{\%in\%} \FunctionTok{c}\NormalTok{(}\DecValTok{1}\SpecialCharTok{:}\DecValTok{50}\NormalTok{))}\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{group\_by}\NormalTok{(client)}\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{summarise}\NormalTok{(}\AttributeTok{expected =} \FunctionTok{sum}\NormalTok{(expected)}\SpecialCharTok{/}\FloatTok{52.18}\NormalTok{,}
        \AttributeTok{claims =} \FunctionTok{sum}\NormalTok{(claims),}
          \AttributeTok{rf =}\FunctionTok{sum}\NormalTok{(rf) ,}
         \AttributeTok{rf\_tuned =} \FunctionTok{sum}\NormalTok{(rf\_tuned),}
         \AttributeTok{svm\_rbd =} \FunctionTok{sum}\NormalTok{(svm\_rbd) ,}
         \AttributeTok{knn =} \FunctionTok{sum}\NormalTok{(knn),}
         \AttributeTok{xgboost=} \FunctionTok{sum}\NormalTok{(xgboost))}\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{rename}\NormalTok{(}\StringTok{"Random forest"} \OtherTok{=}\NormalTok{ rf, }\StringTok{"Tuned random forest"} \OtherTok{=}\NormalTok{ rf\_tuned, }
    \StringTok{"Support vector machines"} \OtherTok{=}\NormalTok{ svm\_rbd, }\StringTok{"K{-}nearest neighbors"} \OtherTok{=}\NormalTok{ knn, }\AttributeTok{Xgboost =}\NormalTok{ xgboost )}\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{pivot\_longer}\NormalTok{(claims}\SpecialCharTok{:}\NormalTok{Xgboost, }\AttributeTok{names\_to =} \StringTok{"metric"}\NormalTok{, }\AttributeTok{values\_to =} \StringTok{"value"}\NormalTok{)}\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{ggplot}\NormalTok{(}\FunctionTok{aes}\NormalTok{(}\AttributeTok{x =}\NormalTok{ client, }\AttributeTok{y =}\NormalTok{ value, }\AttributeTok{color =}\NormalTok{ metric)) }\SpecialCharTok{+} \FunctionTok{geom\_point}\NormalTok{(}\AttributeTok{alpha =} \FloatTok{0.7}\NormalTok{)}\SpecialCharTok{+}
  \FunctionTok{scale\_y\_log10}\NormalTok{() }\SpecialCharTok{+}
  \FunctionTok{ggtitle}\NormalTok{(}\StringTok{"Total claims for 2021{-}01{-}01 to 2021{-}06{-}01 for each client with zip death"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\includegraphics{figures/report/fig-unnamed-chunk-87-2.pdf}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{predclaim2}\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{filter}\NormalTok{(client }\SpecialCharTok{\%in\%} \FunctionTok{c}\NormalTok{(}\DecValTok{1}\SpecialCharTok{:}\DecValTok{50}\NormalTok{))}\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{group\_by}\NormalTok{(client)}\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{summarise}\NormalTok{(}\AttributeTok{expected =} \FunctionTok{sum}\NormalTok{(expected)}\SpecialCharTok{/}\FloatTok{52.18}\NormalTok{,}
        \AttributeTok{claims =} \FunctionTok{sum}\NormalTok{(claims),}
          \AttributeTok{rf =}\FunctionTok{sum}\NormalTok{(rf) ,}
         \AttributeTok{rf\_tuned =} \FunctionTok{sum}\NormalTok{(rf\_tuned),}
         \AttributeTok{svm\_rbd =} \FunctionTok{sum}\NormalTok{(svm\_rbd) ,}
         \AttributeTok{knn =} \FunctionTok{sum}\NormalTok{(knn),}
         \AttributeTok{xgboost=} \FunctionTok{sum}\NormalTok{(xgboost))}\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{rename}\NormalTok{(}\StringTok{"Random forest"} \OtherTok{=}\NormalTok{ rf, }\StringTok{"Tuned random forest"} \OtherTok{=}\NormalTok{ rf\_tuned, }
    \StringTok{"Support vector machines"} \OtherTok{=}\NormalTok{ svm\_rbd, }\StringTok{"K{-}nearest neighbors"} \OtherTok{=}\NormalTok{ knn, }\AttributeTok{Xgboost =}\NormalTok{ xgboost )}\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{pivot\_longer}\NormalTok{(claims}\SpecialCharTok{:}\NormalTok{Xgboost, }\AttributeTok{names\_to =} \StringTok{"metric"}\NormalTok{, }\AttributeTok{values\_to =} \StringTok{"value"}\NormalTok{)}\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{ggplot}\NormalTok{(}\FunctionTok{aes}\NormalTok{(}\AttributeTok{x =}\NormalTok{ client, }\AttributeTok{y =}\NormalTok{ value, }\AttributeTok{color =}\NormalTok{ metric)) }\SpecialCharTok{+} \FunctionTok{geom\_point}\NormalTok{(}\AttributeTok{alpha =} \FloatTok{0.7}\NormalTok{)}\SpecialCharTok{+}
  \FunctionTok{scale\_y\_log10}\NormalTok{() }\SpecialCharTok{+}
  \FunctionTok{ggtitle}\NormalTok{(}\StringTok{"Total claims for 2021{-}01{-}01 to 2021{-}06{-}01 for each client without death data"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\includegraphics{figures/report/fig-unnamed-chunk-87-3.pdf}

\hypertarget{why-this-model-doesnt-work}{%
\subsection{Why this model doesn't
work}\label{why-this-model-doesnt-work}}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  The performance on classification of whether the client is adverse or
  not adverse is worse than our long-time and short-time model.
\item
  Even this model is global for all clients, it will take much more time
  than others.
\item
  It cannot predict new clients outside of the training set currently.
\end{enumerate}

\hypertarget{possible-improvement-for-this-model}{%
\subsection{Possible improvement for this
model}\label{possible-improvement-for-this-model}}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  To improve accuracy, we can add feature engineering and localized
  model selection by time series identifier.
\item
  We can also choose the final predicted value according to the
  confidence interval to improve our result since the exact AE is in the
  overlap of 5 models.
\end{enumerate}

\hypertarget{conclusion}{%
\section{Conclusion}\label{conclusion}}

The aim of this project is to predict Group Life Insurance Mortality
during a pandemic. We first observe that it is not sufficient to use the
pre-pandemic AE to classify clients between low-risk and high-risk
during a pandemic. Hence, we need to provide the management team with
new information that helps their decision making in such unprecedented
times. To serve this purpose, we collect data from the zip codes where
the companies are located along with some characteristics of the
companies. We present two types of models that serve valuable but
different purposes: long-term model and short-term model.

After evaluating several metrics of different machine learning models
for the long-term version, we choose the Random Forest and we tune its
hyperparameters. We aim at minimizing the proportion of adverse clients
that are incorrectly predicted as not adverse since this number causes
huge money loss for Securian. Simultaneously, we also aim at minimizing
the proportion of not adverse clients that are predicted as adverse,
since this causes clients loss and hence again money loss. After
choosing the best parameters, and choosing a threshold, we are able to
reach a sensitivity of 85\%, a specificity of 78\% and an accuracy of
82\%. We then use the SHAP values to increase the transparency of the
model and understand the contribution of the predictors in the
classification of our clients.

We then include the time factor in the short-term model and we aim at
using some known clients' performance in the past to predict both known
and unknown clients' performance three months in the future. For this
version, we use the rolling AE (updated weekly) that takes also into
account client volume, as opposed to using yearly AE in the long-term
version. We also add the weekly deaths by COVID to the list of
predictors used in the long-term model. One key step in the process is
forecasting the weekly deaths to use it in the future. For this
time-dependent version, we choose the Boosted Trees model after
comparing its performance with other known machine learning models.
After tuning it and choosing the best parameter set, we present the
accuracy of the predictions for unknown clients and compare it with the
one for the known clients.

Using this work, the management team has a strong valuable asset that
can be used in their contract renewals, their negotiations and their
risk management.

\hypertarget{future-directions}{%
\section{Future Directions}\label{future-directions}}

The both long and short-term models above performed tremendously well.
Even though the data used to build the models are from trusted sources
as cited in the data wrangling section, the clients data is simulated
due to privacy reason. Hence, the natural future step will be to test
the models on real clients. We hope that the models would perform as
well as they did with the simulated clients.

Another direction that this project can head to is to consider
infections as lagged predictor for deaths. Note that high vaccinations
have helped averted hospitalizations and deaths. See the CDC
\href{https://www.cdc.gov/flu/about/burden-averted/2019-2020.htm}{report}.
In the beginning of the pandemic, flu vaccination rate can be used as a
proxy for future COVID vaccination rate. With this in mind models can be
build to take into account the influenza infections and its vaccination
rate. Influenza is very seasonal infection thus the forecast will of
course be seasonal, but in building the future model other infectious
disease can be taken into to account, be it seasonal or not.

As noted above, a recently updated model is naturally expected to give
even better outcomes by adding more pandemic related predictors. To be
precise, let's consider the relationship between vaccination and cases
rate given by the CDC, check the
\href{https://covid.cdc.gov/covid-data-tracker/\#vaccination-case-rate}{link}.
As expected the higher the vaccination rate the lower the cases and
eventually lower death rate. So we expect that with these new COVID-19
parameters among others will give better deaths forcast and thus better
short-term models.

\hypertarget{appendices}{%
\section{Appendices}\label{appendices}}

\hypertarget{data-repository}{%
\subsection{Data repository}\label{data-repository}}

The data is stored in
\href{https://drive.google.com/drive/folders/1GYzZ3FuPWtQwTF8-UPQdenm0QWL_u_HH?usp=sharing}{Google
Drive}. It contains all the files needed to generate the various
datasets and compile the \texttt{Rmd} files. Some datasets have been
pre-generated for convenience. The directory structure is meant to mimic
the one in the \href{https://github.com/haerski/mortality}{GitHub
repository}.

Next, we describe the contents and dependencies of the files in the data
repository

\hypertarget{r-scripts}{%
\subsubsection{R scripts}\label{r-scripts}}

\texttt{data/zip3\_rel.R}: generates \texttt{data/zip3\_rel.feather}.
Depends on

\begin{itemize}
\tightlist
\item
  \texttt{data/zcta\_county\_rel\_10.txt}
\end{itemize}

\texttt{data/deaths.R}: generates \texttt{data/deaths\_zip3.feather}.
Depends on

\begin{itemize}
\tightlist
\item
  \texttt{data/zip3\_rel.feather}, generated by
  \texttt{data/zip3\_rel.R}
\item
  \texttt{data/covid\_deaths\_usafacts.csv}
\end{itemize}

\texttt{data/census.R}: generates \texttt{data/pop\_den.feather}.

\begin{itemize}
\tightlist
\item
  Requires an API key
\end{itemize}

\texttt{data/all\_persons.r}: generates
\texttt{data/simulation\_data/all\_persons.feather}. Depends on

\begin{itemize}
\tightlist
\item
  \texttt{data/simulation\_data/experience\_weekly\_\{n\}.RDS}, where n
  = 1,\ldots,10
\item
  \texttt{data/simulation\_data/person\_\{n\}.RDS}, where n =
  1,\ldots,10
\item
  \texttt{data/soa\_base\_2017.csv}
\end{itemize}

\texttt{data/wrangling.Rmd}: generates \texttt{data/data.feather}.
Depends on

\begin{itemize}
\tightlist
\item
  \texttt{data/Population\_Estimates.csv}
\item
  \texttt{data/COVID-19\_Vaccinations\_in\_the\_United\_States\_County\_data.gov.csv}
\item
  \texttt{data/Provisional\_COVID-19\_Death\_Counts\_in\_the\_United\_States\_by\_County.csv}
\item
  \texttt{data/Education\_Estimates.csv}
\item
  \texttt{data/Poverty\_Estimates.csv}
\item
  \texttt{data/Unemployment\_Estimates.csv}
\item
  \texttt{data/Vaccine\_Hesitancy\_for\_COVID-19\_\_County\_and\_local\_estimates.csv}
\item
  \texttt{data/countypres\_2000-2020.csv}
\item
  \texttt{data/zip3\_rel.feather}, generated by
  \texttt{data/zip3\_rel.R}
\end{itemize}

\texttt{data/processed\_data.r}: generates
\texttt{data/processed\_data\_20\_12\_24.feather}. Depends on

\begin{itemize}
\tightlist
\item
  \texttt{data/simulation\_data/all\_persons.feather}, generated by
  \texttt{data/all\_persons.r}
\item
  \texttt{data/data.feather}, generated by \texttt{data/wrangling.Rmd}
\item
  \texttt{data/deaths\_zip3.feather}, generated by
  \texttt{data/deaths.R}
\item
  \texttt{data/state.txt}
\item
  \texttt{data/zcta\_county\_rel\_10.txt}
\item
  \texttt{data/2020\_12\_23/reference\_hospitalization\_all\_locs.csv}
\end{itemize}

\hypertarget{rmd-files}{%
\subsubsection{Rmd files}\label{rmd-files}}

\texttt{report.Rmd}: this file. Depends on

\begin{itemize}
\tightlist
\item
  \texttt{data/processed\_data\_20\_12\_23.feather}, generated by
  \texttt{data/processed\_data.r}
\item
  \texttt{calibwithIHME.rds}
\item
  \texttt{calibwithzipdeaths.rds}
\item
  \texttt{calibwithoutdeaths.rds}
\item
  \texttt{resultwithIHME.rds}
\item
  \texttt{resultwithzipdeaths.rds}
\item
  \texttt{resultwithoutdeaths.rds}
\end{itemize}

\texttt{time.Rmd}: work on time-dependent models. Depends on

\begin{itemize}
\tightlist
\item
  \texttt{data/simulation\_data/all\_persons.feather}, generated by
  \texttt{data/all\_persons.r}
\item
  \texttt{data/data.feather}, generated by \texttt{data/wrangling.Rmd}
\item
  \texttt{data/deaths\_zip3.feather}, generated by
  \texttt{data/deaths.R}
\item
  \texttt{data/state.txt}
\item
  \texttt{data/zcta\_county\_rel\_10.txt}
\item
  \texttt{data/2020\_12\_23/reference\_hospitalization\_all\_locs.csv}
\item
  \texttt{data/processed\_data\_20\_12\_23.feather}, generated by
  \texttt{data/processed\_data.r}
\end{itemize}

\texttt{baseline\_models.Rmd}: work on time-independent models. Depends
on

\begin{itemize}
\tightlist
\item
  \texttt{data/simulation\_data/all\_persons.feather}, generated by
  \texttt{data/all\_persons.r}
\item
  \texttt{data/data.feather}, generated by \texttt{data/wrangling.Rmd}
\end{itemize}

\texttt{final.Rmd}: final presentation plots. Depends on

\begin{itemize}
\tightlist
\item
  \texttt{data/processed\_data\_20\_12\_23.feather}, generated by
  \texttt{data/processed\_data.r}
\end{itemize}

\texttt{presentation\_2.Rmd}: technical presentation plots. Depends on

\begin{itemize}
\tightlist
\item
  \texttt{data/simulation\_data/all\_persons.feather}, generated by
  \texttt{data/all\_persons.r}
\item
  \texttt{data/data.feather}, generated by \texttt{data/wrangling.Rmd}
\end{itemize}

\texttt{random\_forest.Rmd}: early work on random forest models

\begin{itemize}
\tightlist
\item
  \texttt{data/simulation\_data/all\_persons.feather}, generated by
  \texttt{data/all\_persons.r}
\item
  \texttt{data/data.feather}, generated by \texttt{data/wrangling.Rmd}
\end{itemize}

\end{document}