construct_onlineretail_fixed_pnbd_models.qmd

---
title: "Construct Non-Hierarchical P/NBD Model for Online Retail Transaction Data"
author: "Mick Cooney <mickcooney@gmail.com>"
date: "Last updated: `r format(Sys.time(), '%B %d, %Y')`"
editor: source
execute:
  message: false
  warning: false
  error: false
format:
  html:
    light: superhero
    dark: darkly
    anchor-sections: true
    embed-resources: true
    number-sections: true
    smooth-scroll: true
    toc: true
    toc-depth: 3
    toc-location: left
    code-fold: true
    code-summary: "Show code"
---


```{r import_libraries}
#| echo: FALSE
#| message: FALSE

library(conflicted)
library(tidyverse)
library(scales)
library(cowplot)
library(directlabels)
library(magrittr)
library(rlang)
library(lobstr)
library(rsyslog)
library(fs)
library(purrr)
library(furrr)
library(glue)
library(cmdstanr)
library(brms)
library(posterior)
library(bayesplot)
library(tidybayes)


source("lib_utils.R")
source("lib_btyd.R")


conflict_lst <- resolve_conflicts(
  c("magrittr", "rlang", "dplyr", "readr", "purrr", "ggplot2", "MASS",
    "fitdistrplus")
  )


options(
  width = 80L,
  warn  = 1,
  mc.cores = parallelly::availableCores()
  )


set.seed(42)
stanfit_seed <- 4501
n_sim        <- 2000


theme_set(theme_cowplot())
plan(multisession)
```


In this workbook we construct our first hierarchical P/NBD models on the
synthetic data with the longer timeframe.


# Load and Construct Datasets

We start by modelling the P/NBD model using our synthetic datasets before we
try to model real-life data.


## Load Online Retail Data

We now want to load the online retail transaction data.


```{r load_online_retail_data}
#| echo: TRUE

customer_cohortdata_tbl <- read_rds("data/onlineretail_cohort_tbl.rds")
customer_cohortdata_tbl |> glimpse()

customer_transactions_tbl <- read_rds("data/onlineretail_transactions_tbl.rds")
customer_transactions_tbl |> glimpse()

customer_subset_id <- read_rds("data/onlineretail_customer_subset_ids.rds")
customer_subset_id |> glimpse()
```


## Load Derived Data

```{r load_derived_data}
#| echo: TRUE

customer_summarystats_tbl <- read_rds("data/onlineretail_customer_summarystats_tbl.rds")

obs_fitdata_tbl   <- read_rds("data/onlineretail_obs_fitdata_tbl.rds")
obs_validdata_tbl <- read_rds("data/onlineretail_obs_validdata_tbl.rds")

customer_fit_stats_tbl <- obs_fitdata_tbl |>
  rename(x = tnx_count)
```


## Load Subset Data

We also want to construct our data subsets for the purposes of speeding up our
valuations.

```{r construct_customer_subset_data}
#| echo: TRUE

customer_fit_subset_tbl <- obs_fitdata_tbl |>
  filter(customer_id %in% customer_subset_id)

customer_fit_subset_tbl |> glimpse()


customer_valid_subset_tbl <- obs_validdata_tbl |>
  filter(customer_id %in% customer_subset_id)

customer_valid_subset_tbl |> glimpse()
```


We now use these datasets to set the start and end dates for our various
validation methods.


```{r set_start_end_dates}
dates_lst <- read_rds("data/onlineretail_simulation_dates.rds")

use_fit_start_date <- dates_lst$use_fit_start_date
use_fit_end_date   <- dates_lst$use_fit_end_date

use_valid_start_date <- dates_lst$use_valid_start_date
use_valid_end_date   <- dates_lst$use_valid_end_date
```

We now split out the transaction data into fit and validation datasets.

```{r create_customer_transaction_splits}
#| echo: true

customer_fit_transactions_tbl <- customer_transactions_tbl |>
  filter(
    customer_id %in% customer_subset_id,
    tnx_timestamp >= use_fit_start_date,
    tnx_timestamp <= use_fit_end_date
    )
  
customer_fit_transactions_tbl |> glimpse()


customer_valid_transactions_tbl <- customer_transactions_tbl |>
  filter(
    customer_id %in% customer_subset_id,
    tnx_timestamp >= use_valid_start_date,
    tnx_timestamp <= use_valid_end_date
    )
  
customer_valid_transactions_tbl |> glimpse()
```

Finally, we want to extract the first transaction for each customer, so we
can add this data to assess our models.

```{r extract_customer_first_transaction}
#| echo: true

customer_initial_tnx_tbl <- customer_fit_transactions_tbl |>
  slice_min(n = 1, order_by = tnx_timestamp, by = customer_id)

customer_initial_tnx_tbl |> glimpse()
```

We now expand out these initial transactions so that we can append them to
our simulations.

```{r expand_initial_simulation_transactions}
#| echo: true

sim_init_tbl <- customer_initial_tnx_tbl |>
  transmute(
    customer_id,
    draw_id       = list(1:n_sim),
    tnx_timestamp,
    tnx_amount
    ) |>
  unnest(draw_id)

sim_init_tbl |> glimpse()
```


Before we start on that, we set a few parameters for the workbook to organise
our Stan code.

```{r setup_workbook_parameters}
#| echo: TRUE

stan_modeldir <- "stan_models"
stan_codedir  <-   "stan_code"
```


# Fit First P/NBD Model

We now construct our Stan model and prepare to fit it with our synthetic
dataset.

We also want to set a number of overall parameters for this workbook

To start the fit data, we want to use the 1,000 customers. We also need to
calculate the summary statistics for the validation period.


## Compile and Fit Stan Model

We now compile this model using `CmdStanR`.

```{r compile_pnbd_fixed_stanmodel}
#| echo: TRUE
#| results: "hide"

pnbd_fixed_stanmodel <- cmdstan_model(
  "stan_code/pnbd_fixed.stan",
  include_paths =   stan_codedir,
  pedantic      =           TRUE,
  dir           =  stan_modeldir
  )
```


We then use this compiled model with our data to produce a fit of the data.


```{r fit_pnbd_onlineretail_fixed1_stanmodel}
#| echo: TRUE

stan_modelname <- "pnbd_onlineretail_fixed1"
stanfit_seed   <- stanfit_seed + 1
stanfit_prefix <- str_c("fit_", stan_modelname) 

stanfit_object_file <- glue("data/{stanfit_prefix}_stanfit.rds")

stan_data_lst <- customer_fit_stats_tbl |>
  select(customer_id, x, t_x, T_cal) |>
  compose_data(
    lambda_mn = 0.25,
    lambda_cv = 1.00,
    
    mu_mn     = 0.10,
    mu_cv     = 1.00,
    )

if(!file_exists(stanfit_object_file)) {
  pnbd_onlineretail_fixed1_stanfit <- pnbd_fixed_stanmodel$sample(
    data            =                stan_data_lst,
    chains          =                            4,
    iter_warmup     =                          500,
    iter_sampling   =                          500,
    seed            =                 stanfit_seed,
    save_warmup     =                         TRUE,
    output_dir      =                stan_modeldir,
    output_basename =               stanfit_prefix,
    )
  
  pnbd_onlineretail_fixed1_stanfit$save_object(stanfit_object_file, compress = "gzip")

} else {
  pnbd_onlineretail_fixed1_stanfit <- read_rds(stanfit_object_file)
}

pnbd_onlineretail_fixed1_stanfit$print()
```


We have some basic HMC-based validity statistics we can check.

```{r calculate_pnbd_onlineretail_fixed1_hmc_diagnostics}
#| echo: TRUE

pnbd_onlineretail_fixed1_stanfit$cmdstan_diagnose()
```


## Visual Diagnostics of the Sample Validity

Now that we have a sample from the posterior distribution we need to create a
few different visualisations of the diagnostics.


```{r plot_pnbd_onlineretail_fixed1_lambda_traceplots_nowarmup}
#| echo: TRUE

parameter_subset <- c(
  "lambda[1]", "lambda[2]", "lambda[3]", "lambda[4]",
  "mu[1]",     "mu[2]",     "mu[3]",     "mu[4]"
  )

pnbd_onlineretail_fixed1_stanfit$draws(inc_warmup = FALSE) |>
  mcmc_trace(pars = parameter_subset) +
  expand_limits(y = 0) +
  labs(
    x = "Iteration",
    y = "Value",
    title = "Traceplot of Sample of Lambda and Mu Values"
    ) +
  theme(axis.text.x = element_text(size = 10))
```


We also check $N_{eff}$ as a quick diagnostic of the fit.


```{r plot_pnbd_onlineretail_fixed1_parameter_neffratio}
#| echo: TRUE

pnbd_onlineretail_fixed1_stanfit |>
  neff_ratio(pars = c("lambda", "mu")) |>
  mcmc_neff() +
    ggtitle("Plot of Parameter Effective Sample Sizes")
```


Finally, we want to check out the energy diagnostic, which is often indicative
of problems with the posterior mixing.

```{r plot_pnbd_onlineretail_fixed1_energy}
#| echo: true

pnbd_onlineretail_fixed1_stanfit |>
  nuts_params() |>
  mcmc_nuts_energy(binwidth = 50)
```


## Assess the Model

As we intend to run the same logic to assess each of our models, we have
combined all this logic into a single function `run_model_assessment`, to 
run the simulations and combine the datasets.

```{r run_pnbd_onlineretail_fixed1_assessment}
#| echo: TRUE

pnbd_stanfit <- pnbd_onlineretail_fixed1_stanfit |>
  recover_types(customer_fit_stats_tbl)

pnbd_onlineretail_fixed1_assess_data_lst <- run_model_assessment(
  model_stanfit    = pnbd_stanfit,
  insample_tbl     = customer_fit_subset_tbl,
  fit_label        = "pnbd_onlineretail_fixed1",
  fit_end_dttm     = use_fit_end_date     |> as.POSIXct(),
  valid_start_dttm = use_valid_start_date |> as.POSIXct(),
  valid_end_dttm   = use_valid_end_date   |> as.POSIXct(),
  sim_seed         = 1010
  )

pnbd_onlineretail_fixed1_assess_data_lst |> glimpse()
```


### Check In-Sample Data Validation

We first check the model against the in-sample data.

```{r run_pnbd_onlineretail_fixed1_fit_assessment}
#| echo: TRUE

simdata_tbl <- pnbd_onlineretail_fixed1_assess_data_lst |>
  use_series(model_fit_index_filepath) |>
  read_rds() |>
  use_series(sim_file) |>
  map_dfr(read_rds) |>
  select(customer_id, draw_id, sim_data) |>
  unnest(sim_data) |>
  bind_rows(sim_init_tbl) |>
  arrange(customer_id, draw_id, tnx_timestamp)


assess_plots_lst <- create_model_assessment_plots(
  obsdata_tbl = customer_fit_transactions_tbl,
  simdata_tbl = simdata_tbl
  )

assess_plots_lst |> map(print)
```

This fit looks reasonable and appears to capture most of the aspects of the
data used to fit it. Given that this is a synthetic dataset, this is not
surprising, but at least we appreciate that our model is valid.


### Check Out-of-Sample Data Validation

We now repeat for the out-of-sample data.

```{r run_pnbd_onlineretail_fixed1_valid_assessment}
#| echo: TRUE

simdata_tbl <- pnbd_onlineretail_fixed1_assess_data_lst |>
  use_series(model_valid_index_filepath) |>
  read_rds() |>
  use_series(sim_file) |>
  map_dfr(read_rds) |>
  select(customer_id, draw_id, sim_data) |>
  unnest(sim_data) |>
  arrange(customer_id, draw_id, tnx_timestamp)


assess_plots_lst <- create_model_assessment_plots(
  obsdata_tbl = customer_valid_transactions_tbl,
  simdata_tbl = simdata_tbl
  )

assess_plots_lst |> map(print)
```

As for our short time frame data, overall our model is working well.

```{r delete_pnbd_onlineretail_fixed1_valid_assessment}
#| echo: false

rm(simdata_tbl)
rm(insample_plots_lst)
rm(outsample_plots_lst)

gc()
```


# Fit Alternate Prior Model.

We want to try an alternate prior model with a smaller co-efficient of variation
to see what impact it has on our procedures.


```{r fit_pnbd_onlineretail_fixed2_stanmodel}
#| echo: TRUE

stan_modelname <- "pnbd_onlineretail_fixed2"
stanfit_seed   <- stanfit_seed + 1
stanfit_prefix <- str_c("fit_", stan_modelname) 

stanfit_object_file <- glue("data/{stanfit_prefix}_stanfit.rds")

stan_data_lst <- customer_fit_stats_tbl |>
  select(customer_id, x, t_x, T_cal) |>
  compose_data(
    lambda_mn = 0.25,
    lambda_cv = 0.50,
    
    mu_mn     = 0.10,
    mu_cv     = 0.50,
    )

if(!file_exists(stanfit_object_file)) {
  pnbd_onlineretail_fixed2_stanfit <- pnbd_fixed_stanmodel$sample(
    data            =                stan_data_lst,
    chains          =                            4,
    iter_warmup     =                          500,
    iter_sampling   =                          500,
    seed            =                 stanfit_seed,
    save_warmup     =                         TRUE,
    output_dir      =                stan_modeldir,
    output_basename =               stanfit_prefix,
    )
  
  pnbd_onlineretail_fixed2_stanfit$save_object(stanfit_object_file, compress = "gzip")

} else {
  pnbd_onlineretail_fixed2_stanfit <- read_rds(stanfit_object_file)
}

pnbd_onlineretail_fixed2_stanfit$print()
```


We have some basic HMC-based validity statistics we can check.

```{r calculate_pnbd_onlineretail_fixed2_hmc_diagnostics}
#| echo: TRUE

pnbd_onlineretail_fixed2_stanfit$cmdstan_diagnose()
```


## Visual Diagnostics of the Sample Validity

Now that we have a sample from the posterior distribution we need to create a
few different visualisations of the diagnostics.

```{r plot_pnbd_onlineretail_fixed2_lambda_traceplots}
#| echo: TRUE

parameter_subset <- c(
  "lambda[1]", "lambda[2]", "lambda[3]", "lambda[4]",
  "mu[1]",     "mu[2]",     "mu[3]",     "mu[4]"
  )

pnbd_onlineretail_fixed2_stanfit$draws(inc_warmup = FALSE) |>
  mcmc_trace(pars = parameter_subset) +
  expand_limits(y = 0) +
  labs(
    x = "Iteration",
    y = "Value",
    title = "Traceplot of Sample of Lambda and Mu Values"
    ) +
  theme(axis.text.x = element_text(size = 10))
```


We want to check the $N_{eff}$ statistics also.


```{r plot_pnbd_onlineretail_fixed2_parameter_neffratio}
#| echo: TRUE

pnbd_onlineretail_fixed2_stanfit |>
  neff_ratio(pars = c("lambda", "mu")) |>
  mcmc_neff() +
    ggtitle("Plot of Parameter Effective Sample Sizes")
```


Finally, we want to check out the energy diagnostic, which is often indicative
of problems with the posterior mixing.

```{r plot_pnbd_onlineretail_fixed2_energy}
#| echo: true

pnbd_onlineretail_fixed2_stanfit |>
  nuts_params() |>
  mcmc_nuts_energy(binwidth = 50)
```


## Assess the Model

As we intend to run the same logic to assess each of our models, we have
combined all this logic into a single function `run_model_assessment`, to 
run the simulations and combine the datasets.


```{r run_pnbd_onlineretail_fixed2_assessment}
#| echo: TRUE

pnbd_stanfit <- pnbd_onlineretail_fixed2_stanfit |>
  recover_types(customer_fit_stats_tbl)

pnbd_onlineretail_fixed2_assess_data_lst <- run_model_assessment(
  model_stanfit    = pnbd_stanfit,
  insample_tbl     = customer_fit_subset_tbl,
  fit_label        = "pnbd_onlineretail_fixed2",
  fit_end_dttm     = use_fit_end_date     |> as.POSIXct(),
  valid_start_dttm = use_valid_start_date |> as.POSIXct(),
  valid_end_dttm   = use_valid_end_date   |> as.POSIXct(),
  sim_seed         = 1020
  )

pnbd_onlineretail_fixed2_assess_data_lst |> glimpse()
```

### Check In-Sample Data Validation

We first check the model against the in-sample data.

```{r run_pnbd_onlineretail_fixed2_fit_assessment}
#| echo: TRUE

simdata_tbl <- pnbd_onlineretail_fixed2_assess_data_lst |>
  use_series(model_fit_index_filepath) |>
  read_rds() |>
  use_series(sim_file) |>
  map_dfr(read_rds) |>
  select(customer_id, draw_id, sim_data) |>
  unnest(sim_data) |>
  bind_rows(sim_init_tbl) |>
  arrange(customer_id, draw_id, tnx_timestamp)


assess_plots_lst <- create_model_assessment_plots(
  obsdata_tbl = customer_fit_transactions_tbl,
  simdata_tbl = simdata_tbl
  )

assess_plots_lst |> map(print)
```

This fit looks reasonable and appears to capture most of the aspects of the
data used to fit it. Given that this is a synthetic dataset, this is not
surprising, but at least we appreciate that our model is valid.


### Check Out-of-Sample Data Validation

We now repeat for the out-of-sample data.

```{r run_pnbd_onlineretail_fixed2_valid_assessment}
#| echo: TRUE

simdata_tbl <- pnbd_onlineretail_fixed2_assess_data_lst |>
  use_series(model_valid_index_filepath) |>
  read_rds() |>
  use_series(sim_file) |>
  map_dfr(read_rds) |>
  select(customer_id, draw_id, sim_data) |>
  unnest(sim_data) |>
  arrange(customer_id, draw_id, tnx_timestamp)


assess_plots_lst <- create_model_assessment_plots(
  obsdata_tbl = customer_valid_transactions_tbl,
  simdata_tbl = simdata_tbl
  )

assess_plots_lst |> map(print)
```


```{r delete_pnbd_onlineretail_fixed2_valid_assessment}
#| echo: false

rm(simdata_tbl)
rm(insample_plots_lst)
rm(outsample_plots_lst)

gc()
```


# Fit Tight-Lifetime Model

We now want to try a model where we use priors with a tighter coefficient of
variation for lifetime but keep the CoV for transaction frequency.


```{r fit_pnbd_onlineretail_fixed3_stanmodel}
#| echo: TRUE

stan_modelname <- "pnbd_onlineretail_fixed3"
stanfit_seed   <- stanfit_seed + 1
stanfit_prefix <- str_c("fit_", stan_modelname) 

stanfit_object_file <- glue("data/{stanfit_prefix}_stanfit.rds")


stan_data_lst <- customer_fit_stats_tbl |>
  select(customer_id, x, t_x, T_cal) |>
  compose_data(
    lambda_mn = 0.25,
    lambda_cv = 1.00,
    
    mu_mn     = 0.10,
    mu_cv     = 0.50,
    )

if(!file_exists(stanfit_object_file)) {
  pnbd_onlineretail_fixed3_stanfit <- pnbd_fixed_stanmodel$sample(
    data            =                stan_data_lst,
    chains          =                            4,
    iter_warmup     =                          500,
    iter_sampling   =                          500,
    seed            =                 stanfit_seed,
    save_warmup     =                         TRUE,
    output_dir      =                stan_modeldir,
    output_basename =               stanfit_prefix,
    )
  
  pnbd_onlineretail_fixed3_stanfit$save_object(stanfit_object_file, compress = "gzip")

} else {
  pnbd_onlineretail_fixed3_stanfit <- read_rds(stanfit_object_file)
}

pnbd_onlineretail_fixed3_stanfit$print()
```


We have some basic HMC-based validity statistics we can check.

```{r calculate_pnbd_onlineretail_fixed3_hmc_diagnostics}
#| echo: TRUE

pnbd_onlineretail_fixed3_stanfit$cmdstan_diagnose()
```


## Visual Diagnostics of the Sample Validity

Now that we have a sample from the posterior distribution we need to create a
few different visualisations of the diagnostics.

```{r plot_pnbd_onlineretail_fixed3_lambda_traceplots}
#| echo: TRUE

parameter_subset <- c(
  "lambda[1]", "lambda[2]", "lambda[3]", "lambda[4]",
  "mu[1]",     "mu[2]",     "mu[3]",     "mu[4]"
  )

pnbd_onlineretail_fixed3_stanfit$draws(inc_warmup = FALSE) |>
  mcmc_trace(pars = parameter_subset) +
  expand_limits(y = 0) +
  labs(
    x = "Iteration",
    y = "Value",
    title = "Traceplot of Sample of Lambda and Mu Values"
    ) +
  theme(axis.text.x = element_text(size = 10))
```


We want to check the $N_{eff}$ statistics also.


```{r plot_pnbd_onlineretail_fixed3_parameter_neffratio}
#| echo: TRUE

pnbd_onlineretail_fixed3_stanfit |>
  neff_ratio(pars = c("lambda", "mu")) |>
  mcmc_neff() +
    ggtitle("Plot of Parameter Effective Sample Sizes")
```


Finally, we want to check out the energy diagnostic, which is often indicative
of problems with the posterior mixing.

```{r plot_pnbd_onlineretail_fixed3_energy}
#| echo: true

pnbd_onlineretail_fixed3_stanfit |>
  nuts_params() |>
  mcmc_nuts_energy(binwidth = 50)
```


## Assess the Model

As we intend to run the same logic to assess each of our models, we have
combined all this logic into a single function `run_model_assessment`, to 
run the simulations and combine the datasets.


```{r run_pnbd_onlineretail_fixed3_assessment}
#| echo: TRUE

pnbd_stanfit <- pnbd_onlineretail_fixed3_stanfit |>
  recover_types(customer_fit_stats_tbl)

pnbd_onlineretail_fixed3_assess_data_lst <- run_model_assessment(
  model_stanfit    = pnbd_stanfit,
  insample_tbl     = customer_fit_subset_tbl,
  fit_label        = "pnbd_onlineretail_fixed3",
  fit_end_dttm     = use_fit_end_date     |> as.POSIXct(),
  valid_start_dttm = use_valid_start_date |> as.POSIXct(),
  valid_end_dttm   = use_valid_end_date   |> as.POSIXct(),
  sim_seed         = 1030
  )

pnbd_onlineretail_fixed3_assess_data_lst |> glimpse()
```


### Check In-Sample Data Validation

We first check the model against the in-sample data.

```{r run_pnbd_onlineretail_fixed3_fit_assessment}
#| echo: TRUE

simdata_tbl <- pnbd_onlineretail_fixed3_assess_data_lst |>
  use_series(model_fit_index_filepath) |>
  read_rds() |>
  use_series(sim_file) |>
  map_dfr(read_rds) |>
  select(customer_id, draw_id, sim_data) |>
  unnest(sim_data) |>
  bind_rows(sim_init_tbl) |>
  arrange(customer_id, draw_id, tnx_timestamp)


assess_plots_lst <- create_model_assessment_plots(
  obsdata_tbl = customer_fit_transactions_tbl,
  simdata_tbl = simdata_tbl
  )

assess_plots_lst |> map(print)
```

This fit looks reasonable and appears to capture most of the aspects of the
data used to fit it. Given that this is a synthetic dataset, this is not
surprising, but at least we appreciate that our model is valid.


### Check Out-of-Sample Data Validation

We now repeat for the out-of-sample data.

```{r run_pnbd_onlineretail_fixed3_valid_assessment}
#| echo: TRUE

simdata_tbl <- pnbd_onlineretail_fixed3_assess_data_lst |>
  use_series(model_valid_index_filepath) |>
  read_rds() |>
  use_series(sim_file) |>
  map_dfr(read_rds) |>
  select(customer_id, draw_id, sim_data) |>
  unnest(sim_data) |>
  arrange(customer_id, draw_id, tnx_timestamp)


assess_plots_lst <- create_model_assessment_plots(
  obsdata_tbl = customer_valid_transactions_tbl,
  simdata_tbl = simdata_tbl
  )

assess_plots_lst |> map(print)
```


```{r delete_pnbd_onlineretail_fixed3_valid_assessment}
#| echo: false

rm(simdata_tbl)
rm(insample_plots_lst)
rm(outsample_plots_lst)

gc()
```


# Fit Narrow-Short-Lifetime Model

We now want to try a model where we use priors with a tighter coefficient of
variation for lifetime but keep the CoV for transaction frequency.


```{r fit_pnbd_onlineretail_fixed4_stanmodel}
#| echo: TRUE

stan_modelname <- "pnbd_onlineretail_fixed4"
stanfit_seed   <- stanfit_seed + 1
stanfit_prefix <- str_c("fit_", stan_modelname) 

stanfit_object_file <- glue("data/{stanfit_prefix}_stanfit.rds")


stan_data_lst <- customer_fit_stats_tbl |>
  select(customer_id, x, t_x, T_cal) |>
  compose_data(
    lambda_mn = 0.25,
    lambda_cv = 1.00,
    
    mu_mn     = 0.20,
    mu_cv     = 0.30,
    )

if(!file_exists(stanfit_object_file)) {
  pnbd_onlineretail_fixed4_stanfit <- pnbd_fixed_stanmodel$sample(
    data            =                stan_data_lst,
    chains          =                            4,
    iter_warmup     =                          500,
    iter_sampling   =                          500,
    seed            =                 stanfit_seed,
    save_warmup     =                         TRUE,
    output_dir      =                stan_modeldir,
    output_basename =               stanfit_prefix,
    )
  
  pnbd_onlineretail_fixed4_stanfit$save_object(stanfit_object_file, compress = "gzip")

} else {
  pnbd_onlineretail_fixed4_stanfit <- read_rds(stanfit_object_file)
}

pnbd_onlineretail_fixed4_stanfit$print()
```


We have some basic HMC-based validity statistics we can check.

```{r calculate_pnbd_onlineretail_fixed4_hmc_diagnostics}
#| echo: TRUE

pnbd_onlineretail_fixed4_stanfit$cmdstan_diagnose()
```


## Visual Diagnostics of the Sample Validity

Now that we have a sample from the posterior distribution we need to create a
few different visualisations of the diagnostics.

```{r plot_pnbd_onlineretail_fixed4_lambda_traceplots}
#| echo: TRUE

parameter_subset <- c(
  "lambda[1]", "lambda[2]", "lambda[3]", "lambda[4]",
  "mu[1]",     "mu[2]",     "mu[3]",     "mu[4]"
  )

pnbd_onlineretail_fixed4_stanfit$draws(inc_warmup = FALSE) |>
  mcmc_trace(pars = parameter_subset) +
  expand_limits(y = 0) +
  labs(
    x = "Iteration",
    y = "Value",
    title = "Traceplot of Sample of Lambda and Mu Values"
    ) +
  theme(axis.text.x = element_text(size = 10))
```


We want to check the $N_{eff}$ statistics also.


```{r plot_pnbd_onlineretail_fixed4_parameter_neffratio}
#| echo: TRUE

pnbd_onlineretail_fixed4_stanfit |>
  neff_ratio(pars = c("lambda", "mu")) |>
  mcmc_neff() +
    ggtitle("Plot of Parameter Effective Sample Sizes")
```


Finally, we want to check out the energy diagnostic, which is often indicative
of problems with the posterior mixing.

```{r plot_pnbd_onlineretail_fixed4_energy}
#| echo: true

pnbd_onlineretail_fixed4_stanfit |>
  nuts_params() |>
  mcmc_nuts_energy(binwidth = 50)
```


## Assess the Model

As we intend to run the same logic to assess each of our models, we have
combined all this logic into a single function `run_model_assessment`, to 
run the simulations and combine the datasets.


```{r run_pnbd_onlineretail_fixed4_assessment}
#| echo: TRUE

pnbd_stanfit <- pnbd_onlineretail_fixed4_stanfit |>
  recover_types(customer_fit_stats_tbl)

pnbd_onlineretail_fixed4_assess_data_lst <- run_model_assessment(
  model_stanfit    = pnbd_stanfit,
  insample_tbl     = customer_fit_subset_tbl,
  fit_label        = "pnbd_onlineretail_fixed4",
  fit_end_dttm     = use_fit_end_date     |> as.POSIXct(),
  valid_start_dttm = use_valid_start_date |> as.POSIXct(),
  valid_end_dttm   = use_valid_end_date   |> as.POSIXct(),
  sim_seed         = 1040
  )

pnbd_onlineretail_fixed4_assess_data_lst |> glimpse()
```


### Check In-Sample Data Validation

We first check the model against the in-sample data.

```{r run_pnbd_onlineretail_fixed4_fit_assessment}
#| echo: TRUE

simdata_tbl <- pnbd_onlineretail_fixed4_assess_data_lst |>
  use_series(model_fit_index_filepath) |>
  read_rds() |>
  use_series(sim_file) |>
  map_dfr(read_rds) |>
  select(customer_id, draw_id, sim_data) |>
  unnest(sim_data) |>
  bind_rows(sim_init_tbl) |>
  arrange(customer_id, draw_id, tnx_timestamp)


assess_plots_lst <- create_model_assessment_plots(
  obsdata_tbl = customer_fit_transactions_tbl,
  simdata_tbl = simdata_tbl
  )

assess_plots_lst |> map(print)
```

This fit looks reasonable and appears to capture most of the aspects of the
data used to fit it. Given that this is a synthetic dataset, this is not
surprising, but at least we appreciate that our model is valid.


### Check Out-of-Sample Data Validation

We now repeat for the out-of-sample data.

```{r run_pnbd_onlineretail_fixed4_valid_assessment}
#| echo: TRUE

simdata_tbl <- pnbd_onlineretail_fixed4_assess_data_lst |>
  use_series(model_valid_index_filepath) |>
  read_rds() |>
  use_series(sim_file) |>
  map_dfr(read_rds) |>
  select(customer_id, draw_id, sim_data) |>
  unnest(sim_data) |>
  arrange(customer_id, draw_id, tnx_timestamp)


assess_plots_lst <- create_model_assessment_plots(
  obsdata_tbl = customer_valid_transactions_tbl,
  simdata_tbl = simdata_tbl
  )

assess_plots_lst |> map(print)
```


```{r delete_pnbd_onlineretail_fixed4_valid_assessment}
#| echo: false

rm(simdata_tbl)
rm(insample_plots_lst)
rm(outsample_plots_lst)

gc()
```


# Compare Model Outputs

We have looked at each of the models individually, but it is also worth looking
at each of the models as a group.

We now want to combine both the `fit` and `valid` transaction sets to
calculate the summary statistics for both.

```{r calculate_observed_statistics}
#| echo: true

obs_summstats_tbl <- list(
    fit   = customer_fit_transactions_tbl,
    valid = customer_valid_transactions_tbl
    ) |>
  bind_rows(.id = "assess_type") |>
  group_by(assess_type) |>
  calculate_transaction_summary_statistics() |>
  pivot_longer(
    cols      = !assess_type,
    names_to  = "label",
    values_to = "obs_value"
    )

obs_summstats_tbl |> glimpse()
```


```{r load_model_assessment_data}
#| echo: TRUE

model_assess_transactions_tbl <- dir_ls("data", regexp = "pnbd_onlineretail_fixed.*_assess_.*index") |>
  enframe(name = NULL, value = "file_path") |>
  mutate(
    model_label = str_replace(file_path, "data/pnbd_onlineretail_(.*?)_assess_.*", "\\1"),
    assess_type = if_else(str_detect(file_path, "_assess_fit_"), "fit", "valid"),
    
    assess_data = map(
      file_path, construct_model_assessment_data,
      
      .progress = "construct_assess_data"
      )
    ) |>
  select(model_label, assess_type, assess_data) |>
  unnest(assess_data)

model_assess_transactions_tbl |> glimpse()
```

We now want to calculate the transaction statistics on this full dataset, for
each separate draw.

```{r calculate_model_assessment_transaction_statistics}
model_assess_tbl <- model_assess_transactions_tbl |>
  group_by(model_label, assess_type, draw_id) |>
  calculate_transaction_summary_statistics()

model_assess_tbl |> glimpse()
```

We now combine all this data to create a number of different comparison plots
for the various summary statistics.


```{r construct_model_comparison_plot}
#! echo: TRUE

create_multiple_model_assessment_plot(
  obs_summstats_tbl, model_assess_tbl,
  "total_count", "Total Transactions"
  )

create_multiple_model_assessment_plot(
  obs_summstats_tbl, model_assess_tbl,
  "mean_count", "Average Transactions per Customer"
  )

create_multiple_model_assessment_plot(
  obs_summstats_tbl, model_assess_tbl,
  "p99", "99th Percentile Count"
  )
```


## Write Assessment Data to Disk

We now want to save the assessment data to disk.

```{r write_model_assessment_data}
#| echo: TRUE

model_assess_tbl |> write_rds("data/assess_data_pnbd_onlineretail_fixed_tbl.rds")
```


# R Environment {.unnumbered}


```{r show_session_info}
#| echo: TRUE
#| message: TRUE

options(width = 120L)
sessioninfo::session_info()
options(width = 80L)
```