exploring_online_retail_transactions.qmd

---
title: "Exploring the Online Retail Transaction Data"
author: "Mick Cooney <mickcooney@gmail.com>"
date: "Last updated: `r format(Sys.time(), '%B %d, %Y')`"
editor: source
execute:
  message: false
  warning: false
  error: false
format:
  html:
    light: superhero
    dark: darkly
    anchor-sections: true
    embed-resources: true
    number-sections: true
    smooth-scroll: true
    toc: true
    toc-depth: 3
    toc-location: left
    code-fold: true
    code-summary: "Show code"
---


```{r import_libraries}
#| echo: FALSE
#| message: FALSE

library(conflicted)
library(tidyverse)
library(scales)
library(cowplot)
library(magrittr)
library(rlang)
library(purrr)
library(furrr)
library(vctrs)
library(fs)
library(glue)
library(rsyslog)
library(forcats)
library(snakecase)
library(DataExplorer)
library(lubridate)
library(evir)
library(DT)
library(tidyquant)
library(directlabels)


source("lib_utils.R")
source("lib_btyd.R")


conflict_lst <- resolve_conflicts(
  c("magrittr", "rlang", "dplyr", "readr", "purrr", "ggplot2", "MASS",
    "fitdistrplus")
  )


options(
  width = 80L,
  warn  = 1,
  mc.cores = parallel::detectCores()
  )

theme_set(theme_cowplot())

set.seed(42)

open_syslog("exploring_online_retail_transactions")

plan(multisession)
```


```{r custom_functions}
#| echo: FALSE

### Checks if variable is a date/time
is_date <- function(x)
  x |> inherits(c("POSIXt", "POSIXct", "POSIXlt", "Date", "hms"))


### Returns the category of data type passed to it
categorise_datatype <- function(x) {
  if (all(are_na(x))) return("na")

  if (is_date(x))                          "datetime"
  else if (!is_null(attributes(x)) ||
           all(is_character(x)))          "discrete"
  else if (all(is_logical(x)))            "logical"
  else                                    "continuous"
}


### create_coltype_list() splits columns into various types
create_coltype_list <- function(data_tbl) {
  coltypes  <- data_tbl |> map_chr(categorise_datatype)
  cat_types <- coltypes |> unique() |> sort()

  split_lst <- cat_types |> map(~ coltypes[coltypes %in% .x] |> names())

  names(split_lst) <- coltypes |> unique() |> sort()

  coltype_lst <- list(
    split   = split_lst,
    columns = coltypes
  )

  return(coltype_lst)
}

```


This workbook was created using the "dataexpks" template:

https://github.com/DublinLearningGroup/dataexpks


# Introduction

This workbook performs the basic data exploration of the dataset.

```{r set_exploration_params}
#| echo: true

dataexp_level_exclusion_threshold <- 100

dataexp_cat_level_count <- 40
dataexp_hist_bins_count <- 50
```


# Load Data

First we load the dataset as well as some support datasets.

```{r load_dataset}
#| echo: true

syslog(
  glue("Setting up data"),
  level = "INFO"
  )


rawdata_tbl <- read_rds("data/rawdata_online_retail_tbl.rds")

rawdata_tbl |> glimpse()
```


## Perform Quick Data Cleaning

Some of the dates provided in the dataset are in an irregular format.

```{r clean_names}
#| echo: true

data_tbl <- rawdata_tbl %>% set_colnames(names(.) |> to_snake_case()) #

data_tbl |> glimpse()
```


```{r}
#| echo: FALSE

#knitr::knit_exit()
```


## Create Derived Variables

We now create derived features useful for modelling. These values are
new variables calculated from existing variables in the data.

```{r construct_derived_values}
#| echo: true

data_tbl <- data_tbl |>
  rename(invoice_id = invoice) |>
  mutate(
    row_id = sprintf("ROW%07d", 1:n()),
    
    .before = 1
    ) |>
  mutate(
    stock_code_upr = stock_code |> str_to_upper(),
    
    cancellation   = str_detect(invoice_id, "^C"),
    invoice_dttm   = invoice_date,
    invoice_date   = invoice_date |> as.Date(),
    invoice_month  = format(invoice_dttm, "%B") |> fct_reorder(invoice_dttm |> format("%m") |> as.numeric()),
    invoice_dow    = format(invoice_dttm, "%A") |> fct_reorder(invoice_dttm |> format("%u") |> as.numeric()),
    invoice_dom    = format(invoice_dttm, "%d"),
    invoice_hour   = format(invoice_dttm, "%H"),
    invoice_minute = format(invoice_dttm, "%M"),
    invoice_woy    = format(invoice_dttm, "%V"),
    invoice_ym     = format(invoice_dttm, "%Y%m"),

    stock_value    = price * quantity
    ) |>
  group_by(invoice_ym) |>
  mutate(
    invoice_monthprop = as.numeric(invoice_dom) / max(as.numeric(invoice_dom))
    ) |>
  ungroup() |>
  arrange(invoice_dttm)

data_tbl |> glimpse()
```


# Perform Basic Checks on Data

We now want to look at some very high level checks on the data, and we leverage
some of the functionality provided by `DataExplorer`.

## Create High-Level Visualisations

We first want to look at a visualisation of some high-level summarys of the
meta-data on this dataset. This gives us a quick view of the categorical and
numeric values in the dataset, as well as the proportions of missing values.

```{r plot_dataexp_introduce}
#| echo: true

data_tbl |>
  plot_intro(
    title   = "High Level Table Summary",
    ggtheme = theme_cowplot()
    )
```


## Check Missing Values

Before we do anything with the data, we first check for missing values
in the dataset. In some cases, missing data is coded by a special
character rather than as a blank, so we first correct for this.

```{r replace_missing_character}
#| echo: true

### _TEMPLATE_
### ADD CODE TO CORRECT FOR DATA ENCODING HERE
```

With missing data properly encoded, we now visualise the missing data in a
number of different ways.

### Univariate Missing Data

```{r plot_univariate_missing_data}
#| echo: true

data_tbl |>
  plot_missing(
    title   = "Summary of Data Missingness",
    group   = list(Good = 0.05, Acceptable = 0.2, Bad = 0.8, Remove = 1),
    ggtheme = theme_cowplot()
    )
```

We now want to repeat this plot but only for those columns that have some
missing values.

```{r plot_univariate_missing_only_data}
#| echo: true

data_tbl |>
  plot_missing(
    title        = "Summary of Data Missingness (missing variables only)",
    missing_only = TRUE,
    group        = list(Good = 0.05, Acceptable = 0.2, Bad = 0.8, Remove = 1),
    ggtheme      = theme_cowplot()
    )
```


### Multivariate Missing Data

It is useful to get an idea of what combinations of variables tend to have
variables with missing values simultaneously, so to construct a visualisation
for this we create a count of all the times given combinations of variables
have missing values, producing a heat map for these combination counts.

```{r missing_data_matrix}
#| echo: true

dataexp_missing_group_count <- 20

row_count <- rawdata_tbl |> nrow()

count_nas <- ~ .x |> are_na() |> vec_cast(integer())

missing_vizdata_tbl <- rawdata_tbl |>
  mutate(across(everything(), count_nas)) %>%  # Need %>% for the '.' functionality
  mutate(label = pmap_chr(., str_c)) |>
  group_by(label) |>
  mutate(
    miss_count = n(),
    miss_prop  = miss_count / row_count
    ) |>
  slice_max(order_by = miss_prop, n = 1, with_ties = FALSE) |>
  ungroup() |>
  pivot_longer(
    !c(label, miss_count, miss_prop),
    names_to = "variable_name",
    values_to = "presence"
    ) |>
  mutate(
    prop_label = sprintf("%6.4f", miss_prop)
    )

top10_data_tbl <- missing_vizdata_tbl |>
  select(label, miss_prop) |>
  distinct() |>
  slice_max(order_by = miss_prop, n = dataexp_missing_group_count)

missing_plot_tbl <- missing_vizdata_tbl |>
  semi_join(top10_data_tbl, by = "label")

ggplot(missing_plot_tbl) +
  geom_tile(aes(x = variable_name, y = prop_label, fill = presence), height = 0.8) +
  scale_fill_continuous() +
  scale_x_discrete(position = "top", labels = abbreviate) +
  xlab("Variable") +
  ylab("Proportion of Rows") +
  theme(
    legend.position = "none",
    axis.text.x = element_text(angle = 90, vjust = 0.5)
    )
```

This visualisation takes a little explaining.

Each row represents a combination of variables with simultaneous missing
values. For each row in the graphic, the coloured entries show which particular
variables are missing in that combination. The proportion of rows with that
combination is displayed in both the label for the row and the colouring for
the cells in the row.


## Inspect High-level-count Categorical Variables

With the raw data loaded up we now remove obvious unique or near-unique
variables that are not amenable to basic exploration and plotting.

```{r find_highlevelcount_categorical_variables}
#| echo: true

coltype_lst <- create_coltype_list(data_tbl)

count_levels <- ~ .x |> unique() |> length()

catvar_valuecount_tbl <- data_tbl |>
  summarise(
    .groups = "drop",

    across(coltype_lst$split$discrete, count_levels)
    ) |>
  pivot_longer(
    cols      = everything(),
    names_to  = "var_name",
    values_to = "level_count"
    ) |>
  arrange(desc(level_count))

print(catvar_valuecount_tbl)

row_count <- data_tbl |> nrow()

cat(glue("Dataset has {row_count} rows\n"))
```

Now that we a table of the counts of all the categorical variables we can
automatically exclude unique variables from the exploration, as the level
count will match the row count.

```{r remove_id_variables}
#| echo: true

unique_vars <- catvar_valuecount_tbl |>
  filter(level_count == row_count) |>
  pull(var_name)

print(unique_vars)

explore_data_tbl <- data_tbl |>
  select(-one_of(unique_vars))
```

Having removed the unique identifier variables from the dataset, we
may also wish to exclude categoricals with high level counts also, so
we create a vector of those variable names.

```{r collect_highcount_variables}
#| echo: true

highcount_vars <- catvar_valuecount_tbl |>
  filter(level_count >= dataexp_level_exclusion_threshold,
         level_count < row_count) |>
  pull(var_name)

cat(str_c(highcount_vars, collapse = ", "))
```

We now can continue doing some basic exploration of the data. We may
also choose to remove some extra columns from the dataset.

```{r drop_variables}
#| echo: true

### You may want to comment out these next few lines to customise which
### categoricals are kept in the exploration.
drop_vars <- c(highcount_vars)

if (length(drop_vars) > 0) {
  explore_data_tbl <- explore_data_tbl |>
      select(-one_of(drop_vars))

  cat(str_c(drop_vars, collapse = ", "))
}
```


```{r}
#| echo: FALSE

#knitr::knit_exit()
```


# Univariate Data Exploration

Now that we have loaded the data we can prepare it for some basic data
exploration.

```{r create_log_univariate_data_exploration}
#| echo: true

syslog(
  glue("Performing univariate data exploration"),
  level = "INFO"
  )
```


## Quick Univariate Data Summaries

We use a number of summary visualisations provided by `DataExplorer`: a
facet plot across each variable with categorical variables getting bar plots
and numerical plots getting histograms.

We first look at the barplots of categorical variables.

```{r plot_dataexp_bar}
#| echo: true
#| message: TRUE

plot_bar(
    data_tbl,
    ncol    = 2,
    nrow    = 2,
    title   = "Barplots of Data",
    ggtheme = theme_cowplot()
    )
```


We then have a quick look at histograms of the numeric variables.

```{r plot_dataexp_hist}
#| echo: true
#| message: TRUE

plot_histogram(
    data_tbl,
    ncol    = 2,
    nrow    = 2,
    title   = "Histograms of Data",
    ggtheme = theme_cowplot()
    )
```


Finally, we split the remaining variables into different categories and then
produce a sequence of plots for each variable.


```{r separate_exploration_cols}
#| echo: true

coltype_lst <- create_coltype_list(explore_data_tbl)

print(coltype_lst)
```


## Logical Variables

Logical variables only take two values: TRUE or FALSE. It is useful to see
missing data as well though, so we also plot the count of those.

```{r create_univariate_logical_plots}
#| echo: true
#| warning: FALSE

logical_vars <- coltype_lst$split$logical |> sort()

for (plot_varname in logical_vars) {
  cat("--\n")
  cat(glue("{plot_varname}\n"))

  na_count <- explore_data_tbl |> pull(.data[[plot_varname]]) |> are_na() |> sum()

  plot_title <- glue("Barplot of Counts for Variable: {plot_varname} ({na_count} missing values)")

  explore_plot <- ggplot(explore_data_tbl) +
    geom_bar(aes(x = .data[[plot_varname]])) +
    xlab(plot_varname) +
    ylab("Count") +
    scale_y_continuous(labels = label_comma()) +
    ggtitle(plot_title) +
    theme(axis.text.x = element_text(angle = 30, vjust = 0.5))

  plot(explore_plot)
}
```


## Numeric Variables

Numeric variables are usually continuous in nature, though we also have
integer data.

```{r create_univariate_numeric_plots}
#| echo: true
#| warning: FALSE

numeric_vars <- coltype_lst$split$continuous |> sort()

for (plot_varname in numeric_vars) {
  cat("--\n")
  cat(glue("{plot_varname}\n"))

  plot_var <- explore_data_tbl |> pull(.data[[plot_varname]])
  na_count <- plot_var |> are_na() |> sum()

  plot_var |> summary() |> print()

  plot_title <- glue("Histogram Plot for Variable: {plot_varname} ({na_count} missing values)")


  all_plot <- ggplot() +
    geom_histogram(aes(x = plot_var), bins = dataexp_hist_bins_count) +
    geom_vline(xintercept = mean(plot_var, na.rm = TRUE),
               colour = "red", size = 1.5) +
    geom_vline(xintercept = median(plot_var, na.rm = TRUE),
               colour = "green", size = 1.5) +
    xlab(plot_varname) +
    ylab("Count") +
    scale_x_continuous(labels = label_comma()) +
    scale_y_continuous(labels = label_comma()) +
    ggtitle(
      plot_title,
      subtitle = "(red line is mean, green line is median)"
      )

  pos_data_tbl <- explore_data_tbl |>
    filter(.data[[plot_varname]] >= 0) |>
    mutate(var_val = abs(.data[[plot_varname]]))

  pos_log_plot <- ggplot(pos_data_tbl) +
    geom_histogram(aes(x = var_val), bins = dataexp_hist_bins_count) +
    xlab(plot_varname) +
    ylab("Count") +
    scale_x_log10(labels = label_comma()) +
    scale_y_continuous(labels = label_comma()) +
    ggtitle("Positive Values")

  
  neg_data_tbl <- explore_data_tbl |>
    filter(.data[[plot_varname]] < 0) |>
    mutate(var_val = abs(.data[[plot_varname]]))

  neg_log_plot <- ggplot(neg_data_tbl) +
    geom_histogram(aes(x = var_val), bins = dataexp_hist_bins_count) +
    xlab(plot_varname) +
    ylab("Count") +
    scale_x_log10(labels = label_comma()) +
    scale_y_continuous(labels = label_comma()) +
    ggtitle("Negative Values")


  plot_grid(
      all_plot,
      NULL,
      pos_log_plot,
      neg_log_plot,
      nrow = 2
      ) |>
    print()
}
```

## Categorical Variables

Categorical variables only have values from a limited, and usually fixed,
number of possible values

```{r create_univariate_categorical_plots}
#| echo: true
#| warning: FALSE

categorical_vars <- coltype_lst$split$discrete |> sort()

for (plot_varname in categorical_vars) {
  cat("--\n")
  cat(glue("{plot_varname}\n"))

  na_count <- explore_data_tbl |> pull(.data[[plot_varname]]) |> are_na() |> sum()

  plot_title <- glue("Barplot of Counts for Variable: {plot_varname} ({na_count} missing values)")

  standard_plot_tbl <- explore_data_tbl |>
    count(.data[[plot_varname]])

  standard_plot <- ggplot(standard_plot_tbl) +
    geom_bar(aes(x = .data[[plot_varname]], weight = n)) +
    xlab(plot_varname) +
    ylab("Count") +
    scale_x_discrete(labels = ~ abbreviate(.x, minlength = 10)) +
    scale_y_continuous(labels = label_comma()) +
    ggtitle(plot_title) +
    theme(axis.text.x = element_text(angle = 30, vjust = 0.5))

  standard_plot |> print()


  desc_plot_tbl <- explore_data_tbl |>
    pull(.data[[plot_varname]]) |>
    fct_lump(n = dataexp_cat_level_count) |>
    fct_count() |>
    mutate(f = fct_relabel(f, str_trunc, width = 15))

  desc_plot <- ggplot(desc_plot_tbl) +
    geom_bar(aes(x = fct_reorder(f, -n), weight = n)) +
    xlab(plot_varname) +
    ylab("Count") +
    scale_x_discrete(labels = abbreviate) +
    scale_y_continuous(labels = label_comma()) +
    ggtitle(plot_title) +
    theme(axis.text.x = element_text(angle = 30, vjust = 0.5))

  desc_plot |> print()
}
```


## Date/Time Variables

Date/Time variables represent calendar or time-based data should as time of the
day, a date, or a timestamp.

```{r create_univariate_datetime_plots}
#| echo: true
#| warning: FALSE

datetime_vars <- coltype_lst$split$datetime |> sort()

for (plot_varname in datetime_vars) {
  cat("--\n")
  cat(glue("{plot_varname}\n"))

  plot_var <- explore_data_tbl |> pull(.data[[plot_varname]])
  na_count <- plot_var |> are_na() |> sum()

  plot_var |> summary() |> print()

  plot_title <- glue("Barplot of Dates/Times in Variable: {plot_varname} ({na_count} missing values)")


  explore_plot <- ggplot(explore_data_tbl) +
    geom_histogram(aes(x = .data[[plot_varname]]), bins = dataexp_hist_bins_count) +
    xlab(plot_varname) +
    ylab("Count") +
    scale_y_continuous(labels = label_comma()) +
    ggtitle(plot_title)

  plot(explore_plot)
}
```


```{r, echo=FALSE}
#| echo: FALSE

#knitr::knit_exit()
```


# Bivariate Facet Plots

We now move on to looking at bivariate plots of the data set.

```{r create_log_bivariate_facet_plots}
#| echo: true

syslog(
  glue("Performing bivariate facet plots"),
  level = "INFO"
  )
```

A natural way to explore relationships in data is to create univariate
visualisations facetted by a categorical value.

```{r bivariate_facet_data}
#| echo: true

facet_varname <- "invoice_month"

dataexp_facet_count_max <- 3
```


## Logical Variables

For logical variables we facet on barplots of the levels, comparing TRUE,
FALSE and missing data.

```{r create_bivariate_logical_plots}
#| echo: true

logical_vars <- logical_vars[!logical_vars %in% facet_varname] |> sort()


for (plot_varname in logical_vars) {
  cat("--\n")
  cat(plot_varname)

  plot_tbl <- data_tbl |> filter(!are_na(.data[[plot_varname]]))

  explore_plot <- ggplot(plot_tbl) +
    geom_bar(aes(x = .data[[plot_varname]])) +
    facet_wrap(facet_varname, scales = "free") +
    xlab(plot_varname) +
    ylab("Count") +
    scale_y_continuous(labels = label_comma()) +
    ggtitle(glue("{facet_varname}-Faceted Histogram for Variable: {plot_varname}")) +
    theme(axis.text.x = element_text(angle = 30, vjust = 0.5))

  plot(explore_plot)
}
```


## Numeric Variables

For numeric variables, we facet on histograms of the data.

```{r create_bivariate_numeric_plots}
#| echo: true

for (plot_varname in numeric_vars) {
  cat("--\n")
  cat(plot_varname)

  plot_tbl <- data_tbl |> filter(!are_na(.data[[plot_varname]]))

  explore_plot <- ggplot(plot_tbl) +
    geom_histogram(aes(x = .data[[plot_varname]]), bins = dataexp_hist_bins_count) +
    facet_wrap(facet_varname, scales = "free") +
    xlab(plot_varname) +
    ylab("Count") +
    scale_x_continuous(labels = label_comma()) +
    scale_y_continuous(labels = label_comma()) +
    ggtitle(glue("{facet_varname}-Faceted Histogram for Variable: {plot_varname}")) +
    theme(axis.text.x = element_text(angle = 30, vjust = 0.5))

  print(explore_plot)
}
```

## Categorical Variables

We treat categorical variables like logical variables, faceting the barplots
of the different levels of the data.

```{r create_bivariate_categorical_plots}
#| echo: true

categorical_vars <- categorical_vars[!categorical_vars %in% facet_varname] |> sort()

for (plot_varname in categorical_vars) {
  cat("--\n")
  cat(plot_varname)

  plot_tbl <- data_tbl |>
    filter(!are_na(.data[[plot_varname]])) |>
    mutate(
      varname_trunc = fct_relabel(.data[[plot_varname]], str_trunc, width = 10)
      )

  explore_plot <- ggplot(plot_tbl) +
    geom_bar(aes(x = varname_trunc)) +
    facet_wrap(facet_varname, scales = "free") +
    xlab(plot_varname) +
    ylab("Count") +
    scale_x_discrete(labels = abbreviate) +
    scale_y_continuous(labels = label_comma()) +
    ggtitle(glue("{facet_varname}-Faceted Histogram for Variable: {plot_varname}")) +
    theme(axis.text.x = element_text(angle = 30, vjust = 0.5))

  plot(explore_plot)
}
```


## Date/Time Variables

Like the univariate plots, we facet on histograms of the years in the dates.

```{r create_bivariate_datetime_plots}
#| echo: true

for (plot_varname in datetime_vars) {
  cat("--\n")
  cat(plot_varname)

  plot_tbl <- data_tbl |> filter(!are_na(.data[[plot_varname]]))

  explore_plot <- ggplot(plot_tbl) +
    geom_histogram(aes(x = .data[[plot_varname]]), bins = dataexp_hist_bins_count) +
    facet_wrap(facet_varname, scales = "free") +
    xlab(plot_varname) +
    ylab("Count") +
    scale_y_continuous(labels = label_comma()) +
    ggtitle(glue("{facet_varname}-Faceted Histogram for Variable: {plot_varname}")) +
    theme(axis.text.x = element_text(angle = 30, vjust = 0.5))

  plot(explore_plot)
}
```

```{r free_memory_facetplot}
#| echo: FALSE

rm(plot_var, plot_tbl)
```


```{r}
#| echo: true

#knitr::knit_exit()
```


# Custom Explorations

In this section we perform various data explorations.

```{r create_log_custom_explorations}
#| echo: true

syslog(
  glue("Performing custom data exploration"),
  level = "INFO"
  )
```

## Custom Checks for Data Integrity

We want to check the transaction data for consistency, so we create a table
of all distinct 

```{r check_stock_codes}
#| echo: true

stock_codes_lookup_tbl <- data_tbl |>
  select(stock_code_upr, description) |>
  distinct() |>
  arrange(stock_code_upr, description) |>
  drop_na(description)

stock_codes_lookup_tbl |> glimpse()
```

We now take a look at the first 50 rows of this table to get a sense of any
possible duplication of `stock_code`.

```{r plot_stock_codes_distinct_50}
#| echo: true

stock_codes_lookup_tbl |> datatable()
```


### Items per Transactions

As another check on the data, we want to look at how many different objects
are included in 

```{r plot_histogram_distinct_items}
#| echo: true

plot_tbl <- data_tbl |>
  filter(quantity > 0) |>
  count(invoice_id, name = "n_items")

ggplot(plot_tbl) +
  geom_histogram(aes(x = n_items), bins = 40) +
  scale_x_log10(labels = label_comma()) +
  scale_y_continuous(labels = label_comma()) +
  xlab("Number of Items") +
  ylab("Transaction Count") +
  ggtitle("Histogram of Item Counts per Transactions")
```


## Explore Aggregate Amounts

We now turn our focus to aggregating the data set in various ways and inspect
how these aggregate totals are distributed.


### Invoice-Level Amounts

We first aggregate the data at the invoice level, and inspect how those amounts
are distributed.

```{r explore_invoice_amounts}
#| echo: true

invoice_data_tbl <- data_tbl |>
  group_by(invoice_id) |>
  summarise(
    .groups = "drop",
    
    invoice_amount = sum(price * quantity) |> round(2)
  )

invoice_mean   <- invoice_data_tbl |> pull(invoice_amount) |> mean()   |> round(2)
invoice_median <- invoice_data_tbl |> pull(invoice_amount) |> median() |> round(2)

ggplot(invoice_data_tbl) +
  geom_histogram(aes(x = invoice_amount), bins = 50) +
  geom_vline(aes(xintercept = invoice_mean),   colour = "black") +
  geom_vline(aes(xintercept = invoice_median), colour = "red") +
  xlab("Invoice Amount") +
  ylab("Count") +
  scale_x_log10(labels = label_comma()) +
  scale_y_continuous(labels = label_comma()) +
  ggtitle(
    label    = "Histogram Plot for Invoice Amount",
    subtitle = glue("Mean is {invoice_mean}, Median is {invoice_median}")
    )
```

We see there is a broad range of different invoice totals, with mean and
median being a few hundred pounds.


### Customer-Level Amounts


```{r explore_customer_amounts}
#| echo: true

customer_data_tbl <- data_tbl |>
  group_by(customer_id) |>
  summarise(
    .groups = "drop",
    
    customer_spend = sum(price * quantity) |> round(2)
  )
  
ggplot(customer_data_tbl) +
  geom_histogram(aes(x = customer_spend), bins = 50) +
  xlab("Customer Spend") +
  ylab("Count") +
  scale_x_log10(labels = label_comma()) +
  scale_y_continuous(labels = label_comma()) +
  ggtitle("Histogram Plot for Customer Spend")
```


```{r customer_spend_hill_plot}
#| echo: true

customer_data_tbl |> pull(customer_spend) |> hill()
```


### Stock Code Investigation

We now want to have a quick look at some high level summary statistics on the
stock codes, so we look at quantities and total value.

```{r stock_code_summary_data}
#| echo: true

stock_code_summary_tbl <- data_tbl |>
  group_by(stock_code_upr) |>
  summarise(
    .groups = "drop",
    
    row_count = n(),

    net_quantity = sum(quantity),
    abs_quantity = abs(quantity) |> sum(),
    net_value    = sum(stock_value),
    abs_value    = abs(quantity) |> sum()
    ) |>
  mutate(
    sc_nchar = nchar(stock_code_upr), .before = "row_count"
    )

stock_code_summary_tbl |> datatable()
```

It appears there are number of odd stock codes in the dataset, so we look at
those codes that are 4 characters or less and inspect those.


```{r check_short_stock_codes}
#| echo: true

short_stock_codes_tbl <- data_tbl |>
  semi_join(stock_code_summary_tbl |> filter(sc_nchar < 5), by = "stock_code_upr")

short_stock_codes_tbl |> datatable()
```


### Stock Code Price Data

We want to look at the range of different prices assigned to the same
`stock_code` value.

```{r plot_stock_price_counts}
#| echo: true

stock_price_counts_tbl <- data_tbl |>
  group_by(stock_code) |>
  summarise(
    .groups = "drop",
    
    n_prices    = n(),
    min_price   = min(price),
    p25_price   = quantile(price, 0.25),
    mean_price  = mean(price) |> round(2),
    p50_price   = median(price),
    p75_price   = quantile(price, 0.75),
    max_price   = max(price),
    range_price = ((max_price - min_price) / mean_price) |> round(4)
    )

stock_price_counts_tbl |> datatable()
  
stock_distinct_price_counts_tbl <- data_tbl |>
  select(stock_code, price) |>
  distinct() |>
  group_by(stock_code) |>
  summarise(
    .groups = "drop",
    
    n_prices    = n(),
    min_price   = min(price),
    p25_price   = quantile(price, 0.25),
    mean_price  = mean(price) |> round(2),
    p50_price   = median(price),
    p75_price   = quantile(price, 0.75),
    max_price   = max(price),
    range_price = (max_price - min_price) / mean_price
    )

stock_distinct_price_counts_tbl |> datatable()
```


## Construct Time-Series / Date-Based Data

Another way to look at this data is to combine all the invoice values by
various time period such as daily, weekly and monthly to see how it looks.

As we are going to do a number of aggregations based on various aspects of the
date, we construct a function that takes a table of data and adds a number of
derived columns based on that date: things like day of week, calendar month and
so on.

```{r append_calendar_columns}
#| echo: true

append_calendar_columns <- function(data_tbl, date_col) {
  updated_data_tbl <- data_tbl |>
    mutate(
      invoice_date   = {{date_col}} |> as.Date(),
      
      invoice_month  = {{date_col}} |> format("%B"),
      invoice_dow    = {{date_col}} |> format("%A"),
      invoice_dom    = {{date_col}} |> format("%d"),
      invoice_hour   = {{date_col}} |> format("%H"),
      invoice_minute = {{date_col}} |> format("%M"),
      invoice_woy    = {{date_col}} |> format("%V"),
      invoice_ym     = {{date_col}} |> format("%Y%m"),
      
      .after = {{date_col}}
      )
  
  return(updated_data_tbl)
}

data_tbl |>
  select(excel_sheet, invoice_id, invoice_date, stock_value) |>
  append_calendar_columns(invoice_date) |>
  glimpse()


```


### Create Univariate Time-Series of Amounts

```{r construct_timeseries_data}
#| echo: true

ts_data_tbl <- data_tbl |>
  mutate(
    ts_week  = format(invoice_dttm, "%Y-%U"),
    ts_month = format(invoice_dttm, "%Y-%m")
    )

ts_daily_tbl <- ts_data_tbl |>
  group_by(label = invoice_date |> format("%Y-%m-%d")) |>
  summarise(
    .groups = "drop",

    period_date = min(invoice_date),
    total_spend = sum(price * quantity) |> round(2)
    )

ggplot(ts_daily_tbl) +
  geom_line(aes(x = period_date, y = total_spend)) +
  expand_limits(y = 0) +
  scale_y_continuous(labels = label_comma()) +
  xlab("Date") +
  ylab("Total Spend") +
  ggtitle("Lineplot of Total Spend by Day")


ts_weekly_tbl <- ts_data_tbl |>
  group_by(label = ts_week) |>
  summarise(
    .groups = "drop",

    period_date = min(invoice_date),
    total_spend = sum(price * quantity)
    )

ggplot(ts_weekly_tbl) +
  geom_line(aes(x = period_date, y = total_spend)) +
  expand_limits(y = 0) +
  scale_y_continuous(labels = label_comma()) +
  xlab("Date") +
  ylab("Total Spend") +
  ggtitle("Lineplot of Total Spend by Week")


ts_monthly_tbl <- ts_data_tbl |>
  group_by(label = ts_month) |>
  summarise(
    .groups = "drop",

    period_date = min(invoice_date),
    total_spend = sum(price * quantity) |> round(2)
    )

ggplot(ts_monthly_tbl) +
  geom_line(aes(x = period_date, y = total_spend)) +
  expand_limits(y = 0) +
  scale_y_continuous(labels = label_comma()) +
  xlab("Date") +
  ylab("Total Spend") +
  ggtitle("Lineplot of Total Spend by Month")
```

To avoid dealing with multiple files for the time series, we combine them into
a single object.

```{r combine_ts_tbl}
#| echo: true

ts_data_tbl <- list(
    daily   = ts_daily_tbl,
    weekly  = ts_weekly_tbl,
    monthly = ts_monthly_tbl
    ) |>
  bind_rows(.id = "series") |>
  arrange(series, period_date)

ts_data_tbl |> glimpse()
```


### Calendar-Based Boxplots

We have aggregated our data across time periods, but it is also worth looking
at both transaction-level and invoice-level amount over time.

```{r lineitem_calendar_boxplot}
#| echo: true

ggplot(data_tbl) +
  geom_boxplot(aes(x = invoice_woy, y = stock_value, group = invoice_woy)) +
  scale_y_log10(labels = label_comma()) +
  xlab("Week of Year") +
  ylab("Transaction Amount") +
  ggtitle("Boxplot of Transaction Sizes by Week of Year") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5))
```


```{r invoice_calendar_boxplot}
#| echo: true

plot_tbl <- data_tbl |>
  group_by(invoice_woy, invoice_id) |>
  summarise(
    .groups = "drop",
    
    invoice_amount = sum(stock_value) |> round(2)
  )

ggplot(plot_tbl) +
  geom_boxplot(aes(x = invoice_woy, y = invoice_amount, group = invoice_woy)) +
  scale_y_log10(labels = label_comma()) +
  xlab("Week of Year") +
  ylab("Invoice Amount") +
  ggtitle("Boxplot of Invoice Amounts by Week of Year") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5))
```


## Check Distribution of Daily Purchases

We now look at individual invoice amounts, and look at how they are distributed
on a daily basis.

```{r plot_daily_amount_distributions}
#| echo: true

daily_distribution_tbl <- data_tbl |>
  group_by(invoice_date, invoice_id) |>
  summarise(
    .groups = "drop",
    
    total_spend = sum(stock_value)
    )

daily_distribution_tbl |> glimpse()
```

We now produce a boxplot for each day so we have a sense of how the
distribution of transaction amounts changes and evolves over time.

```{r plot_daily_distribution_transactions}
#| echo: true

plot_tbl <- daily_distribution_tbl |>
  filter(total_spend > 0) |>
  group_by(invoice_date) |>
  reframe(
    .groups = "drop",
    
    qn   = c("total_spend", "p10", "p25", "p50", "p75", "p90"),
    vals = c(
      sum(total_spend),
      quantile(total_spend, probs = c(0.10, 0.25, 0.50, 0.75, 0.90))
      )
    ) |>
  pivot_wider(
    names_from = qn,
    values_from = vals
  )

ggplot(plot_tbl) +
  geom_errorbar(aes(x = invoice_date, ymin = p25, ymax = p75, colour = total_spend),
                width = 0) +
  scale_y_log10(labels = label_comma()) +
  scale_colour_continuous(labels = label_comma()) +
  labs(x = "Date", y = "Transaction Amount", colour = "Spend") +
  ggtitle("P25-P75 Quartile of Transaction Amounts by Date")
```

It does not look there is much of a change over time or any particular patterns
that stand out, so we also want to look at the refunds.

```{r plot_daily_distribution_refunds}
#| echo: true

plot_tbl <- daily_distribution_tbl |>
  filter(total_spend < 0) |>
  mutate(total_refund = abs(total_spend)) |>
  group_by(invoice_date) |>
  reframe(
    .groups = "drop",
    
    qn   = c("total_refund", "p10", "p25", "p50", "p75", "p90"),
    vals = c(
      sum(total_refund),
      quantile(total_refund, probs = c(0.10, 0.25, 0.50, 0.75, 0.90))
      )
    ) |>
  pivot_wider(
    names_from = qn,
    values_from = vals
  )

ggplot(plot_tbl) +
  geom_errorbar(aes(x = invoice_date, ymin = p25, ymax = p75, colour = total_refund),
                width = 0) +
  scale_y_log10(labels = label_comma()) +
  scale_colour_continuous(labels = label_comma()) +
  labs(x = "Date", y = "Transaction Amount", colour = "Spend") +
  ggtitle("P25-P75 Quartile of Refund Amounts by Date")
```


## Investigate Refunds

This transactions data also includes returns and refunds - adding a level of
complexity to this analysis as we need to account for this when assessing the
data.

One option is to ignore the refunds, at least initially, but this may add
a large source of bias to our analysis and we need to get a feel for this.

To do this, we will look at each line-item entry with a negative quantity and
we will look back in time to the previous time such an time was sold, and we
match on the `customer_id`, `stock_code` and `price`. This is strong evidence
that the previous sale was then returned, and so we can label these entries.

Due to the time snapshot nature of this data, this also means there are likely
to be some returns in our data that do not have a corresponding sale, so we
can also ignore those.


```{r create_matching_returns_data}
#| echo: true
#| cache: TRUE

filter_returns_transactions <- function(use_stock, use_customer,  use_dttm,
                                        data_tbl) {
  matches_tbl <- data_tbl |>
    filter(stock_code   == use_stock,
           customer_id  == use_customer,
           invoice_dttm <= use_dttm,
           quantity     >  0)

  return(matches_tbl)
}

match_tnx_prices <- function(data_tbl, return_price, return_quantity) {
  matched_tbl <- data_tbl |>
    filter(abs(price) == abs(return_price)) |>
    select(row_id, invoice_id, stock_code, quantity, price, invoice_dttm) |>
    arrange(desc(invoice_dttm)) |>
    mutate(
      cuml_quantity   = cumsum(quantity),
      return_quantity = return_quantity,
      remaining       = cuml_quantity + return_quantity
      )

  return(matched_tbl)
}


determine_return_records <- function(data_tbl) {
  negative_tbl <- data_tbl |>
    filter(remaining <= 0)
  
  first_tbl <- data_tbl |>
    filter(remaining > 0) |>
    arrange(desc(invoice_dttm)) |>
    head(1)
  
  
  matched_tbl <- list(negative_tbl, first_tbl) |>
    bind_rows() |>
    arrange(desc(invoice_dttm)) |>
    rename(orig_row_id = row_id)
  
  return(matched_tbl)
}


returns_data_tbl <- data_tbl |>
  filter(quantity < 0) |>
  select(row_id, stock_code, customer_id, quantity, price, invoice_dttm) |>
  mutate(
    prev_tnx_data = future_pmap(
      list(use_stock    = stock_code,
           use_customer = customer_id,
           use_dttm     = invoice_dttm),
      filter_returns_transactions,
      data_tbl = data_tbl,
      
      .options  = furrr_options(seed = 421),
      .progress = TRUE      
      ),
    price_data = future_pmap(
      list(data_tbl        = prev_tnx_data,
           return_price    = price,
           return_quantity = quantity),
      match_tnx_prices,
      
      .options  = furrr_options(seed = 422),
      .progress = TRUE
      ),
    match_data = map(price_data, determine_return_records)
    )


returns_data_tbl |> glimpse()
```

We can then use this data to get an idea of how often items are returned, and
how much time tends to pass between the purchase and the return.

```{r construct_returns_lookups}
#| echo: true

returns_lookup_tbl <- returns_data_tbl |>
  select(return_row_id = row_id, return_dttm = invoice_dttm, match_data) |>
  unnest(match_data) |>
  select(
    orig_row_id, return_row_id, return_dttm,
    adjust_quantity = return_quantity
    ) |>
  group_by(return_row_id) |>
  arrange(orig_row_id) |>
  slice_head(n = 1) |>
  ungroup()

returns_lookup_tbl |> glimpse()
```

We probably need to do a better job of this, as this current method is
inadequate to properly match this up.

Stepping up this work to properly match up the transactions will be a topic
we return to later.

We now want to look at the distribution of times between the purchase and the
item being returned.

```{r plot_return_times_histogram}
#| echo: true

orig_data_tbl <- data_tbl |>
  select(orig_row_id   = row_id, purchase_dttm = invoice_dttm)

ret_data_tbl <- data_tbl |>
  select(return_row_id = row_id, return_dttm   = invoice_dttm)

return_times_tbl <- returns_lookup_tbl |>
  select(orig_row_id, return_row_id) |>
  left_join(orig_data_tbl, by = "orig_row_id") |>
  left_join(ret_data_tbl,  by = "return_row_id") |>
  mutate(
    return_time = return_dttm - purchase_dttm,
    return_days  = as.numeric(return_time) / (24 * 60 * 60)
    )

base_plot <- ggplot(return_times_tbl) +
  geom_histogram(aes(x = return_days), bins = 50) +
  geom_vline(aes(xintercept = 90), colour = "red") +
  xlab("Days") +
  ylab("Count") +
  ggtitle("Histogram of Days Between Purchase and Return per Item Transaction")

norm_plot <- base_plot +
  scale_x_continuous(labels = label_comma())

log_plot <- base_plot +
  scale_x_log10(labels = label_comma())

plot_grid(norm_plot, log_plot, nrow = 2)
```


# Indicate Excluded Rows

As we have found a number of issues with this data, we now will indicate
which rows we wish to exclude from futher analysis. We will then exclude these
rows at a later point of the analysis.


## Filter Extra Stock Codes

We discovered there are a number of extra data in this dataset for things like
bad debt management, discounts, gift cards and so on, so we remove those from
this.

```{r remove_extraneous_stock_code_entries}
#| echo: true

screen_stock_code <- c(
  "B", "C2", "C3", "D", "M", "S", "CRUK", "POST", "DOT", "BANK CHARGES",
  "AMAZONFEE", "ADJUST", "ADJUST2", "TEST001", "TEST002"
  )

clean_stock_code_tbl <- data_tbl |>
  filter(stock_code_upr %in% screen_stock_code) |>
  select(row_id)

clean_stock_code_tbl |> glimpse()


clean_gift_tbl <- data_tbl |>
  filter(str_detect(stock_code_upr, "GIFT")) |>
  select(row_id)

clean_gift_tbl |> glimpse()
```


## Create Exclusion Indicator

```{r combine_exclusions}
#| echo: true

exclusions_tbl <- list(
    clean_stock_code_tbl,
    clean_gift_tbl
    ) |>
  bind_rows() |>
  mutate(exclude = TRUE)

cleaned_data_tbl <- data_tbl |>
  left_join(exclusions_tbl, by = "row_id") |>
  replace_na(list(exclude = FALSE))

cleaned_data_tbl |> glimpse()
```


# Construct Stock Code Lookups

Finally, we want to construct a lookup table that provides some free-text
fields for each `stock_code` value.

```{r dedupe_stock_descs}
#| echo: true

dedupe_stock_descs <- function(stock_desc) {
  dedupe_descs <- stock_desc |>
    enframe(name = NULL, value = "stock_desc") |>
    mutate(
      desc_len    = stock_desc |> nchar(),
      desc_dedupe = stock_desc |> str_trim() |> str_replace_all("[^\\w]| ", ""),
      desc_output = stock_desc |> str_trim() |> str_squish()
      ) |>
    group_by(desc_dedupe) |>
    slice_max(order_by = desc_len, n = 1, with_ties = FALSE) |>
    ungroup() |>
    pull(desc_output)

  return(dedupe_descs)
}
```

We use some simple logic to attempt to de-dupe the descriptions as much as
possible.


```{r construct_stock_code_table}
#| echo: true

stock_description_tbl <- cleaned_data_tbl |>
  filter(
    exclude == FALSE,
    quantity > 0,
    price > 0,
    !are_na(description)
    ) |>
  mutate(
    stock_code = stock_code |> str_trim() |> str_to_upper()
    ) |>
  select(stock_code, description) |>
  drop_na(description) |>
  distinct() |>
  group_by(stock_code) |>
  summarise(
    .groups = "drop",
    
    desc = description |> sort() |> dedupe_stock_descs() |> str_c(collapse = " : ")
    ) |>
  arrange(stock_code)

stock_description_tbl |> glimpse()
```

We also look at this output using DT

```{r view_stock_description}
#| echo: true

stock_description_tbl |> datatable()
```


# BTYD Visualisations

Despite our extensive exploration of the data earlier, the concepts around 
BTYD modelling suggest a few more than are worth exploring, so we will look at
those now.

We model the purchase data first, then combine this to create an individual
customer/invoice pairing with the total amount spent as an additional column.

```{r prepare_purchase_data}
#| echo: true

tnx_purchase_tbl <- cleaned_data_tbl |>
  filter(
    quantity > 0,
    price > 0,
    exclude == FALSE
    ) |>
  select(
    invoice_date, invoice_dttm, invoice_id, stock_code, customer_id,
    description, quantity, price, stock_value
    )

tnx_purchase_tbl |> glimpse()
```

Use of BTYD models assumes a total spend over a period of day and those
differences between the times. This is calculated internally by the various
BTYD routines so we keep just the per-invoice spend.

```{r calculate_customer_daily_spend}
#| echo: true

daily_spend_invoice_tbl <- tnx_purchase_tbl |>
  drop_na(customer_id) |>
  group_by(invoice_date, invoice_dttm, customer_id, invoice_id) |>
  summarise(
    .groups = "drop",
    
    invoice_spend = sum(stock_value)
    )

daily_spend_invoice_tbl |> glimpse()


daily_spend_tbl <- daily_spend_invoice_tbl |>
  group_by(invoice_date, customer_id) |>
  summarise(
    .groups = "drop",
    
    total_spend = sum(invoice_spend),
    tnx_count   = n()
    )

daily_spend_tbl |> glimpse()
```


To start with, it might be worth understanding a bit more about when customers
are 'born' in the system - that is, the date on which they make their first
purchase. Another important quantity is the time between transactions for a
customer, we we will visualise these.

```{r construct_customer_cohort_data}
#| echo: true

customer_cohort_tbl <- daily_spend_tbl |>
  group_by(customer_id) |>
  summarise(
    .groups = "drop",
    
    first_tnx_date  = min(invoice_date),
    total_tnx_count = n()
    ) |>
  mutate(
    cohort_qtr = first_tnx_date |> as.yearqtr() |> as.character(),
    cohort_ym  = first_tnx_date |> format("%Y %m"),
    
    .after = "customer_id"
    )


customer_cohort_tbl |> glimpse()
```

Now that we have a first date for each customer, we look at the total number
of customers joining at each date.

```{r plot_customer_first_dates}
#| echo: true

plot_tbl <- customer_cohort_tbl |>
  count(first_tnx_date, name = "n_customer")

ggplot(plot_tbl) +
  geom_line(aes(x = first_tnx_date, y = n_customer)) +
  labs(
    x = "First Transaction Date",
    y = "New Customers",
    title = "Plot of Count of New Customer by Date"
    )
```

We know look at how time differences between purchases are distributed.

```{r plot_transaction_time_diffs}
#| echo: true

customer_tnx_diffs_tbl <- daily_spend_tbl |>
  group_by(customer_id) |>
  summarise(
    .groups = "drop",
    
    time_diff = diff(invoice_date) |> as.numeric() |> divide_by(7)
  )

mean_diff <- customer_tnx_diffs_tbl |> pull(time_diff) |> mean()

ggplot(customer_tnx_diffs_tbl) +
  geom_histogram(aes(x = time_diff), bins = 50) +
  geom_vline(aes(xintercept = mean_diff), colour = "red") +
  scale_y_continuous(labels = label_comma()) +
  labs(
    x = "Time Difference (weeks)",
    y = "Frequency",
    title = "Histogram of Differences Between Transactions for Customers",
    subtitle = glue(
      "Mean Difference is {mean_diff} weeks", mean_diff = mean_diff |> round(2)
      )
    )
```


We also want to look at a number of customers and make some line plots of their
transactions.

```{r visualise_customer_transactions}
#| echo: true

keep_customers_tbl <- customer_cohort_tbl |>
  filter(total_tnx_count > 2) |>
  slice_sample(n = 30)

plot_tbl <- daily_spend_tbl |>
  semi_join(keep_customers_tbl, by = "customer_id")

ggplot(plot_tbl, aes(x = invoice_date, y = customer_id, group = customer_id)) +
  geom_line() +
  geom_point() +
  labs(
    x = "Transaction Date",
    y = "Customer ID",
    title = "Visualisation of Transaction Times for 30 Customers"
    ) +
  theme(axis.text.y = element_text(size = 12))
```


## Construct Timestamped Transaction Data

We want to construct a table of data containing each individual transaction,
given by a timestamp for the transaction.

```{r construct_customer_transactions}
customer_tnx_data_tbl <- cleaned_data_tbl |>
  count(
    tnx_timestamp = invoice_dttm, invoice_id, customer_id,
    wt = stock_value,
    name = "tnx_amount"
    )

customer_tnx_data_tbl |> glimpse()
```

This data will be used as an input to the BTYD models.


## Investigate Cohorts

Finally, we want to take a look at the distribution of transaction times based
on various first-transaction cohorts in the data.


```{r plot_distribution_first_transaction}
#| echo: true

ggplot(customer_cohort_tbl) +
  geom_histogram(aes(x = first_tnx_date), bins = 50) +
  labs(
    x = "Date of First Transaction",
    y = "Count",
    title = "Histogram of New Customer Start Dates"
    )
```

We also want to get a sense of the total count of customers in each cohort.

```{r construct_qtr_cohort_column_plot}
#| echo: true

plot_tbl <- customer_cohort_tbl |>
  count(cohort_qtr, name = "customer_count") |>
  mutate(cohort_qtr = cohort_qtr |> as.character())
  
ggplot(plot_tbl) +
  geom_col(aes(x = cohort_qtr, y = customer_count)) +
  scale_y_continuous(labels = label_comma()) +
  labs(
    x = "Customer Cohort",
    y = "Customer Count",
    title = "Bar Plot of Customer Quarterly Cohort Sizes"
    )
```

We also want to see the monthly cohorts:

```{r construct_ym_cohort_column_plot}
#| echo: true

plot_tbl <- customer_cohort_tbl |>
  count(cohort_ym, name = "customer_count") |>
  mutate(cohort_ym = cohort_ym |> as.character())

ggplot(plot_tbl) +
  geom_col(aes(x = cohort_ym, y = customer_count)) +
  scale_y_continuous(labels = label_comma()) +
  labs(
    x = "Customer Cohort",
    y = "Customer Count",
    title = "Bar Plot of Customer Monthly Cohort Sizes"
    ) +
  theme(
    axis.text.x = element_text(size = 10, angle = 20, vjust = 0.5)
    )
```


For the cohort analysis, we start with a boxplot of the time difference between
transactions by cohort.

```{r plot_cohort_differences_boxplot}
#| echo: true

plot_tbl <- customer_cohort_tbl |>
  inner_join(customer_tnx_diffs_tbl, by = "customer_id") |>
  mutate(cohort_qtr = cohort_qtr |> as.character())

ggplot(plot_tbl) +
  geom_boxplot(aes(x = cohort_qtr, y = time_diff)) +
  scale_y_log10() +
  labs(
    x = "Cohort",
    y = "Time Difference (weeks)",
    title = "Boxplot of Time Differences by Starting Cohort"
    )
```

We also construct a density plot of the time differences for these cohorts.

```{r investigate_cohort_transaction_times}
#| echo: true

ggplot(plot_tbl, aes(x = time_diff, colour = cohort_qtr)) +
  geom_line(stat = "density") +
  geom_dl(aes(label = cohort_qtr), method = "top.bumpup", stat = "density") +
  labs(
    x = "Time Difference (weeks)",
    y = "Density",
    title = "Comparison Density Plot for Transaction Time Differences Between Cohorts"
    ) +
  theme(legend.position = "none")
```

And we also look at a facetted-histogram

```{r investigate_cohort_timediffs_facets}
#| echo: true

ggplot(plot_tbl) +
  geom_histogram(aes(x = time_diff), bins = 50) +
  facet_wrap(vars(cohort_qtr), scales = "free_y") +
  scale_y_continuous(labels = label_comma()) +
  labs(
    x = "Time Difference (weeks)",
    y = "Count",
    title = "Facetted Histograms of Time Between Transactions"
    )

```


## Investigate Dropout Rates

We want to plot some visualisations of the lifetime and dropout rate of
customers in each cohort.


```{r estimate_cohort_dropout_rates}
#| echo: true

cohort_dropout_est_tbl <- customer_cohort_tbl |>
  select(
    customer_id, first_tnx_date, cohort_qtr, cohort_ym
    ) |>
  inner_join(daily_spend_tbl, by = "customer_id") |>
  group_by(cohort_qtr, customer_id) |>
  mutate(
    final_tnx_date = max(invoice_date)
    ) |>
  ungroup() |>
  select(
    customer_id, cohort_qtr, first_tnx_date, final_tnx_date
    ) |>
  distinct() |>
  mutate(
    obs_lifetime = difftime(final_tnx_date, first_tnx_date, units = "week") |>
      as.numeric()
    ) |>
  filter(obs_lifetime > 0) |>
  group_by(cohort_qtr) |>
  summarise(
    .groups = "drop",
    
    lifetimes = list(obs_lifetime)
    ) |>
  mutate(
    exp_fit    = map(lifetimes, MASS::fitdistr, densfun = "exponential"),
    param_data = map(exp_fit, broom::tidy)
    ) |>
  select(cohort_qtr, lifetimes, param_data) |>
  unnest(param_data)

cohort_dropout_est_tbl |> glimpse()
```

```{r cohort_dropout_params_dt}
#| echo: true

cohort_dropout_est_tbl |> datatable()
```


# Construct BTYD Datasets

We start by modelling the P/NBD model using our synthetic datasets before we
try to model real-life data.

```{r set_start_end_dates}
#| echo: true

use_fit_start_date <- as.Date("2009-12-01")
use_fit_end_date   <- as.Date("2010-12-01")

use_valid_start_date <- as.Date("2010-12-01")
use_valid_end_date   <- as.Date("2012-12-10")
```


```{r setup_online_retail_transaction_data}
#| echo: true

customer_cohortdata_tbl <- customer_cohort_tbl
customer_cohortdata_tbl |> glimpse()

customer_transactions_tbl <- daily_spend_invoice_tbl |>
  transmute(
    tnx_timestamp = invoice_dttm,
    customer_id   = fct_reorder(customer_id, tnx_timestamp, min),
    invoice_id,
    tnx_amount    = invoice_spend
    )

customer_transactions_tbl |> glimpse()
```


## Construct Datasets

Having loaded the synthetic data we need to construct a number of datasets of
derived values.

```{r construct_summary_stats_data}
#| echo: true

customer_summarystats_tbl <- customer_transactions_tbl |>
  drop_na(customer_id) |>
  calculate_transaction_cbs_data(last_date = use_fit_end_date |> as.POSIXct())

customer_summarystats_tbl |> glimpse()
```

As before, we construct a number of subsets of the data for use later on with
the modelling and create some data subsets.


```{r select_fit_dataset}
customer_fit_stats_tbl <- customer_summarystats_tbl
customer_fit_stats_tbl |> glimpse()


customer_valid_stats_tbl <- customer_transactions_tbl |>
  drop_na(customer_id) |>
  filter(
    tnx_timestamp > (use_valid_start_date |> as.POSIXct())
    ) |>
  summarise(
    tnx_count = n(),
    tnx_last_interval = difftime(
        max(tnx_timestamp),
        use_valid_start_date,
        units = "weeks"
        ) |>
      as.numeric(),

    .by = customer_id
    )

customer_valid_stats_tbl |> glimpse()
```


```{r construct_fit_valid_datasets}
#| echo: true

obs_fitdata_tbl <- customer_fit_stats_tbl |>
  rename(tnx_count = x)
  

### We need to add all the zero count customers into the valid data
obs_validdata_tbl <- customer_fit_stats_tbl |>
  anti_join(customer_valid_stats_tbl, by = "customer_id") |>
  transmute(customer_id, tnx_count = 0) |>
  bind_rows(customer_valid_stats_tbl) |>
  arrange(customer_id)
```


Finally we want to construct a cleaned transaction dataset consisting only
of purchases from individual customers.

```{r construct_customer_transaction_data}
#| echo: true

daily_spend_invoice_tbl <- tnx_purchase_tbl |>
  drop_na(customer_id) |>
  group_by(invoice_date, customer_id, invoice_id) |>
  summarise(
    .groups = "drop",
    
    invoice_spend = sum(stock_value)
    )

daily_spend_invoice_tbl |> glimpse()
```


## Construct Customer Subsets

Rather run our validation on the entire datasets, we instead focus initially
on using a subset of the data for validating the model as a whole.

The idea is to fit our models on the whole dataset, but just run our simulation
code on the subset to reduce computation time. Once we are happy with a number
of candidate models we can run the simulations on the full set.

```{r create_customer_subset}
#| echo: true

onlineretail_customer_subset_ids <- customer_summarystats_tbl |>
  filter(first_tnx_date <= as.Date("2010-12-01")) |>
  slice_sample(n = 1000, replace = FALSE) |>
  arrange(customer_id) |>
  pull(customer_id)

onlineretail_customer_subset_ids |> glimpse()
```


# Output Cleaned Data

Finally we output the various datasets we have constructed to disks.


```{r write_to_disk}
#| echo: true

customer_summarystats_tbl |> write_rds("data/onlineretail_customer_summarystats_tbl.rds")

ts_data_tbl               |> write_rds("data/onlineretail_timeseries_tbl.rds")
stock_description_tbl     |> write_rds("data/onlineretail_stock_description_tbl.rds")
returns_lookup_tbl        |> write_rds("data/onlineretail_returns_lookup_tbl.rds")
cleaned_data_tbl          |> write_rds("data/onlineretail_cleaned_tbl.rds")
customer_cohort_tbl       |> write_rds("data/onlineretail_cohort_tbl.rds")
customer_transactions_tbl |> write_rds("data/onlineretail_transactions_tbl.rds")
daily_spend_invoice_tbl   |> write_rds("data/onlineretail_invoice_cleaned_tbl.rds")
daily_spend_tbl           |> write_rds("data/onlineretail_daily_spend_tbl.rds")

obs_fitdata_tbl           |> write_rds("data/onlineretail_obs_fitdata_tbl.rds")
obs_validdata_tbl         |> write_rds("data/onlineretail_obs_validdata_tbl.rds")

onlineretail_customer_subset_ids |>
  write_rds("data/onlineretail_customer_subset_ids.rds")

list(
    use_fit_start_date = use_fit_start_date,
    use_fit_end_date   = use_fit_end_date,
  
    use_valid_start_date = use_valid_start_date,
    use_valid_end_date   = use_valid_end_date
    ) |>
  write_rds("data/onlineretail_simulation_dates.rds")
```


# R Environment {.unnumbered}

```{r show_session_info}
#| echo: true
#| message: TRUE

sessioninfo::session_info()
```