01_setup_20240604.qmd

---
title: "Dummy Data"
format: html
editor: visual
editor_options: 
  chunk_output_type: console
---

## Loading packages

```{r}
#| eval: true
#| echo: false 
#| include: false

# Loading packages 
library(gtsummary)
library(gmodels)
library(summarytools)
library(haven)
library(survey)
library(devtools)
library(readstata13)
library(freqtables)
library(srvyr)
library(jtools)
library(marginaleffects)
library(flextable)
library(sjstats)
library(MASS)
library(interactions)
library(SmartEDA)
library(DataExplorer)
library(rddensity)
library(rdd)
library(rdrobust)
library(modelsummary)
library(tibble)
library(fixest)
library(here)
library(estimatr)
library(sandwich)
library(lmtest)
library(janitor)
library(broom)
library(aplot)
library(fastDummies)
library(glue)
library(rdrobust)
library(RDHonest)
library(splines)
library(stats)
library(arrow)
library(tidyverse)
library(ggsurvey)
library(lfe)

```

```{r}
#| eval: true
#| echo: false 
#| include: false

# load CVS file 


claims <- readr::read_csv("C:/Users/felip/Dropbox (Partners HealthCare)/R Projects/dummy_project/data/01_raw/claims_line_file.csv") %>%
  as_tibble()

people <- readr::read_csv("C:/Users/felip/Dropbox (Partners HealthCare)/R Projects/dummy_project/data/01_raw/patient_level_file.csv") %>%
  as_tibble()


```

#Rolling up claims line file to one row person (similar to SUM statement in proc sql in SAS)

```{r}
#| eval: true
#| echo: false 

a1 <- claims %>%
  group_by(bene_id) %>%
  summarise(rpm = sum(rpm), 
            cgm = sum(cgm)) %>%
  ungroup()

```

#Rolling up claims line file to one row person (similar to MAX statement in proc sql in SAS)

```{r}
#| eval: true
#| echo: false 

a2 <- claims %>%
  group_by(bene_id) %>%
  summarise(rpm = max(rpm), 
            cgm = max(cgm)) %>%
  ungroup()
```

#Rolling up claims line file to one row person (similar to sum(diab_type_2_ind=1 and cgm=1) as cgm_diab_ct statement in proc sql in SAS)

```{r}
#| eval: true
#| echo: false 

a3 <- claims %>%
  group_by(bene_id) %>%
  summarise(cgm_diab_ct = sum(diab_type_2_ind == 1 & cgm == 1), 
            rpm_ct = sum(rpm)) %>%
  ungroup()

```

#Creating indicator variable for cgm_diab_ct \> 0 (similar to (case when cgm_diab_ct gt 0 then 1 else 0 end) as cgm_diab_any) in SAS proc sql

```{r}
#| eval: true
#| echo: false 

a4 <- a3 %>%
  group_by(bene_id) %>%
  mutate(cgm_diab_any = case_when(
    cgm_diab_ct > 0 ~ 1,
    TRUE ~ 0), 
    rpm_any = case_when(
      rpm_ct > 0 ~ 1, 
      TRUE ~ 0)) %>%
  ungroup()
```

#Joining datasets a4 and people by bene_id (similar to proc sql's from a4 as a left join people as b in proc sql in SAS )

```{r}
#| eval: true
#| echo: false 

a5 <- left_join(a4, people, by = "bene_id") %>%
  arrange(bene_id)

```

#Joining datasets a4 and people by bene_id (similar to proc sql's from a4 as a left join people as b in proc sql in SAS ) BUT just bringing in one column from a4

```{r}
#| eval: true
#| echo: false 

a6 <- left_join(a4 %>% select(bene_id, cgm_diab_any), people, by = "bene_id") %>%
  arrange(bene_id)
```

#Iterate over multiple variables

```{r}
#| eval: true
#| echo: false 
#| label: tab-dummy-demo
#| tab-cap: "Table of demographic characteristics by gender"

create_frequency_tables <- function(data, variables) {
  tables <- list()
  for (variable in variables) {
    variable_name <- as.character(variable)
    
    # Ensure that all combinations of variable and gender are included
    all_combinations <- expand.grid(
      variable = unique(data[[variable_name]]), 
      gender = unique(data$gender)
    )
    names(all_combinations)[1] <- variable_name  # Rename the column to match the variable name
    
    # Correctly join and count combinations
    table <- data %>%
      count(!!sym(variable_name), gender, .drop = FALSE) %>%
      full_join(all_combinations, by = c(variable_name, "gender")) %>%
      replace_na(list(n = 0)) %>%
      group_by(!!sym(variable_name)) %>%
      mutate(percent = n / sum(n) * 100) %>%
      ungroup() %>%
      arrange(!!sym(variable_name), gender)
    
    # Add the table to the list
    tables[[variable_name]] <- table
  }
  return(tables)
}


# # test coding by hand
# create_frequency_tables <- function(data, variables) {
# tables <- list()
# for (variable in variables) {
# table <- data %>%
#       count({{variable}}, gender, .drop = FALSE) %>%
#       group_by({{variable}}) %>%
#       mutate(percent = n / sum(n) * 100) %>%
#       ungroup() 
# 
#   # Add the table to the list
#     tables[[as.character(variable)]] <- table
# } 
# return(tables)
# }


# Vector of variables to create frequency tables for
variables <- c("rpm_any", "cgm_diab_any")

# Create frequency tables for the variables
frequency_tables <- create_frequency_tables(a5, variables)

# Flatten the list of frequency tables into a single tibble
flat_frequency_table <- bind_rows(frequency_tables, .id = "variable")

```

```{r}
#| eval: true
#| echo: false 

# Make variables into factors 
a5$race <- factor(a5$race)
a5$dual_eligible <- factor(a5$dual_eligible)
a5$gender <- factor(a5$gender)
a5$tin <- as.character(a5$tin)


# Custom function to perform linear regression for multiple outcome variables and covariates
perform_regression <- function(outcome_vars, covariates, data, fixed = NULL) {
  results <- list()
  
  for (outcome_var in outcome_vars) {
    for (i in seq_along(covariates)) {
      covariate_set <- unlist(covariates[1:i])
      
      if (!is.null(fixed)) {
      formula_string <- paste(outcome_var, "~", paste(covariate_set, collapse = "+"), "| tin |  0 | 0")
      } else {
        formula_string <- paste(outcome_var, "~", paste(covariate_set, collapse = "+"), "| 0 | 0 | 0")
      }
      
      # Construct formula
      formula <- as.formula(formula_string)
      
      
      model <- felm(formula, data = data, cmethod = "cgm2", exactDOF = TRUE) %>%
        tidy(conf.int = TRUE) %>%
        filter(term == "race2" | term == "race3") %>%
        mutate(outcome_name = outcome_var, 
               mod_name = paste("model",i, sep = ""))
      
      
      # Collect garbage
      gc()
      

      results[[length(results) + 1]] <- model
    }
  }
  final_results <- bind_rows(results, .id = "model")
  
  gc()

  return(final_results)
}

# vector of age and sex
age_sex <- c("age", "gender")

# Vector of outcome variables
outcome_vars <- c("rpm_any", "cgm_diab_any")

# Vector of covariates
covariates <- list("race", "dual_eligible", "number_comorbidities", age_sex)

# Fixed effects
fixed <- c("tin")

# Perform regression for the variables and flatten results 
regression_results <- perform_regression(outcome_vars, covariates, data= a5)

regression_results2 <- perform_regression(outcome_vars, covariates, a5, fixed = fixed)


# filter only needed coeffs

reg_res <- regression_results %>%
  filter(term == "dual_eligible")


```

#Plotting estimates and 95% CIs

```{r}
#| eval: true
#| echo: false 


# Calculate mean of each outcome variable
mean_outcome <- sapply(outcome_vars, function(outcome_var) {
  a5 %>%
    summarize(mean = mean(!!sym(outcome_var), na.rm = TRUE))
}) %>%
  enframe(name = "outcome_name", value = "mean_value") %>%
  mutate(outcome_name = str_remove(outcome_name, "\\.mean$"), mean_value = as.numeric(mean_value))

# Merge mean outcome data into regression_results
reg_res <- reg_res %>%
  left_join(mean_outcome, by = "outcome_name")

# Calculate % change from baseline

reg_res <- reg_res %>% 
  mutate(baseline = case_when(TRUE ~ mean_value),
         pct_change = estimate / baseline, 
         low_ci = (estimate - 1.96 * std.error) / baseline,
         high_ci = (estimate + 1.96 * std.error) / baseline)

```

```{r}
#| label: fig-dummy-models
#| fig-cap: "Regression models"

# Plot using ggplot2
plot <- ggplot(reg_res, aes(x = factor(model, labels = c("m1", "m2", "m3")), y = pct_change)) +
  geom_point(aes(colour = factor(outcome_name)), size = 3, position=position_dodge(width = 1)) +
  geom_errorbar(aes(ymin = low_ci, ymax = high_ci), position=position_dodge(width = 1), width=.33) +
 geom_text(aes(label = format(round(pct_change * 100, 1), nsmall = 1)), position=position_dodge(width = 1), vjust = -1, size = 4) +
  facet_wrap(~factor(outcome_name), nrow = 1) +
  geom_hline(yintercept = 0, color = "dark gray", linetype = "dashed") +
  theme_bw() +
  scale_y_continuous(labels = scales::percent) + 
  theme(strip.placement = "outside", plot.margin = margin(0.75, 0.75, 0.75, 0.75, "cm"),
        strip.text.y.left = element_text(angle = 0),
        axis.text.y=element_text(size = 11), axis.title.y=element_text(size = 14, vjust = +4),
        axis.text.x=element_text(size = 11, angle = 0, vjust = 0.2, hjust = 0.95), axis.title.x=element_text(size = 14, vjust = -1.5),
        legend.text = element_text(size = 11)) + 
  labs(y = "Relative change (% of baseline)", 
       x = "", 
       colour = "outcome_name")
print(plot)
```