diff --git a/data/surveys/tab_survey_output.rds b/data/surveys/tab_survey_output.rds new file mode 100644 index 00000000..0ef8e1b5 Binary files /dev/null and b/data/surveys/tab_survey_output.rds differ diff --git a/html_outputs/new_pages/age_pyramid.html b/html_outputs/new_pages/age_pyramid.html index 2631342b..79f48667 100644 --- a/html_outputs/new_pages/age_pyramid.html +++ b/html_outputs/new_pages/age_pyramid.html @@ -2,12 +2,12 @@ - + -The Epidemiologist R Handbook - 33  Demographic pyramids and Likert-scales +33  Demographic pyramids and Likert-scales – The Epidemiologist R Handbook + + +++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

Characteristic

N

OR

+1

95% CI

+1

p-value

gender4,1671.000.88, 1.13>0.9
fever4,1671.000.85, 1.17>0.9
chills4,1671.030.89, 1.210.7
cough4,1671.150.97, 1.370.11
aches4,1670.930.76, 1.140.5
vomit4,1671.090.96, 1.230.2
age_cat4,167
+

+

+
    0-4
+

+
    5-9
+
0.940.77, 1.150.5
    10-14
+
1.150.93, 1.420.2
    15-19
+
0.990.80, 1.24>0.9
    20-29
+
1.030.84, 1.260.8
    30-49
+
1.070.85, 1.330.6
    50-69
+
0.680.41, 1.130.13
    70+
+
0.530.07, 3.200.5
1 +

OR = Odds Ratio, CI = Confidence Interval

+ + + + + +
+

19.2.1 Cross-tabulation

+

The gtsummary package also allows us to quickly and easily create tables of counts. This can be useful for quickly summarising the data, and putting it in context with the regression we have carried out.

+
+
#Carry out our regression
+univ_tab <- linelist %>% 
+  dplyr::select(explanatory_vars, outcome) %>% ## select variables of interest
+
+  tbl_uvregression(                         ## produce univariate table
+    method = glm,                           ## define regression want to run (generalised linear model)
+    y = outcome,                            ## define outcome variable
+    method.args = list(family = binomial),  ## define what type of glm want to run (logistic)
+    exponentiate = TRUE                     ## exponentiate to produce odds ratios (rather than log odds)
+  )
+
+#Create our cross tabulation
+cross_tab <- linelist %>%
+  dplyr::select(explanatory_vars, outcome) %>%   ## select variables of interest
+     tbl_summary(by = outcome)                   ## create summary table
+
+tbl_merge(tbls = list(cross_tab,
+                      univ_tab),
+          tab_spanner = c("Summary", "Univariate regression"))
+
+
+ + + +++++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

Characteristic

+

Summary

+
+

Univariate regression

+

0
+N = 1,825

+1

1
+N = 2,342

+1

N

OR

+2

95% CI

+2

p-value

gender916 (50%)1,174 (50%)4,1671.000.88, 1.13>0.9
fever1,485 (81%)1,906 (81%)4,1671.000.85, 1.17>0.9
chills353 (19%)465 (20%)4,1671.030.89, 1.210.7
cough1,553 (85%)2,033 (87%)4,1671.150.97, 1.370.11
aches189 (10%)228 (9.7%)4,1670.930.76, 1.140.5
vomit894 (49%)1,198 (51%)4,1671.090.96, 1.230.2
age_cat
+

+
4,167
+

+

+
    0-4338 (19%)427 (18%)
+

+
    5-9365 (20%)433 (18%)
+
0.940.77, 1.150.5
    10-14273 (15%)396 (17%)
+
1.150.93, 1.420.2
    15-19238 (13%)299 (13%)
+
0.990.80, 1.24>0.9
    20-29345 (19%)448 (19%)
+
1.030.84, 1.260.8
    30-49228 (12%)307 (13%)
+
1.070.85, 1.330.6
    50-6935 (1.9%)30 (1.3%)
+
0.680.41, 1.130.13
    70+3 (0.2%)2 (<0.1%)
+
0.530.07, 3.200.5
1 +

n (%)

2 +

OR = Odds Ratio, CI = Confidence Interval

+ +
-

As before, we can create a counts table from the linelist for each explanatory variable, bind it to models, and make a nice table. We begin with the variables, and iterate through them with map(). We iterate through a user-defined function which involves creating a counts table with dplyr functions. Then the results are combined and bound with the models model results.

+

There are many modifications you can make to this table output, such as adjusting the text labels, bolding rows by their p-value, etc. See tutorials here and elsewhere online.

+ +
+ +
+

19.3 Stratified

+

Here we define stratified regression as the process of carrying out separate regression analyses on different “groups” of data.

+

Sometimes in your analysis, you will want to investigate whether or not there are different relationships between an outcome and variables, by different strata. This could be something like, a difference in gender, age group, or source of infection.

+

To do this, you will want to split your dataset into the strata of interest. For example, creating two separate datasets of gender == "f" and gender == "m", would be done by:

-
## for each explanatory variable
-univ_tab_base <- explanatory_vars %>% 
-  map(.f = 
-    ~{linelist %>%                ## begin with linelist
-        group_by(outcome) %>%     ## group data set by outcome
-        count(.data[[.x]]) %>%    ## produce counts for variable of interest
-        pivot_wider(              ## spread to wide format (as in cross-tabulation)
-          names_from = outcome,
-          values_from = n) %>% 
-        drop_na(.data[[.x]]) %>%         ## drop rows with missings
-        rename("variable" = .x) %>%      ## change variable of interest column to "variable"
-        mutate(variable = as.character(variable))} ## convert to character, else non-dichotomous (categorical) variables come out as factor and cant be merged
-      ) %>% 
-  
-  ## collapse the list of count outputs in to one data frame
-  bind_rows() %>% 
-  
-  ## merge with the outputs of the regression 
-  bind_cols(., models) %>% 
-  
-  ## only keep columns interested in 
-  select(term, 2:3, estimate, conf.low, conf.high, p.value) %>% 
-  
-  ## round decimal places
-  mutate(across(where(is.numeric), round, digits = 2))
+
f_linelist <- linelist %>%
+     filter(gender == 0) %>%                 ## subset to only where the gender == "f"
+  dplyr::select(explanatory_vars, outcome)     ## select variables of interest
+     
+m_linelist <- linelist %>%
+     filter(gender == 1) %>%                 ## subset to only where the gender == "f"
+  dplyr::select(explanatory_vars, outcome)     ## select variables of interest
-

Below is what the data frame looks like. See the page on Tables for presentation for ideas on how to further convert this table to pretty HTML output (e.g. with flextable).

+

Once this has been done, you can carry out your regression in either base R or gtsummary.

+
+

19.3.1 base R

+

To carry this out in base R, you run two different regressions, one for where gender == "f" and gender == "m".

-
-
- +
#Run model for f
+f_model <- glm(outcome ~ vomit, family = "binomial", data = f_linelist) %>% 
+     tidy(exponentiate = TRUE, conf.int = TRUE) %>%        # exponentiate and produce CIs
+     mutate(across(where(is.numeric), round, digits = 2)) %>%  # round all numeric columns
+     mutate(gender = "f")                                      # create a column which identifies these results as using the f dataset
+ 
+#Run model for m
+m_model <- glm(outcome ~ vomit, family = "binomial", data = m_linelist) %>% 
+     tidy(exponentiate = TRUE, conf.int = TRUE) %>%        # exponentiate and produce CIs
+     mutate(across(where(is.numeric), round, digits = 2)) %>%  # round all numeric columns
+     mutate(gender = "m")                                      # create a column which identifies these results as using the m dataset
+
+#Combine the results
+rbind(f_model,
+      m_model)
+
+
# A tibble: 4 × 8
+  term        estimate std.error statistic p.value conf.low conf.high gender
+  <chr>          <dbl>     <dbl>     <dbl>   <dbl>    <dbl>     <dbl> <chr> 
+1 (Intercept)     1.25      0.06      3.56    0        1.11      1.42 f     
+2 vomit           1.05      0.09      0.58    0.56     0.89      1.25 f     
+3 (Intercept)     1.21      0.06      3.04    0        1.07      1.36 m     
+4 vomit           1.13      0.09      1.38    0.17     0.95      1.34 m     
-
-
-
-

gtsummary package

-

Below we present the use of tbl_uvregression() from the gtsummary package. Just like in the page on Descriptive tables, gtsummary functions do a good job of running statistics and producing professional-looking outputs. This function produces a table of univariate regression results.

-

We select only the necessary columns from the linelist (explanatory variables and the outcome variable) and pipe them into tbl_uvregression(). We are going to run univariate regression on each of the columns we defined as explanatory_vars in the data Preparation section (gender, fever, chills, cough, aches, vomit, and age_cat).

-

Within the function itself, we provide the method = as glm (no quotes), the y = outcome column (outcome), specify to method.args = that we want to run logistic regression via family = binomial, and we tell it to exponentiate the results.

-

The output is HTML and contains the counts

+
+

19.3.2 gtsummary

+

The same approach is repeated using gtsummary, however it is easier to produce publication ready tables with gtsummary and compare the two tables with the function tbl_merge().

-
univ_tab <- linelist %>% 
-  dplyr::select(explanatory_vars, outcome) %>% ## select variables of interest
-
-  tbl_uvregression(                         ## produce univariate table
-    method = glm,                           ## define regression want to run (generalised linear model)
-    y = outcome,                            ## define outcome variable
-    method.args = list(family = binomial),  ## define what type of glm want to run (logistic)
-    exponentiate = TRUE                     ## exponentiate to produce odds ratios (rather than log odds)
-  )
-
-## view univariate results table 
-univ_tab
+
#Run model for f
+f_model_gt <- f_linelist %>% 
+     dplyr::select(vomit, outcome) %>% ## select variables of interest
+     tbl_uvregression(                         ## produce univariate table
+          method = glm,                           ## define regression want to run (generalised linear model)
+          y = outcome,                            ## define outcome variable
+          method.args = list(family = binomial),  ## define what type of glm want to run (logistic)
+          exponentiate = TRUE                     ## exponentiate to produce odds ratios (rather than log odds)
+     )
+
+#Run model for m
+m_model_gt <- m_linelist %>% 
+     dplyr::select(vomit, outcome) %>% ## select variables of interest
+     tbl_uvregression(                         ## produce univariate table
+          method = glm,                           ## define regression want to run (generalised linear model)
+          y = outcome,                            ## define outcome variable
+          method.args = list(family = binomial),  ## define what type of glm want to run (logistic)
+          exponentiate = TRUE                     ## exponentiate to produce odds ratios (rather than log odds)
+     )
+
+#Combine gtsummary tables
+f_and_m_table <- tbl_merge(
+     tbls = list(f_model_gt,
+                 m_model_gt),
+     tab_spanner = c("Female",
+                     "Male")
+)
+
+#Print
+f_and_m_table
-
- -----+++++++++ - - + + + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + - - + @@ -1879,21 +3174,15 @@

gtsummary -

There are many modifications you can make to this table output, such as adjusting the text labels, bolding rows by their p-value, etc. See tutorials here and elsewhere online.

- - -
-

19.3 Stratified

-

Stratified analysis is currently still being worked on for gtsummary, this page will be updated in due course.

19.4 Multivariable

For multivariable analysis, we again present two approaches:

    -
  • glm() and tidy()
    +
  • glm() and tidy().
  • -
  • gtsummary package
  • +
  • gtsummary package.

The workflow is similar for each and only the last step of pulling together a final table is different.

@@ -1901,9 +3190,9 @@

Conduct m

Here we use glm() but add more variables to the right side of the equation, separated by plus symbols (+).

To run the model with all of our explanatory variables we would run:

-
mv_reg <- glm(outcome ~ gender + fever + chills + cough + aches + vomit + age_cat, family = "binomial", data = linelist)
-
-summary(mv_reg)
+
mv_reg <- glm(outcome ~ gender + fever + chills + cough + aches + vomit + age_cat, family = "binomial", data = linelist)
+
+summary(mv_reg)

 Call:
@@ -1936,28 +3225,28 @@ 

Conduct m Number of Fisher Scoring iterations: 4

-

If you want to include two variables and an interaction between them you can separate them with an asterisk * instead of a +. Separate them with a colon : if you are only specifying the interaction. For example:

+

If you want to include two variables and an interaction between them you can separate them with an asterisk * instead of a +. Separate them with a colon : if you are only specifying the interaction. For example:

-
glm(outcome ~ gender + age_cat * fever, family = "binomial", data = linelist)
+
glm(outcome ~ gender + age_cat * fever, family = "binomial", data = linelist)

Optionally, you can use this code to leverage the pre-defined vector of column names and re-create the above command using str_c(). This might be useful if your explanatory variable names are changing, or you don’t want to type them all out again.

-
## run a regression with all variables of interest 
-mv_reg <- explanatory_vars %>%  ## begin with vector of explanatory column names
-  str_c(collapse = "+") %>%     ## combine all names of the variables of interest separated by a plus
-  str_c("outcome ~ ", .) %>%    ## combine the names of variables of interest with outcome in formula style
-  glm(family = "binomial",      ## define type of glm as logistic,
-      data = linelist)          ## define your dataset
+
## run a regression with all variables of interest 
+mv_reg <- explanatory_vars %>%  ## begin with vector of explanatory column names
+  str_c(collapse = "+") %>%     ## combine all names of the variables of interest separated by a plus
+  str_c("outcome ~ ", .) %>%    ## combine the names of variables of interest with outcome in formula style
+  glm(family = "binomial",      ## define type of glm as logistic,
+      data = linelist)          ## define your dataset

Building the model

You can build your model step-by-step, saving various models that include certain explanatory variables. You can compare these models with likelihood-ratio tests using lrtest() from the package lmtest, as below:

NOTE: Using base anova(model1, model2, test = "Chisq) produces the same results

-
model1 <- glm(outcome ~ age_cat, family = "binomial", data = linelist)
-model2 <- glm(outcome ~ age_cat + gender, family = "binomial", data = linelist)
-
-lmtest::lrtest(model1, model2)
+
model1 <- glm(outcome ~ age_cat, family = "binomial", data = linelist)
+model2 <- glm(outcome ~ age_cat + gender, family = "binomial", data = linelist)
+
+lmtest::lrtest(model1, model2)
Likelihood ratio test
 
@@ -1970,26 +3259,26 @@ 

Building the

Another option is to take the model object and apply the step() function from the stats package. Specify which variable selection direction you want use when building the model.

-
## choose a model using forward selection based on AIC
-## you can also do "backward" or "both" by adjusting the direction
-final_mv_reg <- mv_reg %>%
-  step(direction = "forward", trace = FALSE)
+
## choose a model using forward selection based on AIC
+## you can also do "backward" or "both" by adjusting the direction
+final_mv_reg <- mv_reg %>%
+  step(direction = "forward", trace = FALSE)

You can also turn off scientific notation in your R session, for clarity:

-
options(scipen=999)
+
options(scipen=999)

As described in the section on univariate analysis, pass the model output to tidy() to exponentiate the log odds and CIs. Finally we round all numeric columns to two decimal places. Scroll through to see all the rows.

-
mv_tab_base <- final_mv_reg %>% 
-  broom::tidy(exponentiate = TRUE, conf.int = TRUE) %>%  ## get a tidy dataframe of estimates 
-  mutate(across(where(is.numeric), round, digits = 2))          ## round 
+
mv_tab_base <- final_mv_reg %>% 
+  broom::tidy(exponentiate = TRUE, conf.int = TRUE) %>%  ## get a tidy dataframe of estimates 
+  mutate(across(where(is.numeric), round, digits = 2))          ## round 

Here is what the resulting data frame looks like:

-
- +
+
@@ -1999,32 +3288,32 @@

Building the

Combine univariate and multivariable

Combine with gtsummary

-

The gtsummary package provides the tbl_regression() function, which will take the outputs from a regression (glm() in this case) and produce an nice summary table.

+

The gtsummary package provides the tbl_regression() function, which will take the outputs from a regression (glm() in this case) and produce a nice summary table.

-
## show results table of final regression 
-mv_tab <- tbl_regression(final_mv_reg, exponentiate = TRUE)
+
## show results table of final regression 
+mv_tab <- tbl_regression(final_mv_reg, exponentiate = TRUE)

Let’s see the table:

-
mv_tab
+
mv_tab
-
- @@ -2588,28 +3877,28 @@

Combine

You can also combine several different output tables produced by gtsummary with the tbl_merge() function. We now combine the multivariable results with the gtsummary univariate results that we created above:

-
## combine with univariate results 
-tbl_merge(
-  tbls = list(univ_tab, mv_tab),                          # combine
-  tab_spanner = c("**Univariate**", "**Multivariable**")) # set header names
+
## combine with univariate results 
+tbl_merge(
+  tbls = list(univ_tab, mv_tab),                          # combine
+  tab_spanner = c("**Univariate**", "**Multivariable**")) # set header names
-
- @@ -3272,30 +4561,30 @@

Combine

Combine with dplyr

An alternative way of combining the glm()/tidy() univariate and multivariable outputs is with the dplyr join functions.

    -
  • Join the univariate results from earlier (univ_tab_base, which contains counts) with the tidied multivariable results mv_tab_base
    +
  • Join the univariate results from earlier (univ_tab_base, which contains counts) with the tidied multivariable results mv_tab_base.
  • -
  • Use select() to keep only the columns we want, specify their order, and re-name them
    +
  • Use select() to keep only the columns we want, specify their order, and re-name them.
  • -
  • Use round() with two decimal places on all the column that are class Double
  • +
  • Use round() with two decimal places on all the column that are class Double.
-
## combine univariate and multivariable tables 
-left_join(univ_tab_base, mv_tab_base, by = "term") %>% 
-  ## choose columns and rename them
-  select( # new name =  old name
-    "characteristic" = term, 
-    "recovered"      = "0", 
-    "dead"           = "1", 
-    "univ_or"        = estimate.x, 
-    "univ_ci_low"    = conf.low.x, 
-    "univ_ci_high"   = conf.high.x,
-    "univ_pval"      = p.value.x, 
-    "mv_or"          = estimate.y, 
-    "mvv_ci_low"     = conf.low.y, 
-    "mv_ci_high"     = conf.high.y,
-    "mv_pval"        = p.value.y 
-  ) %>% 
-  mutate(across(where(is.double), round, 2))   
+
## combine univariate and multivariable tables 
+left_join(univ_tab_base, mv_tab_base, by = "term") %>% 
+  ## choose columns and rename them
+  select( # new name =  old name
+    "characteristic" = term, 
+    "recovered"      = "0", 
+    "dead"           = "1", 
+    "univ_or"        = estimate.x, 
+    "univ_ci_low"    = conf.low.x, 
+    "univ_ci_high"   = conf.high.x,
+    "univ_pval"      = p.value.x, 
+    "mv_or"          = estimate.y, 
+    "mvv_ci_low"     = conf.low.y, 
+    "mv_ci_high"     = conf.high.y,
+    "mv_pval"        = p.value.y 
+  ) %>% 
+  mutate(across(where(is.double), round, 2))   
# A tibble: 20 × 11
    characteristic recovered  dead univ_or univ_ci_low univ_ci_high univ_pval
@@ -3337,38 +4626,38 @@ 

ggplot2 package

You can build a forest plot with ggplot() by plotting elements of the multivariable regression results. Add the layers of the plots using these “geoms”:

    -
  • estimates with geom_point()
    +
  • estimates with geom_point().
  • -
  • confidence intervals with geom_errorbar()
    +
  • confidence intervals with geom_errorbar().
  • -
  • a vertical line at OR = 1 with geom_vline()
  • +
  • a vertical line at OR = 1 with geom_vline().

Before plotting, you may want to use fct_relevel() from the forcats package to set the order of the variables/levels on the y-axis. ggplot() may display them in alpha-numeric order which would not work well for these age category values (“30” would appear before “5”). See the page on Factors for more details.

-
## remove the intercept term from your multivariable results
-mv_tab_base %>% 
-  
-  #set order of levels to appear along y-axis
-  mutate(term = fct_relevel(
-    term,
-    "vomit", "gender", "fever", "cough", "chills", "aches",
-    "age_cat5-9", "age_cat10-14", "age_cat15-19", "age_cat20-29",
-    "age_cat30-49", "age_cat50-69", "age_cat70+")) %>%
-  
-  # remove "intercept" row from plot
-  filter(term != "(Intercept)") %>% 
-  
-  ## plot with variable on the y axis and estimate (OR) on the x axis
-  ggplot(aes(x = estimate, y = term)) +
-  
-  ## show the estimate as a point
-  geom_point() + 
-  
-  ## add in an error bar for the confidence intervals
-  geom_errorbar(aes(xmin = conf.low, xmax = conf.high)) + 
-  
-  ## show where OR = 1 is for reference as a dashed line
-  geom_vline(xintercept = 1, linetype = "dashed")
+
## remove the intercept term from your multivariable results
+mv_tab_base %>% 
+  
+  #set order of levels to appear along y-axis
+  mutate(term = fct_relevel(
+    term,
+    "vomit", "gender", "fever", "cough", "chills", "aches",
+    "age_cat5-9", "age_cat10-14", "age_cat15-19", "age_cat20-29",
+    "age_cat30-49", "age_cat50-69", "age_cat70+")) %>%
+  
+  # remove "intercept" row from plot
+  filter(term != "(Intercept)") %>% 
+  
+  ## plot with variable on the y axis and estimate (OR) on the x axis
+  ggplot(aes(x = estimate, y = term)) +
+  
+  ## show the estimate as a point
+  geom_point() + 
+  
+  ## add in an error bar for the confidence intervals
+  geom_errorbar(aes(xmin = conf.low, xmax = conf.high)) + 
+  
+  ## show where OR = 1 is for reference as a dashed line
+  geom_vline(xintercept = 1, linetype = "dashed")
@@ -3384,12 +4673,12 @@

easy

An alternative, if you do not want to the fine level of control that ggplot2 provides, is to use a combination of easystats packages.

The function model_parameters() from the parameters package does the equivalent of the broom package function tidy(). The see package then accepts those outputs and creates a default forest plot as a ggplot() object.

-
pacman::p_load(easystats)
-
-## remove the intercept term from your multivariable results
-final_mv_reg %>% 
-  model_parameters(exponentiate = TRUE) %>% 
-  plot()
+
pacman::p_load(easystats)
+
+## remove the intercept term from your multivariable results
+final_mv_reg %>% 
+  model_parameters(exponentiate = TRUE) %>% 
+  plot()
@@ -3398,11 +4687,66 @@

easy

-

-
-

19.6 Resources

+
+

19.6 Model performance

+

Once you have built your regression models, you may want to assess how well the model has fit the data. There are many different approaches to do this, and many different metrics with which to assess your model fit, and how it compares with other model formulations. How you assess your model fit will depend on your model, the data, and the context in which you are conducting your work.

+

While there are many different functions, and many different packages, to assess model fit, one package that nicely combines several different metrics and approaches into a single source is the performance package. This package allows you to assess model assumptions (such as linearity, homogeneity, highlight outliers, etc.) and check how well the model performs (Akaike Information Criterion values, R2, RMSE, etc) with a few simple functions.

+

Unfortunately, we are unable to use this package with gtsummary, but it readily accepts objects generated by other packages such as stats, lmerMod and tidymodels. Here we will demonstrate its application using the function glm() for a multivariable regression. To do this we can use the function performance() to assess model fit, and compare_perfomrance() to compare the two models.

+
+
#Load in packages
+pacman::p_load(performance)
+
+#Set up regression models
+regression_one <- linelist %>%
+     select(outcome, gender, fever, chills, cough) %>%
+     glm(formula = outcome ~ .,
+         family = binomial)
+
+regression_two <- linelist %>%
+     select(outcome, days_onset_hosp, aches, vomit, age_years) %>%
+     glm(formula = outcome ~ .,
+         family = binomial)
+
+#Assess model fit
+performance(regression_one)
+
+
# Indices of model performance
+
+AIC      |     AICc |      BIC | Tjur's R2 |  RMSE | Sigma | Log_loss | Score_log |   PCP
+-----------------------------------------------------------------------------------------
+5719.746 | 5719.760 | 5751.421 | 6.342e-04 | 0.496 | 1.000 |    0.685 |      -Inf | 0.508
+
+
performance(regression_two)
+
+
# Indices of model performance
+
+AIC      |     AICc |      BIC | Tjur's R2 |  RMSE | Sigma | Log_loss | Score_log |   PCP
+-----------------------------------------------------------------------------------------
+5411.752 | 5411.767 | 5443.187 |     0.012 | 0.493 | 1.000 |    0.680 |      -Inf | 0.513
+
+
#Compare model fit
+compare_performance(regression_one,
+                    regression_two)
+
+
When comparing models, please note that probably not all models were fit
+  from same data.
+
+
+
# Comparison of Model Performance Indices
+
+Name           | Model |  AIC (weights) | AICc (weights) |  BIC (weights) | Tjur's R2 |  RMSE | Sigma | Log_loss | Score_log |   PCP
+------------------------------------------------------------------------------------------------------------------------------------
+regression_one |   glm | 5719.7 (<.001) | 5719.8 (<.001) | 5751.4 (<.001) | 6.342e-04 | 0.496 | 1.000 |    0.685 |      -Inf | 0.508
+regression_two |   glm | 5411.8 (>.999) | 5411.8 (>.999) | 5443.2 (>.999) |     0.012 | 0.493 | 1.000 |    0.680 |      -Inf | 0.513
+
+
+

For further reading on the performance package, and the model tests you can carry out, see their github.

+ +
+
+

19.7 Resources

The content of this page was informed by these resources and vignettes online:

Linear regression in R

gtsummary

@@ -3592,18 +4936,7 @@

{ - lightboxQuarto.on('slide_before_load', (data) => { - const { slideIndex, slideNode, slideConfig, player, trigger } = data; - const href = trigger.getAttribute('href'); - if (href !== null) { - const imgEl = window.document.querySelector(`a[href="${href}"] img`); - if (imgEl !== null) { - const srcAttr = imgEl.getAttribute("src"); - if (srcAttr && srcAttr.startsWith("data:")) { - slideConfig.href = srcAttr; + diff --git a/html_outputs/new_pages/reportfactory.html b/html_outputs/new_pages/reportfactory.html index 0ad4d6fb..6849fd30 100644 --- a/html_outputs/new_pages/reportfactory.html +++ b/html_outputs/new_pages/reportfactory.html @@ -2,12 +2,12 @@ - + -The Epidemiologist R Handbook - 41  Organizing routine reports +41  Organizing routine reports – The Epidemiologist R Handbook -

+

Characteristic

+

Female

+
+

Male

+

N

OR

+1

95% CI

+1

p-value

N

gtsummary
gender4,1671.000.88, 1.13>0.9
fever4,1671.000.85, 1.17>0.9
chills4,1671.030.89, 1.210.7
cough4,1671.150.97, 1.370.11
aches4,1670.930.76, 1.140.5
vomit4,1671.090.96, 1.230.2
age_cat4,167
-

-

-
    0-4
-

-
    5-9
-
0.940.77, 1.150.5
    10-14
-
1.150.93, 1.420.2
    15-19
-
0.990.80, 1.24>0.9
    20-29
-
1.030.84, 1.260.8
    30-49
-
1.070.85, 1.330.6
    50-69
-
0.680.41, 1.130.13
    70+
-
0.530.07, 3.200.52,0771.050.89, 1.250.62,0901.130.95, 1.340.2
1 +
1

OR = Odds Ratio, CI = Confidence Interval

+
++++++ - - - - + + + + @@ -2027,23 +2096,28 @@

arrived

- + - + - + - + + + + @@ -2052,7 +2126,6 @@

- @@ -2062,16 +2135,16 @@

26.9.1 Survey package

-
ratio <- svyratio(~died, 
-         denominator = ~obstime, 
-         design = base_survey_design)
-
-ci <- confint(ratio)
-
-cbind(
-  ratio$ratio * 10000, 
-  ci * 10000
-)
+
ratio <- svyratio(~died, 
+         denominator = ~obstime, 
+         design = base_survey_design)
+
+ci <- confint(ratio)
+
+cbind(
+  ratio$ratio * 10000, 
+  ci * 10000
+)
      obstime    2.5 %   97.5 %
 died 5.981922 1.194294 10.76955
@@ -2081,14 +2154,14 @@

26.9.2 Srvyr package

-
survey_design %>% 
-  ## survey ratio used to account for observation time 
-  summarise(
-    mortality = survey_ratio(
-      as.numeric(died) * 10000, 
-      obstime, 
-      vartype = "ci")
-    )
+
survey_design %>% 
+  ## survey ratio used to account for observation time 
+  summarise(
+    mortality = survey_ratio(
+      as.numeric(died) * 10000, 
+      obstime, 
+      vartype = "ci")
+    )
# A tibble: 1 × 3
   mortality mortality_low mortality_upp
@@ -2096,171 +2169,1283 @@ 

- -
-

26.10 Resources

-

UCLA stats page

-

Analyze survey data free

-

srvyr packge

-

gtsummary package

-

EPIET survey case studies

+
+

26.10 Weighted regression

+

Another tool we can use to analyse our survey data is to use weighted regression. This allows us to carry out to account for the survey design in our regression in order to avoid biases that may be introduced from the survey process.

+

To carry out a univariate regression, we can use the packages survey for the function svyglm() and the package gtsummary which allows us to call svyglm() inside the function tbl_uvregression. To do this we first use the survey_design object created above. This is then provided to the function tbl_uvregression() as in the Univariate and multivariable regression chapter. We then make one key change, we change method = glm to method = survey::svyglm in order to carry out our survey weighted regression.

+

Here we will be using the previously created object survey_design to predict whether the value in the column died is TRUE, using the columns malaria_treatment, bednet, and age_years.

+
+
survey_design %>%
+     tbl_uvregression(                             #Carry out a univariate regression, if we wanted a multivariable regression we would use tbl_
+          method = survey::svyglm,                 #Set this to survey::svyglm to carry out our weighted regression on the survey data
+          y = died,                                #The column we are trying to predict
+          method.args = list(family = binomial),   #The family, we are carrying out a logistic regression so we want the family as binomial
+          include = c(malaria_treatment,           #These are the columns we want to evaluate
+                      bednet,
+                      age_years),
+          exponentiate = T                         #To transform the log odds to odds ratio for easier interpretation
+     )
+
+
+ + +

CharacteristicWeighted total (N)Weighted Count195%CI

Characteristic

Weighted total (N)

Weighted count

+1

95% CI

+2
1,482,457 761,799 (51%)40.9-61.741%, 62%
left 1,482,457 701,199 (47%)39.2-55.539%, 56%
died 1,482,457 76,213 (5.1%)2.1-12.12.1%, 12%
1 n (%)1 +

n (%)

2 +

CI = Confidence Interval

+++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

Characteristic

N

OR

+1

95% CI

+1

p-value

malaria_treatment1,357,145
+

+

+
    FALSE
+

+
    TRUE
+
1.120.33, 3.820.8
bednet1,482,457
+

+

+
    FALSE
+

+
    TRUE
+
0.500.13, 1.880.3
age_years1,482,4571.011.00, 1.020.013
1 +

OR = Odds Ratio, CI = Confidence Interval

+ +
+
+
+

If we wanted to carry out a multivariable regression, we would have to first use the function svyglm() and pipe (%>%) the results into the function tbl_regression. Note that we need to specify the formula.

+
+
survey_design %>%
+     svyglm(formula = died ~ malaria_treatment + 
+                 bednet + 
+                 age_years,
+            family = binomial) %>%                   #The family, we are carrying out a logistic regression so we want the family as binomial
+     tbl_regression( 
+          exponentiate = T                           #To transform the log odds to odds ratio for easier interpretation                            
+     )
+
+
+ + + ++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

Characteristic

OR

+1

95% CI

+1

p-value

malaria_treatment
+

+

+
    FALSE
+
    TRUE1.110.28, 4.400.9
bednet
+

+

+
    FALSE
+
    TRUE0.620.12, 3.210.5
age_years1.011.00, 1.020.018
1 +

OR = Odds Ratio, CI = Confidence Interval

+ +
+
+
+ +
+
+

26.11 Resources

+

UCLA stats page

+

Analyze survey data free

+

srvyr packge

+

gtsummary package

+

EPIET survey case studies

+

Analyzing Complex Survey Data

+ + +
+ + + diff --git a/html_outputs/new_pages/survey_analysis_files/figure-html/weighted_age_pyramid-1.png b/html_outputs/new_pages/survey_analysis_files/figure-html/weighted_age_pyramid-1.png index b8c42585..782bba3d 100644 Binary files a/html_outputs/new_pages/survey_analysis_files/figure-html/weighted_age_pyramid-1.png and b/html_outputs/new_pages/survey_analysis_files/figure-html/weighted_age_pyramid-1.png differ diff --git a/html_outputs/new_pages/survival_analysis.html b/html_outputs/new_pages/survival_analysis.html index 8cab565c..6d89b2c1 100644 --- a/html_outputs/new_pages/survival_analysis.html +++ b/html_outputs/new_pages/survival_analysis.html @@ -2,12 +2,12 @@ - + -The Epidemiologist R Handbook - 27  Survival analysis +27  Survival analysis – The Epidemiologist R Handbook

Hospital

Total cases with known outcome

Recovered

Died

Total

% of cases

Median CT values

Total

% of cases

Median CT values

St. Mark's Maternity Hospital (SMMH)

325

126

38.8%

22

199

61.2%

22

Central Hospital

358

165

46.1%

22

193

53.9%

22

Other

685

290

42.3%

21

395

57.7%

22

Military Hospital

708

309

43.6%

22

399

56.4%

21

Missing

1,125

514

45.7%

21

611

54.3%

21

Port Hospital

1,364

579

42.4%

21

785

57.6%

22

Total

3,440

1,469

42.7%

22

1,971

57.3%

22

+

Hospital

Total cases with known outcome

Recovered

Died

Total

% of cases

Median CT values

Total

% of cases

Median CT values

St. Mark's Maternity Hospital (SMMH)

325

126

38.8%

22

199

61.2%

22

Central Hospital

358

165

46.1%

22

193

53.9%

22

Other

685

290

42.3%

21

395

57.7%

22

Military Hospital

708

309

43.6%

22

399

56.4%

21

Missing

1,125

514

45.7%

21

611

54.3%

21

Port Hospital

1,364

579

42.4%

21

785

57.6%

22

Total

3,440

1,469

42.7%

22

1,971

57.3%

22

This page demonstrates how to convert summary data frames into presentation-ready tables with the flextable package. These tables can be inserted into powerpoint slides, HTML pages, PDF or Word documents, etc.

@@ -746,16 +821,21 @@

Load packages

Import data

-

To begin, we import the cleaned linelist of cases from a simulated Ebola epidemic. If you want to follow along, click to download the “clean” linelist (as .rds file). Import data with the import() function from the rio package (it handles many file types like .xlsx, .csv, .rds - see the Import and export page for details).

+

To begin, we import the cleaned linelist of cases from a simulated Ebola epidemic. If you want to follow along, click to download the “clean” linelist (as .rds file). Import data with the import() function from the rio package (it handles many file types like .xlsx, .csv, .rds - see the Import and export page for details).

+
+
+
Warning: Missing `trust` will be set to FALSE by default for RDS in 2.0.0.
+
+
-
# import the linelist
-linelist <- import("linelist_cleaned.rds")
+
# import the linelist
+linelist <- import("linelist_cleaned.rds")

The first 50 rows of the linelist are displayed below.

-
- +
+
@@ -764,42 +844,42 @@

Prepare table

Before beginning to use flextable you will need to create your table as a data frame. See the page on Descriptive tables and Pivoting data to learn how to create a data frame using packages such as janitor and dplyr. You must arrange the content in rows and columns as you want it displayed. Then, the data frame will be passed to flextable to display it with colors, headers, fonts, etc.

Below is an example from the Descriptive tables page of converting the case linelist into a data frame that summarises patient outcomes and CT values by hospital, with a Totals row at the bottom. The output is saved as table.

-
table <- linelist %>% 
-  
-  # Get summary values per hospital-outcome group
-  ###############################################
-  group_by(hospital, outcome) %>%                      # Group data
-  summarise(                                           # Create new summary columns of indicators of interest
-    N = n(),                                            # Number of rows per hospital-outcome group     
-    ct_value = median(ct_blood, na.rm=T)) %>%           # median CT value per group
-  
-  # add totals
-  ############
-  bind_rows(                                           # Bind the previous table with this mini-table of totals
-    linelist %>% 
-      filter(!is.na(outcome) & hospital != "Missing") %>%
-      group_by(outcome) %>%                            # Grouped only by outcome, not by hospital    
-      summarise(
-        N = n(),                                       # Number of rows for whole dataset     
-        ct_value = median(ct_blood, na.rm=T))) %>%     # Median CT for whole dataset
-  
-  # Pivot wider and format
-  ########################
-  mutate(hospital = replace_na(hospital, "Total")) %>% 
-  pivot_wider(                                         # Pivot from long to wide
-    values_from = c(ct_value, N),                       # new values are from ct and count columns
-    names_from = outcome) %>%                           # new column names are from outcomes
-  mutate(                                              # Add new columns
-    N_Known = N_Death + N_Recover,                               # number with known outcome
-    Pct_Death = scales::percent(N_Death / N_Known, 0.1),         # percent cases who died (to 1 decimal)
-    Pct_Recover = scales::percent(N_Recover / N_Known, 0.1)) %>% # percent who recovered (to 1 decimal)
-  select(                                              # Re-order columns
-    hospital, N_Known,                                   # Intro columns
-    N_Recover, Pct_Recover, ct_value_Recover,            # Recovered columns
-    N_Death, Pct_Death, ct_value_Death)  %>%             # Death columns
-  arrange(N_Known)                                    # Arrange rows from lowest to highest (Total row at bottom)
-
-table  # print
+
table <- linelist %>% 
+  
+  # Get summary values per hospital-outcome group
+  ###############################################
+  group_by(hospital, outcome) %>%                      # Group data
+  summarise(                                           # Create new summary columns of indicators of interest
+    N = n(),                                            # Number of rows per hospital-outcome group     
+    ct_value = median(ct_blood, na.rm=T)) %>%           # median CT value per group
+  
+  # add totals
+  ############
+  bind_rows(                                           # Bind the previous table with this mini-table of totals
+    linelist %>% 
+      filter(!is.na(outcome) & hospital != "Missing") %>%
+      group_by(outcome) %>%                            # Grouped only by outcome, not by hospital    
+      summarise(
+        N = n(),                                       # Number of rows for whole dataset     
+        ct_value = median(ct_blood, na.rm=T))) %>%     # Median CT for whole dataset
+  
+  # Pivot wider and format
+  ########################
+  mutate(hospital = replace_na(hospital, "Total")) %>% 
+  pivot_wider(                                         # Pivot from long to wide
+    values_from = c(ct_value, N),                       # new values are from ct and count columns
+    names_from = outcome) %>%                           # new column names are from outcomes
+  mutate(                                              # Add new columns
+    N_Known = N_Death + N_Recover,                               # number with known outcome
+    Pct_Death = scales::percent(N_Death / N_Known, 0.1),         # percent cases who died (to 1 decimal)
+    Pct_Recover = scales::percent(N_Recover / N_Known, 0.1)) %>% # percent who recovered (to 1 decimal)
+  select(                                              # Re-order columns
+    hospital, N_Known,                                   # Intro columns
+    N_Recover, Pct_Recover, ct_value_Recover,            # Recovered columns
+    N_Death, Pct_Death, ct_value_Death)  %>%             # Death columns
+  arrange(N_Known)                                    # Arrange rows from lowest to highest (Total row at bottom)
+
+table  # print
# A tibble: 7 × 8
 # Groups:   hospital [7]
@@ -824,10 +904,10 @@ 

Create a flextable

To create and manage flextable objects, we first pass the data frame through the flextable() function. We save the result as my_table.

-
my_table <- flextable(table) 
-my_table
+
my_table <- flextable(table) 
+my_table
-

hospital

N_Known

N_Recover

Pct_Recover

ct_value_Recover

N_Death

Pct_Death

ct_value_Death

St. Mark's Maternity Hospital (SMMH)

325

126

38.8%

22

199

61.2%

22

Central Hospital

358

165

46.1%

22

193

53.9%

22

Other

685

290

42.3%

21

395

57.7%

22

Military Hospital

708

309

43.6%

22

399

56.4%

21

Missing

1,125

514

45.7%

21

611

54.3%

21

Port Hospital

1,364

579

42.4%

21

785

57.6%

22

Total

3,440

1,469

42.7%

22

1,971

57.3%

22

+

hospital

N_Known

N_Recover

Pct_Recover

ct_value_Recover

N_Death

Pct_Death

ct_value_Death

St. Mark's Maternity Hospital (SMMH)

325

126

38.8%

22

199

61.2%

22

Central Hospital

358

165

46.1%

22

193

53.9%

22

Other

685

290

42.3%

21

395

57.7%

22

Military Hospital

708

309

43.6%

22

399

56.4%

21

Missing

1,125

514

45.7%

21

611

54.3%

21

Port Hospital

1,364

579

42.4%

21

785

57.6%

22

Total

3,440

1,469

42.7%

22

1,971

57.3%

22

After doing this, we can progressively pipe the my_table object through more flextable formatting functions.

@@ -849,22 +929,22 @@

Create a fle

Column width

We can use the autofit() function, which nicely stretches out the table so that each cell only has one row of text. The function qflextable() is a convenient shorthand for flextable() and autofit().

-
my_table %>% autofit()
+
my_table %>% autofit()
-

hospital

N_Known

N_Recover

Pct_Recover

ct_value_Recover

N_Death

Pct_Death

ct_value_Death

St. Mark's Maternity Hospital (SMMH)

325

126

38.8%

22

199

61.2%

22

Central Hospital

358

165

46.1%

22

193

53.9%

22

Other

685

290

42.3%

21

395

57.7%

22

Military Hospital

708

309

43.6%

22

399

56.4%

21

Missing

1,125

514

45.7%

21

611

54.3%

21

Port Hospital

1,364

579

42.4%

21

785

57.6%

22

Total

3,440

1,469

42.7%

22

1,971

57.3%

22

+

hospital

N_Known

N_Recover

Pct_Recover

ct_value_Recover

N_Death

Pct_Death

ct_value_Death

St. Mark's Maternity Hospital (SMMH)

325

126

38.8%

22

199

61.2%

22

Central Hospital

358

165

46.1%

22

193

53.9%

22

Other

685

290

42.3%

21

395

57.7%

22

Military Hospital

708

309

43.6%

22

399

56.4%

21

Missing

1,125

514

45.7%

21

611

54.3%

21

Port Hospital

1,364

579

42.4%

21

785

57.6%

22

Total

3,440

1,469

42.7%

22

1,971

57.3%

22

However, this might not always be appropriate, especially if there are very long values within cells, meaning the table might not fit on the page.

Instead, we can specify widths with the width() function. It can take some playing around to know what width value to put. In the example below, we specify different widths for column 1, column 2, and columns 4 to 8.

-
my_table <- my_table %>% 
-  width(j=1, width = 2.7) %>% 
-  width(j=2, width = 1.5) %>% 
-  width(j=c(4,5,7,8), width = 1)
-
-my_table
+
my_table <- my_table %>% 
+  width(j=1, width = 2.7) %>% 
+  width(j=2, width = 1.5) %>% 
+  width(j=c(4,5,7,8), width = 1)
+
+my_table
-

hospital

N_Known

N_Recover

Pct_Recover

ct_value_Recover

N_Death

Pct_Death

ct_value_Death

St. Mark's Maternity Hospital (SMMH)

325

126

38.8%

22

199

61.2%

22

Central Hospital

358

165

46.1%

22

193

53.9%

22

Other

685

290

42.3%

21

395

57.7%

22

Military Hospital

708

309

43.6%

22

399

56.4%

21

Missing

1,125

514

45.7%

21

611

54.3%

21

Port Hospital

1,364

579

42.4%

21

785

57.6%

22

Total

3,440

1,469

42.7%

22

1,971

57.3%

22

+

hospital

N_Known

N_Recover

Pct_Recover

ct_value_Recover

N_Death

Pct_Death

ct_value_Death

St. Mark's Maternity Hospital (SMMH)

325

126

38.8%

22

199

61.2%

22

Central Hospital

358

165

46.1%

22

193

53.9%

22

Other

685

290

42.3%

21

395

57.7%

22

Military Hospital

708

309

43.6%

22

399

56.4%

21

Missing

1,125

514

45.7%

21

611

54.3%

21

Port Hospital

1,364

579

42.4%

21

785

57.6%

22

Total

3,440

1,469

42.7%

22

1,971

57.3%

22

@@ -875,64 +955,64 @@

Column headersWe also rename the header names in the now-second header in a separate set_header_labels() command.

Finally, to “combine” certain column headers in the top header we use merge_at() to merge the column headers in the top header row.

-
my_table <- my_table %>% 
-  
-  add_header_row(
-    top = TRUE,                # New header goes on top of existing header row
-    values = c("Hospital",     # Header values for each column below
-               "Total cases with known outcome", 
-               "Recovered",    # This will be the top-level header for this and two next columns
-               "",
-               "",
-               "Died",         # This will be the top-level header for this and two next columns
-               "",             # Leave blank, as it will be merged with "Died"
-               "")) %>% 
-    
-  set_header_labels(         # Rename the columns in original header row
-      hospital = "", 
-      N_Known = "",                  
-      N_Recover = "Total",
-      Pct_Recover = "% of cases",
-      ct_value_Recover = "Median CT values",
-      N_Death = "Total",
-      Pct_Death = "% of cases",
-      ct_value_Death = "Median CT values")  %>% 
-  
-  merge_at(i = 1, j = 3:5, part = "header") %>% # Horizontally merge columns 3 to 5 in new header row
-  merge_at(i = 1, j = 6:8, part = "header")     # Horizontally merge columns 6 to 8 in new header row
-
-my_table  # print
+
my_table <- my_table %>% 
+  
+  add_header_row(
+    top = TRUE,                # New header goes on top of existing header row
+    values = c("Hospital",     # Header values for each column below
+               "Total cases with known outcome", 
+               "Recovered",    # This will be the top-level header for this and two next columns
+               "",
+               "",
+               "Died",         # This will be the top-level header for this and two next columns
+               "",             # Leave blank, as it will be merged with "Died"
+               "")) %>% 
+    
+  set_header_labels(         # Rename the columns in original header row
+      hospital = "", 
+      N_Known = "",                  
+      N_Recover = "Total",
+      Pct_Recover = "% of cases",
+      ct_value_Recover = "Median CT values",
+      N_Death = "Total",
+      Pct_Death = "% of cases",
+      ct_value_Death = "Median CT values")  %>% 
+  
+  merge_at(i = 1, j = 3:5, part = "header") %>% # Horizontally merge columns 3 to 5 in new header row
+  merge_at(i = 1, j = 6:8, part = "header")     # Horizontally merge columns 6 to 8 in new header row
+
+my_table  # print
-

Hospital

Total cases with known outcome

Recovered

Died

Total

% of cases

Median CT values

Total

% of cases

Median CT values

St. Mark's Maternity Hospital (SMMH)

325

126

38.8%

22

199

61.2%

22

Central Hospital

358

165

46.1%

22

193

53.9%

22

Other

685

290

42.3%

21

395

57.7%

22

Military Hospital

708

309

43.6%

22

399

56.4%

21

Missing

1,125

514

45.7%

21

611

54.3%

21

Port Hospital

1,364

579

42.4%

21

785

57.6%

22

Total

3,440

1,469

42.7%

22

1,971

57.3%

22

+

Hospital

Total cases with known outcome

Recovered

Died

Total

% of cases

Median CT values

Total

% of cases

Median CT values

St. Mark's Maternity Hospital (SMMH)

325

126

38.8%

22

199

61.2%

22

Central Hospital

358

165

46.1%

22

193

53.9%

22

Other

685

290

42.3%

21

395

57.7%

22

Military Hospital

708

309

43.6%

22

399

56.4%

21

Missing

1,125

514

45.7%

21

611

54.3%

21

Port Hospital

1,364

579

42.4%

21

785

57.6%

22

Total

3,440

1,469

42.7%

22

1,971

57.3%

22

Borders and background

-

You can adjust the borders, internal lines, etc. with various flextable functions. It is often easier to start by removing all existing borders with border_remove().

+

You can adjust the borders, internal lines, etc., with various flextable functions. It is often easier to start by removing all existing borders with border_remove().

Then, you can apply default border themes by passing the table to theme_box(), theme_booktabs(), or theme_alafoli().

You can add vertical and horizontal lines with a variety of functions. hline() and vline() add lines to a specified row or column, respectively. Within each, you must specify the part = as either “all”, “body”, or “header”. For vertical lines, specify the column to j =, and for horizontal lines the row to i =. Other functions like vline_right(), vline_left(), hline_top(), and hline_bottom() add lines to the outsides only.

In all of these functions, the actual line style itself must be specified to border = and must be the output of a separate command using the fp_border() function from the officer package. This function helps you define the width and color of the line. You can define this above the table commands, as shown below.

-
# define style for border line
-border_style = officer::fp_border(color="black", width=1)
-
-# add border lines to table
-my_table <- my_table %>% 
-
-  # Remove all existing borders
-  border_remove() %>%  
-  
-  # add horizontal lines via a pre-determined theme setting
-  theme_booktabs() %>% 
-  
-  # add vertical lines to separate Recovered and Died sections
-  vline(part = "all", j = 2, border = border_style) %>%   # at column 2 
-  vline(part = "all", j = 5, border = border_style)       # at column 5
-
-my_table
+
# define style for border line
+border_style = officer::fp_border(color="black", width=1)
+
+# add border lines to table
+my_table <- my_table %>% 
+
+  # Remove all existing borders
+  border_remove() %>%  
+  
+  # add horizontal lines via a pre-determined theme setting
+  theme_booktabs() %>% 
+  
+  # add vertical lines to separate Recovered and Died sections
+  vline(part = "all", j = 2, border = border_style) %>%   # at column 2 
+  vline(part = "all", j = 5, border = border_style)       # at column 5
+
+my_table
-

Hospital

Total cases with known outcome

Recovered

Died

Total

% of cases

Median CT values

Total

% of cases

Median CT values

St. Mark's Maternity Hospital (SMMH)

325

126

38.8%

22

199

61.2%

22

Central Hospital

358

165

46.1%

22

193

53.9%

22

Other

685

290

42.3%

21

395

57.7%

22

Military Hospital

708

309

43.6%

22

399

56.4%

21

Missing

1,125

514

45.7%

21

611

54.3%

21

Port Hospital

1,364

579

42.4%

21

785

57.6%

22

Total

3,440

1,469

42.7%

22

1,971

57.3%

22

+

Hospital

Total cases with known outcome

Recovered

Died

Total

% of cases

Median CT values

Total

% of cases

Median CT values

St. Mark's Maternity Hospital (SMMH)

325

126

38.8%

22

199

61.2%

22

Central Hospital

358

165

46.1%

22

193

53.9%

22

Other

685

290

42.3%

21

395

57.7%

22

Military Hospital

708

309

43.6%

22

399

56.4%

21

Missing

1,125

514

45.7%

21

611

54.3%

21

Port Hospital

1,364

579

42.4%

21

785

57.6%

22

Total

3,440

1,469

42.7%

22

1,971

57.3%

22

@@ -940,31 +1020,31 @@

Borders

Font and alignment

We centre-align all columns aside from the left-most column with the hospital names, using the align() function from flextable.

-
my_table <- my_table %>% 
-   flextable::align(align = "center", j = c(2:8), part = "all") 
-my_table
+
my_table <- my_table %>% 
+   flextable::align(align = "center", j = c(2:8), part = "all") 
+my_table
-

Hospital

Total cases with known outcome

Recovered

Died

Total

% of cases

Median CT values

Total

% of cases

Median CT values

St. Mark's Maternity Hospital (SMMH)

325

126

38.8%

22

199

61.2%

22

Central Hospital

358

165

46.1%

22

193

53.9%

22

Other

685

290

42.3%

21

395

57.7%

22

Military Hospital

708

309

43.6%

22

399

56.4%

21

Missing

1,125

514

45.7%

21

611

54.3%

21

Port Hospital

1,364

579

42.4%

21

785

57.6%

22

Total

3,440

1,469

42.7%

22

1,971

57.3%

22

+

Hospital

Total cases with known outcome

Recovered

Died

Total

% of cases

Median CT values

Total

% of cases

Median CT values

St. Mark's Maternity Hospital (SMMH)

325

126

38.8%

22

199

61.2%

22

Central Hospital

358

165

46.1%

22

193

53.9%

22

Other

685

290

42.3%

21

395

57.7%

22

Military Hospital

708

309

43.6%

22

399

56.4%

21

Missing

1,125

514

45.7%

21

611

54.3%

21

Port Hospital

1,364

579

42.4%

21

785

57.6%

22

Total

3,440

1,469

42.7%

22

1,971

57.3%

22

Additionally, we can increase the header font size and change then to bold. We can also change the total row to bold.

-
my_table <-  my_table %>%  
-  fontsize(i = 1, size = 12, part = "header") %>%   # adjust font size of header
-  bold(i = 1, bold = TRUE, part = "header") %>%     # adjust bold face of header
-  bold(i = 7, bold = TRUE, part = "body")           # adjust bold face of total row (row 7 of body)
-
-my_table
+
my_table <-  my_table %>%  
+  fontsize(i = 1, size = 12, part = "header") %>%   # adjust font size of header
+  bold(i = 1, bold = TRUE, part = "header") %>%     # adjust bold face of header
+  bold(i = 7, bold = TRUE, part = "body")           # adjust bold face of total row (row 7 of body)
+
+my_table
-

Hospital

Total cases with known outcome

Recovered

Died

Total

% of cases

Median CT values

Total

% of cases

Median CT values

St. Mark's Maternity Hospital (SMMH)

325

126

38.8%

22

199

61.2%

22

Central Hospital

358

165

46.1%

22

193

53.9%

22

Other

685

290

42.3%

21

395

57.7%

22

Military Hospital

708

309

43.6%

22

399

56.4%

21

Missing

1,125

514

45.7%

21

611

54.3%

21

Port Hospital

1,364

579

42.4%

21

785

57.6%

22

Total

3,440

1,469

42.7%

22

1,971

57.3%

22

+

Hospital

Total cases with known outcome

Recovered

Died

Total

% of cases

Median CT values

Total

% of cases

Median CT values

St. Mark's Maternity Hospital (SMMH)

325

126

38.8%

22

199

61.2%

22

Central Hospital

358

165

46.1%

22

193

53.9%

22

Other

685

290

42.3%

21

395

57.7%

22

Military Hospital

708

309

43.6%

22

399

56.4%

21

Missing

1,125

514

45.7%

21

611

54.3%

21

Port Hospital

1,364

579

42.4%

21

785

57.6%

22

Total

3,440

1,469

42.7%

22

1,971

57.3%

22

We can ensure that the proportion columns display only one decimal place using the function colformat_num(). Note this could also have been done at data management stage with the round() function.

-
my_table <- colformat_num(my_table, j = c(4,7), digits = 1)
-my_table
+
my_table <- colformat_num(my_table, j = c(4, 7), digits = 1)
+my_table
-

Hospital

Total cases with known outcome

Recovered

Died

Total

% of cases

Median CT values

Total

% of cases

Median CT values

St. Mark's Maternity Hospital (SMMH)

325

126

38.8%

22

199

61.2%

22

Central Hospital

358

165

46.1%

22

193

53.9%

22

Other

685

290

42.3%

21

395

57.7%

22

Military Hospital

708

309

43.6%

22

399

56.4%

21

Missing

1,125

514

45.7%

21

611

54.3%

21

Port Hospital

1,364

579

42.4%

21

785

57.6%

22

Total

3,440

1,469

42.7%

22

1,971

57.3%

22

+

Hospital

Total cases with known outcome

Recovered

Died

Total

% of cases

Median CT values

Total

% of cases

Median CT values

St. Mark's Maternity Hospital (SMMH)

325

126

38.8%

22

199

61.2%

22

Central Hospital

358

165

46.1%

22

193

53.9%

22

Other

685

290

42.3%

21

395

57.7%

22

Military Hospital

708

309

43.6%

22

399

56.4%

21

Missing

1,125

514

45.7%

21

611

54.3%

21

Port Hospital

1,364

579

42.4%

21

785

57.6%

22

Total

3,440

1,469

42.7%

22

1,971

57.3%

22

@@ -972,13 +1052,13 @@

Font and ali

Merge cells

Just as we merge cells horizontally in the header row, we can also merge cells vertically using merge_at() and specifying the rows (i) and column (j). Here we merge the “Hospital” and “Total cases with known outcome” values vertically to give them more space.

-
my_table <- my_table %>% 
-  merge_at(i = 1:2, j = 1, part = "header") %>% 
-  merge_at(i = 1:2, j = 2, part = "header")
-
-my_table
+
my_table <- my_table %>% 
+  merge_at(i = 1:2, j = 1, part = "header") %>% 
+  merge_at(i = 1:2, j = 2, part = "header")
+
+my_table
-

Hospital

Total cases with known outcome

Recovered

Died

Total

% of cases

Median CT values

Total

% of cases

Median CT values

St. Mark's Maternity Hospital (SMMH)

325

126

38.8%

22

199

61.2%

22

Central Hospital

358

165

46.1%

22

193

53.9%

22

Other

685

290

42.3%

21

395

57.7%

22

Military Hospital

708

309

43.6%

22

399

56.4%

21

Missing

1,125

514

45.7%

21

611

54.3%

21

Port Hospital

1,364

579

42.4%

21

785

57.6%

22

Total

3,440

1,469

42.7%

22

1,971

57.3%

22

+

Hospital

Total cases with known outcome

Recovered

Died

Total

% of cases

Median CT values

Total

% of cases

Median CT values

St. Mark's Maternity Hospital (SMMH)

325

126

38.8%

22

199

61.2%

22

Central Hospital

358

165

46.1%

22

193

53.9%

22

Other

685

290

42.3%

21

395

57.7%

22

Military Hospital

708

309

43.6%

22

399

56.4%

21

Missing

1,125

514

45.7%

21

611

54.3%

21

Port Hospital

1,364

579

42.4%

21

785

57.6%

22

Total

3,440

1,469

42.7%

22

1,971

57.3%

22

@@ -986,12 +1066,12 @@

Merge cells

Background color

To distinguish the content of the table from the headers, we may want to add additional formatting. e.g. changing the background color. In this example we change the table body to gray.

-
my_table <- my_table %>% 
-    bg(part = "body", bg = "gray95")  
-
-my_table 
+
my_table <- my_table %>% 
+    bg(part = "body", bg = "gray95")  
+
+my_table 
-

Hospital

Total cases with known outcome

Recovered

Died

Total

% of cases

Median CT values

Total

% of cases

Median CT values

St. Mark's Maternity Hospital (SMMH)

325

126

38.8%

22

199

61.2%

22

Central Hospital

358

165

46.1%

22

193

53.9%

22

Other

685

290

42.3%

21

395

57.7%

22

Military Hospital

708

309

43.6%

22

399

56.4%

21

Missing

1,125

514

45.7%

21

611

54.3%

21

Port Hospital

1,364

579

42.4%

21

785

57.6%

22

Total

3,440

1,469

42.7%

22

1,971

57.3%

22

+

Hospital

Total cases with known outcome

Recovered

Died

Total

% of cases

Median CT values

Total

% of cases

Median CT values

St. Mark's Maternity Hospital (SMMH)

325

126

38.8%

22

199

61.2%

22

Central Hospital

358

165

46.1%

22

193

53.9%

22

Other

685

290

42.3%

21

395

57.7%

22

Military Hospital

708

309

43.6%

22

399

56.4%

21

Missing

1,125

514

45.7%

21

611

54.3%

21

Port Hospital

1,364

579

42.4%

21

785

57.6%

22

Total

3,440

1,469

42.7%

22

1,971

57.3%

22

@@ -1001,18 +1081,18 @@

Background col

29.3 Conditional formatting

We can highlight all values in a column that meet a certain rule, e.g. where more than 55% of cases died. Simply put the criteria to the i = or j = argument, preceded by a tilde ~. Reference the column in the data frame, not the display heading values.

-
my_table %>% 
-  bg(j = 7, i = ~ Pct_Death >= 55, part = "body", bg = "red") 
+
my_table %>% 
+  bg(j = 7, i = ~ Pct_Death > 55, part = "body", bg = "red") 
-

Hospital

Total cases with known outcome

Recovered

Died

Total

% of cases

Median CT values

Total

% of cases

Median CT values

St. Mark's Maternity Hospital (SMMH)

325

126

38.8%

22

199

61.2%

22

Central Hospital

358

165

46.1%

22

193

53.9%

22

Other

685

290

42.3%

21

395

57.7%

22

Military Hospital

708

309

43.6%

22

399

56.4%

21

Missing

1,125

514

45.7%

21

611

54.3%

21

Port Hospital

1,364

579

42.4%

21

785

57.6%

22

Total

3,440

1,469

42.7%

22

1,971

57.3%

22

+

Hospital

Total cases with known outcome

Recovered

Died

Total

% of cases

Median CT values

Total

% of cases

Median CT values

St. Mark's Maternity Hospital (SMMH)

325

126

38.8%

22

199

61.2%

22

Central Hospital

358

165

46.1%

22

193

53.9%

22

Other

685

290

42.3%

21

395

57.7%

22

Military Hospital

708

309

43.6%

22

399

56.4%

21

Missing

1,125

514

45.7%

21

611

54.3%

21

Port Hospital

1,364

579

42.4%

21

785

57.6%

22

Total

3,440

1,469

42.7%

22

1,971

57.3%

22

Or, we can highlight the entire row meeting a certain criterion, such as a hospital of interest. To do this we just remove the column (j) specification so the criteria apply to all columns.

-
my_table %>% 
-  bg(., i= ~ hospital == "Military Hospital", part = "body", bg = "#91c293") 
+
my_table %>% 
+  bg(., i= ~ hospital == "Military Hospital", part = "body", bg = "#91c293") 
-

Hospital

Total cases with known outcome

Recovered

Died

Total

% of cases

Median CT values

Total

% of cases

Median CT values

St. Mark's Maternity Hospital (SMMH)

325

126

38.8%

22

199

61.2%

22

Central Hospital

358

165

46.1%

22

193

53.9%

22

Other

685

290

42.3%

21

395

57.7%

22

Military Hospital

708

309

43.6%

22

399

56.4%

21

Missing

1,125

514

45.7%

21

611

54.3%

21

Port Hospital

1,364

579

42.4%

21

785

57.6%

22

Total

3,440

1,469

42.7%

22

1,971

57.3%

22

+

Hospital

Total cases with known outcome

Recovered

Died

Total

% of cases

Median CT values

Total

% of cases

Median CT values

St. Mark's Maternity Hospital (SMMH)

325

126

38.8%

22

199

61.2%

22

Central Hospital

358

165

46.1%

22

193

53.9%

22

Other

685

290

42.3%

21

395

57.7%

22

Military Hospital

708

309

43.6%

22

399

56.4%

21

Missing

1,125

514

45.7%

21

611

54.3%

21

Port Hospital

1,364

579

42.4%

21

785

57.6%

22

Total

3,440

1,469

42.7%

22

1,971

57.3%

22

@@ -1020,96 +1100,96 @@

29.4 All code together

Below we show all the code from the above sections together.

-
border_style = officer::fp_border(color="black", width=1)
-
-pacman::p_load(
-  rio,            # import/export
-  here,           # file pathways
-  flextable,      # make HTML tables 
-  officer,        # helper functions for tables
-  tidyverse)      # data management, summary, and visualization
-
-table <- linelist %>% 
-
-  # Get summary values per hospital-outcome group
-  ###############################################
-  group_by(hospital, outcome) %>%                      # Group data
-  summarise(                                           # Create new summary columns of indicators of interest
-    N = n(),                                            # Number of rows per hospital-outcome group     
-    ct_value = median(ct_blood, na.rm=T)) %>%           # median CT value per group
-  
-  # add totals
-  ############
-  bind_rows(                                           # Bind the previous table with this mini-table of totals
-    linelist %>% 
-      filter(!is.na(outcome) & hospital != "Missing") %>%
-      group_by(outcome) %>%                            # Grouped only by outcome, not by hospital    
-      summarise(
-        N = n(),                                       # Number of rows for whole dataset     
-        ct_value = median(ct_blood, na.rm=T))) %>%     # Median CT for whole dataset
-  
-  # Pivot wider and format
-  ########################
-  mutate(hospital = replace_na(hospital, "Total")) %>% 
-  pivot_wider(                                         # Pivot from long to wide
-    values_from = c(ct_value, N),                       # new values are from ct and count columns
-    names_from = outcome) %>%                           # new column names are from outcomes
-  mutate(                                              # Add new columns
-    N_Known = N_Death + N_Recover,                               # number with known outcome
-    Pct_Death = scales::percent(N_Death / N_Known, 0.1),         # percent cases who died (to 1 decimal)
-    Pct_Recover = scales::percent(N_Recover / N_Known, 0.1)) %>% # percent who recovered (to 1 decimal)
-  select(                                              # Re-order columns
-    hospital, N_Known,                                   # Intro columns
-    N_Recover, Pct_Recover, ct_value_Recover,            # Recovered columns
-    N_Death, Pct_Death, ct_value_Death)  %>%             # Death columns
-  arrange(N_Known) %>%                                 # Arrange rows from lowest to highest (Total row at bottom)
-
-  # formatting
-  ############
-  flextable() %>%              # table is piped in from above
-  add_header_row(
-    top = TRUE,                # New header goes on top of existing header row
-    values = c("Hospital",     # Header values for each column below
-               "Total cases with known outcome", 
-               "Recovered",    # This will be the top-level header for this and two next columns
-               "",
-               "",
-               "Died",         # This will be the top-level header for this and two next columns
-               "",             # Leave blank, as it will be merged with "Died"
-               "")) %>% 
-    set_header_labels(         # Rename the columns in original header row
-      hospital = "", 
-      N_Known = "",                  
-      N_Recover = "Total",
-      Pct_Recover = "% of cases",
-      ct_value_Recover = "Median CT values",
-      N_Death = "Total",
-      Pct_Death = "% of cases",
-      ct_value_Death = "Median CT values")  %>% 
-  merge_at(i = 1, j = 3:5, part = "header") %>% # Horizontally merge columns 3 to 5 in new header row
-  merge_at(i = 1, j = 6:8, part = "header") %>%  
-  border_remove() %>%  
-  theme_booktabs() %>% 
-  vline(part = "all", j = 2, border = border_style) %>%   # at column 2 
-  vline(part = "all", j = 5, border = border_style) %>%   # at column 5
-  merge_at(i = 1:2, j = 1, part = "header") %>% 
-  merge_at(i = 1:2, j = 2, part = "header") %>% 
-  width(j=1, width = 2.7) %>% 
-  width(j=2, width = 1.5) %>% 
-  width(j=c(4,5,7,8), width = 1) %>% 
-  flextable::align(., align = "center", j = c(2:8), part = "all") %>% 
-  bg(., part = "body", bg = "gray95")  %>% 
-  bg(., j=c(1:8), i= ~ hospital == "Military Hospital", part = "body", bg = "#91c293") %>% 
-  colformat_num(., j = c(4,7), digits = 1) %>%
-  bold(i = 1, bold = TRUE, part = "header") %>% 
-  bold(i = 7, bold = TRUE, part = "body")
+
border_style = officer::fp_border(color="black", width=1)
+
+pacman::p_load(
+  rio,            # import/export
+  here,           # file pathways
+  flextable,      # make HTML tables 
+  officer,        # helper functions for tables
+  tidyverse)      # data management, summary, and visualization
+
+table <- linelist %>% 
+
+  # Get summary values per hospital-outcome group
+  ###############################################
+  group_by(hospital, outcome) %>%                      # Group data
+  summarise(                                           # Create new summary columns of indicators of interest
+    N = n(),                                            # Number of rows per hospital-outcome group     
+    ct_value = median(ct_blood, na.rm=T)) %>%           # median CT value per group
+  
+  # add totals
+  ############
+  bind_rows(                                           # Bind the previous table with this mini-table of totals
+    linelist %>% 
+      filter(!is.na(outcome) & hospital != "Missing") %>%
+      group_by(outcome) %>%                            # Grouped only by outcome, not by hospital    
+      summarise(
+        N = n(),                                       # Number of rows for whole dataset     
+        ct_value = median(ct_blood, na.rm=T))) %>%     # Median CT for whole dataset
+  
+  # Pivot wider and format
+  ########################
+  mutate(hospital = replace_na(hospital, "Total")) %>% 
+  pivot_wider(                                         # Pivot from long to wide
+    values_from = c(ct_value, N),                       # new values are from ct and count columns
+    names_from = outcome) %>%                           # new column names are from outcomes
+  mutate(                                              # Add new columns
+    N_Known = N_Death + N_Recover,                               # number with known outcome
+    Pct_Death = scales::percent(N_Death / N_Known, 0.1),         # percent cases who died (to 1 decimal)
+    Pct_Recover = scales::percent(N_Recover / N_Known, 0.1)) %>% # percent who recovered (to 1 decimal)
+  select(                                              # Re-order columns
+    hospital, N_Known,                                   # Intro columns
+    N_Recover, Pct_Recover, ct_value_Recover,            # Recovered columns
+    N_Death, Pct_Death, ct_value_Death)  %>%             # Death columns
+  arrange(N_Known) %>%                                 # Arrange rows from lowest to highest (Total row at bottom)
+
+  # formatting
+  ############
+  flextable() %>%              # table is piped in from above
+  add_header_row(
+    top = TRUE,                # New header goes on top of existing header row
+    values = c("Hospital",     # Header values for each column below
+               "Total cases with known outcome", 
+               "Recovered",    # This will be the top-level header for this and two next columns
+               "",
+               "",
+               "Died",         # This will be the top-level header for this and two next columns
+               "",             # Leave blank, as it will be merged with "Died"
+               "")) %>% 
+    set_header_labels(         # Rename the columns in original header row
+      hospital = "", 
+      N_Known = "",                  
+      N_Recover = "Total",
+      Pct_Recover = "% of cases",
+      ct_value_Recover = "Median CT values",
+      N_Death = "Total",
+      Pct_Death = "% of cases",
+      ct_value_Death = "Median CT values")  %>% 
+  merge_at(i = 1, j = 3:5, part = "header") %>% # Horizontally merge columns 3 to 5 in new header row
+  merge_at(i = 1, j = 6:8, part = "header") %>%  
+  border_remove() %>%  
+  theme_booktabs() %>% 
+  vline(part = "all", j = 2, border = border_style) %>%   # at column 2 
+  vline(part = "all", j = 5, border = border_style) %>%   # at column 5
+  merge_at(i = 1:2, j = 1, part = "header") %>% 
+  merge_at(i = 1:2, j = 2, part = "header") %>% 
+  width(j=1, width = 2.7) %>% 
+  width(j=2, width = 1.5) %>% 
+  width(j=c(4,5,7,8), width = 1) %>% 
+  flextable::align(., align = "center", j = c(2:8), part = "all") %>% 
+  bg(., part = "body", bg = "gray95")  %>% 
+  bg(., j=c(1:8), i= ~ hospital == "Military Hospital", part = "body", bg = "#91c293") %>% 
+  colformat_num(., j = c(4,7), digits = 1) %>%
+  bold(i = 1, bold = TRUE, part = "header") %>% 
+  bold(i = 7, bold = TRUE, part = "body")
`summarise()` has grouped output by 'hospital'. You can override using the
 `.groups` argument.
-
table
+
table
-

Hospital

Total cases with known outcome

Recovered

Died

Total

% of cases

Median CT values

Total

% of cases

Median CT values

St. Mark's Maternity Hospital (SMMH)

325

126

38.8%

22

199

61.2%

22

Central Hospital

358

165

46.1%

22

193

53.9%

22

Other

685

290

42.3%

21

395

57.7%

22

Military Hospital

708

309

43.6%

22

399

56.4%

21

Missing

1,125

514

45.7%

21

611

54.3%

21

Port Hospital

1,364

579

42.4%

21

785

57.6%

22

Total

3,440

1,469

42.7%

22

1,971

57.3%

22

+

Hospital

Total cases with known outcome

Recovered

Died

Total

% of cases

Median CT values

Total

% of cases

Median CT values

St. Mark's Maternity Hospital (SMMH)

325

126

38.8%

22

199

61.2%

22

Central Hospital

358

165

46.1%

22

193

53.9%

22

Other

685

290

42.3%

21

395

57.7%

22

Military Hospital

708

309

43.6%

22

399

56.4%

21

Missing

1,125

514

45.7%

21

611

54.3%

21

Port Hospital

1,364

579

42.4%

21

785

57.6%

22

Total

3,440

1,469

42.7%

22

1,971

57.3%

22

@@ -1131,16 +1211,16 @@

Save single t

For instance below we save our table as a word document. Note the syntax of the first argument - you can just provide the name of your flextable object e.g. my_table, or you can give is a “name” as shown below (the name is “my table”). If name, this will appear as the title of the table in Word. We also demonstrate code to save as PNG image.

-
# Edit the 'my table' as needed for the title of table.  
-save_as_docx("my table" = my_table, path = "file.docx")
-
-save_as_image(my_table, path = "file.png")
+
# Edit the 'my table' as needed for the title of table.  
+save_as_docx("my table" = my_table, path = "file.docx")
+
+save_as_image(my_table, path = "file.png")

Note the packages webshot or webshot2 are required to save a flextable as an image. Images may come out with transparent backgrounds.

If you want to view a ‘live’ version of the flextable output in the intended document format, use print() and specify one of the below to preview =. The document will “pop-up” open on your computer in the specified software program, but will not be saved. This can be useful to check if the table fits in one page/slide or so you can quickly copy it into another document, you can use the print method with the argument preview set to “pptx” or “docx”.

-
print(my_table, preview = "docx") # Word document example
-print(my_table, preview = "pptx") # Powerpoint example
+
print(my_table, preview = "docx") # Word document example
+print(my_table, preview = "pptx") # Powerpoint example
@@ -1339,18 +1419,7 @@

- + -The Epidemiologist R Handbook - 23  Time series and outbreak detection +23  Time series and outbreak detection – The Epidemiologist R Handbook

Indicateurs pour la province de: Shanghai

Indicateurs

Estimation

Mean delay onset-hosp

4.0

Percentage of recovery

46.7

Median age of the cases

67.0

+

Indicateurs pour la province de: Shanghai

Indicateurs

Estimation

Mean delay onset-hosp

4.0

Percentage of recovery

46.7

Median age of the cases

67.0

+ +
print_indic_prov(table_indic_all, "Jiangsu")
+
+
Warning in set_formatter_type(tab_print, fmt_double = "%.2f", na_str = "-"):
+Use `colformat_*()` instead.
-
print_indic_prov(table_indic_all, "Jiangsu")
-

Indicateurs pour la province de: Jiangsu

Indicateurs

Estimation

Mean delay onset-hosp

6.0

Percentage of recovery

71.4

Median age of the cases

55.0

+

Indicateurs pour la province de: Jiangsu

Indicateurs

Estimation

Mean delay onset-hosp

6.0

Percentage of recovery

71.4

Median age of the cases

55.0

@@ -1210,8 +1348,8 @@

Naming and syntax

    -
  • Avoid using character that could have been easily already taken by other functions already existing in your environment

  • -
  • It is recommended for the function name to be short and straightforward to understand for another reader

  • +
  • Avoid using character that could have been easily already taken by other functions already existing in your environment.

  • +
  • It is recommended for the function name to be short and straightforward to understand for another reader.

  • It is preferred to use verbs as the function name and nouns for the argument names.

@@ -1220,13 +1358,13 @@

tidyverse programming guidance. Among the topics covered are tidy evaluation and use of the embrace { } “double braces”

For example, here is a quick skeleton template code from page tutorial mentioned just above:

-
var_summary <- function(data, var) {
-  data %>%
-    summarise(n = n(), min = min({{ var }}), max = max({{ var }}))
-}
-mtcars %>% 
-  group_by(cyl) %>% 
-  var_summary(mpg)
+
var_summary <- function(data, var) {
+  data %>%
+    summarise(n = n(), min = min({{ var }}), max = max({{ var }}))
+}
+mtcars %>% 
+  group_by(cyl) %>% 
+  var_summary(mpg)
@@ -1236,22 +1374,22 @@

Test
  • It can be more than recommended to introduce a check on the missingness of one argument using missing(argument). This simple check can return “TRUE” or “FALSE” value.
  • -
    contain_covid19_missing <- function(barrier_gest, wear_mask, get_vaccine){
    -  
    -  if (missing(barrier_gest)) (print("please provide arg1"))
    -  if (missing(wear_mask)) print("please provide arg2")
    -  if (missing(get_vaccine)) print("please provide arg3")
    -
    -
    -  if (!barrier_gest == "yes" | wear_mask =="yes" | get_vaccine == "yes" ) 
    -       
    -       return ("you can do better")
    -  
    -  else("please make sure all are yes, this pandemic has to end!")
    -}
    -
    -
    -contain_covid19_missing(get_vaccine = "yes")
    +
    contain_covid19_missing <- function(barrier_gest, wear_mask, get_vaccine){
    +  
    +  if (missing(barrier_gest)) (print("please provide arg1"))
    +  if (missing(wear_mask)) print("please provide arg2")
    +  if (missing(get_vaccine)) print("please provide arg3")
    +
    +
    +  if (!barrier_gest == "yes" | wear_mask =="yes" | get_vaccine == "yes" ) 
    +       
    +       return ("you can do better")
    +  
    +  else("please make sure all are yes, this pandemic has to end!")
    +}
    +
    +
    +contain_covid19_missing(get_vaccine = "yes")
    [1] "please provide arg1"
     [1] "please provide arg2"
    @@ -1264,19 +1402,19 @@

    Test
  • Use stop() for more detectable errors.
  • -
    contain_covid19_stop <- function(barrier_gest, wear_mask, get_vaccine){
    -  
    -  if(!is.character(barrier_gest)) (stop("arg1 should be a character, please enter the value with `yes`, `no` or `sometimes"))
    -  
    -  if (barrier_gest == "yes" & wear_mask =="yes" & get_vaccine == "yes" ) 
    -       
    -       return ("success")
    -  
    -  else("please make sure all are yes, this pandemic has to end!")
    -}
    -
    -
    -contain_covid19_stop(barrier_gest=1, wear_mask="yes", get_vaccine = "no")
    +
    contain_covid19_stop <- function(barrier_gest, wear_mask, get_vaccine){
    +  
    +  if(!is.character(barrier_gest)) (stop("arg1 should be a character, please enter the value with `yes`, `no` or `sometimes"))
    +  
    +  if (barrier_gest == "yes" & wear_mask =="yes" & get_vaccine == "yes" ) 
    +       
    +       return ("success")
    +  
    +  else("please make sure all are yes, this pandemic has to end!")
    +}
    +
    +
    +contain_covid19_stop(barrier_gest=1, wear_mask="yes", get_vaccine = "no")
    Error in contain_covid19_stop(barrier_gest = 1, wear_mask = "yes", get_vaccine = "no"): arg1 should be a character, please enter the value with `yes`, `no` or `sometimes
    @@ -1287,7 +1425,7 @@

    Test

    We can verify by first running the mean() as function, then run it with safely().

    -
    map(linelist, mean)
    +
    map(linelist, mean)
    $case_id
     [1] NA
    @@ -1381,9 +1519,9 @@ 

    Test

    -
    safe_mean <- safely(mean)
    -linelist %>% 
    -  map(safe_mean)
    +
    safe_mean <- safely(mean)
    +linelist %>% 
    +  map(safe_mean)
    $case_id
     $case_id$result
    @@ -1819,18 +1957,7 @@ 

    19  Univariate and multivariable regression" @@ -1704,7 +1704,7 @@ "href": "new_pages/regression.html#univariate", "title": "19  Univariate and multivariable regression", "section": "19.2 Univariate", - "text": "19.2 Univariate\nJust like in the page on Descriptive tables, your use case will determine which R package you use. We present two options for doing univariate analysis:\n\nUse functions available in base R to quickly print results to the console. Use the broom package to tidy up the outputs.\n\nUse the gtsummary package to model and get publication-ready outputs.\n\n\n\nbase R\n\nLinear regression\nThe base R function lm() perform linear regression, assessing the relationship between numeric response and explanatory variables that are assumed to have a linear relationship.\nProvide the equation as a formula, with the response and explanatory column names separated by a tilde ~. Also, specify the dataset to data =. Define the model results as an R object, to use later.\n\nlm_results <- lm(ht_cm ~ age, data = linelist)\n\nYou can then run summary() on the model results to see the coefficients (Estimates), P-value, residuals, and other measures.\n\nsummary(lm_results)\n\n\nCall:\nlm(formula = ht_cm ~ age, data = linelist)\n\nResiduals:\n Min 1Q Median 3Q Max \n-128.579 -15.854 1.177 15.887 175.483 \n\nCoefficients:\n Estimate Std. Error t value Pr(>|t|) \n(Intercept) 69.9051 0.5979 116.9 <2e-16 ***\nage 3.4354 0.0293 117.2 <2e-16 ***\n---\nSignif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1\n\nResidual standard error: 23.75 on 4165 degrees of freedom\nMultiple R-squared: 0.7675, Adjusted R-squared: 0.7674 \nF-statistic: 1.375e+04 on 1 and 4165 DF, p-value: < 2.2e-16\n\n\nAlternatively you can use the tidy() function from the broom package to pull the results in to a table. What the results tell us is that for each year increase in age the height increases by 3.5 cm and this is statistically significant.\n\ntidy(lm_results)\n\n# A tibble: 2 × 5\n term estimate std.error statistic p.value\n <chr> <dbl> <dbl> <dbl> <dbl>\n1 (Intercept) 69.9 0.598 117. 0\n2 age 3.44 0.0293 117. 0\n\n\nYou can then also use this regression to add it to a ggplot, to do this we first pull the points for the observed data and the fitted line in to one data frame using the augment() function from broom.\n\n## pull the regression points and observed data in to one dataset\npoints <- augment(lm_results)\n\n## plot the data using age as the x-axis \nggplot(points, aes(x = age)) + \n ## add points for height \n geom_point(aes(y = ht_cm)) + \n ## add your regression line \n geom_line(aes(y = .fitted), colour = \"red\")\n\n\n\n\n\n\n\n\nIt is also possible to add a simple linear regression straight straight in ggplot using the geom_smooth() function.\n\n## add your data to a plot \n ggplot(linelist, aes(x = age, y = ht_cm)) + \n ## show points\n geom_point() + \n ## add a linear regression \n geom_smooth(method = \"lm\", se = FALSE)\n\n`geom_smooth()` using formula = 'y ~ x'\n\n\n\n\n\n\n\n\n\nSee the Resource section at the end of this chapter for more detailed tutorials.\n\n\nLogistic regression\nThe function glm() from the stats package (part of base R) is used to fit Generalized Linear Models (GLM).\nglm() can be used for univariate and multivariable logistic regression (e.g. to get Odds Ratios). Here are the core parts:\n\n# arguments for glm()\nglm(formula, family, data, weights, subset, ...)\n\n\nformula = The model is provided to glm() as an equation, with the outcome on the left and explanatory variables on the right of a tilde ~.\n\nfamily = This determines the type of model to run. For logistic regression, use family = \"binomial\", for poisson use family = \"poisson\". Other examples are in the table below.\n\ndata = Specify your data frame.\n\nIf necessary, you can also specify the link function via the syntax family = familytype(link = \"linkfunction\")). You can read more in the documentation about other families and optional arguments such as weights = and subset = (?glm).\n\n\n\nFamily\nDefault link function\n\n\n\n\n\"binomial\"\n(link = \"logit\")\n\n\n\"gaussian\"\n(link = \"identity\")\n\n\n\"Gamma\"\n(link = \"inverse\")\n\n\n\"inverse.gaussian\"\n(link = \"1/mu^2\")\n\n\n\"poisson\"\n(link = \"log\")\n\n\n\"quasi\"\n(link = \"identity\", variance = \"constant\")\n\n\n\"quasibinomial\"\n(link = \"logit\")\n\n\n\"quasipoisson\"\n(link = \"log\")\n\n\n\nWhen running glm() it is most common to save the results as a named R object. Then you can print the results to your console using summary() as shown below, or perform other operations on the results (e.g. exponentiate).\nIf you need to run a negative binomial regression you can use the MASS package; the glm.nb() uses the same syntax as glm(). For a walk-through of different regressions, see the UCLA stats page.\n\n\nUnivariate glm()\nIn this example we are assessing the association between different age categories and the outcome of death (coded as 1 in the Preparation section). Below is a univariate model of outcome by age_cat. We save the model output as model and then print it with summary() to the console. Note the estimates provided are the log odds and that the baseline level is the first factor level of age_cat (“0-4”).\n\nmodel <- glm(outcome ~ age_cat, family = \"binomial\", data = linelist)\nsummary(model)\n\n\nCall:\nglm(formula = outcome ~ age_cat, family = \"binomial\", data = linelist)\n\nCoefficients:\n Estimate Std. Error z value Pr(>|z|) \n(Intercept) 0.233738 0.072805 3.210 0.00133 **\nage_cat5-9 -0.062898 0.101733 -0.618 0.53640 \nage_cat10-14 0.138204 0.107186 1.289 0.19726 \nage_cat15-19 -0.005565 0.113343 -0.049 0.96084 \nage_cat20-29 0.027511 0.102133 0.269 0.78765 \nage_cat30-49 0.063764 0.113771 0.560 0.57517 \nage_cat50-69 -0.387889 0.259240 -1.496 0.13459 \nage_cat70+ -0.639203 0.915770 -0.698 0.48518 \n---\nSignif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1\n\n(Dispersion parameter for binomial family taken to be 1)\n\n Null deviance: 5712.4 on 4166 degrees of freedom\nResidual deviance: 5705.1 on 4159 degrees of freedom\nAIC: 5721.1\n\nNumber of Fisher Scoring iterations: 4\n\n\nTo alter the baseline level of a given variable, ensure the column is class Factor and move the desired level to the first position with fct_relevel() (see page on Factors). For example, below we take column age_cat and set “20-29” as the baseline before piping the modified data frame into glm().\n\nlinelist %>% \n mutate(age_cat = fct_relevel(age_cat, \"20-29\", after = 0)) %>% \n glm(formula = outcome ~ age_cat, family = \"binomial\") %>% \n summary()\n\n\nCall:\nglm(formula = outcome ~ age_cat, family = \"binomial\", data = .)\n\nCoefficients:\n Estimate Std. Error z value Pr(>|z|) \n(Intercept) 0.26125 0.07163 3.647 0.000265 ***\nage_cat0-4 -0.02751 0.10213 -0.269 0.787652 \nage_cat5-9 -0.09041 0.10090 -0.896 0.370220 \nage_cat10-14 0.11069 0.10639 1.040 0.298133 \nage_cat15-19 -0.03308 0.11259 -0.294 0.768934 \nage_cat30-49 0.03625 0.11302 0.321 0.748390 \nage_cat50-69 -0.41540 0.25891 -1.604 0.108625 \nage_cat70+ -0.66671 0.91568 -0.728 0.466546 \n---\nSignif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1\n\n(Dispersion parameter for binomial family taken to be 1)\n\n Null deviance: 5712.4 on 4166 degrees of freedom\nResidual deviance: 5705.1 on 4159 degrees of freedom\nAIC: 5721.1\n\nNumber of Fisher Scoring iterations: 4\n\n\n\n\nPrinting results\nFor most uses, several modifications must be made to the above outputs. The function tidy() from the package broom is convenient for making the model results presentable.\nHere we demonstrate how to combine model outputs with a table of counts.\n\nGet the exponentiated log odds ratio estimates and confidence intervals by passing the model to tidy() and setting exponentiate = TRUE and conf.int = TRUE.\n\n\nmodel <- glm(outcome ~ age_cat, family = \"binomial\", data = linelist) %>% \n tidy(exponentiate = TRUE, conf.int = TRUE) %>% # exponentiate and produce CIs\n mutate(across(where(is.numeric), round, digits = 2)) # round all numeric columns\n\nBelow is the outputted tibble model:\n\n\n\n\n\n\n\nCombine these model results with a table of counts. Below, we create the a counts cross-table with the tabyl() function from janitor, as covered in the Descriptive tables page.\n\n\ncounts_table <- linelist %>% \n janitor::tabyl(age_cat, outcome)\n\n\n\n\n\n\n\n\n\n\n\nHere is what this counts_table data frame looks like:\n\n\n\n\n\n\nNow we can bind the counts_table and the model results together horizontally with bind_cols() (dplyr). Remember that with bind_cols() the rows in the two data frames must be aligned perfectly. In this code, because we are binding within a pipe chain, we use . to represent the piped object counts_table as we bind it to model. To finish the process, we use select() to pick the desired columns and their order, and finally apply the base R round() function across all numeric columns to specify 2 decimal places.\n\ncombined <- counts_table %>% # begin with table of counts\n bind_cols(., model) %>% # combine with the outputs of the regression \n select(term, 2:3, estimate, # select and re-order cols\n conf.low, conf.high, p.value) %>% \n mutate(across(where(is.numeric), round, digits = 2)) ## round to 2 decimal places\n\nHere is what the combined data frame looks like, printed nicely as an image with a function from flextable. The Tables for presentation explains how to customize such tables with flextable, or or you can use numerous other packages such as knitr or GT.\n\ncombined <- combined %>% \n flextable::qflextable()\n\n\n\nLooping multiple univariate models\nBelow we present a method using glm() and tidy() for a more simple approach, see the section on gtsummary.\nTo run the models on several exposure variables to produce univariate odds ratios (i.e. not controlling for each other), you can use the approach below. It uses str_c() from stringr to create univariate formulas (see Characters and strings), runs the glm() regression on each formula, passes each glm() output to tidy() and finally collapses all the model outputs together with bind_rows() from tidyr. This approach uses map() from the package purrr to iterate - see the page on Iteration, loops, and lists for more information on this tool.\n\nCreate a vector of column names of the explanatory variables. We already have this as explanatory_vars from the Preparation section of this page.\nUse str_c() to create multiple string formulas, with outcome on the left, and a column name from explanatory_vars on the right. The period . substitutes for the column name in explanatory_vars.\n\n\nexplanatory_vars %>% str_c(\"outcome ~ \", .)\n\n[1] \"outcome ~ gender\" \"outcome ~ fever\" \"outcome ~ chills\" \n[4] \"outcome ~ cough\" \"outcome ~ aches\" \"outcome ~ vomit\" \n[7] \"outcome ~ age_cat\"\n\n\n\nPass these string formulas to map() and set ~glm() as the function to apply to each input. Within glm(), set the regression formula as as.formula(.x) where .x will be replaced by the string formula defined in the step above. map() will loop over each of the string formulas, running regressions for each one.\nThe outputs of this first map() are passed to a second map() command, which applies tidy() to the regression outputs.\nFinally the output of the second map() (a list of tidied data frames) is condensed with bind_rows(), resulting in one data frame with all the univariate results.\n\n\nmodels <- explanatory_vars %>% # begin with variables of interest\n str_c(\"outcome ~ \", .) %>% # combine each variable into formula (\"outcome ~ variable of interest\")\n \n # iterate through each univariate formula\n map( \n .f = ~glm( # pass the formulas one-by-one to glm()\n formula = as.formula(.x), # within glm(), the string formula is .x\n family = \"binomial\", # specify type of glm (logistic)\n data = linelist)) %>% # dataset\n \n # tidy up each of the glm regression outputs from above\n map(\n .f = ~tidy(\n .x, \n exponentiate = TRUE, # exponentiate \n conf.int = TRUE)) %>% # return confidence intervals\n \n # collapse the list of regression outputs in to one data frame\n bind_rows() %>% \n \n # round all numeric columns\n mutate(across(where(is.numeric), round, digits = 2))\n\nThis time, the end object models is longer because it now represents the combined results of several univariate regressions. Click through to see all the rows of model.\n\n\n\n\n\n\nAs before, we can create a counts table from the linelist for each explanatory variable, bind it to models, and make a nice table. We begin with the variables, and iterate through them with map(). We iterate through a user-defined function which involves creating a counts table with dplyr functions. Then the results are combined and bound with the models model results.\n\n## for each explanatory variable\nuniv_tab_base <- explanatory_vars %>% \n map(.f = \n ~{linelist %>% ## begin with linelist\n group_by(outcome) %>% ## group data set by outcome\n count(.data[[.x]]) %>% ## produce counts for variable of interest\n pivot_wider( ## spread to wide format (as in cross-tabulation)\n names_from = outcome,\n values_from = n) %>% \n drop_na(.data[[.x]]) %>% ## drop rows with missings\n rename(\"variable\" = .x) %>% ## change variable of interest column to \"variable\"\n mutate(variable = as.character(variable))} ## convert to character, else non-dichotomous (categorical) variables come out as factor and cant be merged\n ) %>% \n \n ## collapse the list of count outputs in to one data frame\n bind_rows() %>% \n \n ## merge with the outputs of the regression \n bind_cols(., models) %>% \n \n ## only keep columns interested in \n select(term, 2:3, estimate, conf.low, conf.high, p.value) %>% \n \n ## round decimal places\n mutate(across(where(is.numeric), round, digits = 2))\n\nBelow is what the data frame looks like. See the page on Tables for presentation for ideas on how to further convert this table to pretty HTML output (e.g. with flextable).\n\n\n\n\n\n\n\n\n\n\ngtsummary package\nBelow we present the use of tbl_uvregression() from the gtsummary package. Just like in the page on Descriptive tables, gtsummary functions do a good job of running statistics and producing professional-looking outputs. This function produces a table of univariate regression results.\nWe select only the necessary columns from the linelist (explanatory variables and the outcome variable) and pipe them into tbl_uvregression(). We are going to run univariate regression on each of the columns we defined as explanatory_vars in the data Preparation section (gender, fever, chills, cough, aches, vomit, and age_cat).\nWithin the function itself, we provide the method = as glm (no quotes), the y = outcome column (outcome), specify to method.args = that we want to run logistic regression via family = binomial, and we tell it to exponentiate the results.\nThe output is HTML and contains the counts\n\nuniv_tab <- linelist %>% \n dplyr::select(explanatory_vars, outcome) %>% ## select variables of interest\n\n tbl_uvregression( ## produce univariate table\n method = glm, ## define regression want to run (generalised linear model)\n y = outcome, ## define outcome variable\n method.args = list(family = binomial), ## define what type of glm want to run (logistic)\n exponentiate = TRUE ## exponentiate to produce odds ratios (rather than log odds)\n )\n\n## view univariate results table \nuniv_tab\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nCharacteristic\nN\nOR\n1\n95% CI\n1\np-value\n\n\n\n\ngender\n4,167\n1.00\n0.88, 1.13\n>0.9\n\n\nfever\n4,167\n1.00\n0.85, 1.17\n>0.9\n\n\nchills\n4,167\n1.03\n0.89, 1.21\n0.7\n\n\ncough\n4,167\n1.15\n0.97, 1.37\n0.11\n\n\naches\n4,167\n0.93\n0.76, 1.14\n0.5\n\n\nvomit\n4,167\n1.09\n0.96, 1.23\n0.2\n\n\nage_cat\n4,167\n\n\n\n\n\n\n\n\n    0-4\n\n\n—\n—\n\n\n\n\n    5-9\n\n\n0.94\n0.77, 1.15\n0.5\n\n\n    10-14\n\n\n1.15\n0.93, 1.42\n0.2\n\n\n    15-19\n\n\n0.99\n0.80, 1.24\n>0.9\n\n\n    20-29\n\n\n1.03\n0.84, 1.26\n0.8\n\n\n    30-49\n\n\n1.07\n0.85, 1.33\n0.6\n\n\n    50-69\n\n\n0.68\n0.41, 1.13\n0.13\n\n\n    70+\n\n\n0.53\n0.07, 3.20\n0.5\n\n\n\n1\nOR = Odds Ratio, CI = Confidence Interval\n\n\n\n\n\n\n\n\nThere are many modifications you can make to this table output, such as adjusting the text labels, bolding rows by their p-value, etc. See tutorials here and elsewhere online.", + "text": "19.2 Univariate\nJust like in the page on Descriptive tables, your use case will determine which R package you use. We present two options for doing univariate analysis:\n\nUse functions available in base R to quickly print results to the console. Use the broom package to tidy up the outputs.\n\nUse the gtsummary package to model and get publication-ready outputs.\n\n\n\nbase R\n\nLinear regression\nThe base R function lm() perform linear regression, assessing the relationship between numeric response and explanatory variables that are assumed to have a linear relationship.\nProvide the equation as a formula, with the response and explanatory column names separated by a tilde ~. Also, specify the dataset to data =. Define the model results as an R object, to use later.\n\nlm_results <- lm(ht_cm ~ age, data = linelist)\n\nYou can then run summary() on the model results to see the coefficients (Estimates), P-value, residuals, and other measures.\n\nsummary(lm_results)\n\n\nCall:\nlm(formula = ht_cm ~ age, data = linelist)\n\nResiduals:\n Min 1Q Median 3Q Max \n-128.579 -15.854 1.177 15.887 175.483 \n\nCoefficients:\n Estimate Std. Error t value Pr(>|t|) \n(Intercept) 69.9051 0.5979 116.9 <2e-16 ***\nage 3.4354 0.0293 117.2 <2e-16 ***\n---\nSignif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1\n\nResidual standard error: 23.75 on 4165 degrees of freedom\nMultiple R-squared: 0.7675, Adjusted R-squared: 0.7674 \nF-statistic: 1.375e+04 on 1 and 4165 DF, p-value: < 2.2e-16\n\n\nAlternatively you can use the tidy() function from the broom package to pull the results in to a table. What the results tell us is that for each year increase in age the height increases by 3.5 cm and this is statistically significant.\n\ntidy(lm_results)\n\n# A tibble: 2 × 5\n term estimate std.error statistic p.value\n <chr> <dbl> <dbl> <dbl> <dbl>\n1 (Intercept) 69.9 0.598 117. 0\n2 age 3.44 0.0293 117. 0\n\n\nYou can then also use this regression to add it to a ggplot, to do this we first pull the points for the observed data and the fitted line in to one data frame using the augment() function from broom.\n\n## pull the regression points and observed data in to one dataset\npoints <- augment(lm_results)\n\n## plot the data using age as the x-axis \nggplot(points, aes(x = age)) + \n ## add points for height \n geom_point(aes(y = ht_cm)) + \n ## add your regression line \n geom_line(aes(y = .fitted), colour = \"red\")\n\n\n\n\n\n\n\n\nIt is also possible to add a simple linear regression straight straight in ggplot using the geom_smooth() function.\n\n## add your data to a plot \n ggplot(linelist, aes(x = age, y = ht_cm)) + \n ## show points\n geom_point() + \n ## add a linear regression \n geom_smooth(method = \"lm\", se = FALSE)\n\n`geom_smooth()` using formula = 'y ~ x'\n\n\n\n\n\n\n\n\n\nSee the Resource section at the end of this chapter for more detailed tutorials.\n\n\nLogistic regression\nThe function glm() from the stats package (part of base R) is used to fit Generalized Linear Models (GLM).\nglm() can be used for univariate and multivariable logistic regression (e.g. to get Odds Ratios). Here are the core parts:\n\n# arguments for glm()\nglm(formula, family, data, weights, subset, ...)\n\n\nformula = The model is provided to glm() as an equation, with the outcome on the left and explanatory variables on the right of a tilde ~.\n\nfamily = This determines the type of model to run. For logistic regression, use family = \"binomial\", for poisson use family = \"poisson\". Other examples are in the table below.\n\ndata = Specify your data frame.\n\nIf necessary, you can also specify the link function via the syntax family = familytype(link = \"linkfunction\")). You can read more in the documentation about other families and optional arguments such as weights = and subset = (?glm).\n\n\n\nFamily\nDefault link function\n\n\n\n\n\"binomial\"\n(link = \"logit\")\n\n\n\"gaussian\"\n(link = \"identity\")\n\n\n\"Gamma\"\n(link = \"inverse\")\n\n\n\"inverse.gaussian\"\n(link = \"1/mu^2\")\n\n\n\"poisson\"\n(link = \"log\")\n\n\n\"quasi\"\n(link = \"identity\", variance = \"constant\")\n\n\n\"quasibinomial\"\n(link = \"logit\")\n\n\n\"quasipoisson\"\n(link = \"log\")\n\n\n\nWhen running glm() it is most common to save the results as a named R object. Then you can print the results to your console using summary() as shown below, or perform other operations on the results (e.g. exponentiate).\nIf you need to run a negative binomial regression you can use the MASS package; the glm.nb() uses the same syntax as glm(). For a walk-through of different regressions, see the UCLA stats page.\n\n\nUnivariate glm()\nIn this example we are assessing the association between different age categories and the outcome of death (coded as 1 in the Preparation section). Below is a univariate model of outcome by age_cat. We save the model output as model and then print it with summary() to the console. Note the estimates provided are the log odds and that the baseline level is the first factor level of age_cat (“0-4”).\n\nmodel <- glm(outcome ~ age_cat, family = \"binomial\", data = linelist)\nsummary(model)\n\n\nCall:\nglm(formula = outcome ~ age_cat, family = \"binomial\", data = linelist)\n\nCoefficients:\n Estimate Std. Error z value Pr(>|z|) \n(Intercept) 0.233738 0.072805 3.210 0.00133 **\nage_cat5-9 -0.062898 0.101733 -0.618 0.53640 \nage_cat10-14 0.138204 0.107186 1.289 0.19726 \nage_cat15-19 -0.005565 0.113343 -0.049 0.96084 \nage_cat20-29 0.027511 0.102133 0.269 0.78765 \nage_cat30-49 0.063764 0.113771 0.560 0.57517 \nage_cat50-69 -0.387889 0.259240 -1.496 0.13459 \nage_cat70+ -0.639203 0.915770 -0.698 0.48518 \n---\nSignif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1\n\n(Dispersion parameter for binomial family taken to be 1)\n\n Null deviance: 5712.4 on 4166 degrees of freedom\nResidual deviance: 5705.1 on 4159 degrees of freedom\nAIC: 5721.1\n\nNumber of Fisher Scoring iterations: 4\n\n\nTo alter the baseline level of a given variable, ensure the column is class Factor and move the desired level to the first position with fct_relevel() (see page on Factors). For example, below we take column age_cat and set “20-29” as the baseline before piping the modified data frame into glm().\n\nlinelist %>% \n mutate(age_cat = fct_relevel(age_cat, \"20-29\", after = 0)) %>% \n glm(formula = outcome ~ age_cat, family = \"binomial\") %>% \n summary()\n\n\nCall:\nglm(formula = outcome ~ age_cat, family = \"binomial\", data = .)\n\nCoefficients:\n Estimate Std. Error z value Pr(>|z|) \n(Intercept) 0.26125 0.07163 3.647 0.000265 ***\nage_cat0-4 -0.02751 0.10213 -0.269 0.787652 \nage_cat5-9 -0.09041 0.10090 -0.896 0.370220 \nage_cat10-14 0.11069 0.10639 1.040 0.298133 \nage_cat15-19 -0.03308 0.11259 -0.294 0.768934 \nage_cat30-49 0.03625 0.11302 0.321 0.748390 \nage_cat50-69 -0.41540 0.25891 -1.604 0.108625 \nage_cat70+ -0.66671 0.91568 -0.728 0.466546 \n---\nSignif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1\n\n(Dispersion parameter for binomial family taken to be 1)\n\n Null deviance: 5712.4 on 4166 degrees of freedom\nResidual deviance: 5705.1 on 4159 degrees of freedom\nAIC: 5721.1\n\nNumber of Fisher Scoring iterations: 4\n\n\n\n\nPrinting results\nFor most uses, several modifications must be made to the above outputs. The function tidy() from the package broom is convenient for making the model results presentable.\nHere we demonstrate how to combine model outputs with a table of counts.\n\nGet the exponentiated log odds ratio estimates and confidence intervals by passing the model to tidy() and setting exponentiate = TRUE and conf.int = TRUE.\n\n\nmodel <- glm(outcome ~ age_cat, family = \"binomial\", data = linelist) %>% \n tidy(exponentiate = TRUE, conf.int = TRUE) %>% # exponentiate and produce CIs\n mutate(across(where(is.numeric), round, digits = 2)) # round all numeric columns\n\nBelow is the outputted tibble model:\n\n\n\n\n\n\n\nCombine these model results with a table of counts. Below, we create the a counts cross-table with the tabyl() function from janitor, as covered in the Descriptive tables page.\n\n\ncounts_table <- linelist %>% \n janitor::tabyl(age_cat, outcome)\n\n\n\n\n\n\n\n\n\n\n\nHere is what this counts_table data frame looks like:\n\n\n\n\n\n\nNow we can bind the counts_table and the model results together horizontally with bind_cols() from dplyr. Remember that with bind_cols() the rows in the two data frames must be aligned perfectly. In this code, because we are binding within a pipe chain, we use . to represent the piped object counts_table as we bind it to model. To finish the process, we use select() to pick the desired columns and their order, and finally apply the base R round() function across all numeric columns to specify 2 decimal places.\n\ncombined <- counts_table %>% # begin with table of counts\n bind_cols(., model) %>% # combine with the outputs of the regression \n select(term, 2:3, estimate, # select and re-order cols\n conf.low, conf.high, p.value) %>% \n mutate(across(where(is.numeric), round, digits = 2)) ## round to 2 decimal places\n\nHere is what the combined data frame looks like, printed nicely as an image with a function from flextable. The Tables for presentation explains how to customize such tables with flextable, or or you can use numerous other packages such as knitr or GT.\n\ncombined <- combined %>% \n flextable::qflextable()\n\n\n\nLooping multiple univariate models\nBelow we present a method using glm() and tidy() for a more simple approach, see the section on gtsummary.\nTo run the models on several exposure variables to produce univariate odds ratios (i.e. not controlling for each other), you can use the approach below. It uses str_c() from stringr to create univariate formulas (see Characters and strings), runs the glm() regression on each formula, passes each glm() output to tidy() and finally collapses all the model outputs together with bind_rows() from tidyr. This approach uses map() from the package purrr to iterate - see the page on Iteration, loops, and lists for more information on this tool.\n\nCreate a vector of column names of the explanatory variables. We already have this as explanatory_vars from the Preparation section of this page.\nUse str_c() to create multiple string formulas, with outcome on the left, and a column name from explanatory_vars on the right. The period . substitutes for the column name in explanatory_vars.\n\n\nexplanatory_vars %>% str_c(\"outcome ~ \", .)\n\n[1] \"outcome ~ gender\" \"outcome ~ fever\" \"outcome ~ chills\" \n[4] \"outcome ~ cough\" \"outcome ~ aches\" \"outcome ~ vomit\" \n[7] \"outcome ~ age_cat\"\n\n\n\nPass these string formulas to map() and set ~glm() as the function to apply to each input. Within glm(), set the regression formula as as.formula(.x) where .x will be replaced by the string formula defined in the step above. map() will loop over each of the string formulas, running regressions for each one.\nThe outputs of this first map() are passed to a second map() command, which applies tidy() to the regression outputs.\nFinally the output of the second map() (a list of tidied data frames) is condensed with bind_rows(), resulting in one data frame with all the univariate results.\n\n\nmodels <- explanatory_vars %>% # begin with variables of interest\n str_c(\"outcome ~ \", .) %>% # combine each variable into formula (\"outcome ~ variable of interest\")\n \n # iterate through each univariate formula\n map( \n .f = ~glm( # pass the formulas one-by-one to glm()\n formula = as.formula(.x), # within glm(), the string formula is .x\n family = \"binomial\", # specify type of glm (logistic)\n data = linelist)) %>% # dataset\n \n # tidy up each of the glm regression outputs from above\n map(\n .f = ~tidy(\n .x, \n exponentiate = TRUE, # exponentiate \n conf.int = TRUE)) %>% # return confidence intervals\n \n # collapse the list of regression outputs in to one data frame\n bind_rows() %>% \n \n # round all numeric columns\n mutate(across(where(is.numeric), round, digits = 2))\n\nThis time, the end object models is longer because it now represents the combined results of several univariate regressions. Click through to see all the rows of model.\n\n\n\n\n\n\nAs before, we can create a counts table from the linelist for each explanatory variable, bind it to models, and make a nice table. We begin with the variables, and iterate through them with map(). We iterate through a user-defined function which involves creating a counts table with dplyr functions. Then the results are combined and bound with the models model results.\n\n## for each explanatory variable\nuniv_tab_base <- explanatory_vars %>% \n map(.f = \n ~{linelist %>% ## begin with linelist\n group_by(outcome) %>% ## group data set by outcome\n count(.data[[.x]]) %>% ## produce counts for variable of interest\n pivot_wider( ## spread to wide format (as in cross-tabulation)\n names_from = outcome,\n values_from = n) %>% \n drop_na(.data[[.x]]) %>% ## drop rows with missings\n rename(\"variable\" = .x) %>% ## change variable of interest column to \"variable\"\n mutate(variable = as.character(variable))} ## convert to character, else non-dichotomous (categorical) variables come out as factor and cant be merged\n ) %>% \n \n ## collapse the list of count outputs in to one data frame\n bind_rows() %>% \n \n ## merge with the outputs of the regression \n bind_cols(., models) %>% \n \n ## only keep columns interested in \n select(term, 2:3, estimate, conf.low, conf.high, p.value) %>% \n \n ## round decimal places\n mutate(across(where(is.numeric), round, digits = 2))\n\nBelow is what the data frame looks like. See the page on Tables for presentation for ideas on how to further convert this table to pretty HTML output (e.g. with flextable).\n\n\n\n\n\n\n\n\n\n\ngtsummary package\nBelow we present the use of tbl_uvregression() from the gtsummary package. Just like in the page on Descriptive tables, gtsummary functions do a good job of running statistics and producing professional-looking outputs. This function produces a table of univariate regression results.\nWe select only the necessary columns from the linelist (explanatory variables and the outcome variable) and pipe them into tbl_uvregression(). We are going to run univariate regression on each of the columns we defined as explanatory_vars in the data Preperation section (gender, fever, chills, cough, aches, vomit, and age_cat).\nWithin the function itself, we provide the method = as glm (no quotes), the y = outcome column (outcome), specify to method.args = that we want to run logistic regression via family = binomial, and we tell it to exponentiate the results.\nThe output is HTML and contains the counts:\n\nuniv_tab <- linelist %>% \n dplyr::select(explanatory_vars, outcome) %>% ## select variables of interest\n\n tbl_uvregression( ## produce univariate table\n method = glm, ## define regression want to run (generalised linear model)\n y = outcome, ## define outcome variable\n method.args = list(family = binomial), ## define what type of glm want to run (logistic)\n exponentiate = TRUE ## exponentiate to produce odds ratios (rather than log odds)\n )\n\n## view univariate results table \nuniv_tab\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nCharacteristic\nN\nOR\n1\n95% CI\n1\np-value\n\n\n\n\ngender\n4,167\n1.00\n0.88, 1.13\n>0.9\n\n\nfever\n4,167\n1.00\n0.85, 1.17\n>0.9\n\n\nchills\n4,167\n1.03\n0.89, 1.21\n0.7\n\n\ncough\n4,167\n1.15\n0.97, 1.37\n0.11\n\n\naches\n4,167\n0.93\n0.76, 1.14\n0.5\n\n\nvomit\n4,167\n1.09\n0.96, 1.23\n0.2\n\n\nage_cat\n4,167\n\n\n\n\n\n\n\n\n    0-4\n\n\n—\n—\n\n\n\n\n    5-9\n\n\n0.94\n0.77, 1.15\n0.5\n\n\n    10-14\n\n\n1.15\n0.93, 1.42\n0.2\n\n\n    15-19\n\n\n0.99\n0.80, 1.24\n>0.9\n\n\n    20-29\n\n\n1.03\n0.84, 1.26\n0.8\n\n\n    30-49\n\n\n1.07\n0.85, 1.33\n0.6\n\n\n    50-69\n\n\n0.68\n0.41, 1.13\n0.13\n\n\n    70+\n\n\n0.53\n0.07, 3.20\n0.5\n\n\n\n1\nOR = Odds Ratio, CI = Confidence Interval\n\n\n\n\n\n\n\n\n\n\n19.2.1 Cross-tabulation\nThe gtsummary package also allows us to quickly and easily create tables of counts. This can be useful for quickly summarising the data, and putting it in context with the regression we have carried out.\n\n#Carry out our regression\nuniv_tab <- linelist %>% \n dplyr::select(explanatory_vars, outcome) %>% ## select variables of interest\n\n tbl_uvregression( ## produce univariate table\n method = glm, ## define regression want to run (generalised linear model)\n y = outcome, ## define outcome variable\n method.args = list(family = binomial), ## define what type of glm want to run (logistic)\n exponentiate = TRUE ## exponentiate to produce odds ratios (rather than log odds)\n )\n\n#Create our cross tabulation\ncross_tab <- linelist %>%\n dplyr::select(explanatory_vars, outcome) %>% ## select variables of interest\n tbl_summary(by = outcome) ## create summary table\n\ntbl_merge(tbls = list(cross_tab,\n univ_tab),\n tab_spanner = c(\"Summary\", \"Univariate regression\"))\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nCharacteristic\n\nSummary\n\n\nUnivariate regression\n\n\n\n0\nN = 1,825\n1\n1\nN = 2,342\n1\nN\nOR\n2\n95% CI\n2\np-value\n\n\n\n\ngender\n916 (50%)\n1,174 (50%)\n4,167\n1.00\n0.88, 1.13\n>0.9\n\n\nfever\n1,485 (81%)\n1,906 (81%)\n4,167\n1.00\n0.85, 1.17\n>0.9\n\n\nchills\n353 (19%)\n465 (20%)\n4,167\n1.03\n0.89, 1.21\n0.7\n\n\ncough\n1,553 (85%)\n2,033 (87%)\n4,167\n1.15\n0.97, 1.37\n0.11\n\n\naches\n189 (10%)\n228 (9.7%)\n4,167\n0.93\n0.76, 1.14\n0.5\n\n\nvomit\n894 (49%)\n1,198 (51%)\n4,167\n1.09\n0.96, 1.23\n0.2\n\n\nage_cat\n\n\n\n\n4,167\n\n\n\n\n\n\n\n\n    0-4\n338 (19%)\n427 (18%)\n\n\n—\n—\n\n\n\n\n    5-9\n365 (20%)\n433 (18%)\n\n\n0.94\n0.77, 1.15\n0.5\n\n\n    10-14\n273 (15%)\n396 (17%)\n\n\n1.15\n0.93, 1.42\n0.2\n\n\n    15-19\n238 (13%)\n299 (13%)\n\n\n0.99\n0.80, 1.24\n>0.9\n\n\n    20-29\n345 (19%)\n448 (19%)\n\n\n1.03\n0.84, 1.26\n0.8\n\n\n    30-49\n228 (12%)\n307 (13%)\n\n\n1.07\n0.85, 1.33\n0.6\n\n\n    50-69\n35 (1.9%)\n30 (1.3%)\n\n\n0.68\n0.41, 1.13\n0.13\n\n\n    70+\n3 (0.2%)\n2 (<0.1%)\n\n\n0.53\n0.07, 3.20\n0.5\n\n\n\n1\nn (%)\n\n\n2\nOR = Odds Ratio, CI = Confidence Interval\n\n\n\n\n\n\n\n\nThere are many modifications you can make to this table output, such as adjusting the text labels, bolding rows by their p-value, etc. See tutorials here and elsewhere online.", "crumbs": [ "Analysis", "19  Univariate and multivariable regression" @@ -1715,7 +1715,7 @@ "href": "new_pages/regression.html#stratified", "title": "19  Univariate and multivariable regression", "section": "19.3 Stratified", - "text": "19.3 Stratified\nStratified analysis is currently still being worked on for gtsummary, this page will be updated in due course.", + "text": "19.3 Stratified\nHere we define stratified regression as the process of carrying out separate regression analyses on different “groups” of data.\nSometimes in your analysis, you will want to investigate whether or not there are different relationships between an outcome and variables, by different strata. This could be something like, a difference in gender, age group, or source of infection.\nTo do this, you will want to split your dataset into the strata of interest. For example, creating two separate datasets of gender == \"f\" and gender == \"m\", would be done by:\n\nf_linelist <- linelist %>%\n filter(gender == 0) %>% ## subset to only where the gender == \"f\"\n dplyr::select(explanatory_vars, outcome) ## select variables of interest\n \nm_linelist <- linelist %>%\n filter(gender == 1) %>% ## subset to only where the gender == \"f\"\n dplyr::select(explanatory_vars, outcome) ## select variables of interest\n\nOnce this has been done, you can carry out your regression in either base R or gtsummary.\n\n19.3.1 base R\nTo carry this out in base R, you run two different regressions, one for where gender == \"f\" and gender == \"m\".\n\n#Run model for f\nf_model <- glm(outcome ~ vomit, family = \"binomial\", data = f_linelist) %>% \n tidy(exponentiate = TRUE, conf.int = TRUE) %>% # exponentiate and produce CIs\n mutate(across(where(is.numeric), round, digits = 2)) %>% # round all numeric columns\n mutate(gender = \"f\") # create a column which identifies these results as using the f dataset\n \n#Run model for m\nm_model <- glm(outcome ~ vomit, family = \"binomial\", data = m_linelist) %>% \n tidy(exponentiate = TRUE, conf.int = TRUE) %>% # exponentiate and produce CIs\n mutate(across(where(is.numeric), round, digits = 2)) %>% # round all numeric columns\n mutate(gender = \"m\") # create a column which identifies these results as using the m dataset\n\n#Combine the results\nrbind(f_model,\n m_model)\n\n# A tibble: 4 × 8\n term estimate std.error statistic p.value conf.low conf.high gender\n <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> \n1 (Intercept) 1.25 0.06 3.56 0 1.11 1.42 f \n2 vomit 1.05 0.09 0.58 0.56 0.89 1.25 f \n3 (Intercept) 1.21 0.06 3.04 0 1.07 1.36 m \n4 vomit 1.13 0.09 1.38 0.17 0.95 1.34 m \n\n\n\n\n19.3.2 gtsummary\nThe same approach is repeated using gtsummary, however it is easier to produce publication ready tables with gtsummary and compare the two tables with the function tbl_merge().\n\n#Run model for f\nf_model_gt <- f_linelist %>% \n dplyr::select(vomit, outcome) %>% ## select variables of interest\n tbl_uvregression( ## produce univariate table\n method = glm, ## define regression want to run (generalised linear model)\n y = outcome, ## define outcome variable\n method.args = list(family = binomial), ## define what type of glm want to run (logistic)\n exponentiate = TRUE ## exponentiate to produce odds ratios (rather than log odds)\n )\n\n#Run model for m\nm_model_gt <- m_linelist %>% \n dplyr::select(vomit, outcome) %>% ## select variables of interest\n tbl_uvregression( ## produce univariate table\n method = glm, ## define regression want to run (generalised linear model)\n y = outcome, ## define outcome variable\n method.args = list(family = binomial), ## define what type of glm want to run (logistic)\n exponentiate = TRUE ## exponentiate to produce odds ratios (rather than log odds)\n )\n\n#Combine gtsummary tables\nf_and_m_table <- tbl_merge(\n tbls = list(f_model_gt,\n m_model_gt),\n tab_spanner = c(\"Female\",\n \"Male\")\n)\n\n#Print\nf_and_m_table\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nCharacteristic\n\nFemale\n\n\nMale\n\n\n\nN\nOR\n1\n95% CI\n1\np-value\nN\nOR\n1\n95% CI\n1\np-value\n\n\n\n\nvomit\n2,077\n1.05\n0.89, 1.25\n0.6\n2,090\n1.13\n0.95, 1.34\n0.2\n\n\n\n1\nOR = Odds Ratio, CI = Confidence Interval", "crumbs": [ "Analysis", "19  Univariate and multivariable regression" @@ -1726,7 +1726,7 @@ "href": "new_pages/regression.html#multivariable", "title": "19  Univariate and multivariable regression", "section": "19.4 Multivariable", - "text": "19.4 Multivariable\nFor multivariable analysis, we again present two approaches:\n\nglm() and tidy()\n\ngtsummary package\n\nThe workflow is similar for each and only the last step of pulling together a final table is different.\n\nConduct multivariable\nHere we use glm() but add more variables to the right side of the equation, separated by plus symbols (+).\nTo run the model with all of our explanatory variables we would run:\n\nmv_reg <- glm(outcome ~ gender + fever + chills + cough + aches + vomit + age_cat, family = \"binomial\", data = linelist)\n\nsummary(mv_reg)\n\n\nCall:\nglm(formula = outcome ~ gender + fever + chills + cough + aches + \n vomit + age_cat, family = \"binomial\", data = linelist)\n\nCoefficients:\n Estimate Std. Error z value Pr(>|z|)\n(Intercept) 0.069054 0.131726 0.524 0.600\ngender 0.002448 0.065133 0.038 0.970\nfever 0.004309 0.080522 0.054 0.957\nchills 0.034112 0.078924 0.432 0.666\ncough 0.138584 0.089909 1.541 0.123\naches -0.070705 0.104078 -0.679 0.497\nvomit 0.086098 0.062618 1.375 0.169\nage_cat5-9 -0.063562 0.101851 -0.624 0.533\nage_cat10-14 0.136372 0.107275 1.271 0.204\nage_cat15-19 -0.011074 0.113640 -0.097 0.922\nage_cat20-29 0.026552 0.102780 0.258 0.796\nage_cat30-49 0.059569 0.116402 0.512 0.609\nage_cat50-69 -0.388964 0.262384 -1.482 0.138\nage_cat70+ -0.647443 0.917375 -0.706 0.480\n\n(Dispersion parameter for binomial family taken to be 1)\n\n Null deviance: 5712.4 on 4166 degrees of freedom\nResidual deviance: 5700.2 on 4153 degrees of freedom\nAIC: 5728.2\n\nNumber of Fisher Scoring iterations: 4\n\n\nIf you want to include two variables and an interaction between them you can separate them with an asterisk * instead of a +. Separate them with a colon : if you are only specifying the interaction. For example:\n\nglm(outcome ~ gender + age_cat * fever, family = \"binomial\", data = linelist)\n\nOptionally, you can use this code to leverage the pre-defined vector of column names and re-create the above command using str_c(). This might be useful if your explanatory variable names are changing, or you don’t want to type them all out again.\n\n## run a regression with all variables of interest \nmv_reg <- explanatory_vars %>% ## begin with vector of explanatory column names\n str_c(collapse = \"+\") %>% ## combine all names of the variables of interest separated by a plus\n str_c(\"outcome ~ \", .) %>% ## combine the names of variables of interest with outcome in formula style\n glm(family = \"binomial\", ## define type of glm as logistic,\n data = linelist) ## define your dataset\n\n\nBuilding the model\nYou can build your model step-by-step, saving various models that include certain explanatory variables. You can compare these models with likelihood-ratio tests using lrtest() from the package lmtest, as below:\nNOTE: Using base anova(model1, model2, test = \"Chisq) produces the same results \n\nmodel1 <- glm(outcome ~ age_cat, family = \"binomial\", data = linelist)\nmodel2 <- glm(outcome ~ age_cat + gender, family = \"binomial\", data = linelist)\n\nlmtest::lrtest(model1, model2)\n\nLikelihood ratio test\n\nModel 1: outcome ~ age_cat\nModel 2: outcome ~ age_cat + gender\n #Df LogLik Df Chisq Pr(>Chisq)\n1 8 -2852.6 \n2 9 -2852.6 1 2e-04 0.9883\n\n\nAnother option is to take the model object and apply the step() function from the stats package. Specify which variable selection direction you want use when building the model.\n\n## choose a model using forward selection based on AIC\n## you can also do \"backward\" or \"both\" by adjusting the direction\nfinal_mv_reg <- mv_reg %>%\n step(direction = \"forward\", trace = FALSE)\n\nYou can also turn off scientific notation in your R session, for clarity:\n\noptions(scipen=999)\n\nAs described in the section on univariate analysis, pass the model output to tidy() to exponentiate the log odds and CIs. Finally we round all numeric columns to two decimal places. Scroll through to see all the rows.\n\nmv_tab_base <- final_mv_reg %>% \n broom::tidy(exponentiate = TRUE, conf.int = TRUE) %>% ## get a tidy dataframe of estimates \n mutate(across(where(is.numeric), round, digits = 2)) ## round \n\nHere is what the resulting data frame looks like:\n\n\n\n\n\n\n\n\n\n\nCombine univariate and multivariable\n\nCombine with gtsummary\nThe gtsummary package provides the tbl_regression() function, which will take the outputs from a regression (glm() in this case) and produce an nice summary table.\n\n## show results table of final regression \nmv_tab <- tbl_regression(final_mv_reg, exponentiate = TRUE)\n\nLet’s see the table:\n\nmv_tab\n\n\n\n\n\n\n\n\n\n\n\n\n\nCharacteristic\nOR\n1\n95% CI\n1\np-value\n\n\n\n\ngender\n1.00\n0.88, 1.14\n>0.9\n\n\nfever\n1.00\n0.86, 1.18\n>0.9\n\n\nchills\n1.03\n0.89, 1.21\n0.7\n\n\ncough\n1.15\n0.96, 1.37\n0.12\n\n\naches\n0.93\n0.76, 1.14\n0.5\n\n\nvomit\n1.09\n0.96, 1.23\n0.2\n\n\nage_cat\n\n\n\n\n\n\n\n\n    0-4\n—\n—\n\n\n\n\n    5-9\n0.94\n0.77, 1.15\n0.5\n\n\n    10-14\n1.15\n0.93, 1.41\n0.2\n\n\n    15-19\n0.99\n0.79, 1.24\n>0.9\n\n\n    20-29\n1.03\n0.84, 1.26\n0.8\n\n\n    30-49\n1.06\n0.85, 1.33\n0.6\n\n\n    50-69\n0.68\n0.40, 1.13\n0.14\n\n\n    70+\n0.52\n0.07, 3.19\n0.5\n\n\n\n1\nOR = Odds Ratio, CI = Confidence Interval\n\n\n\n\n\n\n\n\nYou can also combine several different output tables produced by gtsummary with the tbl_merge() function. We now combine the multivariable results with the gtsummary univariate results that we created above:\n\n## combine with univariate results \ntbl_merge(\n tbls = list(univ_tab, mv_tab), # combine\n tab_spanner = c(\"**Univariate**\", \"**Multivariable**\")) # set header names\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nCharacteristic\n\nUnivariate\n\n\nMultivariable\n\n\n\nN\nOR\n1\n95% CI\n1\np-value\nOR\n1\n95% CI\n1\np-value\n\n\n\n\ngender\n4,167\n1.00\n0.88, 1.13\n>0.9\n1.00\n0.88, 1.14\n>0.9\n\n\nfever\n4,167\n1.00\n0.85, 1.17\n>0.9\n1.00\n0.86, 1.18\n>0.9\n\n\nchills\n4,167\n1.03\n0.89, 1.21\n0.7\n1.03\n0.89, 1.21\n0.7\n\n\ncough\n4,167\n1.15\n0.97, 1.37\n0.11\n1.15\n0.96, 1.37\n0.12\n\n\naches\n4,167\n0.93\n0.76, 1.14\n0.5\n0.93\n0.76, 1.14\n0.5\n\n\nvomit\n4,167\n1.09\n0.96, 1.23\n0.2\n1.09\n0.96, 1.23\n0.2\n\n\nage_cat\n4,167\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n    0-4\n\n\n—\n—\n\n\n—\n—\n\n\n\n\n    5-9\n\n\n0.94\n0.77, 1.15\n0.5\n0.94\n0.77, 1.15\n0.5\n\n\n    10-14\n\n\n1.15\n0.93, 1.42\n0.2\n1.15\n0.93, 1.41\n0.2\n\n\n    15-19\n\n\n0.99\n0.80, 1.24\n>0.9\n0.99\n0.79, 1.24\n>0.9\n\n\n    20-29\n\n\n1.03\n0.84, 1.26\n0.8\n1.03\n0.84, 1.26\n0.8\n\n\n    30-49\n\n\n1.07\n0.85, 1.33\n0.6\n1.06\n0.85, 1.33\n0.6\n\n\n    50-69\n\n\n0.68\n0.41, 1.13\n0.13\n0.68\n0.40, 1.13\n0.14\n\n\n    70+\n\n\n0.53\n0.07, 3.20\n0.5\n0.52\n0.07, 3.19\n0.5\n\n\n\n1\nOR = Odds Ratio, CI = Confidence Interval\n\n\n\n\n\n\n\n\n\n\nCombine with dplyr\nAn alternative way of combining the glm()/tidy() univariate and multivariable outputs is with the dplyr join functions.\n\nJoin the univariate results from earlier (univ_tab_base, which contains counts) with the tidied multivariable results mv_tab_base\n\nUse select() to keep only the columns we want, specify their order, and re-name them\n\nUse round() with two decimal places on all the column that are class Double\n\n\n## combine univariate and multivariable tables \nleft_join(univ_tab_base, mv_tab_base, by = \"term\") %>% \n ## choose columns and rename them\n select( # new name = old name\n \"characteristic\" = term, \n \"recovered\" = \"0\", \n \"dead\" = \"1\", \n \"univ_or\" = estimate.x, \n \"univ_ci_low\" = conf.low.x, \n \"univ_ci_high\" = conf.high.x,\n \"univ_pval\" = p.value.x, \n \"mv_or\" = estimate.y, \n \"mvv_ci_low\" = conf.low.y, \n \"mv_ci_high\" = conf.high.y,\n \"mv_pval\" = p.value.y \n ) %>% \n mutate(across(where(is.double), round, 2)) \n\n# A tibble: 20 × 11\n characteristic recovered dead univ_or univ_ci_low univ_ci_high univ_pval\n <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>\n 1 (Intercept) 909 1168 1.28 1.18 1.4 0 \n 2 gender 916 1174 1 0.88 1.13 0.97\n 3 (Intercept) 340 436 1.28 1.11 1.48 0 \n 4 fever 1485 1906 1 0.85 1.17 0.99\n 5 (Intercept) 1472 1877 1.28 1.19 1.37 0 \n 6 chills 353 465 1.03 0.89 1.21 0.68\n 7 (Intercept) 272 309 1.14 0.97 1.34 0.13\n 8 cough 1553 2033 1.15 0.97 1.37 0.11\n 9 (Intercept) 1636 2114 1.29 1.21 1.38 0 \n10 aches 189 228 0.93 0.76 1.14 0.51\n11 (Intercept) 931 1144 1.23 1.13 1.34 0 \n12 vomit 894 1198 1.09 0.96 1.23 0.17\n13 (Intercept) 338 427 1.26 1.1 1.46 0 \n14 age_cat5-9 365 433 0.94 0.77 1.15 0.54\n15 age_cat10-14 273 396 1.15 0.93 1.42 0.2 \n16 age_cat15-19 238 299 0.99 0.8 1.24 0.96\n17 age_cat20-29 345 448 1.03 0.84 1.26 0.79\n18 age_cat30-49 228 307 1.07 0.85 1.33 0.58\n19 age_cat50-69 35 30 0.68 0.41 1.13 0.13\n20 age_cat70+ 3 2 0.53 0.07 3.2 0.49\n# ℹ 4 more variables: mv_or <dbl>, mvv_ci_low <dbl>, mv_ci_high <dbl>,\n# mv_pval <dbl>", + "text": "19.4 Multivariable\nFor multivariable analysis, we again present two approaches:\n\nglm() and tidy().\n\ngtsummary package.\n\nThe workflow is similar for each and only the last step of pulling together a final table is different.\n\nConduct multivariable\nHere we use glm() but add more variables to the right side of the equation, separated by plus symbols (+).\nTo run the model with all of our explanatory variables we would run:\n\nmv_reg <- glm(outcome ~ gender + fever + chills + cough + aches + vomit + age_cat, family = \"binomial\", data = linelist)\n\nsummary(mv_reg)\n\n\nCall:\nglm(formula = outcome ~ gender + fever + chills + cough + aches + \n vomit + age_cat, family = \"binomial\", data = linelist)\n\nCoefficients:\n Estimate Std. Error z value Pr(>|z|)\n(Intercept) 0.069054 0.131726 0.524 0.600\ngender 0.002448 0.065133 0.038 0.970\nfever 0.004309 0.080522 0.054 0.957\nchills 0.034112 0.078924 0.432 0.666\ncough 0.138584 0.089909 1.541 0.123\naches -0.070705 0.104078 -0.679 0.497\nvomit 0.086098 0.062618 1.375 0.169\nage_cat5-9 -0.063562 0.101851 -0.624 0.533\nage_cat10-14 0.136372 0.107275 1.271 0.204\nage_cat15-19 -0.011074 0.113640 -0.097 0.922\nage_cat20-29 0.026552 0.102780 0.258 0.796\nage_cat30-49 0.059569 0.116402 0.512 0.609\nage_cat50-69 -0.388964 0.262384 -1.482 0.138\nage_cat70+ -0.647443 0.917375 -0.706 0.480\n\n(Dispersion parameter for binomial family taken to be 1)\n\n Null deviance: 5712.4 on 4166 degrees of freedom\nResidual deviance: 5700.2 on 4153 degrees of freedom\nAIC: 5728.2\n\nNumber of Fisher Scoring iterations: 4\n\n\nIf you want to include two variables and an interaction between them you can separate them with an asterisk * instead of a +. Separate them with a colon : if you are only specifying the interaction. For example:\n\nglm(outcome ~ gender + age_cat * fever, family = \"binomial\", data = linelist)\n\nOptionally, you can use this code to leverage the pre-defined vector of column names and re-create the above command using str_c(). This might be useful if your explanatory variable names are changing, or you don’t want to type them all out again.\n\n## run a regression with all variables of interest \nmv_reg <- explanatory_vars %>% ## begin with vector of explanatory column names\n str_c(collapse = \"+\") %>% ## combine all names of the variables of interest separated by a plus\n str_c(\"outcome ~ \", .) %>% ## combine the names of variables of interest with outcome in formula style\n glm(family = \"binomial\", ## define type of glm as logistic,\n data = linelist) ## define your dataset\n\n\nBuilding the model\nYou can build your model step-by-step, saving various models that include certain explanatory variables. You can compare these models with likelihood-ratio tests using lrtest() from the package lmtest, as below:\nNOTE: Using base anova(model1, model2, test = \"Chisq) produces the same results \n\nmodel1 <- glm(outcome ~ age_cat, family = \"binomial\", data = linelist)\nmodel2 <- glm(outcome ~ age_cat + gender, family = \"binomial\", data = linelist)\n\nlmtest::lrtest(model1, model2)\n\nLikelihood ratio test\n\nModel 1: outcome ~ age_cat\nModel 2: outcome ~ age_cat + gender\n #Df LogLik Df Chisq Pr(>Chisq)\n1 8 -2852.6 \n2 9 -2852.6 1 2e-04 0.9883\n\n\nAnother option is to take the model object and apply the step() function from the stats package. Specify which variable selection direction you want use when building the model.\n\n## choose a model using forward selection based on AIC\n## you can also do \"backward\" or \"both\" by adjusting the direction\nfinal_mv_reg <- mv_reg %>%\n step(direction = \"forward\", trace = FALSE)\n\nYou can also turn off scientific notation in your R session, for clarity:\n\noptions(scipen=999)\n\nAs described in the section on univariate analysis, pass the model output to tidy() to exponentiate the log odds and CIs. Finally we round all numeric columns to two decimal places. Scroll through to see all the rows.\n\nmv_tab_base <- final_mv_reg %>% \n broom::tidy(exponentiate = TRUE, conf.int = TRUE) %>% ## get a tidy dataframe of estimates \n mutate(across(where(is.numeric), round, digits = 2)) ## round \n\nHere is what the resulting data frame looks like:\n\n\n\n\n\n\n\n\n\n\nCombine univariate and multivariable\n\nCombine with gtsummary\nThe gtsummary package provides the tbl_regression() function, which will take the outputs from a regression (glm() in this case) and produce a nice summary table.\n\n## show results table of final regression \nmv_tab <- tbl_regression(final_mv_reg, exponentiate = TRUE)\n\nLet’s see the table:\n\nmv_tab\n\n\n\n\n\n\n\n\n\n\n\n\n\nCharacteristic\nOR\n1\n95% CI\n1\np-value\n\n\n\n\ngender\n1.00\n0.88, 1.14\n>0.9\n\n\nfever\n1.00\n0.86, 1.18\n>0.9\n\n\nchills\n1.03\n0.89, 1.21\n0.7\n\n\ncough\n1.15\n0.96, 1.37\n0.12\n\n\naches\n0.93\n0.76, 1.14\n0.5\n\n\nvomit\n1.09\n0.96, 1.23\n0.2\n\n\nage_cat\n\n\n\n\n\n\n\n\n    0-4\n—\n—\n\n\n\n\n    5-9\n0.94\n0.77, 1.15\n0.5\n\n\n    10-14\n1.15\n0.93, 1.41\n0.2\n\n\n    15-19\n0.99\n0.79, 1.24\n>0.9\n\n\n    20-29\n1.03\n0.84, 1.26\n0.8\n\n\n    30-49\n1.06\n0.85, 1.33\n0.6\n\n\n    50-69\n0.68\n0.40, 1.13\n0.14\n\n\n    70+\n0.52\n0.07, 3.19\n0.5\n\n\n\n1\nOR = Odds Ratio, CI = Confidence Interval\n\n\n\n\n\n\n\n\nYou can also combine several different output tables produced by gtsummary with the tbl_merge() function. We now combine the multivariable results with the gtsummary univariate results that we created above:\n\n## combine with univariate results \ntbl_merge(\n tbls = list(univ_tab, mv_tab), # combine\n tab_spanner = c(\"**Univariate**\", \"**Multivariable**\")) # set header names\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nCharacteristic\n\nUnivariate\n\n\nMultivariable\n\n\n\nN\nOR\n1\n95% CI\n1\np-value\nOR\n1\n95% CI\n1\np-value\n\n\n\n\ngender\n4,167\n1.00\n0.88, 1.13\n>0.9\n1.00\n0.88, 1.14\n>0.9\n\n\nfever\n4,167\n1.00\n0.85, 1.17\n>0.9\n1.00\n0.86, 1.18\n>0.9\n\n\nchills\n4,167\n1.03\n0.89, 1.21\n0.7\n1.03\n0.89, 1.21\n0.7\n\n\ncough\n4,167\n1.15\n0.97, 1.37\n0.11\n1.15\n0.96, 1.37\n0.12\n\n\naches\n4,167\n0.93\n0.76, 1.14\n0.5\n0.93\n0.76, 1.14\n0.5\n\n\nvomit\n4,167\n1.09\n0.96, 1.23\n0.2\n1.09\n0.96, 1.23\n0.2\n\n\nage_cat\n4,167\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n    0-4\n\n\n—\n—\n\n\n—\n—\n\n\n\n\n    5-9\n\n\n0.94\n0.77, 1.15\n0.5\n0.94\n0.77, 1.15\n0.5\n\n\n    10-14\n\n\n1.15\n0.93, 1.42\n0.2\n1.15\n0.93, 1.41\n0.2\n\n\n    15-19\n\n\n0.99\n0.80, 1.24\n>0.9\n0.99\n0.79, 1.24\n>0.9\n\n\n    20-29\n\n\n1.03\n0.84, 1.26\n0.8\n1.03\n0.84, 1.26\n0.8\n\n\n    30-49\n\n\n1.07\n0.85, 1.33\n0.6\n1.06\n0.85, 1.33\n0.6\n\n\n    50-69\n\n\n0.68\n0.41, 1.13\n0.13\n0.68\n0.40, 1.13\n0.14\n\n\n    70+\n\n\n0.53\n0.07, 3.20\n0.5\n0.52\n0.07, 3.19\n0.5\n\n\n\n1\nOR = Odds Ratio, CI = Confidence Interval\n\n\n\n\n\n\n\n\n\n\nCombine with dplyr\nAn alternative way of combining the glm()/tidy() univariate and multivariable outputs is with the dplyr join functions.\n\nJoin the univariate results from earlier (univ_tab_base, which contains counts) with the tidied multivariable results mv_tab_base.\n\nUse select() to keep only the columns we want, specify their order, and re-name them.\n\nUse round() with two decimal places on all the column that are class Double.\n\n\n## combine univariate and multivariable tables \nleft_join(univ_tab_base, mv_tab_base, by = \"term\") %>% \n ## choose columns and rename them\n select( # new name = old name\n \"characteristic\" = term, \n \"recovered\" = \"0\", \n \"dead\" = \"1\", \n \"univ_or\" = estimate.x, \n \"univ_ci_low\" = conf.low.x, \n \"univ_ci_high\" = conf.high.x,\n \"univ_pval\" = p.value.x, \n \"mv_or\" = estimate.y, \n \"mvv_ci_low\" = conf.low.y, \n \"mv_ci_high\" = conf.high.y,\n \"mv_pval\" = p.value.y \n ) %>% \n mutate(across(where(is.double), round, 2)) \n\n# A tibble: 20 × 11\n characteristic recovered dead univ_or univ_ci_low univ_ci_high univ_pval\n <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>\n 1 (Intercept) 909 1168 1.28 1.18 1.4 0 \n 2 gender 916 1174 1 0.88 1.13 0.97\n 3 (Intercept) 340 436 1.28 1.11 1.48 0 \n 4 fever 1485 1906 1 0.85 1.17 0.99\n 5 (Intercept) 1472 1877 1.28 1.19 1.37 0 \n 6 chills 353 465 1.03 0.89 1.21 0.68\n 7 (Intercept) 272 309 1.14 0.97 1.34 0.13\n 8 cough 1553 2033 1.15 0.97 1.37 0.11\n 9 (Intercept) 1636 2114 1.29 1.21 1.38 0 \n10 aches 189 228 0.93 0.76 1.14 0.51\n11 (Intercept) 931 1144 1.23 1.13 1.34 0 \n12 vomit 894 1198 1.09 0.96 1.23 0.17\n13 (Intercept) 338 427 1.26 1.1 1.46 0 \n14 age_cat5-9 365 433 0.94 0.77 1.15 0.54\n15 age_cat10-14 273 396 1.15 0.93 1.42 0.2 \n16 age_cat15-19 238 299 0.99 0.8 1.24 0.96\n17 age_cat20-29 345 448 1.03 0.84 1.26 0.79\n18 age_cat30-49 228 307 1.07 0.85 1.33 0.58\n19 age_cat50-69 35 30 0.68 0.41 1.13 0.13\n20 age_cat70+ 3 2 0.53 0.07 3.2 0.49\n# ℹ 4 more variables: mv_or <dbl>, mvv_ci_low <dbl>, mv_ci_high <dbl>,\n# mv_pval <dbl>", "crumbs": [ "Analysis", "19  Univariate and multivariable regression" @@ -1737,7 +1737,7 @@ "href": "new_pages/regression.html#forest-plot", "title": "19  Univariate and multivariable regression", "section": "19.5 Forest plot", - "text": "19.5 Forest plot\nThis section shows how to produce a plot with the outputs of your regression. There are two options, you can build a plot yourself using ggplot2 or use a meta-package called easystats (a package that includes many packages).\nSee the page on ggplot basics if you are unfamiliar with the ggplot2 plotting package.\n\n\nggplot2 package\nYou can build a forest plot with ggplot() by plotting elements of the multivariable regression results. Add the layers of the plots using these “geoms”:\n\nestimates with geom_point()\n\nconfidence intervals with geom_errorbar()\n\na vertical line at OR = 1 with geom_vline()\n\nBefore plotting, you may want to use fct_relevel() from the forcats package to set the order of the variables/levels on the y-axis. ggplot() may display them in alpha-numeric order which would not work well for these age category values (“30” would appear before “5”). See the page on Factors for more details.\n\n## remove the intercept term from your multivariable results\nmv_tab_base %>% \n \n #set order of levels to appear along y-axis\n mutate(term = fct_relevel(\n term,\n \"vomit\", \"gender\", \"fever\", \"cough\", \"chills\", \"aches\",\n \"age_cat5-9\", \"age_cat10-14\", \"age_cat15-19\", \"age_cat20-29\",\n \"age_cat30-49\", \"age_cat50-69\", \"age_cat70+\")) %>%\n \n # remove \"intercept\" row from plot\n filter(term != \"(Intercept)\") %>% \n \n ## plot with variable on the y axis and estimate (OR) on the x axis\n ggplot(aes(x = estimate, y = term)) +\n \n ## show the estimate as a point\n geom_point() + \n \n ## add in an error bar for the confidence intervals\n geom_errorbar(aes(xmin = conf.low, xmax = conf.high)) + \n \n ## show where OR = 1 is for reference as a dashed line\n geom_vline(xintercept = 1, linetype = \"dashed\")\n\n\n\n\n\n\n\n\n\n\n\neasystats packages\nAn alternative, if you do not want to the fine level of control that ggplot2 provides, is to use a combination of easystats packages.\nThe function model_parameters() from the parameters package does the equivalent of the broom package function tidy(). The see package then accepts those outputs and creates a default forest plot as a ggplot() object.\n\npacman::p_load(easystats)\n\n## remove the intercept term from your multivariable results\nfinal_mv_reg %>% \n model_parameters(exponentiate = TRUE) %>% \n plot()", + "text": "19.5 Forest plot\nThis section shows how to produce a plot with the outputs of your regression. There are two options, you can build a plot yourself using ggplot2 or use a meta-package called easystats (a package that includes many packages).\nSee the page on ggplot basics if you are unfamiliar with the ggplot2 plotting package.\n\n\nggplot2 package\nYou can build a forest plot with ggplot() by plotting elements of the multivariable regression results. Add the layers of the plots using these “geoms”:\n\nestimates with geom_point().\n\nconfidence intervals with geom_errorbar().\n\na vertical line at OR = 1 with geom_vline().\n\nBefore plotting, you may want to use fct_relevel() from the forcats package to set the order of the variables/levels on the y-axis. ggplot() may display them in alpha-numeric order which would not work well for these age category values (“30” would appear before “5”). See the page on Factors for more details.\n\n## remove the intercept term from your multivariable results\nmv_tab_base %>% \n \n #set order of levels to appear along y-axis\n mutate(term = fct_relevel(\n term,\n \"vomit\", \"gender\", \"fever\", \"cough\", \"chills\", \"aches\",\n \"age_cat5-9\", \"age_cat10-14\", \"age_cat15-19\", \"age_cat20-29\",\n \"age_cat30-49\", \"age_cat50-69\", \"age_cat70+\")) %>%\n \n # remove \"intercept\" row from plot\n filter(term != \"(Intercept)\") %>% \n \n ## plot with variable on the y axis and estimate (OR) on the x axis\n ggplot(aes(x = estimate, y = term)) +\n \n ## show the estimate as a point\n geom_point() + \n \n ## add in an error bar for the confidence intervals\n geom_errorbar(aes(xmin = conf.low, xmax = conf.high)) + \n \n ## show where OR = 1 is for reference as a dashed line\n geom_vline(xintercept = 1, linetype = \"dashed\")\n\n\n\n\n\n\n\n\n\n\n\neasystats packages\nAn alternative, if you do not want to the fine level of control that ggplot2 provides, is to use a combination of easystats packages.\nThe function model_parameters() from the parameters package does the equivalent of the broom package function tidy(). The see package then accepts those outputs and creates a default forest plot as a ggplot() object.\n\npacman::p_load(easystats)\n\n## remove the intercept term from your multivariable results\nfinal_mv_reg %>% \n model_parameters(exponentiate = TRUE) %>% \n plot()", "crumbs": [ "Analysis", "19  Univariate and multivariable regression" @@ -1747,8 +1747,8 @@ "objectID": "new_pages/regression.html#resources", "href": "new_pages/regression.html#resources", "title": "19  Univariate and multivariable regression", - "section": "19.6 Resources", - "text": "19.6 Resources\nThe content of this page was informed by these resources and vignettes online:\nLinear regression in R\ngtsummary\nUCLA stats page\nsthda stepwise regression", + "section": "19.7 Resources", + "text": "19.7 Resources\nThe content of this page was informed by these resources and vignettes online:\nLinear regression in R\ngtsummary\nUCLA stats page\nsthda stepwise regression", "crumbs": [ "Analysis", "19  Univariate and multivariable regression" @@ -1770,7 +1770,7 @@ "href": "new_pages/missing_data.html#preparation", "title": "20  Missing data", "section": "", - "text": "Load packages\nThis code chunk shows the loading of packages required for the analyses. In this handbook we emphasize p_load() from pacman, which installs the package if necessary and loads it for use. You can also load installed packages with library() from base R. See the page on R basics for more information on R packages.\n\npacman::p_load(\n rio, # import/export\n tidyverse, # data mgmt and viz\n naniar, # assess and visualize missingness\n mice # missing data imputation\n)\n\n\n\nImport data\nWe import the dataset of cases from a simulated Ebola epidemic. If you want to follow along, click to download the “clean” linelist (as .rds file). Import your data with the import() function from the rio package (it accepts many file types like .xlsx, .rds, .csv - see the Import and export page for details).\n\n# import the linelist\nlinelist <- import(\"linelist_cleaned.rds\")\n\nThe first 50 rows of the linelist are displayed below.\n\n\n\n\n\n\n\n\nConvert missing on import\nWhen importing your data, be aware of values that should be classified as missing. For example, 99, 999, “Missing”, blank cells (““), or cells with an empty space (” “). You can convert these to NA (R’s version of missing data) during the data import command.\nSee the page on importing page section on Missing data for details, as the exact syntax varies by file type.", + "text": "Load packages\nThis code chunk shows the loading of packages required for the analyses. In this handbook we emphasize p_load() from pacman, which installs the package if necessary and loads it for use. You can also load installed packages with library() from base R. See the page on R basics for more information on R packages.\n\npacman::p_load(\n rio, # import/export\n tidyverse, # data mgmt and viz\n naniar, # assess and visualize missingness\n mice # missing data imputation\n)\n\n\n\nImport data\nWe import the dataset of cases from a simulated Ebola epidemic. If you want to follow along, click to download the “clean” linelist (as .rds file). Import your data with the import() function from the rio package (it accepts many file types like .xlsx, .rds, .csv - see the Import and export page for details).\n\n\nWarning: Missing `trust` will be set to FALSE by default for RDS in 2.0.0.\n\n\n\n# import the linelist\nlinelist <- import(\"linelist_cleaned.rds\")\n\nThe first 50 rows of the linelist are displayed below.\n\n\n\n\n\n\n\n\nConvert missing on import\nWhen importing your data, be aware of values that should be classified as missing. For example, 99, 999, “Missing”, blank cells (““), or cells with an empty space (” “). You can convert these to NA (R’s version of missing data) during the data import command.\nSee the page on importing page section on Missing data for details, as the exact syntax varies by file type.", "crumbs": [ "Analysis", "20  Missing data" @@ -1781,7 +1781,7 @@ "href": "new_pages/missing_data.html#missing-values-in-r", "title": "20  Missing data", "section": "20.2 Missing values in R", - "text": "20.2 Missing values in R\nBelow we explore ways that missingness is presented and assessed in R, along with some adjacent values and functions.\n\nNA\nIn R, missing values are represented by a reserved (special) value - NA. Note that this is typed without quotes. “NA” is different and is just a normal character value (also a Beatles lyric from the song Hey Jude).\nYour data may have other ways of representing missingness, such as “99”, or “Missing”, or “Unknown” - you may even have empty character value “” which looks “blank”, or a single space ” “. Be aware of these and consider whether to convert them to NA during import or during data cleaning with na_if().\nIn your data cleaning, you may also want to convert the other way - changing all NA to “Missing” or similar with replace_na() or with fct_explicit_na() for factors.\n\n\nVersions of NA\nMost of the time, NA represents a missing value and everything works fine. However, in some circumstances you may encounter the need for variations of NA specific to an object class (character, numeric, etc). This will be rare, but you should be aware.\nThe typical scenario for this is when creating a new column with the dplyr function case_when(). As described in the Cleaning data and core functions page, this function evaluates every row in the data frame, assess whether the rows meets specified logical criteria (right side of the code), and assigns the correct new value (left side of the code). Importantly: all values on the right side must be the same class.\n\nlinelist <- linelist %>% \n \n # Create new \"age_years\" column from \"age\" column\n mutate(age_years = case_when(\n age_unit == \"years\" ~ age, # if age is given in years, assign original value\n age_unit == \"months\" ~ age/12, # if age is given in months, divide by 12\n is.na(age_unit) ~ age, # if age UNIT is missing, assume years\n TRUE ~ NA_real_)) # any other circumstance, assign missing\n\nIf you want NA on the right side, you may need to specify one of the special NA options listed below. If the other right side values are character, consider using “Missing” instead or otherwise use NA_character_. If they are all numeric, use NA_real_. If they are all dates or logical, you can use NA.\n\nNA - use for dates or logical TRUE/FALSE\nNA_character_ - use for characters\n\nNA_real_ - use for numeric\n\nAgain, it is not likely you will encounter these variations unless you are using case_when() to create a new column. See the R documentation on NA for more information.\n\n\nNULL\nNULL is another reserved value in R. It is the logical representation of a statement that is neither true nor false. It is returned by expressions or functions whose values are undefined. Generally do not assign NULL as a value, unless writing functions or perhaps writing a shiny app to return NULL in specific scenarios.\nNull-ness can be assessed using is.null() and conversion can made with as.null().\nSee this blog post on the difference between NULL and NA.\n\n\nNaN\nImpossible values are represented by the special value NaN. An example of this is when you force R to divide 0 by 0. You can assess this with is.nan(). You may also encounter complementary functions including is.infinite() and is.finite().\n\n\nInf\nInf represents an infinite value, such as when you divide a number by 0.\nAs an example of how this might impact your work: let’s say you have a vector/column z that contains these values: z <- c(1, 22, NA, Inf, NaN, 5)\nIf you want to use max() on the column to find the highest value, you can use the na.rm = TRUE to remove the NA from the calculation, but the Inf and NaN remain and Inf will be returned. To resolve this, you can use brackets [ ] and is.finite() to subset such that only finite values are used for the calculation: max(z[is.finite(z)]).\n\nz <- c(1, 22, NA, Inf, NaN, 5)\nmax(z) # returns NA\nmax(z, na.rm=T) # returns Inf\nmax(z[is.finite(z)]) # returns 22\n\n\n\nExamples\n\n\n\n\n\n\n\nR command\nOutcome\n\n\n\n\n5 / 0\nInf\n\n\n0 / 0\nNaN\n\n\n5 / NA\nNA\n\n\n5 / Inf |0NA - 5|NAInf / 5|Infclass(NA)| \"logical\"class(NaN)| \"numeric\"class(Inf)| \"numeric\"class(NULL)`\n“NULL”\n\n\n\n“NAs introduced by coercion” is a common warning message. This can happen if you attempt to make an illegal conversion like inserting a character value into a vector that is otherwise numeric.\n\nas.numeric(c(\"10\", \"20\", \"thirty\", \"40\"))\n\nWarning: NAs introduced by coercion\n\n\n[1] 10 20 NA 40\n\n\nNULL is ignored in a vector.\n\nmy_vector <- c(25, NA, 10, NULL) # define\nmy_vector # print\n\n[1] 25 NA 10\n\n\nVariance of one number results in NA.\n\nvar(22)\n\n[1] NA", + "text": "20.2 Missing values in R\nBelow we explore ways that missingness is presented and assessed in R, along with some adjacent values and functions.\n\nNA\nIn R, missing values are represented by a reserved (special) value - NA. Note that this is typed without quotes. “NA” is different and is just a normal character value (also a Beatles lyric from the song Hey Jude).\nYour data may have other ways of representing missingness, such as “99”, or “Missing”, or “Unknown” - you may even have empty character value “” which looks “blank”, or a single space ” “. Be aware of these and consider whether to convert them to NA during import or during data cleaning with na_if().\nIn your data cleaning, you may also want to convert the other way - changing all NA to “Missing” or similar with replace_na() or with fct_explicit_na() for factors.\n\n\nVersions of NA\nMost of the time, NA represents a missing value and everything works fine. However, in some circumstances you may encounter the need for variations of NA specific to an object class (character, numeric, etc). This will be rare, but you should be aware.\nThe typical scenario for this is when creating a new column with the dplyr function case_when(). As described in the Cleaning data and core functions page, this function evaluates every row in the data frame, assess whether the rows meets specified logical criteria (right side of the code), and assigns the correct new value (left side of the code). Importantly: all values on the right side must be the same class.\n\nlinelist <- linelist %>% \n \n # Create new \"age_years\" column from \"age\" column\n mutate(age_years = case_when(\n age_unit == \"years\" ~ age, # if age is given in years, assign original value\n age_unit == \"months\" ~ age/12, # if age is given in months, divide by 12\n is.na(age_unit) ~ age, # if age UNIT is missing, assume years\n TRUE ~ NA_real_)) # any other circumstance, assign missing\n\nIf you want NA on the right side, you may need to specify one of the special NA options listed below. If the other right side values are character, consider using “Missing” instead or otherwise use NA_character_. If they are all numeric, use NA_real_. If they are all dates or logical, you can use NA.\n\nNA - use for dates or logical TRUE/FALSE\nNA_character_ - use for characters\n\nNA_real_ - use for numeric\n\nAgain, it is not likely you will encounter these variations unless you are using case_when() to create a new column. See the R documentation on NA for more information.\n\n\nNULL\nNULL is another reserved value in R. It is the logical representation of a statement that is neither true nor false. It is returned by expressions or functions whose values are undefined. Generally do not assign NULL as a value, unless writing functions or perhaps writing a shiny app to return NULL in specific scenarios.\nNull-ness can be assessed using is.null() and conversion can made with as.null().\nSee this blog post on the difference between NULL and NA.\n\n\nNaN\nImpossible values are represented by the special value NaN. An example of this is when you force R to divide 0 by 0. You can assess this with is.nan(). You may also encounter complementary functions including is.infinite() and is.finite().\n\n\nInf\nInf represents an infinite value, such as when you divide a number by 0.\nAs an example of how this might impact your work: let’s say you have a vector/column z that contains these values: z <- c(1, 22, NA, Inf, NaN, 5).\nIf you want to use max() on the column to find the highest value, you can use the na.rm = TRUE to remove the NA from the calculation, but the Inf and NaN remain and Inf will be returned. To resolve this, you can use brackets [ ] and is.finite() to subset such that only finite values are used for the calculation: max(z[is.finite(z)]).\n\nz <- c(1, 22, NA, Inf, NaN, 5)\nmax(z) # returns NA\nmax(z, na.rm=T) # returns Inf\nmax(z[is.finite(z)]) # returns 22\n\n\n\nExamples\n\n\n\n\n\n\n\nR command\nOutcome\n\n\n\n\n5 / 0\nInf\n\n\n0 / 0\nNaN\n\n\n5 / NA\nNA\n\n\n5 / Inf |0NA - 5|NAInf / 5|Infclass(NA)| \"logical\"class(NaN)| \"numeric\"class(Inf)| \"numeric\"class(NULL)`\n“NULL”\n\n\n\n“NAs introduced by coercion” is a common warning message. This can happen if you attempt to make an illegal conversion like inserting a character value into a vector that is otherwise numeric.\n\nas.numeric(c(\"10\", \"20\", \"thirty\", \"40\"))\n\nWarning: NAs introduced by coercion\n\n\n[1] 10 20 NA 40\n\n\nNULL is ignored in a vector.\n\nmy_vector <- c(25, NA, 10, NULL) # define\nmy_vector # print\n\n[1] 25 NA 10\n\n\nVariance of one number results in NA.\n\nvar(22)\n\n[1] NA", "crumbs": [ "Analysis", "20  Missing data" @@ -1803,7 +1803,7 @@ "href": "new_pages/missing_data.html#assess-missingness-in-a-data-frame", "title": "20  Missing data", "section": "20.4 Assess missingness in a data frame", - "text": "20.4 Assess missingness in a data frame\nYou can use the package naniar to assess and visualize missingness in the data frame linelist.\n\n# install and/or load package\npacman::p_load(naniar)\n\n\nQuantifying missingness\nTo find the percent of all values that are missing use pct_miss(). Use n_miss() to get the number of missing values.\n\n# percent of ALL data frame values that are missing\npct_miss(linelist)\n\n[1] 6.688745\n\n\nThe two functions below return the percent of rows with any missing value, or that are entirely complete, respectively. Remember that NA means missing, and that `\"\" or \" \" will not be counted as missing.\n\n# Percent of rows with any value missing\npct_miss_case(linelist) # use n_complete() for counts\n\n[1] 69.12364\n\n\n\n# Percent of rows that are complete (no values missing) \npct_complete_case(linelist) # use n_complete() for counts\n\n[1] 30.87636\n\n\n\n\nVisualizing missingness\nThe gg_miss_var() function will show you the number (or %) of missing values in each column. A few nuances:\n\nYou can add a column name (not in quote) to the argument facet = to see the plot by groups\n\nBy default, counts are shown instead of percents, change this with show_pct = TRUE\n\nYou can add axis and title labels as for a normal ggplot() with + labs(...)\n\n\ngg_miss_var(linelist, show_pct = TRUE)\n\n\n\n\n\n\n\n\nHere the data are piped %>% into the function. The facet = argument is also used to split the data.\n\nlinelist %>% \n gg_miss_var(show_pct = TRUE, facet = outcome)\n\n\n\n\n\n\n\n\nYou can use vis_miss() to visualize the data frame as a heatmap, showing whether each value is missing or not. You can also select() certain columns from the data frame and provide only those columns to the function.\n\n# Heatplot of missingness across the entire data frame \nvis_miss(linelist)\n\n\n\n\n\n\n\n\n\n\nExplore and visualize missingness relationships\nHow do you visualize something that is not there??? By default, ggplot() removes points with missing values from plots.\nnaniar offers a solution via geom_miss_point(). When creating a scatterplot of two columns, records with one of the values missing and the other value present are shown by setting the missing values to 10% lower than the lowest value in the column, and coloring them distinctly.\nIn the scatterplot below, the red dots are records where the value for one column is present but the value for the other column is missing. This allows you to see the distribution of missing values in relation to the non-missing values.\n\nggplot(\n data = linelist,\n mapping = aes(x = age_years, y = temp)) + \n geom_miss_point()\n\n\n\n\n\n\n\n\nTo assess missingness in the data frame stratified by another column, consider gg_miss_fct(), which returns a heatmap of percent missingness in the data frame by a factor/categorical (or date) column:\n\ngg_miss_fct(linelist, age_cat5)\n\nWarning: There was 1 warning in `mutate()`.\nℹ In argument: `age_cat5 = (function (x) ...`.\nCaused by warning:\n! `fct_explicit_na()` was deprecated in forcats 1.0.0.\nℹ Please use `fct_na_value_to_level()` instead.\nℹ The deprecated feature was likely used in the naniar package.\n Please report the issue at <https://github.com/njtierney/naniar/issues>.\n\n\n\n\n\n\n\n\n\nThis function can also be used with a date column to see how missingness has changed over time:\n\ngg_miss_fct(linelist, date_onset)\n\nWarning: Removed 29 rows containing missing values or values outside the scale range\n(`geom_tile()`).\n\n\n\n\n\n\n\n\n\n\n\n“Shadow” columns\nAnother way to visualize missingness in one column by values in a second column is using the “shadow” that naniar can create. bind_shadow() creates a binary NA/not NA column for every existing column, and binds all these new columns to the original dataset with the appendix “_NA”. This doubles the number of columns - see below:\n\nshadowed_linelist <- linelist %>% \n bind_shadow()\n\nnames(shadowed_linelist)\n\n [1] \"case_id\" \"generation\" \n [3] \"date_infection\" \"date_onset\" \n [5] \"date_hospitalisation\" \"date_outcome\" \n [7] \"outcome\" \"gender\" \n [9] \"age\" \"age_unit\" \n[11] \"age_years\" \"age_cat\" \n[13] \"age_cat5\" \"hospital\" \n[15] \"lon\" \"lat\" \n[17] \"infector\" \"source\" \n[19] \"wt_kg\" \"ht_cm\" \n[21] \"ct_blood\" \"fever\" \n[23] \"chills\" \"cough\" \n[25] \"aches\" \"vomit\" \n[27] \"temp\" \"time_admission\" \n[29] \"bmi\" \"days_onset_hosp\" \n[31] \"case_id_NA\" \"generation_NA\" \n[33] \"date_infection_NA\" \"date_onset_NA\" \n[35] \"date_hospitalisation_NA\" \"date_outcome_NA\" \n[37] \"outcome_NA\" \"gender_NA\" \n[39] \"age_NA\" \"age_unit_NA\" \n[41] \"age_years_NA\" \"age_cat_NA\" \n[43] \"age_cat5_NA\" \"hospital_NA\" \n[45] \"lon_NA\" \"lat_NA\" \n[47] \"infector_NA\" \"source_NA\" \n[49] \"wt_kg_NA\" \"ht_cm_NA\" \n[51] \"ct_blood_NA\" \"fever_NA\" \n[53] \"chills_NA\" \"cough_NA\" \n[55] \"aches_NA\" \"vomit_NA\" \n[57] \"temp_NA\" \"time_admission_NA\" \n[59] \"bmi_NA\" \"days_onset_hosp_NA\" \n\n\nThese “shadow” columns can be used to plot the proportion of values that are missing, by any another column.\nFor example, the plot below shows the proportion of records missing days_onset_hosp (number of days from symptom onset to hospitalisation), by that record’s value in date_hospitalisation. Essentially, you are plotting the density of the x-axis column, but stratifying the results (color =) by a shadow column of interest. This analysis works best if the x-axis is a numeric or date column.\n\nggplot(data = shadowed_linelist, # data frame with shadow columns\n mapping = aes(x = date_hospitalisation, # numeric or date column\n colour = age_years_NA)) + # shadow column of interest\n geom_density() # plots the density curves\n\n\n\n\n\n\n\n\nYou can also use these “shadow” columns to stratify a statistical summary, as shown below:\n\nlinelist %>%\n bind_shadow() %>% # create the shows cols\n group_by(date_outcome_NA) %>% # shadow col for stratifying\n summarise(across(\n .cols = age_years, # variable of interest for calculations\n .fns = list(\"mean\" = mean, # stats to calculate\n \"sd\" = sd,\n \"var\" = var,\n \"min\" = min,\n \"max\" = max), \n na.rm = TRUE)) # other arguments for the stat calculations\n\nWarning: There was 1 warning in `summarise()`.\nℹ In argument: `across(...)`.\nℹ In group 1: `date_outcome_NA = !NA`.\nCaused by warning:\n! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.\nSupply arguments directly to `.fns` through an anonymous function instead.\n\n # Previously\n across(a:b, mean, na.rm = TRUE)\n\n # Now\n across(a:b, \\(x) mean(x, na.rm = TRUE))\n\n\n# A tibble: 2 × 6\n date_outcome_NA age_years_mean age_years_sd age_years_var age_years_min\n <fct> <dbl> <dbl> <dbl> <dbl>\n1 !NA 16.0 12.6 158. 0\n2 NA 16.2 12.9 167. 0\n# ℹ 1 more variable: age_years_max <dbl>\n\n\nAn alternative way to plot the proportion of a column’s values that are missing over time is shown below. It does not involve naniar. This example shows percent of weekly observations that are missing).\n\nAggregate the data into a useful time unit (days, weeks, etc.), summarizing the proportion of observations with NA (and any other values of interest)\n\nPlot the proportion missing as a line using ggplot()\n\nBelow, we take the linelist, add a new column for week, group the data by week, and then calculate the percent of that week’s records where the value is missing. (note: if you want % of 7 days the calculation would be slightly different).\n\noutcome_missing <- linelist %>%\n mutate(week = lubridate::floor_date(date_onset, \"week\")) %>% # create new week column\n group_by(week) %>% # group the rows by week\n summarise( # summarize each week\n n_obs = n(), # number of records\n \n outcome_missing = sum(is.na(outcome) | outcome == \"\"), # number of records missing the value\n outcome_p_miss = outcome_missing / n_obs, # proportion of records missing the value\n \n outcome_dead = sum(outcome == \"Death\", na.rm=T), # number of records as dead\n outcome_p_dead = outcome_dead / n_obs) %>% # proportion of records as dead\n \n tidyr::pivot_longer(-week, names_to = \"statistic\") %>% # pivot all columns except week, to long format for ggplot\n filter(stringr::str_detect(statistic, \"_p_\")) # keep only the proportion values\n\nThen we plot the proportion missing as a line, by week. The ggplot basics page if you are unfamiliar with the ggplot2 plotting package.\n\nggplot(data = outcome_missing)+\n geom_line(\n mapping = aes(x = week, y = value, group = statistic, color = statistic),\n size = 2,\n stat = \"identity\")+\n labs(title = \"Weekly outcomes\",\n x = \"Week\",\n y = \"Proportion of weekly records\") + \n scale_color_discrete(\n name = \"\",\n labels = c(\"Died\", \"Missing outcome\"))+\n scale_y_continuous(breaks = c(seq(0,1,0.1)))+\n theme_minimal()+\n theme(legend.position = \"bottom\")", + "text": "20.4 Assess missingness in a data frame\nYou can use the package naniar to assess and visualize missingness in the data frame linelist.\n\n# install and/or load package\npacman::p_load(naniar)\n\n\nQuantifying missingness\nTo find the percent of all values that are missing use pct_miss(). Use n_miss() to get the number of missing values.\n\n# percent of ALL data frame values that are missing\npct_miss(linelist)\n\n[1] 6.688745\n\n\nThe two functions below return the percent of rows with any missing value, or that are entirely complete, respectively. Remember that NA means missing, and that `\"\" or \" \" will not be counted as missing.\n\n# Percent of rows with any value missing\npct_miss_case(linelist) # use n_complete() for counts\n\n[1] 69.12364\n\n\n\n# Percent of rows that are complete (no values missing) \npct_complete_case(linelist) # use n_complete() for counts\n\n[1] 30.87636\n\n\n\n\nVisualizing missingness\nThe gg_miss_var() function will show you the number (or %) of missing values in each column. A few nuances:\n\nYou can add a column name (not in quote) to the argument facet = to see the plot by groups.\n\nBy default, counts are shown instead of percents, change this with show_pct = TRUE.\n\nYou can add axis and title labels as for a normal ggplot() with + labs(...).\n\n\ngg_miss_var(linelist, show_pct = TRUE)\n\n\n\n\n\n\n\n\nHere the data are piped %>% into the function. The facet = argument is also used to split the data.\n\nlinelist %>% \n gg_miss_var(show_pct = TRUE, facet = outcome)\n\n\n\n\n\n\n\n\nYou can use vis_miss() to visualize the data frame as a heatmap, showing whether each value is missing or not. You can also select() certain columns from the data frame and provide only those columns to the function.\n\n# Heatplot of missingness across the entire data frame \nvis_miss(linelist)\n\n\n\n\n\n\n\n\n\n\nExplore and visualize missingness relationships\nHow do you visualize something that is not there? By default, ggplot() removes points with missing values from plots.\nnaniar offers a solution via geom_miss_point(). When creating a scatterplot of two columns, records with one of the values missing and the other value present are shown by setting the missing values to 10% lower than the lowest value in the column, and coloring them distinctly.\nIn the scatterplot below, the red dots are records where the value for one column is present but the value for the other column is missing. This allows you to see the distribution of missing values in relation to the non-missing values.\n\nggplot(\n data = linelist,\n mapping = aes(x = age_years, y = temp)\n ) + \n geom_miss_point()\n\n\n\n\n\n\n\n\nTo assess missingness in the data frame stratified by another column, consider gg_miss_fct(), which returns a heatmap of percent missingness in the data frame by a factor/categorical (or date) column:\n\ngg_miss_fct(linelist, age_cat5)\n\n\n\n\n\n\n\n\nThis function can also be used with a date column to see how missingness has changed over time:\n\ngg_miss_fct(linelist, date_onset)\n\nWarning: Removed 29 rows containing missing values or values outside the scale range\n(`geom_tile()`).\n\n\n\n\n\n\n\n\n\n\n\n“Shadow” columns\nAnother way to visualize missingness in one column by values in a second column is using the “shadow” that naniar can create. bind_shadow() creates a binary NA/not NA column for every existing column, and binds all these new columns to the original dataset with the appendix “_NA”. This doubles the number of columns - see below:\n\nshadowed_linelist <- linelist %>% \n bind_shadow()\n\nnames(shadowed_linelist)\n\n [1] \"case_id\" \"generation\" \n [3] \"date_infection\" \"date_onset\" \n [5] \"date_hospitalisation\" \"date_outcome\" \n [7] \"outcome\" \"gender\" \n [9] \"age\" \"age_unit\" \n[11] \"age_years\" \"age_cat\" \n[13] \"age_cat5\" \"hospital\" \n[15] \"lon\" \"lat\" \n[17] \"infector\" \"source\" \n[19] \"wt_kg\" \"ht_cm\" \n[21] \"ct_blood\" \"fever\" \n[23] \"chills\" \"cough\" \n[25] \"aches\" \"vomit\" \n[27] \"temp\" \"time_admission\" \n[29] \"bmi\" \"days_onset_hosp\" \n[31] \"case_id_NA\" \"generation_NA\" \n[33] \"date_infection_NA\" \"date_onset_NA\" \n[35] \"date_hospitalisation_NA\" \"date_outcome_NA\" \n[37] \"outcome_NA\" \"gender_NA\" \n[39] \"age_NA\" \"age_unit_NA\" \n[41] \"age_years_NA\" \"age_cat_NA\" \n[43] \"age_cat5_NA\" \"hospital_NA\" \n[45] \"lon_NA\" \"lat_NA\" \n[47] \"infector_NA\" \"source_NA\" \n[49] \"wt_kg_NA\" \"ht_cm_NA\" \n[51] \"ct_blood_NA\" \"fever_NA\" \n[53] \"chills_NA\" \"cough_NA\" \n[55] \"aches_NA\" \"vomit_NA\" \n[57] \"temp_NA\" \"time_admission_NA\" \n[59] \"bmi_NA\" \"days_onset_hosp_NA\" \n\n\nThese “shadow” columns can be used to plot the proportion of values that are missing, by any another column.\nFor example, the plot below shows the proportion of records missing days_onset_hosp (number of days from symptom onset to hospitalisation), by that record’s value in date_hospitalisation. Essentially, you are plotting the density of the x-axis column, but stratifying the results (color =) by a shadow column of interest. This analysis works best if the x-axis is a numeric or date column.\n\nggplot(data = shadowed_linelist, # data frame with shadow columns\n mapping = aes(x = date_hospitalisation, # numeric or date column\n colour = age_years_NA)) + # shadow column of interest\n geom_density() # plots the density curves\n\n\n\n\n\n\n\n\nYou can also use these “shadow” columns to stratify a statistical summary, as shown below:\n\nlinelist %>%\n bind_shadow() %>% # create the shows cols\n group_by(date_outcome_NA) %>% # shadow col for stratifying\n summarise(across(\n .cols = age_years, # variable of interest for calculations\n .fns = list(\"mean\" = mean, # stats to calculate\n \"sd\" = sd,\n \"var\" = var,\n \"min\" = min,\n \"max\" = max), \n na.rm = TRUE)) # other arguments for the stat calculations\n\nWarning: There was 1 warning in `summarise()`.\nℹ In argument: `across(...)`.\nℹ In group 1: `date_outcome_NA = !NA`.\nCaused by warning:\n! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.\nSupply arguments directly to `.fns` through an anonymous function instead.\n\n # Previously\n across(a:b, mean, na.rm = TRUE)\n\n # Now\n across(a:b, \\(x) mean(x, na.rm = TRUE))\n\n\n# A tibble: 2 × 6\n date_outcome_NA age_years_mean age_years_sd age_years_var age_years_min\n <fct> <dbl> <dbl> <dbl> <dbl>\n1 !NA 16.0 12.6 158. 0\n2 NA 16.2 12.9 167. 0\n# ℹ 1 more variable: age_years_max <dbl>\n\n\nAn alternative way to plot the proportion of a column’s values that are missing over time is shown below. It does not involve naniar. This example shows percent of weekly observations that are missing).\n\nAggregate the data into a useful time unit (days, weeks, etc.), summarizing the proportion of observations with NA (and any other values of interest).\n\nPlot the proportion missing as a line using ggplot().\n\nBelow, we take the linelist, add a new column for week, group the data by week, and then calculate the percent of that week’s records where the value is missing. (note: if you want % of 7 days the calculation would be slightly different).\n\noutcome_missing <- linelist %>%\n mutate(week = lubridate::floor_date(date_onset, \"week\")) %>% # create new week column\n group_by(week) %>% # group the rows by week\n summarise( # summarize each week\n n_obs = n(), # number of records\n \n outcome_missing = sum(is.na(outcome) | outcome == \"\"), # number of records missing the value\n outcome_p_miss = outcome_missing / n_obs, # proportion of records missing the value\n \n outcome_dead = sum(outcome == \"Death\", na.rm=T), # number of records as dead\n outcome_p_dead = outcome_dead / n_obs) %>% # proportion of records as dead\n \n tidyr::pivot_longer(-week, names_to = \"statistic\") %>% # pivot all columns except week, to long format for ggplot\n filter(stringr::str_detect(statistic, \"_p_\")) # keep only the proportion values\n\nThen we plot the proportion missing as a line, by week. The ggplot basics page if you are unfamiliar with the ggplot2 plotting package.\n\nggplot(data = outcome_missing)+\n geom_line(\n mapping = aes(x = week, y = value, group = statistic, color = statistic),\n size = 2,\n stat = \"identity\")+\n labs(title = \"Weekly outcomes\",\n x = \"Week\",\n y = \"Proportion of weekly records\") + \n scale_color_discrete(\n name = \"\",\n labels = c(\"Died\", \"Missing outcome\"))+\n scale_y_continuous(breaks = c(seq(0,1,0.1)))+\n theme_minimal()+\n theme(legend.position = \"bottom\")", "crumbs": [ "Analysis", "20  Missing data" @@ -1814,7 +1814,7 @@ "href": "new_pages/missing_data.html#using-data-with-missing-values", "title": "20  Missing data", "section": "20.5 Using data with missing values", - "text": "20.5 Using data with missing values\n\nFilter out rows with missing values\nTo quickly remove rows with missing values, use the dplyr function drop_na().\nThe original linelist has nrow(linelist) rows. The adjusted number of rows is shown below:\n\nlinelist %>% \n drop_na() %>% # remove rows with ANY missing values\n nrow()\n\n[1] 1818\n\n\nYou can specify to drop rows with missingness in certain columns:\n\nlinelist %>% \n drop_na(date_onset) %>% # remove rows missing date_onset \n nrow()\n\n[1] 5632\n\n\nYou can list columns one after the other, or use “tidyselect” helper functions:\n\nlinelist %>% \n drop_na(contains(\"date\")) %>% # remove rows missing values in any \"date\" column \n nrow()\n\n[1] 3029\n\n\n\n\n\nHandling NA in ggplot()\nIt is often wise to report the number of values excluded from a plot in a caption. Below is an example:\nIn ggplot(), you can add labs() and within it a caption =. In the caption, you can use str_glue() from stringr package to paste values together into a sentence dynamically so they will adjust to the data. An example is below:\n\nNote the use of \\n for a new line.\n\nNote that if multiple column would contribute to values not being plotted (e.g. age or sex if those are reflected in the plot), then you must filter on those columns as well to correctly calculate the number not shown.\n\n\nlabs(\n title = \"\",\n y = \"\",\n x = \"\",\n caption = stringr::str_glue(\n \"n = {nrow(central_data)} from Central Hospital;\n {nrow(central_data %>% filter(is.na(date_onset)))} cases missing date of onset and not shown.\")) \n\nSometimes, it can be easier to save the string as an object in commands prior to the ggplot() command, and simply reference the named string object within the str_glue().\n\n\n\nNA in factors\nIf your column of interest is a factor, use fct_explicit_na() from the forcats package to convert NA values to a character value. See more detail in the Factors page. By default, the new value is “(Missing)” but this can be adjusted via the na_level = argument.\n\npacman::p_load(forcats) # load package\n\nlinelist <- linelist %>% \n mutate(gender = fct_explicit_na(gender, na_level = \"Missing\"))\n\nlevels(linelist$gender)\n\n[1] \"f\" \"m\" \"Missing\"", + "text": "20.5 Using data with missing values\n\nFilter out rows with missing values\nTo quickly remove rows with missing values, use the dplyr function drop_na().\nThe original linelist has nrow(linelist) rows. The adjusted number of rows is shown below:\n\nlinelist %>% \n drop_na() %>% # remove rows with ANY missing values\n nrow()\n\n[1] 1818\n\n\nYou can specify to drop rows with missingness in certain columns:\n\nlinelist %>% \n drop_na(date_onset) %>% # remove rows missing date_onset \n nrow()\n\n[1] 5632\n\n\nYou can list columns one after the other, or use “tidyselect” helper functions:\n\nlinelist %>% \n drop_na(contains(\"date\")) %>% # remove rows missing values in any \"date\" column \n nrow()\n\n[1] 3029\n\n\n\n\n\nHandling NA in ggplot()\nIt is often wise to report the number of values excluded from a plot in a caption. Below is an example:\nIn ggplot(), you can add labs() and within it a caption =. In the caption, you can use str_glue() from stringr package to paste values together into a sentence dynamically so they will adjust to the data. An example is below:\n\nNote the use of \\n for a new line.\n\nNote that if multiple column would contribute to values not being plotted (e.g. age or sex if those are reflected in the plot), then you must filter on those columns as well to correctly calculate the number not shown.\n\n\nlabs(\n title = \"\",\n y = \"\",\n x = \"\",\n caption = stringr::str_glue(\n \"n = {nrow(central_data)} from Central Hospital;\n {nrow(central_data %>% filter(is.na(date_onset)))} cases missing date of onset and not shown.\")\n ) \n\nSometimes, it can be easier to save the string as an object in commands prior to the ggplot() command, and simply reference the named string object within the str_glue().\n\n\n\nNA in factors\nIf your column of interest is a factor, use fct_explicit_na() from the forcats package to convert NA values to a character value. See more detail in the Factors page. By default, the new value is “(Missing)” but this can be adjusted via the na_level = argument.\n\npacman::p_load(forcats) # load package\n\nlinelist <- linelist %>% \n mutate(gender = fct_explicit_na(gender, na_level = \"Missing\"))\n\nWarning: There was 1 warning in `mutate()`.\nℹ In argument: `gender = fct_explicit_na(gender, na_level = \"Missing\")`.\nCaused by warning:\n! `fct_explicit_na()` was deprecated in forcats 1.0.0.\nℹ Please use `fct_na_value_to_level()` instead.\n\nlevels(linelist$gender)\n\n[1] \"f\" \"m\" \"Missing\"", "crumbs": [ "Analysis", "20  Missing data" @@ -1825,7 +1825,7 @@ "href": "new_pages/missing_data.html#imputation", "title": "20  Missing data", "section": "20.6 Imputation", - "text": "20.6 Imputation\nSometimes, when analyzing your data, it will be important to “fill in the gaps” and impute missing data While you can always simply analyze a dataset after removing all missing values, this can cause problems in many ways. Here are two examples:\n\nBy removing all observations with missing values or variables with a large amount of missing data, you might reduce your power or ability to do some types of analysis. For example, as we discovered earlier, only a small fraction of the observations in our linelist dataset have no missing data across all of our variables. If we removed the majority of our dataset we’d be losing a lot of information! And, most of our variables have some amount of missing data–for most analysis it’s probably not reasonable to drop every variable that has a lot of missing data either.\nDepending on why your data is missing, analysis of only non-missing data might lead to biased or misleading results. For example, as we learned earlier we are missing data for some patients about whether they’ve had some important symptoms like fever or cough. But, as one possibility, maybe that information wasn’t recorded for people that just obviously weren’t very sick. In that case, if we just removed these observations we’d be excluding some of the healthiest people in our dataset and that might really bias any results.\n\nIt’s important to think about why your data might be missing in addition to seeing how much is missing. Doing this can help you decide how important it might be to impute missing data, and also which method of imputing missing data might be best in your situation.\n\nTypes of missing data\nHere are three general types of missing data:\n\nMissing Completely at Random (MCAR). This means that there is no relationship between the probability of data being missing and any of the other variables in your data. The probability of being missing is the same for all cases This is a rare situation. But, if you have strong reason to believe your data is MCAR analyzing only non-missing data without imputing won’t bias your results (although you may lose some power). [TODO: consider discussing statistical tests for MCAR]\nMissing at Random (MAR). This name is actually a bit misleading as MAR means that your data is missing in a systematic, predictable way based on the other information you have. For example, maybe every observation in our dataset with a missing value for fever was actually not recorded because every patient with chills and and aches was just assumed to have a fever so their temperature was never taken. If true, we could easily predict that every missing observation with chills and aches has a fever as well and use this information to impute our missing data. In practice, this is more of a spectrum. Maybe if a patient had both chills and aches they were more likely to have a fever as well if they didn’t have their temperature taken, but not always. This is still predictable even if it isn’t perfectly predictable. This is a common type of missing data\nMissing not at Random (MNAR). Sometimes, this is also called Not Missing at Random (NMAR). This assumes that the probability of a value being missing is NOT systematic or predictable using the other information we have but also isn’t missing randomly. In this situation data is missing for unknown reasons or for reasons you don’t have any information about. For example, in our dataset maybe information on age is missing because some very elderly patients either don’t know or refuse to say how old they are. In this situation, missing data on age is related to the value itself (and thus isn’t random) and isn’t predictable based on the other information we have. MNAR is complex and often the best way of dealing with this is to try to collect more data or information about why the data is missing rather than attempt to impute it.\n\nIn general, imputing MCAR data is often fairly simple, while MNAR is very challenging if not impossible. Many of the common data imputation methods assume MAR.\n\n\nUseful packages\nSome useful packages for imputing missing data are Mmisc, missForest (which uses random forests to impute missing data), and mice (Multivariate Imputation by Chained Equations). For this section we’ll just use the mice package, which implements a variety of techniques. The maintainer of the mice package has published an online book about imputing missing data that goes into more detail here (https://stefvanbuuren.name/fimd/).\nHere is the code to load the mice package:\n\npacman::p_load(mice)\n\n\n\nMean Imputation\nSometimes if you are doing a simple analysis or you have strong reason to think you can assume MCAR, you can simply set missing numerical values to the mean of that variable. Perhaps we can assume that missing temperature measurements in our dataset were either MCAR or were just normal values. Here is the code to create a new variable that replaces missing temperature values with the mean temperature value in our dataset. However, in many situations replacing data with the mean can lead to bias, so be careful.\n\nlinelist <- linelist %>%\n mutate(temp_replace_na_with_mean = replace_na(temp, mean(temp, na.rm = T)))\n\nYou could also do a similar process for replacing categorical data with a specific value. For our dataset, imagine you knew that all observations with a missing value for their outcome (which can be “Death” or “Recover”) were actually people that died (note: this is not actually true for this dataset):\n\nlinelist <- linelist %>%\n mutate(outcome_replace_na_with_death = replace_na(outcome, \"Death\"))\n\n\n\nRegression imputation\nA somewhat more advanced method is to use some sort of statistical model to predict what a missing value is likely to be and replace it with the predicted value. Here is an example of creating predicted values for all the observations where temperature is missing, but age and fever are not, using simple linear regression using fever status and age in years as predictors. In practice you’d want to use a better model than this sort of simple approach.\n\nsimple_temperature_model_fit <- lm(temp ~ fever + age_years, data = linelist)\n\n#using our simple temperature model to predict values just for the observations where temp is missing\npredictions_for_missing_temps <- predict(simple_temperature_model_fit,\n newdata = linelist %>% filter(is.na(temp))) \n\nOr, using the same modeling approach through the mice package to create imputed values for the missing temperature observations:\n\nmodel_dataset <- linelist %>%\n select(temp, fever, age_years) \n\ntemp_imputed <- mice(model_dataset,\n method = \"norm.predict\",\n seed = 1,\n m = 1,\n print = F)\n\nWarning: Number of logged events: 1\n\ntemp_imputed_values <- temp_imputed$imp$temp\n\nThis is the same type of approach by some more advanced methods like using the missForest package to replace missing data with predicted values. In that case, the prediction model is a random forest instead of a linear regression. You can use other types of models to do this as well. However, while this approach works well under MCAR you should be a bit careful if you believe MAR or MNAR more accurately describes your situation. The quality of your imputation will depend on how good your prediction model is and even with a very good model the variability of your imputed data may be underestimated.\n\n\nLOCF and BOCF\nLast observation carried forward (LOCF) and baseline observation carried forward (BOCF) are imputation methods for time series/longitudinal data. The idea is to take the previous observed value as a replacement for the missing data. When multiple values are missing in succession, the method searches for the last observed value.\nThe fill() function from the tidyr package can be used for both LOCF and BOCF imputation (however, other packages such as HMISC, zoo, and data.table also include methods for doing this). To show the fill() syntax we’ll make up a simple time series dataset containing the number of cases of a disease for each quarter of the years 2000 and 2001. However, the year value for subsequent quarters after Q1 are missing so we’ll need to impute them. The fill() junction is also demonstrated in the Pivoting data page.\n\n#creating our simple dataset\ndisease <- tibble::tribble(\n ~quarter, ~year, ~cases,\n \"Q1\", 2000, 66013,\n \"Q2\", NA, 69182,\n \"Q3\", NA, 53175,\n \"Q4\", NA, 21001,\n \"Q1\", 2001, 46036,\n \"Q2\", NA, 58842,\n \"Q3\", NA, 44568,\n \"Q4\", NA, 50197)\n\n#imputing the missing year values:\ndisease %>% fill(year)\n\n# A tibble: 8 × 3\n quarter year cases\n <chr> <dbl> <dbl>\n1 Q1 2000 66013\n2 Q2 2000 69182\n3 Q3 2000 53175\n4 Q4 2000 21001\n5 Q1 2001 46036\n6 Q2 2001 58842\n7 Q3 2001 44568\n8 Q4 2001 50197\n\n\nNote: make sure your data are sorted correctly before using the fill() function. fill() defaults to filling “down” but you can also impute values in different directions by changing the .direction parameter. We can make a similar dataset where the year value is recorded only at the end of the year and missing for earlier quarters:\n\n#creating our slightly different dataset\ndisease <- tibble::tribble(\n ~quarter, ~year, ~cases,\n \"Q1\", NA, 66013,\n \"Q2\", NA, 69182,\n \"Q3\", NA, 53175,\n \"Q4\", 2000, 21001,\n \"Q1\", NA, 46036,\n \"Q2\", NA, 58842,\n \"Q3\", NA, 44568,\n \"Q4\", 2001, 50197)\n\n#imputing the missing year values in the \"up\" direction:\ndisease %>% fill(year, .direction = \"up\")\n\n# A tibble: 8 × 3\n quarter year cases\n <chr> <dbl> <dbl>\n1 Q1 2000 66013\n2 Q2 2000 69182\n3 Q3 2000 53175\n4 Q4 2000 21001\n5 Q1 2001 46036\n6 Q2 2001 58842\n7 Q3 2001 44568\n8 Q4 2001 50197\n\n\nIn this example, LOCF and BOCF are clearly the right things to do, but in more complicated situations it may be harder to decide if these methods are appropriate. For example, you may have missing laboratory values for a hospital patient after the first day. Sometimes, this can mean the lab values didn’t change…but it could also mean the patient recovered and their values would be very different after the first day! Use these methods with caution.\n\n\nMultiple Imputation\nThe online book we mentioned earlier by the author of the mice package (https://stefvanbuuren.name/fimd/) contains a detailed explanation of multiple imputation and why you’d want to use it. But, here is a basic explanation of the method:\nWhen you do multiple imputation, you create multiple datasets with the missing values imputed to plausible data values (depending on your research data you might want to create more or less of these imputed datasets, but the mice package sets the default number to 5). The difference is that rather than a single, specific value each imputed value is drawn from an estimated distribution (so it includes some randomness). As a result, each of these datasets will have slightly different different imputed values (however, the non-missing data will be the same in each of these imputed datasets). You still use some sort of predictive model to do the imputation in each of these new datasets (mice has many options for prediction methods including Predictive Mean Matching, logistic regression, and random forest) but the mice package can take care of many of the modeling details.\nThen, once you have created these new imputed datasets, you can apply then apply whatever statistical model or analysis you were planning to do for each of these new imputed datasets and pool the results of these models together. This works very well to reduce bias in both MCAR and many MAR settings and often results in more accurate standard error estimates.\nHere is an example of applying the Multiple Imputation process to predict temperature in our linelist dataset using a age and fever status (our simplified model_dataset from above):\n\n# imputing missing values for all variables in our model_dataset, and creating 10 new imputed datasets\nmultiple_imputation = mice(\n model_dataset,\n seed = 1,\n m = 10,\n print = FALSE) \n\nWarning: Number of logged events: 1\n\nmodel_fit <- with(multiple_imputation, lm(temp ~ age_years + fever))\n\nbase::summary(mice::pool(model_fit))\n\n term estimate std.error statistic df p.value\n1 (Intercept) 3.703143e+01 0.0270863456 1.367162e+03 26.83673 1.583113e-66\n2 age_years 3.867829e-05 0.0006090202 6.350905e-02 171.44363 9.494351e-01\n3 feveryes 1.978044e+00 0.0193587115 1.021785e+02 176.51325 5.666771e-159\n\n\nHere we used the mice default method of imputation, which is Predictive Mean Matching. We then used these imputed datasets to separately estimate and then pool results from simple linear regressions on each of these datasets. There are many details we’ve glossed over and many settings you can adjust during the Multiple Imputation process while using the mice package. For example, you won’t always have numerical data and might need to use other imputation methods (you can still use the mice package for many other types of data and methods). But, for a more robust analysis when missing data is a significant concern, Multiple Imputation is good solution that isn’t always much more work than doing a complete case analysis.", + "text": "20.6 Imputation\nSometimes, when analyzing your data, it will be important to “fill in the gaps” and impute missing data While you can always simply analyze a dataset after removing all missing values, this can cause problems in many ways. Here are two examples:\n\nBy removing all observations with missing values or variables with a large amount of missing data, you might reduce your power or ability to do some types of analysis. For example, as we discovered earlier, only a small fraction of the observations in our linelist dataset have no missing data across all of our variables. If we removed the majority of our dataset we’d be losing a lot of information! And, most of our variables have some amount of missing data–for most analysis it’s probably not reasonable to drop every variable that has a lot of missing data either.\nDepending on why your data is missing, analysis of only non-missing data might lead to biased or misleading results. For example, as we learned earlier we are missing data for some patients about whether they’ve had some important symptoms like fever or cough. But, as one possibility, maybe that information wasn’t recorded for people that just obviously weren’t very sick. In that case, if we just removed these observations we’d be excluding some of the healthiest people in our dataset and that might really bias any results.\n\nIt’s important to think about why your data might be missing in addition to seeing how much is missing. Doing this can help you decide how important it might be to impute missing data, and also which method of imputing missing data might be best in your situation.\n\nTypes of missing data\nHere are three general types of missing data:\n\nMissing Completely at Random (MCAR). This means that there is no relationship between the probability of data being missing and any of the other variables in your data. The probability of being missing is the same for all cases This is a rare situation. But, if you have strong reason to believe your data is MCAR analyzing only non-missing data without imputing won’t bias your results (although you may lose some power).\n\nTo test if your data is MCAR, you can use mcar_test from the naniar package. Note this will only work with numerical values, so you may need to convert your data before running the test.\nIf the p-value is less than or equal to 0.05, then your data is not missing completely at random.\n\n## define variables of interest \nexplanatory_vars <- c(\"gender\", \"fever\", \"chills\", \"cough\", \"aches\", \"vomit\")\n\n#Recode linelist to replace characters with numeric\nlinelist <- linelist %>% \n mutate(across( \n .cols = all_of(c(explanatory_vars, \"outcome\")), # for each column listed and \"outcome\"\n .fns = ~case_when( \n . %in% c(\"m\", \"yes\", \"Death\") ~ 1, # recode male, yes and death to 1\n . %in% c(\"f\", \"no\", \"Recover\") ~ 0, # female, no and recover to 0\n TRUE ~ NA_real_) # otherwise set to missing\n )\n )\n\nlinelist %>%\n select(explanatory_vars, \"outcome\") %>%\n mcar_test()\n\nWarning: Using an external vector in selections was deprecated in tidyselect 1.1.0.\nℹ Please use `all_of()` or `any_of()` instead.\n # Was:\n data %>% select(explanatory_vars)\n\n # Now:\n data %>% select(all_of(explanatory_vars))\n\nSee <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.\n\n\n# A tibble: 1 × 4\n statistic df p.value missing.patterns\n <dbl> <dbl> <dbl> <int>\n1 27.9 21 0.144 7\n\n\nAs the p-value is greater than 0.05, we have failed to reject the null hypothesis (that the data is MCAR), and so no patterns exist in the missing data.\n\nMissing at Random (MAR). This name is actually a bit misleading as MAR means that your data is missing in a systematic, predictable way based on the other information you have. For example, maybe every observation in our dataset with a missing value for fever was actually not recorded because every patient with chills and and aches was just assumed to have a fever so their temperature was never taken. If true, we could easily predict that every missing observation with chills and aches has a fever as well and use this information to impute our missing data. In practice, this is more of a spectrum. Maybe if a patient had both chills and aches they were more likely to have a fever as well if they didn’t have their temperature taken, but not always. This is still predictable even if it isn’t perfectly predictable. This is a common type of missing data.\nMissing not at Random (MNAR). Sometimes, this is also called Not Missing at Random (NMAR). This assumes that the probability of a value being missing is NOT systematic or predictable using the other information we have but also isn’t missing randomly. In this situation data is missing for unknown reasons or for reasons you don’t have any information about. For example, in our dataset maybe information on age is missing because some very elderly patients either don’t know or refuse to say how old they are. In this situation, missing data on age is related to the value itself (and thus isn’t random) and isn’t predictable based on the other information we have. MNAR is complex and often the best way of dealing with this is to try to collect more data or information about why the data is missing rather than attempt to impute it.\n\nIn general, imputing MCAR data is often fairly simple, while MNAR is very challenging if not impossible. Many of the common data imputation methods assume MAR.\n\n\nUseful packages\nSome useful packages for imputing missing data are Mmisc, missForest (which uses random forests to impute missing data), and mice (Multivariate Imputation by Chained Equations). For this section we’ll just use the mice package, which implements a variety of techniques. The maintainer of the mice package has published an online book about imputing missing data that goes into more detail here (https://stefvanbuuren.name/fimd/).\nHere is the code to load the mice package:\n\npacman::p_load(mice)\n\n\n\nMean Imputation\nSometimes if you are doing a simple analysis or you have strong reason to think you can assume MCAR, you can simply set missing numerical values to the mean of that variable. Perhaps we can assume that missing temperature measurements in our dataset were either MCAR or were just normal values. Here is the code to create a new variable that replaces missing temperature values with the mean temperature value in our dataset. However, in many situations replacing data with the mean can lead to bias, so be careful.\n\nlinelist <- linelist %>%\n mutate(temp_replace_na_with_mean = replace_na(temp, mean(temp, na.rm = T)))\n\nYou could also do a similar process for replacing categorical data with a specific value. For our dataset, imagine you knew that all observations with a missing value for their outcome (which can be “Death” or “Recover”) were actually people that died (note: this is not actually true for this dataset):\n\nlinelist <- linelist %>%\n mutate(outcome_replace_na_with_death = replace_na(outcome, 1))\n\n\n\nRegression imputation\nA somewhat more advanced method is to use some sort of statistical model to predict what a missing value is likely to be and replace it with the predicted value. Here is an example of creating predicted values for all the observations where temperature is missing, but age and fever are not, using simple linear regression using fever status and age in years as predictors. In practice you’d want to use a better model than this sort of simple approach.\n\nsimple_temperature_model_fit <- lm(temp ~ fever + age_years, data = linelist)\n\n#using our simple temperature model to predict values just for the observations where temp is missing\npredictions_for_missing_temps <- predict(simple_temperature_model_fit,\n newdata = linelist %>% filter(is.na(temp))) \n\nOr, using the same modeling approach through the mice package to create imputed values for the missing temperature observations:\n\nmodel_dataset <- linelist %>%\n select(temp, fever, age_years) \n\ntemp_imputed <- mice(model_dataset,\n method = \"norm.predict\",\n seed = 1,\n m = 1,\n print = F)\n\ntemp_imputed_values <- temp_imputed$imp$temp\n\nThis is the same type of approach by some more advanced methods like using the missForest package to replace missing data with predicted values. In that case, the prediction model is a random forest instead of a linear regression. You can use other types of models to do this as well. However, while this approach works well under MCAR you should be a bit careful if you believe MAR or MNAR more accurately describes your situation. The quality of your imputation will depend on how good your prediction model is and even with a very good model the variability of your imputed data may be underestimated.\n\n\nLOCF and BOCF\nLast observation carried forward (LOCF) and baseline observation carried forward (BOCF) are imputation methods for time series/longitudinal data. The idea is to take the previous observed value as a replacement for the missing data. When multiple values are missing in succession, the method searches for the last observed value.\nThe fill() function from the tidyr package can be used for both LOCF and BOCF imputation (however, other packages such as HMISC, zoo, and data.table also include methods for doing this). To show the fill() syntax we’ll make up a simple time series dataset containing the number of cases of a disease for each quarter of the years 2000 and 2001. However, the year value for subsequent quarters after Q1 are missing so we’ll need to impute them. The fill() junction is also demonstrated in the Pivoting data page.\n\n#creating our simple dataset\ndisease <- tibble::tribble(\n ~quarter, ~year, ~cases,\n \"Q1\", 2000, 66013,\n \"Q2\", NA, 69182,\n \"Q3\", NA, 53175,\n \"Q4\", NA, 21001,\n \"Q1\", 2001, 46036,\n \"Q2\", NA, 58842,\n \"Q3\", NA, 44568,\n \"Q4\", NA, 50197)\n\n#imputing the missing year values:\ndisease %>% fill(year)\n\n# A tibble: 8 × 3\n quarter year cases\n <chr> <dbl> <dbl>\n1 Q1 2000 66013\n2 Q2 2000 69182\n3 Q3 2000 53175\n4 Q4 2000 21001\n5 Q1 2001 46036\n6 Q2 2001 58842\n7 Q3 2001 44568\n8 Q4 2001 50197\n\n\nNote: make sure your data are sorted correctly before using the fill() function. fill() defaults to filling “down” but you can also impute values in different directions by changing the .direction parameter. We can make a similar dataset where the year value is recorded only at the end of the year and missing for earlier quarters:\n\n#creating our slightly different dataset\ndisease <- tibble::tribble(\n ~quarter, ~year, ~cases,\n \"Q1\", NA, 66013,\n \"Q2\", NA, 69182,\n \"Q3\", NA, 53175,\n \"Q4\", 2000, 21001,\n \"Q1\", NA, 46036,\n \"Q2\", NA, 58842,\n \"Q3\", NA, 44568,\n \"Q4\", 2001, 50197)\n\n#imputing the missing year values in the \"up\" direction:\ndisease %>% fill(year, .direction = \"up\")\n\n# A tibble: 8 × 3\n quarter year cases\n <chr> <dbl> <dbl>\n1 Q1 2000 66013\n2 Q2 2000 69182\n3 Q3 2000 53175\n4 Q4 2000 21001\n5 Q1 2001 46036\n6 Q2 2001 58842\n7 Q3 2001 44568\n8 Q4 2001 50197\n\n\nIn this example, LOCF and BOCF are clearly the right things to do, but in more complicated situations it may be harder to decide if these methods are appropriate. For example, you may have missing laboratory values for a hospital patient after the first day. Sometimes, this can mean the lab values didn’t change…but it could also mean the patient recovered and their values would be very different after the first day!\nUse these methods with caution.\n\n\nMultiple Imputation\nThe online book we mentioned earlier by the author of the mice package (https://stefvanbuuren.name/fimd/) contains a detailed explanation of multiple imputation and why you’d want to use it. But, here is a basic explanation of the method:\nWhen you do multiple imputation, you create multiple datasets with the missing values imputed to plausible data values (depending on your research data you might want to create more or less of these imputed datasets, but the mice package sets the default number to 5). The difference is that rather than a single, specific value each imputed value is drawn from an estimated distribution (so it includes some randomness). As a result, each of these datasets will have slightly different different imputed values (however, the non-missing data will be the same in each of these imputed datasets). You still use some sort of predictive model to do the imputation in each of these new datasets (mice has many options for prediction methods including Predictive Mean Matching, logistic regression, and random forest) but the mice package can take care of many of the modeling details.\nThen, once you have created these new imputed datasets, you can apply then apply whatever statistical model or analysis you were planning to do for each of these new imputed datasets and pool the results of these models together. This works very well to reduce bias in both MCAR and many MAR settings and often results in more accurate standard error estimates.\nHere is an example of applying the Multiple Imputation process to predict temperature in our linelist dataset using a age and fever status (our simplified model_dataset from above):\n\n# imputing missing values for all variables in our model_dataset, and creating 10 new imputed datasets\nmultiple_imputation = mice(\n model_dataset,\n seed = 1,\n m = 10,\n print = FALSE\n ) \n\nmodel_fit <- with(multiple_imputation, lm(temp ~ age_years + fever))\n\nbase::summary(mice::pool(model_fit))\n\n term estimate std.error statistic df p.value\n1 (Intercept) 3.696900e+01 0.0154162569 2398.0527445 2176.7305 0.0000000\n2 age_years 4.481642e-04 0.0005023468 0.8921411 1438.8255 0.3724665\n3 fever 2.044250e+00 0.0153006140 133.6057671 953.2349 0.0000000\n\n\nHere we used the mice default method of imputation, which is Predictive Mean Matching. We then used these imputed datasets to separately estimate and then pool results from simple linear regressions on each of these datasets. There are many details we’ve glossed over and many settings you can adjust during the Multiple Imputation process while using the mice package. For example, you won’t always have numerical data and might need to use other imputation methods (you can still use the mice package for many other types of data and methods). But, for a more robust analysis when missing data is a significant concern, Multiple Imputation is good solution that isn’t always much more work than doing a complete case analysis.", "crumbs": [ "Analysis", "20  Missing data" @@ -1869,7 +1869,7 @@ "href": "new_pages/standardization.html#preparation", "title": "21  Standardised rates", "section": "21.2 Preparation", - "text": "21.2 Preparation\nTo show how standardization is done, we will use fictitious population counts and death counts from country A and country B, by age (in 5 year categories) and sex (female, male). To make the datasets ready for use, we will perform the following preparation steps:\n\nLoad packages\n\nLoad datasets\n\nJoin the population and death data from the two countries\nPivot longer so there is one row per age-sex stratum\nClean the reference population (world standard population) and join it to the country data\n\nIn your scenario, your data may come in a different format. Perhaps your data are by province, city, or other catchment area. You may have one row for each death and information on age and sex for each (or a significant proportion) of these deaths. In this case, see the pages on Grouping data, Pivoting data, and Descriptive tables to create a dataset with event and population counts per age-sex stratum.\nWe also need a reference population, the standard population. For the purposes of this exercise we will use the world_standard_population_by_sex. The World standard population is based on the populations of 46 countries and was developed in 1960. There are many “standard” populations - as one example, the website of NHS Scotland is quite informative on the European Standard Population, World Standard Population and Scotland Standard Population.\n\n\nLoad packages\nThis code chunk shows the loading of packages required for the analyses. In this handbook we emphasize p_load() from pacman, which installs the package if necessary and loads it for use. You can also load installed packages with library() from base R. See the page on R basics for more information on R packages.\n\npacman::p_load(\n rio, # import/export data\n here, # locate files\n stringr, # cleaning characters and strings\n frailtypack, # needed for dsr, for frailty models\n dsr, # standardise rates\n PHEindicatormethods, # alternative for rate standardisation\n tidyverse) # data management and visualization\n\nCAUTION: If you have a newer version of R, the dsr package cannot be directly downloaded from CRAN. However, it is still available from the CRAN archive. You can install and use this one. \nFor non-Mac users:\n\npackageurl <- \"https://cran.r-project.org/src/contrib/Archive/dsr/dsr_0.2.2.tar.gz\"\ninstall.packages(packageurl, repos=NULL, type=\"source\")\n\n\n# Other solution that may work\nrequire(devtools)\ndevtools::install_version(\"dsr\", version=\"0.2.2\", repos=\"http:/cran.us.r.project.org\")\n\nFor Mac users:\n\nrequire(devtools)\ndevtools::install_version(\"dsr\", version=\"0.2.2\", repos=\"https://mac.R-project.org\")\n\n\n\nLoad population data\nSee the Download handbook and data page for instructions on how to download all the example data in the handbook. You can import the Standardisation page data directly into R from our Github repository by running the following import() commands:\n\n# import demographics for country A directly from Github\nA_demo <- import(\"https://github.com/appliedepi/epirhandbook_eng/raw/master/data/standardization/country_demographics.csv\")\n\n# import deaths for country A directly from Github\nA_deaths <- import(\"https://github.com/appliedepi/epirhandbook_eng/raw/master/data/standardization/deaths_countryA.csv\")\n\n# import demographics for country B directly from Github\nB_demo <- import(\"https://github.com/appliedepi/epirhandbook_eng/raw/master/data/standardization/country_demographics_2.csv\")\n\n# import deaths for country B directly from Github\nB_deaths <- import(\"https://github.com/appliedepi/epirhandbook_eng/raw/master/data/standardization/deaths_countryB.csv\")\n\n# import demographics for country B directly from Github\nstandard_pop_data <- import(\"https://github.com/appliedepi/epirhandbook_eng/raw/master/data/standardization/world_standard_population_by_sex.csv\")\n\nFirst we load the demographic data (counts of males and females by 5-year age category) for the two countries that we will be comparing, “Country A” and “Country B”.\n\n# Country A\nA_demo <- import(\"country_demographics.csv\")\n\n\n\n\n\n\n\n\n# Country B\nB_demo <- import(\"country_demographics_2.csv\")\n\n\n\n\n\n\n\n\n\nLoad death counts\nConveniently, we also have the counts of deaths during the time period of interest, by age and sex. Each country’s counts are in a separate file, shown below.\nDeaths in Country A\n\n\n\n\n\n\nDeaths in Country B\n\n\n\n\n\n\n\n\nClean populations and deaths\nWe need to join and transform these data in the following ways:\n\nCombine country populations into one dataset and pivot “long” so that each age-sex stratum is one row\n\nCombine country death counts into one dataset and pivot “long” so each age-sex stratum is one row\n\nJoin the deaths to the populations\n\nFirst, we combine the country populations datasets, pivot longer, and do minor cleaning. See the page on Pivoting data for more detail.\n\npop_countries <- A_demo %>% # begin with country A dataset\n bind_rows(B_demo) %>% # bind rows, because cols are identically named\n pivot_longer( # pivot longer\n cols = c(m, f), # columns to combine into one\n names_to = \"Sex\", # name for new column containing the category (\"m\" or \"f\") \n values_to = \"Population\") %>% # name for new column containing the numeric values pivoted\n mutate(Sex = recode(Sex, # re-code values for clarity\n \"m\" = \"Male\",\n \"f\" = \"Female\"))\n\nThe combined population data now look like this (click through to see countries A and B):\n\n\n\n\n\n\nAnd now we perform similar operations on the two deaths datasets.\n\ndeaths_countries <- A_deaths %>% # begin with country A deaths dataset\n bind_rows(B_deaths) %>% # bind rows with B dataset, because cols are identically named\n pivot_longer( # pivot longer\n cols = c(Male, Female), # column to transform into one\n names_to = \"Sex\", # name for new column containing the category (\"m\" or \"f\") \n values_to = \"Deaths\") %>% # name for new column containing the numeric values pivoted\n rename(age_cat5 = AgeCat) # rename for clarity\n\nThe deaths data now look like this, and contain data from both countries:\n\n\n\n\n\n\nWe now join the deaths and population data based on common columns Country, age_cat5, and Sex. This adds the column Deaths.\n\ncountry_data <- pop_countries %>% \n left_join(deaths_countries, by = c(\"Country\", \"age_cat5\", \"Sex\"))\n\nWe can now classify Sex, age_cat5, and Country as factors and set the level order using fct_relevel() function from the forcats package, as described in the page on Factors. Note, classifying the factor levels doesn’t visibly change the data, but the arrange() command does sort it by Country, age category, and sex.\n\ncountry_data <- country_data %>% \n mutate(\n Country = fct_relevel(Country, \"A\", \"B\"),\n \n Sex = fct_relevel(Sex, \"Male\", \"Female\"),\n \n age_cat5 = fct_relevel(\n age_cat5,\n \"0-4\", \"5-9\", \"10-14\", \"15-19\",\n \"20-24\", \"25-29\", \"30-34\", \"35-39\",\n \"40-44\", \"45-49\", \"50-54\", \"55-59\",\n \"60-64\", \"65-69\", \"70-74\",\n \"75-79\", \"80-84\", \"85\")) %>% \n \n arrange(Country, age_cat5, Sex)\n\n\n\n\n\n\n\nCAUTION: If you have few deaths per stratum, consider using 10-, or 15-year categories, instead of 5-year categories for age.\n\n\nLoad reference population\nLastly, for the direct standardisation, we import the reference population (world “standard population” by sex)\n\n# Reference population\nstandard_pop_data <- import(\"world_standard_population_by_sex.csv\")\n\n\n\n\n\n\n\n\n\n\nClean reference population\nThe age category values in the country_data and standard_pop_data data frames will need to be aligned.\nCurrently, the values of the column age_cat5 from the standard_pop_data data frame contain the word “years” and “plus”, while those of the country_data data frame do not. We will have to make the age category values match. We use str_replace_all() from the stringr package, as described in the page on Characters and strings, to replace these patterns with no space \"\".\nFurthermore, the package dsr expects that in the standard population, the column containing counts will be called \"pop\". So we rename that column accordingly.\n\n# Remove specific string from column values\nstandard_pop_clean <- standard_pop_data %>%\n mutate(\n age_cat5 = str_replace_all(age_cat5, \"years\", \"\"), # remove \"year\"\n age_cat5 = str_replace_all(age_cat5, \"plus\", \"\"), # remove \"plus\"\n age_cat5 = str_replace_all(age_cat5, \" \", \"\")) %>% # remove \" \" space\n \n rename(pop = WorldStandardPopulation) # change col name to \"pop\", as this is expected by dsr package\n\nCAUTION: If you try to use str_replace_all() to remove a plus symbol, it won’t work because it is a special symbol. “Escape” the specialnes by putting two back slashes in front, as in str_replace_call(column, \"\\\\+\", \"\"). \n\n\nCreate dataset with standard population\nFinally, the package PHEindicatormethods, detailed below, expects the standard populations joined to the country event and population counts. So, we will create a dataset all_data for that purpose.\n\nall_data <- left_join(country_data, standard_pop_clean, by=c(\"age_cat5\", \"Sex\"))\n\nThis complete dataset looks like this:", + "text": "21.2 Preparation\nTo show how standardization is done, we will use fictitious population counts and death counts from country A and country B, by age (in 5 year categories) and sex (female, male). To make the datasets ready for use, we will perform the following preparation steps:\n\nLoad packages.\nImport datasets.\n\nJoin the population and death data from the two countries.\nPivot longer so there is one row per age-sex stratum.\nClean the reference population (world standard population) and join it to the country data.\n\nIn your scenario, your data may come in a different format. Perhaps your data are by province, city, or other catchment area. You may have one row for each death and information on age and sex for each (or a significant proportion) of these deaths. In this case, see the pages on Grouping data, Pivoting data, and Descriptive tables to create a dataset with event and population counts per age-sex stratum.\nWe also need a reference population, the standard population. For the purposes of this exercise we will use the world_standard_population_by_sex. The World standard population is based on the populations of 46 countries and was developed in 1960. There are many “standard” populations - as one example, the website of NHS Scotland is quite informative on the European Standard Population, World Standard Population and Scotland Standard Population.\n\n\nLoad packages\nThis code chunk shows the loading of packages required for the analyses. In this handbook we emphasize p_load() from pacman, which installs the package if necessary and loads it for use. You can also load installed packages with library() from base R. See the page on R basics for more information on R packages.\n\npacman::p_load(\n rio, # import/export data\n here, # locate files\n stringr, # cleaning characters and strings\n PHEindicatormethods, # alternative for rate standardisation\n tidyverse # data management and visualization\n)\n\n\n\nLoad population data\nSee the Download handbook and data page for instructions on how to download all the example data in the handbook. You can import the Standardisation page data directly into R from our Github repository by running the following import() commands:\n\n# import demographics for country A directly from Github\nA_demo <- import(\"https://github.com/appliedepi/epirhandbook_eng/raw/master/data/standardization/country_demographics.csv\")\n\n# import deaths for country A directly from Github\nA_deaths <- import(\"https://github.com/appliedepi/epirhandbook_eng/raw/master/data/standardization/deaths_countryA.csv\")\n\n# import demographics for country B directly from Github\nB_demo <- import(\"https://github.com/appliedepi/epirhandbook_eng/raw/master/data/standardization/country_demographics_2.csv\")\n\n# import deaths for country B directly from Github\nB_deaths <- import(\"https://github.com/appliedepi/epirhandbook_eng/raw/master/data/standardization/deaths_countryB.csv\")\n\n# import demographics for country B directly from Github\nstandard_pop_data <- import(\"https://github.com/appliedepi/epirhandbook_eng/raw/master/data/standardization/world_standard_population_by_sex.csv\")\n\nFirst we load the demographic data (counts of males and females by 5-year age category) for the two countries that we will be comparing, “Country A” and “Country B”.\n\n# Country A\nA_demo <- import(\"country_demographics.csv\")\n\n\n\n\n\n\n\n\n# Country B\nB_demo <- import(\"country_demographics_2.csv\")\n\n\n\n\n\n\n\n\n\nLoad death counts\nConveniently, we also have the counts of deaths during the time period of interest, by age and sex. Each country’s counts are in a separate file, shown below.\nDeaths in Country A\n\n\n\n\n\n\nDeaths in Country B\n\n\n\n\n\n\n\n\nClean populations and deaths\nWe need to join and transform these data in the following ways:\n\nCombine country populations into one dataset and pivot “long” so that each age-sex stratum is one row.\n\nCombine country death counts into one dataset and pivot “long” so each age-sex stratum is one row.\n\nJoin the deaths to the populations.\n\nFirst, we combine the country populations datasets, pivot longer, and do minor cleaning. See the page on Pivoting data for more detail.\n\npop_countries <- A_demo %>% # begin with country A dataset\n mutate(Country = \"A\") %>% # add in a country identifier\n bind_rows(B_demo %>% # bind rows, because cols are identically named\n mutate(Country = \"B\")) %>% # add in country identifier\n pivot_longer( # pivot longer\n cols = c(m, f), # columns to combine into one\n names_to = \"Sex\", # name for new column containing the category (\"m\" or \"f\") \n values_to = \"Population\") %>% # name for new column containing the numeric values pivoted\n mutate(Sex = recode(Sex, # re-code values for clarity\n \"m\" = \"Male\",\n \"f\" = \"Female\"))\n\nThe combined population data now look like this (click through to see countries A and B):\n\n\n\n\n\n\nAnd now we perform similar operations on the two deaths datasets.\n\ndeaths_countries <- A_deaths %>% # begin with country A deaths dataset\n bind_rows(B_deaths) %>% # bind rows with B dataset, because cols are identically named\n pivot_longer( # pivot longer\n cols = c(Male, Female), # column to transform into one\n names_to = \"Sex\", # name for new column containing the category (\"m\" or \"f\") \n values_to = \"Deaths\") %>% # name for new column containing the numeric values pivoted\n rename(age_cat5 = AgeCat) # rename for clarity\n\nThe deaths data now look like this, and contain data from both countries:\n\n\n\n\n\n\nWe now join the deaths and population data based on common columns Country, age_cat5, and Sex. This adds the column Deaths.\n\ncountry_data <- pop_countries %>% \n left_join(deaths_countries, by = c(\"Country\", \"age_cat5\", \"Sex\"))\n\nWe can now classify Sex, age_cat5, and Country as factors and set the level order using fct_relevel() function from the forcats package, as described in the page on Factors. Note, classifying the factor levels doesn’t visibly change the data, but the arrange() command does sort it by Country, age category, and sex.\n\ncountry_data <- country_data %>% \n mutate(\n Country = fct_relevel(Country, \"A\", \"B\"),\n \n Sex = fct_relevel(Sex, \"Male\", \"Female\"),\n \n age_cat5 = fct_relevel(\n age_cat5,\n \"0-4\", \"5-9\", \"10-14\", \"15-19\",\n \"20-24\", \"25-29\", \"30-34\", \"35-39\",\n \"40-44\", \"45-49\", \"50-54\", \"55-59\",\n \"60-64\", \"65-69\", \"70-74\",\n \"75-79\", \"80-84\", \"85\")) %>% \n \n arrange(Country, age_cat5, Sex)\n\n\n\n\n\n\n\nCAUTION: If you have few deaths per stratum, consider using 10-, or 15-year categories, instead of 5-year categories for age.\n\n\nLoad reference population\nLastly, for the direct standardisation, we import the reference population (world “standard population” by sex):\n\n# Reference population\nstandard_pop_data <- import(\"world_standard_population_by_sex.csv\")\n\n\n\n\n\n\n\n\n\n\nClean reference population\nThe age category values in the country_data and standard_pop_data data frames will need to be aligned.\nCurrently, the values of the column age_cat5 from the standard_pop_data data frame contain the word “years” and “plus”, while those of the country_data data frame do not. We will have to make the age category values match. We use str_replace_all() from the stringr package, as described in the page on Characters and strings, to replace these patterns with no space \"\".\n\n# Remove specific string from column values\nstandard_pop_clean <- standard_pop_data %>%\n mutate(\n age_cat5 = str_replace_all(age_cat5, \"years\", \"\"), # remove \"year\"\n age_cat5 = str_replace_all(age_cat5, \"plus\", \"\"), # remove \"plus\"\n age_cat5 = str_replace_all(age_cat5, \" \", \"\")) %>% # remove \" \" space\n \n rename(pop = WorldStandardPopulation) # change col name to \"pop\"\n\nCAUTION: If you try to use str_replace_all() to remove a plus symbol, it won’t work because it is a special symbol. “Escape” the specialnes by putting two back slashes in front, as in str_replace_call(column, \"\\\\+\", \"\"). \n\n\nCreate dataset with standard population\nFinally, the package PHEindicatormethods, detailed below, expects the standard populations joined to the country event and population counts. So, we will create a dataset all_data for that purpose.\n\nall_data <- left_join(country_data, \n standard_pop_clean, \n by = c(\"age_cat5\", \"Sex\"))\n\nThis complete dataset looks like this:", "crumbs": [ "Analysis", "21  Standardised rates" @@ -1890,8 +1890,8 @@ "objectID": "new_pages/standardization.html#standard_phe", "href": "new_pages/standardization.html#standard_phe", "title": "21  Standardised rates", - "section": "21.4 PHEindicatormethods package", - "text": "21.4 PHEindicatormethods package\nAnother way of calculating standardized rates is with the PHEindicatormethods package. This package allows you to calculate directly as well as indirectly standardized rates. We will show both.\nThis section will use the all_data data frame created at the end of the Preparation section. This data frame includes the country populations, death events, and the world standard reference population. You can view it here.\n\n\nDirectly standardized rates\nBelow, we first group the data by Country and then pass it to the function phe_dsr() to get directly standardized rates per country.\nOf note - the reference (standard) population can be provided as a column within the country-specific data frame or as a separate vector. If provided within the country-specific data frame, you have to set stdpoptype = \"field\". If provided as a vector, set stdpoptype = \"vector\". In the latter case, you have to make sure the ordering of rows by strata is similar in both the country-specific data frame and the reference population, as records will be matched by position. In our example below, we provided the reference population as a column within the country-specific data frame.\nSee the help with ?phr_dsr or the links in the References section for more information.\n\n# Calculate rates per country directly standardized for age and sex\nmortality_ds_rate_phe <- all_data %>%\n group_by(Country) %>%\n PHEindicatormethods::phe_dsr(\n x = Deaths, # column with observed number of events\n n = Population, # column with non-standard pops for each stratum\n stdpop = pop, # standard populations for each stratum\n stdpoptype = \"field\") # either \"vector\" for a standalone vector or \"field\" meaning std populations are in the data \n\n# Print table\nknitr::kable(mortality_ds_rate_phe)\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nCountry\ntotal_count\ntotal_pop\nvalue\nlowercl\nuppercl\nconfidence\nstatistic\nmethod\n\n\n\n\nA\n11344\n86790567\n23.56686\n23.08107\n24.05944\n95%\ndsr per 100000\nDobson\n\n\nB\n9955\n52898281\n19.32549\n18.45516\n20.20882\n95%\ndsr per 100000\nDobson\n\n\n\n\n\n\n\n\nIndirectly standardized rates\nFor indirect standardization, you need a reference population with the number of deaths and number of population per stratum. In this example, we will be calculating rates for country A using country B as the reference population, as the standard_pop_clean reference population does not include number of deaths per stratum.\nBelow, we first create the reference population from country B. Then, we pass mortality and population data for country A, combine it with the reference population, and pass it to the function calculate_ISRate(), to get indirectly standardized rates. Of course, you can do it also vice versa.\nOf note - in our example below, the reference population is provided as a separate data frame. In this case, we make sure that x =, n =, x_ref = and n_ref = vectors are all ordered by the same standardization category (stratum) values as that in our country-specific data frame, as records will be matched by position.\nSee the help with ?phr_isr or the links in the References section for more information.\n\n# Create reference population\nrefpopCountryB <- country_data %>% \n filter(Country == \"B\") \n\n# Calculate rates for country A indirectly standardized by age and sex\nmortality_is_rate_phe_A <- country_data %>%\n filter(Country == \"A\") %>%\n PHEindicatormethods::calculate_ISRate(\n x = Deaths, # column with observed number of events\n n = Population, # column with non-standard pops for each stratum\n x_ref = refpopCountryB$Deaths, # reference number of deaths for each stratum\n n_ref = refpopCountryB$Population) # reference population for each stratum\n\n# Print table\nknitr::kable(mortality_is_rate_phe_A)\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nobserved\nexpected\nref_rate\nvalue\nlowercl\nuppercl\nconfidence\nstatistic\nmethod\n\n\n\n\n11344\n15847.42\n18.81914\n13.47123\n13.22446\n13.72145\n95%\nindirectly standardised rate per 100000\nByars", + "section": "21.3 PHEindicatormethods package", + "text": "21.3 PHEindicatormethods package\nAnother way of calculating standardized rates is with the PHEindicatormethods package. This package allows you to calculate directly as well as indirectly standardized rates. We will show both.\nThis section will use the all_data data frame created at the end of the Preparation section. This data frame includes the country populations, death events, and the world standard reference population. You can view it here.\n\n\nDirectly standardized rates\nBelow, we first group the data by Country and then pass it to the function phe_dsr() to get directly standardized rates per country.\nOf note - the reference (standard) population can be provided as a column within the country-specific data frame or as a separate vector. If provided within the country-specific data frame, you have to set stdpoptype = \"field\". If provided as a vector, set stdpoptype = \"vector\". In the latter case, you have to make sure the ordering of rows by strata is similar in both the country-specific data frame and the reference population, as records will be matched by position. In our example below, we provided the reference population as a column within the country-specific data frame.\nSee the help with ?phr_dsr or the links in the References section for more information.\n\n# Calculate rates per country directly standardized for age and sex\nmortality_ds_rate_phe <- all_data %>%\n group_by(Country) %>%\n PHEindicatormethods::phe_dsr(\n x = Deaths, # column with observed number of events\n n = Population, # column with non-standard pops for each stratum\n stdpop = pop, # standard populations for each stratum\n stdpoptype = \"field\") # either \"vector\" for a standalone vector or \"field\" meaning std populations are in the data \n\n# Print table\nknitr::kable(mortality_ds_rate_phe)\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nCountry\ntotal_count\ntotal_pop\nvalue\nlowercl\nuppercl\nconfidence\nstatistic\nmethod\n\n\n\n\nA\n11344\n86790567\n23.56686\n23.08107\n24.05944\n95%\ndsr per 100000\nDobson\n\n\nB\n9955\n52898281\n19.32549\n18.45516\n20.20882\n95%\ndsr per 100000\nDobson\n\n\n\n\n\n\n\n\nIndirectly standardized rates\nFor indirect standardization, you need a reference population with the number of deaths and number of population per stratum. In this example, we will be calculating rates for country A using country B as the reference population, as the standard_pop_clean reference population does not include number of deaths per stratum.\nBelow, we first create the reference population from country B. Then, we pass mortality and population data for country A, combine it with the reference population, and pass it to the function calculate_ISRate(), to get indirectly standardized rates. Of course, you can do it also vice versa.\nOf note - in our example below, the reference population is provided as a separate data frame. In this case, we make sure that x =, n =, x_ref = and n_ref = vectors are all ordered by the same standardization category (stratum) values as that in our country-specific data frame, as records will be matched by position.\nSee the help with ?phr_isr or the links in the References section for more information.\n\n# Create reference population\nrefpopCountryB <- country_data %>% \n filter(Country == \"B\") \n\n# Calculate rates for country A indirectly standardized by age and sex\nmortality_is_rate_phe_A <- country_data %>%\n filter(Country == \"A\") %>%\n PHEindicatormethods::calculate_ISRate(\n x = Deaths, # column with observed number of events\n n = Population, # column with non-standard pops for each stratum\n x_ref = refpopCountryB$Deaths, # reference number of deaths for each stratum\n n_ref = refpopCountryB$Population) # reference population for each stratum\n\n# Print table\nknitr::kable(mortality_is_rate_phe_A)\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nobserved\nexpected\nref_rate\nvalue\nlowercl\nuppercl\nconfidence\nstatistic\nmethod\n\n\n\n\n11344\n15847.42\n18.81914\n13.47123\n13.22446\n13.72145\n95%\nindirectly standardised rate per 100000\nByars", "crumbs": [ "Analysis", "21  Standardised rates" @@ -1901,8 +1901,8 @@ "objectID": "new_pages/standardization.html#resources", "href": "new_pages/standardization.html#resources", "title": "21  Standardised rates", - "section": "21.5 Resources", - "text": "21.5 Resources\nIf you would like to see another reproducible example using dsr please see this vignette\nFor another example using PHEindicatormethods, please go to this website\nSee the PHEindicatormethods reference pdf file", + "section": "21.4 Resources", + "text": "21.4 Resources\nFor another example using PHEindicatormethods, please go to this website.\nSee the PHEindicatormethods reference pdf file.", "crumbs": [ "Analysis", "21  Standardised rates" @@ -1979,7 +1979,7 @@ "href": "new_pages/time_series.html#overview", "title": "23  Time series and outbreak detection", "section": "", - "text": "Time series data\nDescriptive analysis\nFitting regressions\nRelation of two time series\nOutbreak detection\nInterrupted time series", + "text": "Time series data.\nDescriptive analysis.\nFitting regressions.\nRelation of two time series.\nOutbreak detection.\nInterrupted time series.", "crumbs": [ "Analysis", "23  Time series and outbreak detection" @@ -1990,7 +1990,7 @@ "href": "new_pages/time_series.html#preparation", "title": "23  Time series and outbreak detection", "section": "23.2 Preparation", - "text": "23.2 Preparation\n\nPackages\nThis code chunk shows the loading of packages required for the analyses. In this handbook we emphasize p_load() from pacman, which installs the package if necessary and loads it for use. You can also load packages with library() from base R. See the page on R basics for more information on R packages.\n\npacman::p_load(rio, # File import\n here, # File locator\n tsibble, # handle time series datasets\n slider, # for calculating moving averages\n imputeTS, # for filling in missing values\n feasts, # for time series decomposition and autocorrelation\n forecast, # fit sin and cosin terms to data (note: must load after feasts)\n trending, # fit and assess models \n tmaptools, # for getting geocoordinates (lon/lat) based on place names\n ecmwfr, # for interacting with copernicus sateliate CDS API\n stars, # for reading in .nc (climate data) files\n units, # for defining units of measurement (climate data)\n yardstick, # for looking at model accuracy\n surveillance, # for aberration detection\n tidyverse # data management + ggplot2 graphics\n )\n\n\n\nLoad data\nYou can download all the data used in this handbook via the instructions in the Download handbook and data page.\nThe example dataset used in this section is weekly counts of campylobacter cases reported in Germany between 2001 and 2011. You can click here to download this data file (.xlsx).\nThis dataset is a reduced version of the dataset available in the surveillance package. (for details load the surveillance package and see ?campyDE)\nImport these data with the import() function from the rio package (it handles many file types like .xlsx, .csv, .rds - see the Import and export page for details).\n\n# import the counts into R\ncounts <- rio::import(\"campylobacter_germany.xlsx\")\n\nThe first 10 rows of the counts are displayed below.\n\n\n\n\n\n\n\n\nClean data\nThe code below makes sure that the date column is in the appropriate format. For this tab we will be using the tsibble package and so the yearweek function will be used to create a calendar week variable. There are several other ways of doing this (see the Working with dates page for details), however for time series its best to keep within one framework (tsibble).\n\n## ensure the date column is in the appropriate format\ncounts$date <- as.Date(counts$date)\n\n## create a calendar week variable \n## fitting ISO definitons of weeks starting on a monday\ncounts <- counts %>% \n mutate(epiweek = yearweek(date, week_start = 1))\n\n\n\nDownload climate data\nIn the relation of two time series section of this page, we will be comparing campylobacter case counts to climate data.\nClimate data for anywhere in the world can be downloaded from the EU’s Copernicus Satellite. These are not exact measurements, but based on a model (similar to interpolation), however the benefit is global hourly coverage as well as forecasts.\nYou can download each of these climate data files from the Download handbook and data page.\nFor purposes of demonstration here, we will show R code to use the ecmwfr package to pull these data from the Copernicus climate data store. You will need to create a free account in order for this to work. The package website has a useful walkthrough of how to do this. Below is example code of how to go about doing this, once you have the appropriate API keys. You have to replace the X’s below with your account IDs. You will need to download one year of data at a time otherwise the server times-out.\nIf you are not sure of the coordinates for a location you want to download data for, you can use the tmaptools package to pull the coordinates off open street maps. An alternative option is the photon package, however this has not been released on to CRAN yet; the nice thing about photon is that it provides more contextual data for when there are several matches for your search.\n\n## retrieve location coordinates\ncoords <- geocode_OSM(\"Germany\", geometry = \"point\")\n\n## pull together long/lats in format for ERA-5 querying (bounding box) \n## (as just want a single point can repeat coords)\nrequest_coords <- str_glue_data(coords$coords, \"{y}/{x}/{y}/{x}\")\n\n\n## Pulling data modelled from copernicus satellite (ERA-5 reanalysis)\n## https://cds.climate.copernicus.eu/cdsapp#!/software/app-era5-explorer?tab=app\n## https://github.com/bluegreen-labs/ecmwfr\n\n## set up key for weather data \nwf_set_key(user = \"XXXXX\",\n key = \"XXXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXX\",\n service = \"cds\") \n\n## run for each year of interest (otherwise server times out)\nfor (i in 2002:2011) {\n \n ## pull together a query \n ## see here for how to do: https://bluegreen-labs.github.io/ecmwfr/articles/cds_vignette.html#the-request-syntax\n ## change request to a list using addin button above (python to list)\n ## Target is the name of the output file!!\n request <- request <- list(\n product_type = \"reanalysis\",\n format = \"netcdf\",\n variable = c(\"2m_temperature\", \"total_precipitation\"),\n year = c(i),\n month = c(\"01\", \"02\", \"03\", \"04\", \"05\", \"06\", \"07\", \"08\", \"09\", \"10\", \"11\", \"12\"),\n day = c(\"01\", \"02\", \"03\", \"04\", \"05\", \"06\", \"07\", \"08\", \"09\", \"10\", \"11\", \"12\",\n \"13\", \"14\", \"15\", \"16\", \"17\", \"18\", \"19\", \"20\", \"21\", \"22\", \"23\", \"24\",\n \"25\", \"26\", \"27\", \"28\", \"29\", \"30\", \"31\"),\n time = c(\"00:00\", \"01:00\", \"02:00\", \"03:00\", \"04:00\", \"05:00\", \"06:00\", \"07:00\",\n \"08:00\", \"09:00\", \"10:00\", \"11:00\", \"12:00\", \"13:00\", \"14:00\", \"15:00\",\n \"16:00\", \"17:00\", \"18:00\", \"19:00\", \"20:00\", \"21:00\", \"22:00\", \"23:00\"),\n area = request_coords,\n dataset_short_name = \"reanalysis-era5-single-levels\",\n target = paste0(\"germany_weather\", i, \".nc\")\n )\n \n ## download the file and store it in the current working directory\n file <- wf_request(user = \"XXXXX\", # user ID (for authentication)\n request = request, # the request\n transfer = TRUE, # download the file\n path = here::here(\"data\", \"Weather\")) ## path to save the data\n }\n\n\n\nLoad climate data\nWhether you downloaded the climate data via our handbook, or used the code above, you now should have 10 years of “.nc” climate data files stored in the same folder on your computer.\nUse the code below to import these files into R with the stars package.\n\n## define path to weather folder \nfile_paths <- list.files(\n here::here(\"data\", \"time_series\", \"weather\"), # replace with your own file path \n full.names = TRUE)\n\n## only keep those with the current name of interest \nfile_paths <- file_paths[str_detect(file_paths, \"germany\")]\n\n## read in all the files as a stars object \ndata <- stars::read_stars(file_paths)\n\nt2m, tp, \nt2m, tp, \nt2m, tp, \nt2m, tp, \nt2m, tp, \nt2m, tp, \nt2m, tp, \nt2m, tp, \nt2m, tp, \nt2m, tp, \n\n\nOnce these files have been imported as the object data, we will convert them to a data frame.\n\n## change to a data frame \ntemp_data <- as_tibble(data) %>% \n ## add in variables and correct units\n mutate(\n ## create an calendar week variable \n epiweek = tsibble::yearweek(time), \n ## create a date variable (start of calendar week)\n date = as.Date(epiweek),\n ## change temperature from kelvin to celsius\n t2m = set_units(t2m, celsius), \n ## change precipitation from metres to millimetres \n tp = set_units(tp, mm)) %>% \n ## group by week (keep the date too though)\n group_by(epiweek, date) %>% \n ## get the average per week\n summarise(t2m = as.numeric(mean(t2m)), \n tp = as.numeric(mean(tp)))\n\n`summarise()` has grouped output by 'epiweek'. You can override using the\n`.groups` argument.", + "text": "23.2 Preparation\n\nPackages\nThis code chunk shows the loading of packages required for the analyses. In this handbook we emphasize p_load() from pacman, which installs the package if necessary and loads it for use. You can also load packages with library() from base R. See the page on R basics for more information on R packages.\n\npacman::p_load(\n rio, # File import\n here, # File locator\n tsibble, # handle time series datasets\n slider, # for calculating moving averages\n imputeTS, # for filling in missing values\n feasts, # for time series decomposition and autocorrelation\n forecast, # fit sin and cosin terms to data (note: must load after feasts)\n trending, # fit and assess models \n tmaptools, # for getting geocoordinates (lon/lat) based on place names\n ecmwfr, # for interacting with copernicus sateliate CDS API\n stars, # for reading in .nc (climate data) files\n units, # for defining units of measurement (climate data)\n yardstick, # for looking at model accuracy\n surveillance, # for aberration detection\n tidyverse # data management + ggplot2 graphics\n )\n\n\n\nLoad data\nYou can download all the data used in this handbook via the instructions in the Download handbook and data page.\nThe example dataset used in this section is weekly counts of campylobacter cases reported in Germany between 2001 and 2011. You can click here to download this data file (.xlsx).\nThis dataset is a reduced version of the dataset available in the surveillance package. (for details load the surveillance package and see ?campyDE)\nImport these data with the import() function from the rio package (it handles many file types like .xlsx, .csv, .rds - see the Import and export page for details).\n\n# import the counts into R\ncounts <- rio::import(\"campylobacter_germany.xlsx\")\n\nThe first 10 rows of the counts are displayed below.\n\n\n\n\n\n\n\n\nClean data\nThe code below makes sure that the date column is in the appropriate format. For this tab we will be using the tsibble package and so the yearweek function will be used to create a calendar week variable. There are several other ways of doing this (see the Working with dates page for details), however for time series its best to keep within one framework (tsibble).\n\n## ensure the date column is in the appropriate format\ncounts$date <- as.Date(counts$date)\n\n## create a calendar week variable \n## fitting ISO definitons of weeks starting on a monday\ncounts <- counts %>% \n mutate(epiweek = yearweek(date, week_start = 1))\n\n\n\nDownload climate data\nIn the relation of two time series section of this page, we will be comparing campylobacter case counts to climate data.\nClimate data for anywhere in the world can be downloaded from the EU’s Copernicus Satellite. These are not exact measurements, but based on a model (similar to interpolation), however the benefit is global hourly coverage as well as forecasts.\nYou can download each of these climate data files from the Download handbook and data page.\nFor purposes of demonstration here, we will show R code to use the ecmwfr package to pull these data from the Copernicus climate data store. You will need to create a free account in order for this to work. The package website has a useful walkthrough of how to do this. Below is example code of how to go about doing this, once you have the appropriate API keys. You have to replace the X’s below with your account IDs. You will need to download one year of data at a time otherwise the server times-out.\nIf you are not sure of the coordinates for a location you want to download data for, you can use the tmaptools package to pull the coordinates off open street maps. An alternative option is the photon package, however this has not been released on to CRAN yet; the nice thing about photon is that it provides more contextual data for when there are several matches for your search.\n\n## retrieve location coordinates\ncoords <- geocode_OSM(\"Germany\", geometry = \"point\")\n\n## pull together long/lats in format for ERA-5 querying (bounding box) \n## (as just want a single point can repeat coords)\nrequest_coords <- str_glue_data(coords$coords, \"{y}/{x}/{y}/{x}\")\n\n\n## Pulling data modelled from copernicus satellite (ERA-5 reanalysis)\n## https://cds-beta.climate.copernicus.eu/datasets/reanalysis-era5-land?tab=overview\n## https://github.com/bluegreen-labs/ecmwfr\n\n## set up key for weather data \nwf_set_key(key = \"XXXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXX\") \n\n## run for each year of interest (otherwise server times out)\nfor (i in 2002:2011) {\n \n ## pull together a query \n ## see here for how to do: https://bluegreen-labs.github.io/ecmwfr/articles/cds_vignette.html#the-request-syntax\n ## change request to a list using addin button above (python to list)\n ## Target is the name of the output file!!\n request <- request <- list(\n product_type = \"reanalysis\",\n format = \"netcdf\",\n variable = c(\"2m_temperature\", \"total_precipitation\"),\n year = c(i),\n month = c(\"01\", \"02\", \"03\", \"04\", \"05\", \"06\", \"07\", \"08\", \"09\", \"10\", \"11\", \"12\"),\n day = c(\"01\", \"02\", \"03\", \"04\", \"05\", \"06\", \"07\", \"08\", \"09\", \"10\", \"11\", \"12\",\n \"13\", \"14\", \"15\", \"16\", \"17\", \"18\", \"19\", \"20\", \"21\", \"22\", \"23\", \"24\",\n \"25\", \"26\", \"27\", \"28\", \"29\", \"30\", \"31\"),\n time = c(\"00:00\", \"01:00\", \"02:00\", \"03:00\", \"04:00\", \"05:00\", \"06:00\", \"07:00\",\n \"08:00\", \"09:00\", \"10:00\", \"11:00\", \"12:00\", \"13:00\", \"14:00\", \"15:00\",\n \"16:00\", \"17:00\", \"18:00\", \"19:00\", \"20:00\", \"21:00\", \"22:00\", \"23:00\"),\n area = request_coords,\n dataset_short_name = \"reanalysis-era5-single-levels\",\n target = paste0(\"germany_weather\", i, \".nc\")\n )\n \n ## download the file and store it in the current working directory\n file <- wf_request(user = \"XXXXX\", # user ID (for authentication)\n request = request, # the request\n transfer = TRUE, # download the file\n path = here::here(\"data\", \"Weather\")) ## path to save the data\n }\n\n\n\nLoad climate data\nWhether you downloaded the climate data via our handbook, or used the code above, you now should have 10 years of “.nc” climate data files stored in the same folder on your computer.\nUse the code below to import these files into R with the stars package.\n\n## define path to weather folder \nfile_paths <- list.files(\n here::here(\"data\", \"time_series\", \"weather\"), # replace with your own file path \n full.names = TRUE)\n\n## only keep those with the current name of interest \nfile_paths <- file_paths[str_detect(file_paths, \"germany\")]\n\n## read in all the files as a stars object \ndata <- stars::read_stars(file_paths)\n\nt2m, \n\n\ntp, \n\n\n\nt2m, \n\n\ntp, \n\n\n\nt2m, \n\n\ntp, \n\n\n\nt2m, \n\n\ntp, \n\n\n\nt2m, \n\n\ntp, \n\n\n\nt2m, \n\n\ntp, \n\n\n\nt2m, \n\n\ntp, \n\n\n\nt2m, \n\n\ntp, \n\n\n\nt2m, \n\n\ntp, \n\n\n\nt2m, \n\n\ntp, \n\n\nOnce these files have been imported as the object data, we will convert them to a data frame.\n\n## change to a data frame \ntemp_data <- as_tibble(data) %>% \n ## add in variables and correct units\n mutate(\n ## create an calendar week variable \n epiweek = tsibble::yearweek(time), \n ## create a date variable (start of calendar week)\n date = as.Date(epiweek),\n ## change temperature from kelvin to celsius\n t2m = set_units(t2m, celsius), \n ## change precipitation from metres to millimetres \n tp = set_units(tp, mm)) %>% \n ## group by week (keep the date too though)\n group_by(epiweek, date) %>% \n ## get the average per week\n summarise(t2m = as.numeric(mean(t2m)), \n tp = as.numeric(mean(tp)))\n\n`summarise()` has grouped output by 'epiweek'. You can override using the\n`.groups` argument.", "crumbs": [ "Analysis", "23  Time series and outbreak detection" @@ -2012,7 +2012,7 @@ "href": "new_pages/time_series.html#descriptive-analysis", "title": "23  Time series and outbreak detection", "section": "23.4 Descriptive analysis", - "text": "23.4 Descriptive analysis\n\n\nMoving averages\nIf data is very noisy (counts jumping up and down) then it can be helpful to calculate a moving average. In the example below, for each week we calculate the average number of cases from the four previous weeks. This smooths the data, to make it more interpretable. In our case this does not really add much, so we willstick to the interpolated data for further analysis. See the Moving averages page for more detail.\n\n## create a moving average variable (deals with missings)\ncounts <- counts %>% \n ## create the ma_4w variable \n ## slide over each row of the case variable\n mutate(ma_4wk = slider::slide_dbl(case, \n ## for each row calculate the name\n ~ mean(.x, na.rm = TRUE),\n ## use the four previous weeks\n .before = 4))\n\n## make a quick visualisation of the difference \nggplot(counts, aes(x = epiweek)) + \n geom_line(aes(y = case)) + \n geom_line(aes(y = ma_4wk), colour = \"red\")\n\n\n\n\n\n\n\n\n\n\n\nPeriodicity\nBelow we define a custom function to create a periodogram. See the Writing functions page for information about how to write functions in R.\nFirst, the function is defined. Its arguments include a dataset with a column counts, start_week = which is the first week of the dataset, a number to indicate how many periods per year (e.g. 52, 12), and lastly the output style (see details in the code below).\n\n## Function arguments\n#####################\n## x is a dataset\n## counts is variable with count data or rates within x \n## start_week is the first week in your dataset\n## period is how many units in a year \n## output is whether you want return spectral periodogram or the peak weeks\n ## \"periodogram\" or \"weeks\"\n\n# Define function\nperiodogram <- function(x, \n counts, \n start_week = c(2002, 1), \n period = 52, \n output = \"weeks\") {\n \n\n ## make sure is not a tsibble, filter to project and only keep columns of interest\n prepare_data <- dplyr::as_tibble(x)\n \n # prepare_data <- prepare_data[prepare_data[[strata]] == j, ]\n prepare_data <- dplyr::select(prepare_data, {{counts}})\n \n ## create an intermediate \"zoo\" time series to be able to use with spec.pgram\n zoo_cases <- zoo::zooreg(prepare_data, \n start = start_week, frequency = period)\n \n ## get a spectral periodogram not using fast fourier transform \n periodo <- spec.pgram(zoo_cases, fast = FALSE, plot = FALSE)\n \n ## return the peak weeks \n periodo_weeks <- 1 / periodo$freq[order(-periodo$spec)] * period\n \n if (output == \"weeks\") {\n periodo_weeks\n } else {\n periodo\n }\n \n}\n\n## get spectral periodogram for extracting weeks with the highest frequencies \n## (checking of seasonality) \nperiodo <- periodogram(counts, \n case_int, \n start_week = c(2002, 1),\n output = \"periodogram\")\n\n## pull spectrum and frequence in to a dataframe for plotting\nperiodo <- data.frame(periodo$freq, periodo$spec)\n\n## plot a periodogram showing the most frequently occuring periodicity \nggplot(data = periodo, \n aes(x = 1/(periodo.freq/52), y = log(periodo.spec))) + \n geom_line() + \n labs(x = \"Period (Weeks)\", y = \"Log(density)\")\n\n\n\n\n\n\n\n## get a vector weeks in ascending order \npeak_weeks <- periodogram(counts, \n case_int, \n start_week = c(2002, 1), \n output = \"weeks\")\n\nNOTE: It is possible to use the above weeks to add them to sin and cosine terms, however we will use a function to generate these terms (see regression section below) \n\n\n\nDecomposition\nClassical decomposition is used to break a time series down several parts, which when taken together make up for the pattern you see. These different parts are:\n\nThe trend-cycle (the long-term direction of the data)\n\nThe seasonality (repeating patterns)\n\nThe random (what is left after removing trend and season)\n\n\n## decompose the counts dataset \ncounts %>% \n # using an additive classical decomposition model\n model(classical_decomposition(case_int, type = \"additive\")) %>% \n ## extract the important information from the model\n components() %>% \n ## generate a plot \n autoplot()\n\n\n\n\n\n\n\n\n\n\n\nAutocorrelation\nAutocorrelation tells you about the relation between the counts of each week and the weeks before it (called lags).\nUsing the ACF() function, we can produce a plot which shows us a number of lines for the relation at different lags. Where the lag is 0 (x = 0), this line would always be 1 as it shows the relation between an observation and itself (not shown here). The first line shown here (x = 1) shows the relation between each observation and the observation before it (lag of 1), the second shows the relation between each observation and the observation before last (lag of 2) and so on until lag of 52 which shows the relation between each observation and the observation from 1 year (52 weeks before).\nUsing the PACF() function (for partial autocorrelation) shows the same type of relation but adjusted for all other weeks between. This is less informative for determining periodicity.\n\n## using the counts dataset\ncounts %>% \n ## calculate autocorrelation using a full years worth of lags\n ACF(case_int, lag_max = 52) %>% \n ## show a plot\n autoplot()\n\n\n\n\n\n\n\n## using the counts data set \ncounts %>% \n ## calculate the partial autocorrelation using a full years worth of lags\n PACF(case_int, lag_max = 52) %>% \n ## show a plot\n autoplot()\n\n\n\n\n\n\n\n\nYou can formally test the null hypothesis of independence in a time series (i.e.  that it is not autocorrelated) using the Ljung-Box test (in the stats package). A significant p-value suggests that there is autocorrelation in the data.\n\n## test for independance \nBox.test(counts$case_int, type = \"Ljung-Box\")\n\n\n Box-Ljung test\n\ndata: counts$case_int\nX-squared = 462.65, df = 1, p-value < 2.2e-16", + "text": "23.4 Descriptive analysis\n\n\nMoving averages\nIf data is very noisy (counts jumping up and down) then it can be helpful to calculate a moving average. In the example below, for each week we calculate the average number of cases from the four previous weeks. This smooths the data, to make it more interpretable. In our case this does not really add much, so we willstick to the interpolated data for further analysis. See the Moving averages page for more detail.\n\n## create a moving average variable (deals with missings)\ncounts <- counts %>% \n ## create the ma_4w variable \n ## slide over each row of the case variable\n mutate(ma_4wk = slider::slide_dbl(case, \n ## for each row calculate the name\n ~ mean(.x, na.rm = TRUE),\n ## use the four previous weeks\n .before = 4))\n\n## make a quick visualisation of the difference \nggplot(counts, aes(x = epiweek)) + \n geom_line(aes(y = case)) + \n geom_line(aes(y = ma_4wk), colour = \"red\")\n\n\n\n\n\n\n\n\n\n\n\nPeriodicity\nBelow we define a custom function to create a periodogram. See the Writing functions page for information about how to write functions in R.\nFirst, the function is defined. Its arguments include a dataset with a column counts, start_week = which is the first week of the dataset, a number to indicate how many periods per year (e.g. 52, 12), and lastly the output style (see details in the code below).\n\n## Function arguments\n#####################\n## x is a dataset\n## counts is variable with count data or rates within x \n## start_week is the first week in your dataset\n## period is how many units in a year \n## output is whether you want return spectral periodogram or the peak weeks\n ## \"periodogram\" or \"weeks\"\n\n# Define function\nperiodogram <- function(x, \n counts, \n start_week = c(2002, 1), \n period = 52, \n output = \"weeks\") {\n \n\n ## make sure is not a tsibble, filter to project and only keep columns of interest\n prepare_data <- dplyr::as_tibble(x)\n \n # prepare_data <- prepare_data[prepare_data[[strata]] == j, ]\n prepare_data <- dplyr::select(prepare_data, {{counts}})\n \n ## create an intermediate \"zoo\" time series to be able to use with spec.pgram\n zoo_cases <- zoo::zooreg(prepare_data, \n start = start_week, frequency = period)\n \n ## get a spectral periodogram not using fast fourier transform \n periodo <- spec.pgram(zoo_cases, fast = FALSE, plot = FALSE)\n \n ## return the peak weeks \n periodo_weeks <- 1 / periodo$freq[order(-periodo$spec)] * period\n \n if (output == \"weeks\") {\n periodo_weeks\n } else {\n periodo\n }\n \n}\n\n## get spectral periodogram for extracting weeks with the highest frequencies \n## (checking of seasonality) \nperiodo <- periodogram(counts, \n case_int, \n start_week = c(2002, 1),\n output = \"periodogram\")\n\n## pull spectrum and frequence in to a dataframe for plotting\nperiodo <- data.frame(periodo$freq, periodo$spec)\n\n## plot a periodogram showing the most frequently occuring periodicity \nggplot(data = periodo, \n aes(x = 1/(periodo.freq/52), y = log(periodo.spec))) + \n geom_line() + \n labs(x = \"Period (Weeks)\", y = \"Log(density)\")\n\n\n\n\n\n\n\n## get a vector weeks in ascending order \npeak_weeks <- periodogram(counts, \n case_int, \n start_week = c(2002, 1), \n output = \"weeks\")\n\nNOTE: It is possible to use the above weeks to add them to sin and cosine terms, however we will use a function to generate these terms (see regression section below) \n\n\n\nDecomposition\nClassical decomposition is used to break a time series down several parts, which when taken together make up for the pattern you see. These different parts are:\n\nThe trend-cycle (the long-term direction of the data).\nThe seasonality (repeating patterns).\n\nThe random (what is left after removing trend and season).\n\n\n## decompose the counts dataset \ncounts %>% \n # using an additive classical decomposition model\n model(classical_decomposition(case_int, type = \"additive\")) %>% \n ## extract the important information from the model\n components() %>% \n ## generate a plot \n autoplot()\n\n\n\n\n\n\n\n\n\n\n\nAutocorrelation\nAutocorrelation tells you about the relation between the counts of each week and the weeks before it (called lags).\nUsing the ACF() function, we can produce a plot which shows us a number of lines for the relation at different lags. Where the lag is 0 (x = 0), this line would always be 1 as it shows the relation between an observation and itself (not shown here). The first line shown here (x = 1) shows the relation between each observation and the observation before it (lag of 1), the second shows the relation between each observation and the observation before last (lag of 2) and so on until lag of 52 which shows the relation between each observation and the observation from 1 year (52 weeks before).\nUsing the PACF() function (for partial autocorrelation) shows the same type of relation but adjusted for all other weeks between. This is less informative for determining periodicity.\n\n## using the counts dataset\ncounts %>% \n ## calculate autocorrelation using a full years worth of lags\n ACF(case_int, lag_max = 52) %>% \n ## show a plot\n autoplot()\n\n\n\n\n\n\n\n## using the counts data set \ncounts %>% \n ## calculate the partial autocorrelation using a full years worth of lags\n PACF(case_int, lag_max = 52) %>% \n ## show a plot\n autoplot()\n\n\n\n\n\n\n\n\nYou can formally test the null hypothesis of independence in a time series (i.e.  that it is not autocorrelated) using the Ljung-Box test (in the stats package). A significant p-value suggests that there is autocorrelation in the data.\n\n## test for independance \nBox.test(counts$case_int, type = \"Ljung-Box\")\n\n\n Box-Ljung test\n\ndata: counts$case_int\nX-squared = 462.65, df = 1, p-value < 2.2e-16", "crumbs": [ "Analysis", "23  Time series and outbreak detection" @@ -2023,7 +2023,7 @@ "href": "new_pages/time_series.html#fitting-regressions", "title": "23  Time series and outbreak detection", "section": "23.5 Fitting regressions", - "text": "23.5 Fitting regressions\nIt is possible to fit a large number of different regressions to a time series, however, here we will demonstrate how to fit a negative binomial regression - as this is often the most appropriate for counts data in infectious diseases.\n\n\nFourier terms\nFourier terms are the equivalent of sin and cosin curves. The difference is that these are fit based on finding the most appropriate combination of curves to explain your data.\nIf only fitting one fourier term, this would be the equivalent of fitting a sin and a cosin for your most frequently occurring lag seen in your periodogram (in our case 52 weeks). We use the fourier() function from the forecast package.\nIn the below code we assign using the $, as fourier() returns two columns (one for sin one for cosin) and so these are added to the dataset as a list, called “fourier” - but this list can then be used as a normal variable in regression.\n\n## add in fourier terms using the epiweek and case_int variabless\ncounts$fourier <- select(counts, epiweek, case_int) %>% \n fourier(K = 1)\n\n\n\n\nNegative binomial\nIt is possible to fit regressions using base stats or MASS functions (e.g. lm(), glm() and glm.nb()). However we will be using those from the trending package, as this allows for calculating appropriate confidence and prediction intervals (which are otherwise not available). The syntax is the same, and you specify an outcome variable then a tilde (~) and then add your various exposure variables of interest separated by a plus (+).\nThe other difference is that we first define the model and then fit() it to the data. This is useful because it allows for comparing multiple different models with the same syntax.\nTIP: If you wanted to use rates, rather than counts you could include the population variable as a logarithmic offset term, by adding offset(log(population). You would then need to set population to be 1, before using predict() in order to produce a rate. \nTIP: For fitting more complex models such as ARIMA or prophet, see the fable package.\n\n## define the model you want to fit (negative binomial) \nmodel <- glm_nb_model(\n ## set number of cases as outcome of interest\n case_int ~\n ## use epiweek to account for the trend\n epiweek +\n ## use the fourier terms to account for seasonality\n fourier)\n\n## fit your model using the counts dataset\nfitted_model <- trending::fit(model, data.frame(counts))\n\n## calculate confidence intervals and prediction intervals \nobserved <- predict(fitted_model, simulate_pi = FALSE)\n\nestimate_res <- data.frame(observed$result)\n\n## plot your regression \nggplot(data = estimate_res, aes(x = epiweek)) + \n ## add in a line for the model estimate\n geom_line(aes(y = estimate),\n col = \"Red\") + \n ## add in a band for the prediction intervals \n geom_ribbon(aes(ymin = lower_pi, \n ymax = upper_pi), \n alpha = 0.25) + \n ## add in a line for your observed case counts\n geom_line(aes(y = case_int), \n col = \"black\") + \n ## make a traditional plot (with black axes and white background)\n theme_classic()\n\n\n\n\n\n\n\n\n\n\n\nResiduals\nTo see how well our model fits the observed data we need to look at the residuals. The residuals are the difference between the observed counts and the counts estimated from the model. We could calculate this simply by using case_int - estimate, but the residuals() function extracts this directly from the regression for us.\nWhat we see from the below, is that we are not explaining all of the variation that we could with the model. It might be that we should fit more fourier terms, and address the amplitude. However for this example we will leave it as is. The plots show that our model does worse in the peaks and troughs (when counts are at their highest and lowest) and that it might be more likely to underestimate the observed counts.\n\n## calculate the residuals \nestimate_res <- estimate_res %>% \n mutate(resid = fitted_model$result[[1]]$residuals)\n\n## are the residuals fairly constant over time (if not: outbreaks? change in practice?)\nestimate_res %>%\n ggplot(aes(x = epiweek, y = resid)) +\n geom_line() +\n geom_point() + \n labs(x = \"epiweek\", y = \"Residuals\")\n\n\n\n\n\n\n\n## is there autocorelation in the residuals (is there a pattern to the error?) \nestimate_res %>% \n as_tsibble(index = epiweek) %>% \n ACF(resid, lag_max = 52) %>% \n autoplot()\n\n\n\n\n\n\n\n## are residuals normally distributed (are under or over estimating?) \nestimate_res %>%\n ggplot(aes(x = resid)) +\n geom_histogram(binwidth = 100) +\n geom_rug() +\n labs(y = \"count\") \n\n\n\n\n\n\n\n## compare observed counts to their residuals \n ## should also be no pattern \nestimate_res %>%\n ggplot(aes(x = estimate, y = resid)) +\n geom_point() +\n labs(x = \"Fitted\", y = \"Residuals\")\n\n\n\n\n\n\n\n## formally test autocorrelation of the residuals\n## H0 is that residuals are from a white-noise series (i.e. random)\n## test for independence \n## if p value significant then non-random\nBox.test(estimate_res$resid, type = \"Ljung-Box\")\n\n\n Box-Ljung test\n\ndata: estimate_res$resid\nX-squared = 336.25, df = 1, p-value < 2.2e-16", + "text": "23.5 Fitting regressions\nIt is possible to fit a large number of different regressions to a time series, however, here we will demonstrate how to fit a negative binomial regression - as this is often the most appropriate for counts data in infectious diseases.\n\n\nFourier terms\nFourier terms are the equivalent of sine and cosine curves. The difference is that these are fit based on finding the most appropriate combination of curves to explain your data.\nIf only fitting one fourier term, this would be the equivalent of fitting a sine and a cosine for your most frequently occurring lag seen in your periodogram (in our case 52 weeks). We use the fourier() function from the forecast package.\nIn the below code we assign using the $, as fourier() returns two columns (one for sin one for cosin) and so these are added to the dataset as a list, called “fourier” - but this list can then be used as a normal variable in regression.\n\n## add in fourier terms using the epiweek and case_int variabless\ncounts$fourier <- select(counts, epiweek, case_int) %>% \n fourier(K = 1)\n\n\n\n\nNegative binomial\nIt is possible to fit regressions using base stats or MASS functions (e.g. lm(), glm() and glm.nb()). However we will be using those from the trending package, as this allows for calculating appropriate confidence and prediction intervals (which are otherwise not available). The syntax is the same, and you specify an outcome variable then a tilde (~) and then add your various exposure variables of interest separated by a plus (+).\nThe other difference is that we first define the model and then fit() it to the data. This is useful because it allows for comparing multiple different models with the same syntax.\nTIP: If you wanted to use rates, rather than counts you could include the population variable as a logarithmic offset term, by adding offset(log(population). You would then need to set population to be 1, before using predict() in order to produce a rate. \nTIP: For fitting more complex models such as ARIMA or prophet, see the fable package.\n\n## define the model you want to fit (negative binomial) \nmodel <- glm_nb_model(\n ## set number of cases as outcome of interest\n case_int ~\n ## use epiweek to account for the trend\n epiweek +\n ## use the fourier terms to account for seasonality\n fourier)\n\n## fit your model using the counts dataset\nfitted_model <- trending::fit(model, data.frame(counts))\n\n## calculate confidence intervals and prediction intervals \nobserved <- predict(fitted_model, simulate_pi = FALSE)\n\nestimate_res <- data.frame(observed$result)\n\n## plot your regression \nggplot(data = estimate_res, aes(x = epiweek)) + \n ## add in a line for the model estimate\n geom_line(aes(y = estimate),\n col = \"Red\") + \n ## add in a band for the prediction intervals \n geom_ribbon(aes(ymin = lower_pi, \n ymax = upper_pi), \n alpha = 0.25) + \n ## add in a line for your observed case counts\n geom_line(aes(y = case_int), \n col = \"black\") + \n ## make a traditional plot (with black axes and white background)\n theme_classic()\n\n\n\n\n\n\n\n\n\n\n\nResiduals\nTo see how well our model fits the observed data we need to look at the residuals. The residuals are the difference between the observed counts and the counts estimated from the model. We could calculate this simply by using case_int - estimate, but the residuals() function extracts this directly from the regression for us.\nWhat we see from the below, is that we are not explaining all of the variation that we could with the model. It might be that we should fit more fourier terms, and address the amplitude. However for this example we will leave it as is. The plots show that our model does worse in the peaks and troughs (when counts are at their highest and lowest) and that it might be more likely to underestimate the observed counts.\n\n## calculate the residuals \nestimate_res <- estimate_res %>% \n mutate(resid = fitted_model$result[[1]]$residuals)\n\n## are the residuals fairly constant over time (if not: outbreaks? change in practice?)\nestimate_res %>%\n ggplot(aes(x = epiweek, y = resid)) +\n geom_line() +\n geom_point() + \n labs(x = \"epiweek\", y = \"Residuals\")\n\n\n\n\n\n\n\n## is there autocorelation in the residuals (is there a pattern to the error?) \nestimate_res %>% \n as_tsibble(index = epiweek) %>% \n ACF(resid, lag_max = 52) %>% \n autoplot()\n\n\n\n\n\n\n\n## are residuals normally distributed (are under or over estimating?) \nestimate_res %>%\n ggplot(aes(x = resid)) +\n geom_histogram(binwidth = 100) +\n geom_rug() +\n labs(y = \"count\") \n\n\n\n\n\n\n\n## compare observed counts to their residuals \n ## should also be no pattern \nestimate_res %>%\n ggplot(aes(x = estimate, y = resid)) +\n geom_point() +\n labs(x = \"Fitted\", y = \"Residuals\")\n\n\n\n\n\n\n\n## formally test autocorrelation of the residuals\n## H0 is that residuals are from a white-noise series (i.e. random)\n## test for independence \n## if p value significant then non-random\nBox.test(estimate_res$resid, type = \"Ljung-Box\")\n\n\n Box-Ljung test\n\ndata: estimate_res$resid\nX-squared = 336.25, df = 1, p-value < 2.2e-16", "crumbs": [ "Analysis", "23  Time series and outbreak detection" @@ -2045,7 +2045,7 @@ "href": "new_pages/time_series.html#outbreak-detection", "title": "23  Time series and outbreak detection", "section": "23.7 Outbreak detection", - "text": "23.7 Outbreak detection\nWe will demonstrate two (similar) methods of detecting outbreaks here. The first builds on the sections above. We use the trending package to fit regressions to previous years, and then predict what we expect to see in the following year. If observed counts are above what we expect, then it could suggest there is an outbreak. The second method is based on similar principles but uses the surveillance package, which has a number of different algorithms for aberration detection.\nCAUTION: Normally, you are interested in the current year (where you only know counts up to the present week). So in this example we are pretending to be in week 39 of 2011.\n\n\ntrending package\nFor this method we define a baseline (which should usually be about 5 years of data). We fit a regression to the baseline data, and then use that to predict the estimates for the next year.\n\n\nCut-off date\nIt is easier to define your dates in one place and then use these throughout the rest of your code.\nHere we define a start date (when our observations started) and a cut-off date (the end of our baseline period - and when the period we want to predict for starts). ~We also define how many weeks are in our year of interest (the one we are going to be predicting)~. We also define how many weeks are between our baseline cut-off and the end date that we are interested in predicting for.\nNOTE: In this example we pretend to currently be at the end of September 2011 (“2011 W39”).\n\n## define start date (when observations began)\nstart_date <- min(counts$epiweek)\n\n## define a cut-off week (end of baseline, start of prediction period)\ncut_off <- yearweek(\"2010-12-31\")\n\n## define the last date interested in (i.e. end of prediction)\nend_date <- yearweek(\"2011-12-31\")\n\n## find how many weeks in period (year) of interest\nnum_weeks <- as.numeric(end_date - cut_off)\n\n\n\n\nAdd rows\nTo be able to forecast in a tidyverse format, we need to have the right number of rows in our dataset, i.e. one row for each week up to the end_datedefined above. The code below allows you to add these rows for by a grouping variable - for example if we had multiple countries in one dataset, we could group by country and then add rows appropriately for each. The group_by_key() function from tsibble allows us to do this grouping and then pass the grouped data to dplyr functions, group_modify() and add_row(). Then we specify the sequence of weeks between one after the maximum week currently available in the data and the end week.\n\n## add in missing weeks till end of year \ncounts <- counts %>%\n ## group by the region\n group_by_key() %>%\n ## for each group add rows from the highest epiweek to the end of year\n group_modify(~add_row(.,\n epiweek = seq(max(.$epiweek) + 1, \n end_date,\n by = 1)))\n\n\n\n\nFourier terms\nWe need to redefine our fourier terms - as we want to fit them to the baseline date only and then predict (extrapolate) those terms for the next year. To do this we need to combine two output lists from the fourier() function together; the first one is for the baseline data, and the second one predicts for the year of interest (by defining the h argument).\nN.b. to bind rows we have to use rbind() (rather than tidyverse bind_rows) as the fourier columns are a list (so not named individually).\n\n## define fourier terms (sincos) \ncounts <- counts %>% \n mutate(\n ## combine fourier terms for weeks prior to and after 2010 cut-off date\n ## (nb. 2011 fourier terms are predicted)\n fourier = rbind(\n ## get fourier terms for previous years\n fourier(\n ## only keep the rows before 2011\n filter(counts, \n epiweek <= cut_off), \n ## include one set of sin cos terms \n K = 1\n ), \n ## predict the fourier terms for 2011 (using baseline data)\n fourier(\n ## only keep the rows before 2011\n filter(counts, \n epiweek <= cut_off),\n ## include one set of sin cos terms \n K = 1, \n ## predict 52 weeks ahead\n h = num_weeks\n )\n )\n )\n\n\n\n\nSplit data and fit regression\nWe now have to split our dataset in to the baseline period and the prediction period. This is done using the dplyr group_split() function after group_by(), and will create a list with two data frames, one for before your cut-off and one for after.\nWe then use the purrr package pluck() function to pull the datasets out of the list (equivalent of using square brackets, e.g. dat[[1]]), and can then fit our model to the baseline data, and then use the predict() function for our data of interest after the cut-off.\nSee the page on Iteration, loops, and lists to learn more about purrr.\nCAUTION: Note the use of simulate_pi = FALSE within the predict() argument. This is because the default behaviour of trending is to use the ciTools package to estimate a prediction interval. This does not work if there are NA counts, and also produces more granular intervals. See ?trending::predict.trending_model_fit for details. \n\n# split data for fitting and prediction\ndat <- counts %>% \n group_by(epiweek <= cut_off) %>%\n group_split()\n\n## define the model you want to fit (negative binomial) \nmodel <- glm_nb_model(\n ## set number of cases as outcome of interest\n case_int ~\n ## use epiweek to account for the trend\n epiweek +\n ## use the furier terms to account for seasonality\n fourier\n)\n\n# define which data to use for fitting and which for predicting\nfitting_data <- pluck(dat, 2)\npred_data <- pluck(dat, 1) %>% \n select(case_int, epiweek, fourier)\n\n# fit model \nfitted_model <- trending::fit(model, data.frame(fitting_data))\n\n# get confint and estimates for fitted data\nobserved <- fitted_model %>% \n predict(simulate_pi = FALSE)\n\n# forecast with data want to predict with \nforecasts <- fitted_model %>% \n predict(data.frame(pred_data), simulate_pi = FALSE)\n\n## combine baseline and predicted datasets\nobserved <- bind_rows(observed$result, forecasts$result)\n\nAs previously, we can visualise our model with ggplot. We highlight alerts with red dots for observed counts above the 95% prediction interval. This time we also add a vertical line to label when the forecast starts.\n\n## plot your regression \nggplot(data = observed, aes(x = epiweek)) + \n ## add in a line for the model estimate\n geom_line(aes(y = estimate),\n col = \"grey\") + \n ## add in a band for the prediction intervals \n geom_ribbon(aes(ymin = lower_pi, \n ymax = upper_pi), \n alpha = 0.25) + \n ## add in a line for your observed case counts\n geom_line(aes(y = case_int), \n col = \"black\") + \n ## plot in points for the observed counts above expected\n geom_point(\n data = filter(observed, case_int > upper_pi), \n aes(y = case_int), \n colour = \"red\", \n size = 2) + \n ## add vertical line and label to show where forecasting started\n geom_vline(\n xintercept = as.Date(cut_off), \n linetype = \"dashed\") + \n annotate(geom = \"text\", \n label = \"Forecast\", \n x = cut_off, \n y = max(observed$upper_pi) - 250, \n angle = 90, \n vjust = 1\n ) + \n ## make a traditional plot (with black axes and white background)\n theme_classic()\n\nWarning: Removed 13 rows containing missing values or values outside the scale range\n(`geom_line()`).\n\n\n\n\n\n\n\n\n\n\n\n\nPrediction validation\nBeyond inspecting residuals, it is important to investigate how good your model is at predicting cases in the future. This gives you an idea of how reliable your threshold alerts are.\nThe traditional way of validating is to see how well you can predict the latest year before the present one (because you don’t yet know the counts for the “current year”). For example in our data set we would use the data from 2002 to 2009 to predict 2010, and then see how accurate those predictions are. Then refit the model to include 2010 data and use that to predict 2011 counts.\nAs can be seen in the figure below by Hyndman et al in “Forecasting principles and practice”.\n figure reproduced with permission from the authors\nThe downside of this is that you are not using all the data available to you, and it is not the final model that you are using for prediction.\nAn alternative is to use a method called cross-validation. In this scenario you roll over all of the data available to fit multiple models to predict one year ahead. You use more and more data in each model, as seen in the figure below from the same [Hyndman et al text]((https://otexts.com/fpp3/). For example, the first model uses 2002 to predict 2003, the second uses 2002 and 2003 to predict 2004, and so on. figure reproduced with permission from the authors\nIn the below we use purrr package map() function to loop over each dataset. We then put estimates in one data set and merge with the original case counts, to use the yardstick package to compute measures of accuracy. We compute four measures including: Root mean squared error (RMSE), Mean absolute error (MAE), Mean absolute scaled error (MASE), Mean absolute percent error (MAPE).\nCAUTION: Note the use of simulate_pi = FALSE within the predict() argument. This is because the default behaviour of trending is to use the ciTools package to estimate a prediction interval. This does not work if there are NA counts, and also produces more granular intervals. See ?trending::predict.trending_model_fit for details. \n\n## Cross validation: predicting week(s) ahead based on sliding window\n\n## expand your data by rolling over in 52 week windows (before + after) \n## to predict 52 week ahead\n## (creates longer and longer chains of observations - keeps older data)\n\n## define window want to roll over\nroll_window <- 52\n\n## define weeks ahead want to predict \nweeks_ahead <- 52\n\n## create a data set of repeating, increasingly long data\n## label each data set with a unique id\n## only use cases before year of interest (i.e. 2011)\ncase_roll <- counts %>% \n filter(epiweek < cut_off) %>% \n ## only keep the week and case counts variables\n select(epiweek, case_int) %>% \n ## drop the last x observations \n ## depending on how many weeks ahead forecasting \n ## (otherwise will be an actual forecast to \"unknown\")\n slice(1:(n() - weeks_ahead)) %>%\n as_tsibble(index = epiweek) %>% \n ## roll over each week in x after windows to create grouping ID \n ## depending on what rolling window specify\n stretch_tsibble(.init = roll_window, .step = 1) %>% \n ## drop the first couple - as have no \"before\" cases\n filter(.id > roll_window)\n\n\n## for each of the unique data sets run the code below\nforecasts <- purrr::map(unique(case_roll$.id), \n function(i) {\n \n ## only keep the current fold being fit \n mini_data <- filter(case_roll, .id == i) %>% \n as_tibble()\n \n ## create an empty data set for forecasting on \n forecast_data <- tibble(\n epiweek = seq(max(mini_data$epiweek) + 1,\n max(mini_data$epiweek) + weeks_ahead,\n by = 1),\n case_int = rep.int(NA, weeks_ahead),\n .id = rep.int(i, weeks_ahead)\n )\n \n ## add the forecast data to the original \n mini_data <- bind_rows(mini_data, forecast_data)\n \n ## define the cut off based on latest non missing count data \n cv_cut_off <- mini_data %>% \n ## only keep non-missing rows\n drop_na(case_int) %>% \n ## get the latest week\n summarise(max(epiweek)) %>% \n ## extract so is not in a dataframe\n pull()\n \n ## make mini_data back in to a tsibble\n mini_data <- tsibble(mini_data, index = epiweek)\n \n ## define fourier terms (sincos) \n mini_data <- mini_data %>% \n mutate(\n ## combine fourier terms for weeks prior to and after cut-off date\n fourier = rbind(\n ## get fourier terms for previous years\n forecast::fourier(\n ## only keep the rows before cut-off\n filter(mini_data, \n epiweek <= cv_cut_off), \n ## include one set of sin cos terms \n K = 1\n ), \n ## predict the fourier terms for following year (using baseline data)\n fourier(\n ## only keep the rows before cut-off\n filter(mini_data, \n epiweek <= cv_cut_off),\n ## include one set of sin cos terms \n K = 1, \n ## predict 52 weeks ahead\n h = weeks_ahead\n )\n )\n )\n \n \n # split data for fitting and prediction\n dat <- mini_data %>% \n group_by(epiweek <= cv_cut_off) %>%\n group_split()\n\n ## define the model you want to fit (negative binomial) \n model <- glm_nb_model(\n ## set number of cases as outcome of interest\n case_int ~\n ## use epiweek to account for the trend\n epiweek +\n ## use the furier terms to account for seasonality\n fourier\n )\n\n # define which data to use for fitting and which for predicting\n fitting_data <- pluck(dat, 2)\n pred_data <- pluck(dat, 1)\n \n # fit model \n fitted_model <- trending::fit(model, fitting_data)\n \n # forecast with data want to predict with \n forecasts <- fitted_model %>% \n predict(data.frame(pred_data), simulate_pi = FALSE)\n forecasts <- data.frame(forecasts$result[[1]]) %>% \n ## only keep the week and the forecast estimate\n select(epiweek, estimate)\n \n }\n )\n\n## make the list in to a data frame with all the forecasts\nforecasts <- bind_rows(forecasts)\n\n## join the forecasts with the observed\nforecasts <- left_join(forecasts, \n select(counts, epiweek, case_int),\n by = \"epiweek\")\n\n## using {yardstick} compute metrics\n ## RMSE: Root mean squared error\n ## MAE: Mean absolute error \n ## MASE: Mean absolute scaled error\n ## MAPE: Mean absolute percent error\nmodel_metrics <- bind_rows(\n ## in your forcasted dataset compare the observed to the predicted\n rmse(forecasts, case_int, estimate), \n mae( forecasts, case_int, estimate),\n mase(forecasts, case_int, estimate),\n mape(forecasts, case_int, estimate),\n ) %>% \n ## only keep the metric type and its output\n select(Metric = .metric, \n Measure = .estimate) %>% \n ## make in to wide format so can bind rows after\n pivot_wider(names_from = Metric, values_from = Measure)\n\n## return model metrics \nmodel_metrics\n\n# A tibble: 1 × 4\n rmse mae mase mape\n <dbl> <dbl> <dbl> <dbl>\n1 252. 199. 1.96 17.3\n\n\n\n\n\n\nsurveillance package\nIn this section we use the surveillance package to create alert thresholds based on outbreak detection algorithms. There are several different methods available in the package, however we will focus on two options here. For details, see these papers on the application and theory of the alogirthms used.\nThe first option uses the improved Farrington method. This fits a negative binomial glm (including trend) and down-weights past outbreaks (outliers) to create a threshold level.\nThe second option use the glrnb method. This also fits a negative binomial glm but includes trend and fourier terms (so is favoured here). The regression is used to calculate the “control mean” (~fitted values) - it then uses a computed generalized likelihood ratio statistic to assess if there is shift in the mean for each week. Note that the threshold for each week takes in to account previous weeks so if there is a sustained shift an alarm will be triggered. (Also note that after each alarm the algorithm is reset)\nIn order to work with the surveillance package, we first need to define a “surveillance time series” object (using the sts() function) to fit within the framework.\n\n## define surveillance time series object\n## nb. you can include a denominator with the population object (see ?sts)\ncounts_sts <- sts(observed = counts$case_int[!is.na(counts$case_int)],\n start = c(\n ## subset to only keep the year from start_date \n as.numeric(str_sub(start_date, 1, 4)), \n ## subset to only keep the week from start_date\n as.numeric(str_sub(start_date, 7, 8))), \n ## define the type of data (in this case weekly)\n freq = 52)\n\n## define the week range that you want to include (ie. prediction period)\n## nb. the sts object only counts observations without assigning a week or \n## year identifier to them - so we use our data to define the appropriate observations\nweekrange <- cut_off - start_date\n\n\n\nFarrington method\nWe then define each of our parameters for the Farrington method in a list. Then we run the algorithm using farringtonFlexible() and then we can extract the threshold for an alert using farringtonmethod@upperboundto include this in our dataset. It is also possible to extract a TRUE/FALSE for each week if it triggered an alert (was above the threshold) using farringtonmethod@alarm.\n\n## define control\nctrl <- list(\n ## define what time period that want threshold for (i.e. 2011)\n range = which(counts_sts@epoch > weekrange),\n b = 9, ## how many years backwards for baseline\n w = 2, ## rolling window size in weeks\n weightsThreshold = 2.58, ## reweighting past outbreaks (improved noufaily method - original suggests 1)\n ## pastWeeksNotIncluded = 3, ## use all weeks available (noufaily suggests drop 26)\n trend = TRUE,\n pThresholdTrend = 1, ## 0.05 normally, however 1 is advised in the improved method (i.e. always keep)\n thresholdMethod = \"nbPlugin\",\n populationOffset = TRUE\n )\n\n## apply farrington flexible method\nfarringtonmethod <- farringtonFlexible(counts_sts, ctrl)\n\n## create a new variable in the original dataset called threshold\n## containing the upper bound from farrington \n## nb. this is only for the weeks in 2011 (so need to subset rows)\ncounts[which(counts$epiweek >= cut_off & \n !is.na(counts$case_int)),\n \"threshold\"] <- farringtonmethod@upperbound\n\nWe can then visualise the results in ggplot as done previously.\n\nggplot(counts, aes(x = epiweek)) + \n ## add in observed case counts as a line\n geom_line(aes(y = case_int, colour = \"Observed\")) + \n ## add in upper bound of aberration algorithm\n geom_line(aes(y = threshold, colour = \"Alert threshold\"), \n linetype = \"dashed\", \n size = 1.5) +\n ## define colours\n scale_colour_manual(values = c(\"Observed\" = \"black\", \n \"Alert threshold\" = \"red\")) + \n ## make a traditional plot (with black axes and white background)\n theme_classic() + \n ## remove title of legend \n theme(legend.title = element_blank())\n\n\n\n\n\n\n\n\n\n\n\nGLRNB method\nSimilarly for the GLRNB method we define each of our parameters for the in a list, then fit the algorithm and extract the upper bounds.\nCAUTION: This method uses “brute force” (similar to bootstrapping) for calculating thresholds, so can take a long time!\nSee the GLRNB vignette for details.\n\n## define control options\nctrl <- list(\n ## define what time period that want threshold for (i.e. 2011)\n range = which(counts_sts@epoch > weekrange),\n mu0 = list(S = 1, ## number of fourier terms (harmonics) to include\n trend = TRUE, ## whether to include trend or not\n refit = FALSE), ## whether to refit model after each alarm\n ## cARL = threshold for GLR statistic (arbitrary)\n ## 3 ~ middle ground for minimising false positives\n ## 1 fits to the 99%PI of glm.nb - with changes after peaks (threshold lowered for alert)\n c.ARL = 2,\n # theta = log(1.5), ## equates to a 50% increase in cases in an outbreak\n ret = \"cases\" ## return threshold upperbound as case counts\n )\n\n## apply the glrnb method\nglrnbmethod <- glrnb(counts_sts, control = ctrl, verbose = FALSE)\n\n## create a new variable in the original dataset called threshold\n## containing the upper bound from glrnb \n## nb. this is only for the weeks in 2011 (so need to subset rows)\ncounts[which(counts$epiweek >= cut_off & \n !is.na(counts$case_int)),\n \"threshold_glrnb\"] <- glrnbmethod@upperbound\n\nVisualise the outputs as previously.\n\nggplot(counts, aes(x = epiweek)) + \n ## add in observed case counts as a line\n geom_line(aes(y = case_int, colour = \"Observed\")) + \n ## add in upper bound of aberration algorithm\n geom_line(aes(y = threshold_glrnb, colour = \"Alert threshold\"), \n linetype = \"dashed\", \n size = 1.5) +\n ## define colours\n scale_colour_manual(values = c(\"Observed\" = \"black\", \n \"Alert threshold\" = \"red\")) + \n ## make a traditional plot (with black axes and white background)\n theme_classic() + \n ## remove title of legend \n theme(legend.title = element_blank())", + "text": "23.7 Outbreak detection\nWe will demonstrate two (similar) methods of detecting outbreaks here. The first builds on the sections above. We use the trending package to fit regressions to previous years, and then predict what we expect to see in the following year. If observed counts are above what we expect, then it could suggest there is an outbreak. The second method is based on similar principles but uses the surveillance package, which has a number of different algorithms for aberration detection.\nCAUTION: Normally, you are interested in the current year (where you only know counts up to the present week). So in this example we are pretending to be in week 39 of 2011.\n\n\ntrending package\nFor this method we define a baseline (which should usually be about 5 years of data). We fit a regression to the baseline data, and then use that to predict the estimates for the next year.\n\n\nCut-off date\nIt is easier to define your dates in one place and then use these throughout the rest of your code.\nHere we define a start date (when our observations started) and a cut-off date (the end of our baseline period - and when the period we want to predict for starts). ~We also define how many weeks are in our year of interest (the one we are going to be predicting)~. We also define how many weeks are between our baseline cut-off and the end date that we are interested in predicting for.\nNOTE: In this example we pretend to currently be at the end of September 2011 (“2011 W39”).\n\n## define start date (when observations began)\nstart_date <- min(counts$epiweek)\n\n## define a cut-off week (end of baseline, start of prediction period)\ncut_off <- yearweek(\"2010-12-31\")\n\n## define the last date interested in (i.e. end of prediction)\nend_date <- yearweek(\"2011-12-31\")\n\n## find how many weeks in period (year) of interest\nnum_weeks <- as.numeric(end_date - cut_off)\n\n\n\n\nAdd rows\nTo be able to forecast in a tidyverse format, we need to have the right number of rows in our dataset, i.e. one row for each week up to the end_datedefined above. The code below allows you to add these rows for by a grouping variable - for example if we had multiple countries in one dataset, we could group by country and then add rows appropriately for each. The group_by_key() function from tsibble allows us to do this grouping and then pass the grouped data to dplyr functions, group_modify() and add_row(). Then we specify the sequence of weeks between one after the maximum week currently available in the data and the end week.\n\n## add in missing weeks till end of year \ncounts <- counts %>%\n ## group by the region\n group_by_key() %>%\n ## for each group add rows from the highest epiweek to the end of year\n group_modify(~add_row(.,\n epiweek = seq(max(.$epiweek) + 1, \n end_date,\n by = 1)))\n\n\n\n\nFourier terms\nWe need to redefine our fourier terms - as we want to fit them to the baseline date only and then predict (extrapolate) those terms for the next year. To do this we need to combine two output lists from the fourier() function together; the first one is for the baseline data, and the second one predicts for the year of interest (by defining the h argument).\nN.b. to bind rows we have to use rbind() (rather than tidyverse bind_rows) as the fourier columns are a list (so not named individually).\n\n## define fourier terms (sincos) \ncounts <- counts %>% \n mutate(\n ## combine fourier terms for weeks prior to and after 2010 cut-off date\n ## (nb. 2011 fourier terms are predicted)\n fourier = rbind(\n ## get fourier terms for previous years\n fourier(\n ## only keep the rows before 2011\n filter(counts, \n epiweek <= cut_off), \n ## include one set of sin cos terms \n K = 1\n ), \n ## predict the fourier terms for 2011 (using baseline data)\n fourier(\n ## only keep the rows before 2011\n filter(counts, \n epiweek <= cut_off),\n ## include one set of sin cos terms \n K = 1, \n ## predict 52 weeks ahead\n h = num_weeks\n )\n )\n )\n\n\n\n\nSplit data and fit regression\nWe now have to split our dataset in to the baseline period and the prediction period. This is done using the dplyr group_split() function after group_by(), and will create a list with two data frames, one for before your cut-off and one for after.\nWe then use the purrr package pluck() function to pull the datasets out of the list (equivalent of using square brackets, e.g. dat[[1]]), and can then fit our model to the baseline data, and then use the predict() function for our data of interest after the cut-off.\nSee the page on Iteration, loops, and lists to learn more about purrr.\nCAUTION: Note the use of simulate_pi = FALSE within the predict() argument. This is because the default behaviour of trending is to use the ciTools package to estimate a prediction interval. This does not work if there are NA counts, and also produces more granular intervals. See ?trending::predict.trending_model_fit for details. \n\n# split data for fitting and prediction\ndat <- counts %>% \n group_by(epiweek <= cut_off) %>%\n group_split()\n\n## define the model you want to fit (negative binomial) \nmodel <- glm_nb_model(\n ## set number of cases as outcome of interest\n case_int ~\n ## use epiweek to account for the trend\n epiweek +\n ## use the furier terms to account for seasonality\n fourier\n)\n\n# define which data to use for fitting and which for predicting\nfitting_data <- pluck(dat, 2)\npred_data <- pluck(dat, 1) %>% \n select(case_int, epiweek, fourier)\n\n# fit model \nfitted_model <- trending::fit(model, data.frame(fitting_data))\n\n# get confint and estimates for fitted data\nobserved <- fitted_model %>% \n predict(simulate_pi = FALSE)\n\n# forecast with data want to predict with \nforecasts <- fitted_model %>% \n predict(data.frame(pred_data), simulate_pi = FALSE)\n\n## combine baseline and predicted datasets\nobserved <- bind_rows(observed$result, forecasts$result)\n\nAs previously, we can visualise our model with ggplot. We highlight alerts with red dots for observed counts above the 95% prediction interval. This time we also add a vertical line to label when the forecast starts.\n\n## plot your regression \nggplot(data = observed, aes(x = epiweek)) + \n ## add in a line for the model estimate\n geom_line(aes(y = estimate),\n col = \"grey\") + \n ## add in a band for the prediction intervals \n geom_ribbon(aes(ymin = lower_pi, \n ymax = upper_pi), \n alpha = 0.25) + \n ## add in a line for your observed case counts\n geom_line(aes(y = case_int), \n col = \"black\") + \n ## plot in points for the observed counts above expected\n geom_point(\n data = filter(observed, case_int > upper_pi), \n aes(y = case_int), \n colour = \"red\", \n size = 2) + \n ## add vertical line and label to show where forecasting started\n geom_vline(\n xintercept = as.Date(cut_off), \n linetype = \"dashed\") + \n annotate(geom = \"text\", \n label = \"Forecast\", \n x = cut_off, \n y = max(observed$upper_pi) - 250, \n angle = 90, \n vjust = 1\n ) + \n ## make a traditional plot (with black axes and white background)\n theme_classic()\n\nWarning: Removed 13 rows containing missing values or values outside the scale range\n(`geom_line()`).\n\n\n\n\n\n\n\n\n\n\n\n\nPrediction validation\nBeyond inspecting residuals, it is important to investigate how good your model is at predicting cases in the future. This gives you an idea of how reliable your threshold alerts are.\nThe traditional way of validating is to see how well you can predict the latest year before the present one (because you don’t yet know the counts for the “current year”). For example in our data set we would use the data from 2002 to 2009 to predict 2010, and then see how accurate those predictions are. Then refit the model to include 2010 data and use that to predict 2011 counts.\nAs can be seen in the figure below by Hyndman et al in “Forecasting principles and practice”.\n figure reproduced with permission from the authors\nThe downside of this is that you are not using all the data available to you, and it is not the final model that you are using for prediction.\nAn alternative is to use a method called cross-validation. In this scenario you roll over all of the data available to fit multiple models to predict one year ahead. You use more and more data in each model, as seen in the figure below from the same Hyndman et al.,.\nFor example, the first model uses 2002 to predict 2003, the second uses 2002 and 2003 to predict 2004, and so on. figure reproduced with permission from the authors\nIn the below we use purrr package map() function to loop over each dataset. We then put estimates in one data set and merge with the original case counts, to use the yardstick package to compute measures of accuracy. We compute four measures including: Root mean squared error (RMSE), Mean absolute error (MAE), Mean absolute scaled error (MASE), Mean absolute percent error (MAPE).\nCAUTION: Note the use of simulate_pi = FALSE within the predict() argument. This is because the default behaviour of trending is to use the ciTools package to estimate a prediction interval. This does not work if there are NA counts, and also produces more granular intervals. See ?trending::predict.trending_model_fit for details. \n\n## Cross validation: predicting week(s) ahead based on sliding window\n\n## expand your data by rolling over in 52 week windows (before + after) \n## to predict 52 week ahead\n## (creates longer and longer chains of observations - keeps older data)\n\n## define window want to roll over\nroll_window <- 52\n\n## define weeks ahead want to predict \nweeks_ahead <- 52\n\n## create a data set of repeating, increasingly long data\n## label each data set with a unique id\n## only use cases before year of interest (i.e. 2011)\ncase_roll <- counts %>% \n filter(epiweek < cut_off) %>% \n ## only keep the week and case counts variables\n select(epiweek, case_int) %>% \n ## drop the last x observations \n ## depending on how many weeks ahead forecasting \n ## (otherwise will be an actual forecast to \"unknown\")\n slice(1:(n() - weeks_ahead)) %>%\n as_tsibble(index = epiweek) %>% \n ## roll over each week in x after windows to create grouping ID \n ## depending on what rolling window specify\n stretch_tsibble(.init = roll_window, .step = 1) %>% \n ## drop the first couple - as have no \"before\" cases\n filter(.id > roll_window)\n\n\n## for each of the unique data sets run the code below\nforecasts <- purrr::map(unique(case_roll$.id), \n function(i) {\n \n ## only keep the current fold being fit \n mini_data <- filter(case_roll, .id == i) %>% \n as_tibble()\n \n ## create an empty data set for forecasting on \n forecast_data <- tibble(\n epiweek = seq(max(mini_data$epiweek) + 1,\n max(mini_data$epiweek) + weeks_ahead,\n by = 1),\n case_int = rep.int(NA, weeks_ahead),\n .id = rep.int(i, weeks_ahead)\n )\n \n ## add the forecast data to the original \n mini_data <- bind_rows(mini_data, forecast_data)\n \n ## define the cut off based on latest non missing count data \n cv_cut_off <- mini_data %>% \n ## only keep non-missing rows\n drop_na(case_int) %>% \n ## get the latest week\n summarise(max(epiweek)) %>% \n ## extract so is not in a dataframe\n pull()\n \n ## make mini_data back in to a tsibble\n mini_data <- tsibble(mini_data, index = epiweek)\n \n ## define fourier terms (sincos) \n mini_data <- mini_data %>% \n mutate(\n ## combine fourier terms for weeks prior to and after cut-off date\n fourier = rbind(\n ## get fourier terms for previous years\n forecast::fourier(\n ## only keep the rows before cut-off\n filter(mini_data, \n epiweek <= cv_cut_off), \n ## include one set of sin cos terms \n K = 1\n ), \n ## predict the fourier terms for following year (using baseline data)\n fourier(\n ## only keep the rows before cut-off\n filter(mini_data, \n epiweek <= cv_cut_off),\n ## include one set of sin cos terms \n K = 1, \n ## predict 52 weeks ahead\n h = weeks_ahead\n )\n )\n )\n \n \n # split data for fitting and prediction\n dat <- mini_data %>% \n group_by(epiweek <= cv_cut_off) %>%\n group_split()\n\n ## define the model you want to fit (negative binomial) \n model <- glm_nb_model(\n ## set number of cases as outcome of interest\n case_int ~\n ## use epiweek to account for the trend\n epiweek +\n ## use the furier terms to account for seasonality\n fourier\n )\n\n # define which data to use for fitting and which for predicting\n fitting_data <- pluck(dat, 2)\n pred_data <- pluck(dat, 1)\n \n # fit model \n fitted_model <- trending::fit(model, fitting_data)\n \n # forecast with data want to predict with \n forecasts <- fitted_model %>% \n predict(data.frame(pred_data), simulate_pi = FALSE)\n forecasts <- data.frame(forecasts$result[[1]]) %>% \n ## only keep the week and the forecast estimate\n select(epiweek, estimate)\n \n }\n )\n\n## make the list in to a data frame with all the forecasts\nforecasts <- bind_rows(forecasts)\n\n## join the forecasts with the observed\nforecasts <- left_join(forecasts, \n select(counts, epiweek, case_int),\n by = \"epiweek\")\n\n## using {yardstick} compute metrics\n ## RMSE: Root mean squared error\n ## MAE: Mean absolute error \n ## MASE: Mean absolute scaled error\n ## MAPE: Mean absolute percent error\nmodel_metrics <- bind_rows(\n ## in your forcasted dataset compare the observed to the predicted\n rmse(forecasts, case_int, estimate), \n mae( forecasts, case_int, estimate),\n mase(forecasts, case_int, estimate),\n mape(forecasts, case_int, estimate),\n ) %>% \n ## only keep the metric type and its output\n select(Metric = .metric, \n Measure = .estimate) %>% \n ## make in to wide format so can bind rows after\n pivot_wider(names_from = Metric, values_from = Measure)\n\n## return model metrics \nmodel_metrics\n\n# A tibble: 1 × 4\n rmse mae mase mape\n <dbl> <dbl> <dbl> <dbl>\n1 252. 199. 1.96 17.3\n\n\n\n\n\n\nsurveillance package\nIn this section we use the surveillance package to create alert thresholds based on outbreak detection algorithms. There are several different methods available in the package, however we will focus on two options here. For details, see these papers on the application and theory of the alogirthms used.\nThe first option uses the improved Farrington method. This fits a negative binomial glm (including trend) and down-weights past outbreaks (outliers) to create a threshold level.\nThe second option use the glrnb method. This also fits a negative binomial glm but includes trend and fourier terms (so is favoured here). The regression is used to calculate the “control mean” (~fitted values) - it then uses a computed generalized likelihood ratio statistic to assess if there is shift in the mean for each week. Note that the threshold for each week takes in to account previous weeks so if there is a sustained shift an alarm will be triggered. (Also note that after each alarm the algorithm is reset).\nIn order to work with the surveillance package, we first need to define a “surveillance time series” object (using the sts() function) to fit within the framework.\n\n## define surveillance time series object\n## nb. you can include a denominator with the population object (see ?sts)\ncounts_sts <- sts(observed = counts$case_int[!is.na(counts$case_int)],\n start = c(\n ## subset to only keep the year from start_date \n as.numeric(str_sub(start_date, 1, 4)), \n ## subset to only keep the week from start_date\n as.numeric(str_sub(start_date, 7, 8))), \n ## define the type of data (in this case weekly)\n freq = 52)\n\n## define the week range that you want to include (ie. prediction period)\n## nb. the sts object only counts observations without assigning a week or \n## year identifier to them - so we use our data to define the appropriate observations\nweekrange <- cut_off - start_date\n\n\n\nFarrington method\nWe then define each of our parameters for the Farrington method in a list. Then we run the algorithm using farringtonFlexible() and then we can extract the threshold for an alert using farringtonmethod@upperboundto include this in our dataset. It is also possible to extract a TRUE/FALSE for each week if it triggered an alert (was above the threshold) using farringtonmethod@alarm.\n\n## define control\nctrl <- list(\n ## define what time period that want threshold for (i.e. 2011)\n range = which(counts_sts@epoch > weekrange),\n b = 9, ## how many years backwards for baseline\n w = 2, ## rolling window size in weeks\n weightsThreshold = 2.58, ## reweighting past outbreaks (improved noufaily method - original suggests 1)\n ## pastWeeksNotIncluded = 3, ## use all weeks available (noufaily suggests drop 26)\n trend = TRUE,\n pThresholdTrend = 1, ## 0.05 normally, however 1 is advised in the improved method (i.e. always keep)\n thresholdMethod = \"nbPlugin\",\n populationOffset = TRUE\n )\n\n## apply farrington flexible method\nfarringtonmethod <- farringtonFlexible(counts_sts, ctrl)\n\n## create a new variable in the original dataset called threshold\n## containing the upper bound from farrington \n## nb. this is only for the weeks in 2011 (so need to subset rows)\ncounts[which(counts$epiweek >= cut_off & \n !is.na(counts$case_int)),\n \"threshold\"] <- farringtonmethod@upperbound\n\nWe can then visualise the results in ggplot as done previously.\n\nggplot(counts, aes(x = epiweek)) + \n ## add in observed case counts as a line\n geom_line(aes(y = case_int, colour = \"Observed\")) + \n ## add in upper bound of aberration algorithm\n geom_line(aes(y = threshold, colour = \"Alert threshold\"), \n linetype = \"dashed\", \n size = 1.5) +\n ## define colours\n scale_colour_manual(values = c(\"Observed\" = \"black\", \n \"Alert threshold\" = \"red\")) + \n ## make a traditional plot (with black axes and white background)\n theme_classic() + \n ## remove title of legend \n theme(legend.title = element_blank())\n\n\n\n\n\n\n\n\n\n\n\nGLRNB method\nSimilarly for the GLRNB method we define each of our parameters for the in a list, then fit the algorithm and extract the upper bounds.\nCAUTION: This method uses “brute force” (similar to bootstrapping) for calculating thresholds, so can take a long time!\nSee the GLRNB vignette for details.\n\n## define control options\nctrl <- list(\n ## define what time period that want threshold for (i.e. 2011)\n range = which(counts_sts@epoch > weekrange),\n mu0 = list(S = 1, ## number of fourier terms (harmonics) to include\n trend = TRUE, ## whether to include trend or not\n refit = FALSE), ## whether to refit model after each alarm\n ## cARL = threshold for GLR statistic (arbitrary)\n ## 3 ~ middle ground for minimising false positives\n ## 1 fits to the 99%PI of glm.nb - with changes after peaks (threshold lowered for alert)\n c.ARL = 2,\n # theta = log(1.5), ## equates to a 50% increase in cases in an outbreak\n ret = \"cases\" ## return threshold upperbound as case counts\n )\n\n## apply the glrnb method\nglrnbmethod <- glrnb(counts_sts, control = ctrl, verbose = FALSE)\n\n## create a new variable in the original dataset called threshold\n## containing the upper bound from glrnb \n## nb. this is only for the weeks in 2011 (so need to subset rows)\ncounts[which(counts$epiweek >= cut_off & \n !is.na(counts$case_int)),\n \"threshold_glrnb\"] <- glrnbmethod@upperbound\n\nVisualise the outputs as previously.\n\nggplot(counts, aes(x = epiweek)) + \n ## add in observed case counts as a line\n geom_line(aes(y = case_int, colour = \"Observed\")) + \n ## add in upper bound of aberration algorithm\n geom_line(aes(y = threshold_glrnb, colour = \"Alert threshold\"), \n linetype = \"dashed\", \n size = 1.5) +\n ## define colours\n scale_colour_manual(values = c(\"Observed\" = \"black\", \n \"Alert threshold\" = \"red\")) + \n ## make a traditional plot (with black axes and white background)\n theme_classic() + \n ## remove title of legend \n theme(legend.title = element_blank())", "crumbs": [ "Analysis", "23  Time series and outbreak detection" @@ -2056,7 +2056,7 @@ "href": "new_pages/time_series.html#interrupted-timeseries", "title": "23  Time series and outbreak detection", "section": "23.8 Interrupted timeseries", - "text": "23.8 Interrupted timeseries\nInterrupted timeseries (also called segmented regression or intervention analysis), is often used in assessing the impact of vaccines on the incidence of disease. But it can be used for assessing impact of a wide range of interventions or introductions. For example changes in hospital procedures or the introduction of a new disease strain to a population. In this example we will pretend that a new strain of Campylobacter was introduced to Germany at the end of 2008, and see if that affects the number of cases. We will use negative binomial regression again. The regression this time will be split in to two parts, one before the intervention (or introduction of new strain here) and one after (the pre and post-periods). This allows us to calculate an incidence rate ratio comparing the two time periods. Explaining the equation might make this clearer (if not then just ignore!).\nThe negative binomial regression can be defined as follows:\n\\[\\log(Y_t)= β_0 + β_1 \\times t+ β_2 \\times δ(t-t_0) + β_3\\times(t-t_0 )^+ + log(pop_t) + e_t\\]\nWhere: \\(Y_t\\)is the number of cases observed at time \\(t\\)\n\\(pop_t\\) is the population size in 100,000s at time \\(t\\) (not used here)\n\\(t_0\\) is the last year of the of the pre-period (including transition time if any)\n\\(δ(x\\) is the indicator function (it is 0 if x≤0 and 1 if x>0)\n\\((x)^+\\) is the cut off operator (it is x if x>0 and 0 otherwise)\n\\(e_t\\) denotes the residual Additional terms trend and season can be added as needed.\n\\(β_2 \\times δ(t-t_0) + β_3\\times(t-t_0 )^+\\) is the generalised linear part of the post-period and is zero in the pre-period. This means that the \\(β_2\\) and \\(β_3\\) estimates are the effects of the intervention.\nWe need to re-calculate the fourier terms without forecasting here, as we will use all the data available to us (i.e. retrospectively). Additionally we need to calculate the extra terms needed for the regression.\n\n## add in fourier terms using the epiweek and case_int variabless\ncounts$fourier <- select(counts, epiweek, case_int) %>% \n as_tsibble(index = epiweek) %>% \n fourier(K = 1)\n\n## define intervention week \nintervention_week <- yearweek(\"2008-12-31\")\n\n## define variables for regression \ncounts <- counts %>% \n mutate(\n ## corresponds to t in the formula\n ## count of weeks (could probably also just use straight epiweeks var)\n # linear = row_number(epiweek), \n ## corresponds to delta(t-t0) in the formula\n ## pre or post intervention period\n intervention = as.numeric(epiweek >= intervention_week), \n ## corresponds to (t-t0)^+ in the formula\n ## count of weeks post intervention\n ## (choose the larger number between 0 and whatever comes from calculation)\n time_post = pmax(0, epiweek - intervention_week + 1))\n\nWe then use these terms to fit a negative binomial regression, and produce a table with percentage change. What this example shows is that there was no significant change.\nCAUTION: Note the use of simulate_pi = FALSE within the predict() argument. This is because the default behaviour of trending is to use the ciTools package to estimate a prediction interval. This does not work if there are NA counts, and also produces more granular intervals. See ?trending::predict.trending_model_fit for details. \n\n## define the model you want to fit (negative binomial) \nmodel <- glm_nb_model(\n ## set number of cases as outcome of interest\n case_int ~\n ## use epiweek to account for the trend\n epiweek +\n ## use the furier terms to account for seasonality\n fourier + \n ## add in whether in the pre- or post-period \n intervention + \n ## add in the time post intervention \n time_post\n )\n\n## fit your model using the counts dataset\nfitted_model <- trending::fit(model, counts)\n\n## calculate confidence intervals and prediction intervals \nobserved <- predict(fitted_model, simulate_pi = FALSE)\n\n\n## show estimates and percentage change in a table\nfitted_model %>% \n ## extract original negative binomial regression\n get_model() %>% \n ## get a tidy dataframe of results\n tidy(exponentiate = TRUE, \n conf.int = TRUE) %>% \n ## only keep the intervention value \n filter(term == \"intervention\") %>% \n ## change the IRR to percentage change for estimate and CIs \n mutate(\n ## for each of the columns of interest - create a new column\n across(\n all_of(c(\"estimate\", \"conf.low\", \"conf.high\")), \n ## apply the formula to calculate percentage change\n .f = function(i) 100 * (i - 1), \n ## add a suffix to new column names with \"_perc\"\n .names = \"{.col}_perc\")\n ) %>% \n ## only keep (and rename) certain columns \n select(\"IRR\" = estimate, \n \"95%CI low\" = conf.low, \n \"95%CI high\" = conf.high,\n \"Percentage change\" = estimate_perc, \n \"95%CI low (perc)\" = conf.low_perc, \n \"95%CI high (perc)\" = conf.high_perc,\n \"p-value\" = p.value)\n\nAs previously we can visualise the outputs of the regression.\n\nestimate_res <- data.frame(observed$result)\n\nggplot(estimate_res, aes(x = epiweek)) + \n ## add in observed case counts as a line\n geom_line(aes(y = case_int, colour = \"Observed\")) + \n ## add in a line for the model estimate\n geom_line(aes(y = estimate, col = \"Estimate\")) + \n ## add in a band for the prediction intervals \n geom_ribbon(aes(ymin = lower_pi, \n ymax = upper_pi), \n alpha = 0.25) + \n ## add vertical line and label to show where forecasting started\n geom_vline(\n xintercept = as.Date(intervention_week), \n linetype = \"dashed\") + \n annotate(geom = \"text\", \n label = \"Intervention\", \n x = intervention_week, \n y = max(observed$upper_pi), \n angle = 90, \n vjust = 1\n ) + \n ## define colours\n scale_colour_manual(values = c(\"Observed\" = \"black\", \n \"Estimate\" = \"red\")) + \n ## make a traditional plot (with black axes and white background)\n theme_classic()\n\nWarning: Unknown or uninitialised column: `upper_pi`.\n\n\nWarning in max(observed$upper_pi): no non-missing arguments to max; returning\n-Inf", + "text": "23.8 Interrupted timeseries\nInterrupted timeseries (also called segmented regression or intervention analysis), is often used in assessing the impact of vaccines on the incidence of disease. But it can be used for assessing impact of a wide range of interventions or introductions. For example changes in hospital procedures or the introduction of a new disease strain to a population. In this example we will pretend that a new strain of Campylobacter was introduced to Germany at the end of 2008, and see if that affects the number of cases. We will use negative binomial regression again. The regression this time will be split in to two parts, one before the intervention (or introduction of new strain here) and one after (the pre and post-periods). This allows us to calculate an incidence rate ratio comparing the two time periods. Explaining the equation might make this clearer (if not then just ignore!).\nThe negative binomial regression can be defined as follows:\n\\[\\log(Y_t)= β_0 + β_1 \\times t+ β_2 \\times δ(t-t_0) + β_3\\times(t-t_0 )^+ + log(pop_t) + e_t\\]\nWhere: \\(Y_t\\)is the number of cases observed at time \\(t\\)\n\\(pop_t\\) is the population size in 100,000s at time \\(t\\) (not used here)\n\\(t_0\\) is the last year of the of the pre-period (including transition time if any)\n\\(δ(x\\) is the indicator function (it is 0 if x≤0 and 1 if x>0)\n\\((x)^+\\) is the cut off operator (it is x if x>0 and 0 otherwise)\n\\(e_t\\) denotes the residual Additional terms trend and season can be added as needed.\n\\(β_2 \\times δ(t-t_0) + β_3\\times(t-t_0 )^+\\) is the generalised linear part of the post-period and is zero in the pre-period. This means that the \\(β_2\\) and \\(β_3\\) estimates are the effects of the intervention.\nWe need to re-calculate the fourier terms without forecasting here, as we will use all the data available to us (i.e. retrospectively). Additionally we need to calculate the extra terms needed for the regression.\n\n## add in fourier terms using the epiweek and case_int variabless\ncounts$fourier <- select(counts, epiweek, case_int) %>% \n as_tsibble(index = epiweek) %>% \n fourier(K = 1)\n\n## define intervention week \nintervention_week <- yearweek(\"2008-12-31\")\n\n## define variables for regression \ncounts <- counts %>% \n mutate(\n ## corresponds to t in the formula\n ## count of weeks (could probably also just use straight epiweeks var)\n # linear = row_number(epiweek), \n ## corresponds to delta(t-t0) in the formula\n ## pre or post intervention period\n intervention = as.numeric(epiweek >= intervention_week), \n ## corresponds to (t-t0)^+ in the formula\n ## count of weeks post intervention\n ## (choose the larger number between 0 and whatever comes from calculation)\n time_post = pmax(0, epiweek - intervention_week + 1))\n\nWe then use these terms to fit a negative binomial regression, and produce a table with percentage change. What this example shows is that there was no significant change.\nCAUTION: Note the use of simulate_pi = FALSE within the predict() argument. This is because the default behaviour of trending is to use the ciTools package to estimate a prediction interval. This does not work if there are NA counts, and also produces more granular intervals. See ?trending::predict.trending_model_fit for details. \n\n## define the model you want to fit (negative binomial) \nmodel <- glm_nb_model(\n ## set number of cases as outcome of interest\n case_int ~\n ## use epiweek to account for the trend\n epiweek +\n ## use the furier terms to account for seasonality\n fourier + \n ## add in whether in the pre- or post-period \n intervention + \n ## add in the time post intervention \n time_post\n )\n\n## fit your model using the counts dataset\nfitted_model <- trending::fit(model, counts)\n\n## calculate confidence intervals and prediction intervals \nobserved <- predict(fitted_model, simulate_pi = FALSE)\n\n\n## extract original negative binomial regression\nfitted_model$result[[1]] %>%\n ## get a tidy dataframe of results\n tidy(exponentiate = TRUE, \n conf.int = TRUE) %>% \n ## only keep the intervention value \n filter(term == \"intervention\") %>% \n ## change the IRR to percentage change for estimate and CIs \n mutate(\n ## for each of the columns of interest - create a new column\n across(\n all_of(c(\"estimate\", \"conf.low\", \"conf.high\")), \n ## apply the formula to calculate percentage change\n .f = function(i) 100 * (i - 1), \n ## add a suffix to new column names with \"_perc\"\n .names = \"{.col}_perc\")\n ) %>% \n ## only keep (and rename) certain columns \n select(\"IRR\" = estimate, \n \"95%CI low\" = conf.low, \n \"95%CI high\" = conf.high,\n \"Percentage change\" = estimate_perc, \n \"95%CI low (perc)\" = conf.low_perc, \n \"95%CI high (perc)\" = conf.high_perc,\n \"p-value\" = p.value)\n\nAs previously we can visualise the outputs of the regression.\n\nestimate_res <- data.frame(observed$result)\n\nggplot(estimate_res, aes(x = epiweek)) + \n ## add in observed case counts as a line\n geom_line(aes(y = case_int, colour = \"Observed\")) + \n ## add in a line for the model estimate\n geom_line(aes(y = estimate, col = \"Estimate\")) + \n ## add in a band for the prediction intervals \n geom_ribbon(aes(ymin = lower_pi, \n ymax = upper_pi), \n alpha = 0.25) + \n ## add vertical line and label to show where forecasting started\n geom_vline(\n xintercept = as.Date(intervention_week), \n linetype = \"dashed\") + \n annotate(geom = \"text\", \n label = \"Intervention\", \n x = intervention_week, \n y = max(observed$upper_pi), \n angle = 90, \n vjust = 1\n ) + \n ## define colours\n scale_colour_manual(values = c(\"Observed\" = \"black\", \n \"Estimate\" = \"red\")) + \n ## make a traditional plot (with black axes and white background)\n theme_classic()\n\nWarning: Unknown or uninitialised column: `upper_pi`.\n\n\nWarning in max(observed$upper_pi): no non-missing arguments to max; returning\n-Inf", "crumbs": [ "Analysis", "23  Time series and outbreak detection" @@ -2232,7 +2232,7 @@ "href": "new_pages/survey_analysis.html#overview", "title": "26  Survey analysis", "section": "", - "text": "Survey data\nObservation time\nWeighting\nSurvey design objects\nDescriptive analysis\nWeighted proportions\nWeighted rates", + "text": "Survey data.\nObservation time.\nWeighting.\nSurvey design objects.\nDescriptive analysis.\nWeighted proportions.\nWeighted rates.", "crumbs": [ "Analysis", "26  Survey analysis" @@ -2254,7 +2254,7 @@ "href": "new_pages/survey_analysis.html#survey-data", "title": "26  Survey analysis", "section": "26.3 Survey data", - "text": "26.3 Survey data\nThere numerous different sampling designs that can be used for surveys. Here we will demonstrate code for: - Stratified - Cluster - Stratified and cluster\nAs described above (depending on how you design your questionnaire) the data for each level would be exported as a separate dataset from Kobo. In our example there is one level for households and one level for individuals within those households.\nThese two levels are linked by a unique identifier. For a Kobo dataset this variable is “_index” at the household level, which matches the “_parent_index” at the individual level. This will create new rows for household with each matching individual, see the handbook section on joining for details.\n\n## join the individual and household data to form a complete data set\nsurvey_data <- left_join(survey_data_hh, \n survey_data_indiv,\n by = c(\"_index\" = \"_parent_index\"))\n\n\n## create a unique identifier by combining indeces of the two levels \nsurvey_data <- survey_data %>% \n mutate(uid = str_glue(\"{index}_{index_y}\"))", + "text": "26.3 Survey data\nThere numerous different sampling designs that can be used for surveys. Here we will demonstrate code for: - Stratified - Cluster - Stratified and cluster", "crumbs": [ "Analysis", "26  Survey analysis" @@ -2265,7 +2265,7 @@ "href": "new_pages/survey_analysis.html#observation-time", "title": "26  Survey analysis", "section": "26.4 Observation time", - "text": "26.4 Observation time\nFor mortality surveys we want to now how long each individual was present for in the location to be able to calculate an appropriate mortality rate for our period of interest. This is not relevant to all surveys, but particularly for mortality surveys this is important as they are conducted frequently among mobile or displaced populations.\nTo do this we first define our time period of interest, also known as a recall period (i.e. the time that participants are asked to report on when answering questions). We can then use this period to set inappropriate dates to missing, i.e. if deaths are reported from outside the period of interest.\n\n## set the start/end of recall period\n## can be changed to date variables from dataset \n## (e.g. arrival date & date questionnaire)\nsurvey_data <- survey_data %>% \n mutate(recall_start = as.Date(\"2018-01-01\"), \n recall_end = as.Date(\"2018-05-01\")\n )\n\n\n# set inappropriate dates to NA based on rules \n## e.g. arrivals before start, departures departures after end\nsurvey_data <- survey_data %>%\n mutate(\n arrived_date = if_else(arrived_date < recall_start, \n as.Date(NA),\n arrived_date),\n birthday_date = if_else(birthday_date < recall_start,\n as.Date(NA),\n birthday_date),\n left_date = if_else(left_date > recall_end,\n as.Date(NA),\n left_date),\n death_date = if_else(death_date > recall_end,\n as.Date(NA),\n death_date)\n )\n\nWe can then use our date variables to define start and end dates for each individual. We can use the find_start_date() function from sitrep to fine the causes for the dates and then use that to calculate the difference between days (person-time).\nstart date: Earliest appropriate arrival event within your recall period Either the beginning of your recall period (which you define in advance), or a date after the start of recall if applicable (e.g. arrivals or births)\nend date: Earliest appropriate departure event within your recall period Either the end of your recall period, or a date before the end of recall if applicable (e.g. departures, deaths)\n\n## create new variables for start and end dates/causes\nsurvey_data <- survey_data %>% \n ## choose earliest date entered in survey\n ## from births, household arrivals, and camp arrivals \n find_start_date(\"birthday_date\",\n \"arrived_date\",\n period_start = \"recall_start\",\n period_end = \"recall_end\",\n datecol = \"startdate\",\n datereason = \"startcause\" \n ) %>%\n ## choose earliest date entered in survey\n ## from camp departures, death and end of the study\n find_end_date(\"left_date\",\n \"death_date\",\n period_start = \"recall_start\",\n period_end = \"recall_end\",\n datecol = \"enddate\",\n datereason = \"endcause\" \n )\n\n\n## label those that were present at the start/end (except births/deaths)\nsurvey_data <- survey_data %>% \n mutate(\n ## fill in start date to be the beginning of recall period (for those empty) \n startdate = if_else(is.na(startdate), recall_start, startdate), \n ## set the start cause to present at start if equal to recall period \n ## unless it is equal to the birth date \n startcause = if_else(startdate == recall_start & startcause != \"birthday_date\",\n \"Present at start\", startcause), \n ## fill in end date to be end of recall period (for those empty) \n enddate = if_else(is.na(enddate), recall_end, enddate), \n ## set the end cause to present at end if equall to recall end \n ## unless it is equal to the death date\n endcause = if_else(enddate == recall_end & endcause != \"death_date\", \n \"Present at end\", endcause))\n\n\n## Define observation time in days\nsurvey_data <- survey_data %>% \n mutate(obstime = as.numeric(enddate - startdate))", + "text": "26.4 Observation time\nFor mortality surveys we want to now how long each individual was present for in the location to be able to calculate an appropriate mortality rate for our period of interest. This is not relevant to all surveys, but particularly for mortality surveys this is important as they are conducted frequently among mobile or displaced populations.\nTo do this we first define our time period of interest, also known as a recall period (i.e. the time that participants are asked to report on when answering questions). We can then use this period to set inappropriate dates to missing, i.e. if deaths are reported from outside the period of interest.\n\n## set the start/end of recall period\n## can be changed to date variables from dataset \n## (e.g. arrival date & date questionnaire)\nsurvey_data <- survey_data %>% \n mutate(recall_start = as.Date(\"2018-01-01\"), \n recall_end = as.Date(\"2018-05-01\")\n )\n\n\n# set inappropriate dates to NA based on rules \n## e.g. arrivals before start, departures departures after end\nsurvey_data <- survey_data %>%\n mutate(\n arrived_date = if_else(arrived_date < recall_start, \n as.Date(NA),\n arrived_date),\n birthday_date = if_else(birthday_date < recall_start,\n as.Date(NA),\n birthday_date),\n left_date = if_else(left_date > recall_end,\n as.Date(NA),\n left_date),\n death_date = if_else(death_date > recall_end,\n as.Date(NA),\n death_date)\n )\n\nWe can then use our date variables to define start and end dates for each individual. We can use the find_start_date() function from sitrep to fine the causes for the dates and then use that to calculate the difference between days (person-time).\nstart date: Earliest appropriate arrival event within your recall period. Either the beginning of your recall period (which you define in advance), or a date after the start of recall if applicable (e.g. arrivals or births).\nend date: Earliest appropriate departure event within your recall period. Either the end of your recall period, or a date before the end of recall if applicable (e.g. departures, deaths).\n\n## create new variables for start and end dates/causes\nsurvey_data <- survey_data %>% \n ## choose earliest date entered in survey\n ## from births, household arrivals, and camp arrivals \n find_start_date(\"birthday_date\",\n \"arrived_date\",\n period_start = \"recall_start\",\n period_end = \"recall_end\",\n datecol = \"startdate\",\n datereason = \"startcause\" \n ) %>%\n ## choose earliest date entered in survey\n ## from camp departures, death and end of the study\n find_end_date(\"left_date\",\n \"death_date\",\n period_start = \"recall_start\",\n period_end = \"recall_end\",\n datecol = \"enddate\",\n datereason = \"endcause\" \n )\n\n\n## label those that were present at the start/end (except births/deaths)\nsurvey_data <- survey_data %>% \n mutate(\n ## fill in start date to be the beginning of recall period (for those empty) \n startdate = if_else(is.na(startdate), recall_start, startdate), \n ## set the start cause to present at start if equal to recall period \n ## unless it is equal to the birth date \n startcause = if_else(startdate == recall_start & startcause != \"birthday_date\",\n \"Present at start\", startcause), \n ## fill in end date to be end of recall period (for those empty) \n enddate = if_else(is.na(enddate), recall_end, enddate), \n ## set the end cause to present at end if equall to recall end \n ## unless it is equal to the death date\n endcause = if_else(enddate == recall_end & endcause != \"death_date\", \n \"Present at end\", endcause))\n\n\n## Define observation time in days\nsurvey_data <- survey_data %>% \n mutate(obstime = as.numeric(enddate - startdate))", "crumbs": [ "Analysis", "26  Survey analysis" @@ -2287,7 +2287,7 @@ "href": "new_pages/survey_analysis.html#survey-design-objects", "title": "26  Survey analysis", "section": "26.6 Survey design objects", - "text": "26.6 Survey design objects\nCreate survey object according to your study design. Used the same way as data frames to calculate weight proportions etc. Make sure that all necessary variables are created before this.\nThere are four options, comment out those you do not use: - Simple random - Stratified - Cluster - Stratified cluster\nFor this template - we will pretend that we cluster surveys in two separate strata (health districts A and B). So to get overall estimates we need have combined cluster and strata weights.\nAs mentioned previously, there are two packages available for doing this. The classic one is survey and then there is a wrapper package called srvyr that makes tidyverse-friendly objects and functions. We will demonstrate both, but note that most of the code in this chapter will use srvyr based objects. The one exception is that the gtsummary package only accepts survey objects.\n\n26.6.1 Survey package\nThe survey package effectively uses base R coding, and so it is not possible to use pipes (%>%) or other dplyr syntax. With the survey package we use the svydesign() function to define a survey object with appropriate clusters, weights and strata.\nNOTE: we need to use the tilde (~) in front of variables, this is because the package uses the base R syntax of assigning variables based on formulae. \n\n# simple random ---------------------------------------------------------------\nbase_survey_design_simple <- svydesign(ids = ~1, # 1 for no cluster ids\n weights = NULL, # No weight added\n strata = NULL, # sampling was simple (no strata)\n data = survey_data # have to specify the dataset\n )\n\n## stratified ------------------------------------------------------------------\nbase_survey_design_strata <- svydesign(ids = ~1, # 1 for no cluster ids\n weights = ~surv_weight_strata, # weight variable created above\n strata = ~health_district, # sampling was stratified by district\n data = survey_data # have to specify the dataset\n )\n\n# cluster ---------------------------------------------------------------------\nbase_survey_design_cluster <- svydesign(ids = ~village_name, # cluster ids\n weights = ~surv_weight_cluster, # weight variable created above\n strata = NULL, # sampling was simple (no strata)\n data = survey_data # have to specify the dataset\n )\n\n# stratified cluster ----------------------------------------------------------\nbase_survey_design <- svydesign(ids = ~village_name, # cluster ids\n weights = ~surv_weight_cluster_strata, # weight variable created above\n strata = ~health_district, # sampling was stratified by district\n data = survey_data # have to specify the dataset\n )\n\n\n\n26.6.2 Srvyr package\nWith the srvyr package we can use the as_survey_design() function, which has all the same arguments as above but allows pipes (%>%), and so we do not need to use the tilde (~).\n\n## simple random ---------------------------------------------------------------\nsurvey_design_simple <- survey_data %>% \n as_survey_design(ids = 1, # 1 for no cluster ids \n weights = NULL, # No weight added\n strata = NULL # sampling was simple (no strata)\n )\n## stratified ------------------------------------------------------------------\nsurvey_design_strata <- survey_data %>%\n as_survey_design(ids = 1, # 1 for no cluster ids\n weights = surv_weight_strata, # weight variable created above\n strata = health_district # sampling was stratified by district\n )\n## cluster ---------------------------------------------------------------------\nsurvey_design_cluster <- survey_data %>%\n as_survey_design(ids = village_name, # cluster ids\n weights = surv_weight_cluster, # weight variable created above\n strata = NULL # sampling was simple (no strata)\n )\n\n## stratified cluster ----------------------------------------------------------\nsurvey_design <- survey_data %>%\n as_survey_design(ids = village_name, # cluster ids\n weights = surv_weight_cluster_strata, # weight variable created above\n strata = health_district # sampling was stratified by district\n )", + "text": "26.6 Survey design objects\nNext you should create a survey object according to your study design. This is used in the same way as data frames to calculate weight proportions and other aspects of survey analysis. You should make sure all your necessary variables are created before this.\nThere are four options: - Simple random - Stratified - Cluster - Stratified cluster\nIn the template below - we will pretend that we cluster surveys in two separate strata (health districts A and B).\nTo get overall estimates we need to have combined cluster and strata weights.\nAs mentioned previously, there are two packages available for doing this. The classic one is survey and then there is a wrapper package called srvyr that makes tidyverse-friendly objects and functions. We will demonstrate both, but note that most of the code in this chapter will use srvyr based objects. The one exception is that the gtsummary package only accepts survey objects.\n\n26.6.1 Survey package\nThe survey package effectively uses base R coding, and so it is not possible to use pipes (%>%) or other dplyr syntax. With the survey package we use the svydesign() function to define a survey object with appropriate clusters, weights and strata.\nNOTE: we need to use the tilde (~) in front of variables, this is because the package uses the base R syntax of assigning variables based on formulae. \n\n# simple random ---------------------------------------------------------------\nbase_survey_design_simple <- svydesign(ids = ~1, # 1 for no cluster ids\n weights = NULL, # No weight added\n strata = NULL, # sampling was simple (no strata)\n data = survey_data # have to specify the dataset\n )\n\n## stratified ------------------------------------------------------------------\nbase_survey_design_strata <- svydesign(ids = ~1, # 1 for no cluster ids\n weights = ~surv_weight_strata, # weight variable created above\n strata = ~health_district, # sampling was stratified by district\n data = survey_data # have to specify the dataset\n )\n\n# cluster ---------------------------------------------------------------------\nbase_survey_design_cluster <- svydesign(ids = ~village_name, # cluster ids\n weights = ~surv_weight_cluster, # weight variable created above\n strata = NULL, # sampling was simple (no strata)\n data = survey_data # have to specify the dataset\n )\n\n# stratified cluster ----------------------------------------------------------\nbase_survey_design <- svydesign(ids = ~village_name, # cluster ids\n weights = ~surv_weight_cluster_strata, # weight variable created above\n strata = ~health_district, # sampling was stratified by district\n data = survey_data # have to specify the dataset\n )\n\n\n\n26.6.2 Srvyr package\nWith the srvyr package we can use the as_survey_design() function, which has all the same arguments as above but allows pipes (%>%), and so we do not need to use the tilde (~).\n\n## simple random ---------------------------------------------------------------\nsurvey_design_simple <- survey_data %>% \n as_survey_design(ids = 1, # 1 for no cluster ids \n weights = NULL, # No weight added\n strata = NULL # sampling was simple (no strata)\n )\n## stratified ------------------------------------------------------------------\nsurvey_design_strata <- survey_data %>%\n as_survey_design(ids = 1, # 1 for no cluster ids\n weights = surv_weight_strata, # weight variable created above\n strata = health_district # sampling was stratified by district\n )\n## cluster ---------------------------------------------------------------------\nsurvey_design_cluster <- survey_data %>%\n as_survey_design(ids = village_name, # cluster ids\n weights = surv_weight_cluster, # weight variable created above\n strata = NULL # sampling was simple (no strata)\n )\n\n## stratified cluster ----------------------------------------------------------\nsurvey_design <- survey_data %>%\n as_survey_design(ids = village_name, # cluster ids\n weights = surv_weight_cluster_strata, # weight variable created above\n strata = health_district # sampling was stratified by district\n )", "crumbs": [ "Analysis", "26  Survey analysis" @@ -2298,7 +2298,7 @@ "href": "new_pages/survey_analysis.html#descriptive-analysis", "title": "26  Survey analysis", "section": "26.7 Descriptive analysis", - "text": "26.7 Descriptive analysis\nBasic descriptive analysis and visualisation is covered extensively in other chapters of the handbook, so we will not dwell on it here. For details see the chapters on descriptive tables, statistical tests, tables for presentation, ggplot basics and R markdown reports.\nIn this section we will focus on how to investigate bias in your sample and visualise this. We will also look at visualising population flow in a survey setting using alluvial/sankey diagrams.\nIn general, you should consider including the following descriptive analyses:\n\nFinal number of clusters, households and individuals included\n\nNumber of excluded individuals and the reasons for exclusion\nMedian (range) number of households per cluster and individuals per household\n\n\n26.7.1 Sampling bias\nCompare the proportions in each age group between your sample and the source population. This is important to be able to highlight potential sampling bias. You could similarly repeat this looking at distributions by sex.\nNote that these p-values are just indicative, and a descriptive discussion (or visualisation with age-pyramids below) of the distributions in your study sample compared to the source population is more important than the binomial test itself. This is because increasing sample size will more often than not lead to differences that may be irrelevant after weighting your data.\n\n## counts and props of the study population\nag <- survey_data %>% \n group_by(age_group) %>% \n drop_na(age_group) %>% \n tally() %>% \n mutate(proportion = n / sum(n), \n n_total = sum(n))\n\n## counts and props of the source population\npropcount <- population %>% \n group_by(age_group) %>%\n tally(population) %>%\n mutate(proportion = n / sum(n))\n\n## bind together the columns of two tables, group by age, and perform a \n## binomial test to see if n/total is significantly different from population\n## proportion.\n ## suffix here adds to text to the end of columns in each of the two datasets\nleft_join(ag, propcount, by = \"age_group\", suffix = c(\"\", \"_pop\")) %>%\n group_by(age_group) %>%\n ## broom::tidy(binom.test()) makes a data frame out of the binomial test and\n ## will add the variables p.value, parameter, conf.low, conf.high, method, and\n ## alternative. We will only use p.value here. You can include other\n ## columns if you want to report confidence intervals\n mutate(binom = list(broom::tidy(binom.test(n, n_total, proportion_pop)))) %>%\n unnest(cols = c(binom)) %>% # important for expanding the binom.test data frame\n mutate(proportion_pop = proportion_pop * 100) %>%\n ## Adjusting the p-values to correct for false positives \n ## (because testing multiple age groups). This will only make \n ## a difference if you have many age categories\n mutate(p.value = p.adjust(p.value, method = \"holm\")) %>%\n \n ## Only show p-values over 0.001 (those under report as <0.001)\n mutate(p.value = ifelse(p.value < 0.001, \n \"<0.001\", \n as.character(round(p.value, 3)))) %>% \n \n ## rename the columns appropriately\n select(\n \"Age group\" = age_group,\n \"Study population (n)\" = n,\n \"Study population (%)\" = proportion,\n \"Source population (n)\" = n_pop,\n \"Source population (%)\" = proportion_pop,\n \"P-value\" = p.value\n )\n\n# A tibble: 5 × 6\n# Groups: Age group [5]\n `Age group` `Study population (n)` `Study population (%)`\n <chr> <int> <dbl>\n1 0-2 12 0.0256\n2 3-14 42 0.0896\n3 15-29 64 0.136 \n4 30-44 52 0.111 \n5 45+ 299 0.638 \n# ℹ 3 more variables: `Source population (n)` <dbl>,\n# `Source population (%)` <dbl>, `P-value` <chr>\n\n\n\n\n26.7.2 Demographic pyramids\nDemographic (or age-sex) pyramids are an easy way of visualising the distribution in your survey population. It is also worth considering creating descriptive tables of age and sex by survey strata. We will demonstrate using the apyramid package as it allows for weighted proportions using our survey design object created above. Other options for creating demographic pyramids are covered extensively in that chapter of the handbook. We will also use a wrapper function from apyramid called age_pyramid() which saves a few lines of coding for producing a plot with proportions.\nAs with the formal binomial test of difference, seen above in the sampling bias section, we are interested here in visualising whether our sampled population is substantially different from the source population and whether weighting corrects this difference. To do this we will use the patchwork package to show our ggplot visualisations side-by-side; for details see the section on combining plots in ggplot tips chapter of the handbook. We will visualise our source population, our un-weighted survey population and our weighted survey population. You may also consider visualising by each strata of your survey - in our example here that would be by using the argument stack_by = \"health_district\" (see ?plot_age_pyramid for details).\nNOTE: The x and y axes are flipped in pyramids \n\n## define x-axis limits and labels ---------------------------------------------\n## (update these numbers to be the values for your graph)\nmax_prop <- 35 # choose the highest proportion you want to show \nstep <- 5 # choose the space you want beween labels \n\n## this part defines vector using the above numbers with axis breaks\nbreaks <- c(\n seq(max_prop/100 * -1, 0 - step/100, step/100), \n 0, \n seq(0 + step / 100, max_prop/100, step/100)\n )\n\n## this part defines vector using the above numbers with axis limits\nlimits <- c(max_prop/100 * -1, max_prop/100)\n\n## this part defines vector using the above numbers with axis labels\nlabels <- c(\n seq(max_prop, step, -step), \n 0, \n seq(step, max_prop, step)\n )\n\n\n## create plots individually --------------------------------------------------\n\n## plot the source population \n## nb: this needs to be collapsed for the overall population (i.e. removing health districts)\nsource_population <- population %>%\n ## ensure that age and sex are factors\n mutate(age_group = factor(age_group, \n levels = c(\"0-2\", \n \"3-14\", \n \"15-29\",\n \"30-44\", \n \"45+\")), \n sex = factor(sex)) %>% \n group_by(age_group, sex) %>% \n ## add the counts for each health district together \n summarise(population = sum(population)) %>% \n ## remove the grouping so can calculate overall proportion\n ungroup() %>% \n mutate(proportion = population / sum(population)) %>% \n ## plot pyramid \n age_pyramid(\n age_group = age_group, \n split_by = sex, \n count = proportion, \n proportional = TRUE) +\n ## only show the y axis label (otherwise repeated in all three plots)\n labs(title = \"Source population\", \n y = \"\", \n x = \"Age group (years)\") + \n ## make the x axis the same for all plots \n scale_y_continuous(breaks = breaks, \n limits = limits, \n labels = labels)\n \n \n## plot the unweighted sample population \nsample_population <- age_pyramid(survey_data, \n age_group = \"age_group\", \n split_by = \"sex\",\n proportion = TRUE) + \n ## only show the x axis label (otherwise repeated in all three plots)\n labs(title = \"Unweighted sample population\", \n y = \"Proportion (%)\", \n x = \"\") + \n ## make the x axis the same for all plots \n scale_y_continuous(breaks = breaks, \n limits = limits, \n labels = labels)\n\n\n## plot the weighted sample population \nweighted_population <- survey_design %>% \n ## make sure the variables are factors\n mutate(age_group = factor(age_group), \n sex = factor(sex)) %>%\n age_pyramid(\n age_group = \"age_group\",\n split_by = \"sex\", \n proportion = TRUE) +\n ## only show the x axis label (otherwise repeated in all three plots)\n labs(title = \"Weighted sample population\", \n y = \"\", \n x = \"\") + \n ## make the x axis the same for all plots \n scale_y_continuous(breaks = breaks, \n limits = limits, \n labels = labels)\n\n## combine all three plots ----------------------------------------------------\n## combine three plots next to eachother using + \nsource_population + sample_population + weighted_population + \n ## only show one legend and define theme \n ## note the use of & for combining theme with plot_layout()\n plot_layout(guides = \"collect\") & \n theme(legend.position = \"bottom\", # move legend to bottom\n legend.title = element_blank(), # remove title\n text = element_text(size = 18), # change text size\n axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1) # turn x-axis text\n )\n\n\n\n\n\n\n\n\n\n\n26.7.3 Alluvial/sankey diagram\nVisualising starting points and outcomes for individuals can be very helpful to get an overview. There is quite an obvious application for mobile populations, however there are numerous other applications such as cohorts or any other situation where there are transitions in states for individuals. These diagrams have several different names including alluvial, sankey and parallel sets - the details are in the handbook chapter on diagrams and charts.\n\n## summarize data\nflow_table <- survey_data %>%\n count(startcause, endcause, sex) %>% # get counts \n gather_set_data(x = c(\"startcause\", \"endcause\")) # change format for plotting\n\n\n## plot your dataset \n ## on the x axis is the start and end causes\n ## gather_set_data generates an ID for each possible combination\n ## splitting by y gives the possible start/end combos\n ## value as n gives it as counts (could also be changed to proportion)\nggplot(flow_table, aes(x, id = id, split = y, value = n)) +\n ## colour lines by sex \n geom_parallel_sets(aes(fill = sex), alpha = 0.5, axis.width = 0.2) +\n ## fill in the label boxes grey\n geom_parallel_sets_axes(axis.width = 0.15, fill = \"grey80\", color = \"grey80\") +\n ## change text colour and angle (needs to be adjusted)\n geom_parallel_sets_labels(color = \"black\", angle = 0, size = 5) +\n ## remove axis labels\n theme_void()+\n ## move legend to bottom\n theme(legend.position = \"bottom\")", + "text": "26.7 Descriptive analysis\nBasic descriptive analysis and visualisation is covered extensively in other chapters of the handbook, so we will not dwell on it here. For details see the chapters on descriptive tables, statistical tests, tables for presentation, ggplot basics and R markdown reports.\nIn this section we will focus on how to investigate bias in your sample and visualise this. We will also look at visualising population flow in a survey setting using alluvial/sankey diagrams.\nIn general, you should consider including the following descriptive analyses:\n\nFinal number of clusters, households and individuals included.\n\nNumber of excluded individuals and the reasons for exclusion.\nMedian (range) number of households per cluster and individuals per household.\n\n\n26.7.1 Sampling bias\nCompare the proportions in each age group between your sample and the source population. This is important to be able to highlight potential sampling bias. You could similarly repeat this looking at distributions by sex.\nNote that these p-values are just indicative, and a descriptive discussion (or visualisation with age-pyramids below) of the distributions in your study sample compared to the source population is more important than the binomial test itself. This is because increasing sample size will more often than not lead to differences that may be irrelevant after weighting your data.\n\n## counts and props of the study population\nag <- survey_data %>% \n group_by(age_group) %>% \n drop_na(age_group) %>% \n tally() %>% \n mutate(proportion = n / sum(n), \n n_total = sum(n))\n\n## counts and props of the source population\npropcount <- population %>% \n group_by(age_group) %>%\n tally(population) %>%\n mutate(proportion = n / sum(n))\n\n## bind together the columns of two tables, group by age, and perform a \n## binomial test to see if n/total is significantly different from population\n## proportion.\n ## suffix here adds to text to the end of columns in each of the two datasets\nleft_join(ag, propcount, by = \"age_group\", suffix = c(\"\", \"_pop\")) %>%\n group_by(age_group) %>%\n ## broom::tidy(binom.test()) makes a data frame out of the binomial test and\n ## will add the variables p.value, parameter, conf.low, conf.high, method, and\n ## alternative. We will only use p.value here. You can include other\n ## columns if you want to report confidence intervals\n mutate(binom = list(broom::tidy(binom.test(n, n_total, proportion_pop)))) %>%\n unnest(cols = c(binom)) %>% # important for expanding the binom.test data frame\n mutate(proportion_pop = proportion_pop * 100) %>%\n ## Adjusting the p-values to correct for false positives \n ## (because testing multiple age groups). This will only make \n ## a difference if you have many age categories\n mutate(p.value = p.adjust(p.value, method = \"holm\")) %>%\n \n ## Only show p-values over 0.001 (those under report as <0.001)\n mutate(p.value = ifelse(p.value < 0.001, \n \"<0.001\", \n as.character(round(p.value, 3)))) %>% \n \n ## rename the columns appropriately\n select(\n \"Age group\" = age_group,\n \"Study population (n)\" = n,\n \"Study population (%)\" = proportion,\n \"Source population (n)\" = n_pop,\n \"Source population (%)\" = proportion_pop,\n \"P-value\" = p.value\n )\n\n# A tibble: 5 × 6\n# Groups: Age group [5]\n `Age group` `Study population (n)` `Study population (%)`\n <chr> <int> <dbl>\n1 0-2 12 0.0256\n2 3-14 42 0.0896\n3 15-29 64 0.136 \n4 30-44 52 0.111 \n5 45+ 299 0.638 \n# ℹ 3 more variables: `Source population (n)` <dbl>,\n# `Source population (%)` <dbl>, `P-value` <chr>\n\n\n\n\n26.7.2 Demographic pyramids\nDemographic (or age-sex) pyramids are an easy way of visualising the distribution in your survey population. It is also worth considering creating descriptive tables of age and sex by survey strata. We will demonstrate using the apyramid package as it allows for weighted proportions using our survey design object created above. Other options for creating demographic pyramids are covered extensively in that chapter of the handbook. We will also use a wrapper function from apyramid called age_pyramid() which saves a few lines of coding for producing a plot with proportions.\nAs with the formal binomial test of difference, seen above in the sampling bias section, we are interested here in visualising whether our sampled population is substantially different from the source population and whether weighting corrects this difference. To do this we will use the patchwork package to show our ggplot visualisations side-by-side; for details see the section on combining plots in ggplot tips chapter of the handbook. We will visualise our source population, our un-weighted survey population and our weighted survey population. You may also consider visualising by each strata of your survey - in our example here that would be by using the argument stack_by = \"health_district\" (see ?plot_age_pyramid for details).\nNOTE: The x and y axes are flipped in pyramids \n\n## define x-axis limits and labels ---------------------------------------------\n## (update these numbers to be the values for your graph)\nmax_prop <- 35 # choose the highest proportion you want to show \nstep <- 5 # choose the space you want beween labels \n\n## this part defines vector using the above numbers with axis breaks\nbreaks <- c(\n seq(max_prop/100 * -1, 0 - step/100, step/100), \n 0, \n seq(0 + step / 100, max_prop/100, step/100)\n )\n\n## this part defines vector using the above numbers with axis limits\nlimits <- c(max_prop/100 * -1, max_prop/100)\n\n## this part defines vector using the above numbers with axis labels\nlabels <- c(\n seq(max_prop, step, -step), \n 0, \n seq(step, max_prop, step)\n )\n\n\n## create plots individually --------------------------------------------------\n\n## plot the source population \n## nb: this needs to be collapsed for the overall population (i.e. removing health districts)\nsource_population <- population %>%\n ## ensure that age and sex are factors\n mutate(age_group = factor(age_group, \n levels = c(\"0-2\", \n \"3-14\", \n \"15-29\",\n \"30-44\", \n \"45+\")), \n sex = factor(sex)) %>% \n group_by(age_group, sex) %>% \n ## add the counts for each health district together \n summarise(population = sum(population)) %>% \n ## remove the grouping so can calculate overall proportion\n ungroup() %>% \n mutate(proportion = population / sum(population)) %>% \n ## plot pyramid \n age_pyramid(\n age_group = age_group, \n split_by = sex, \n count = proportion, \n proportional = TRUE) +\n ## only show the y axis label (otherwise repeated in all three plots)\n labs(title = \"Source population\", \n y = \"\", \n x = \"Age group (years)\") + \n ## make the x axis the same for all plots \n scale_y_continuous(breaks = breaks, \n limits = limits, \n labels = labels)\n \n \n## plot the unweighted sample population \nsample_population <- age_pyramid(survey_data, \n age_group = \"age_group\", \n split_by = \"sex\",\n proportion = TRUE) + \n ## only show the x axis label (otherwise repeated in all three plots)\n labs(title = \"Unweighted sample population\", \n y = \"Proportion (%)\", \n x = \"\") + \n ## make the x axis the same for all plots \n scale_y_continuous(breaks = breaks, \n limits = limits, \n labels = labels)\n\n\n## plot the weighted sample population \nweighted_population <- survey_design %>% \n ## make sure the variables are factors\n mutate(age_group = factor(age_group), \n sex = factor(sex)) %>%\n age_pyramid(\n age_group = \"age_group\",\n split_by = \"sex\", \n proportion = TRUE) +\n ## only show the x axis label (otherwise repeated in all three plots)\n labs(title = \"Weighted sample population\", \n y = \"\", \n x = \"\") + \n ## make the x axis the same for all plots \n scale_y_continuous(breaks = breaks, \n limits = limits, \n labels = labels)\n\n## combine all three plots ----------------------------------------------------\n## combine three plots next to eachother using + \nsource_population + sample_population + weighted_population + \n ## only show one legend and define theme \n ## note the use of & for combining theme with plot_layout()\n plot_layout(guides = \"collect\") & \n theme(legend.position = \"bottom\", # move legend to bottom\n legend.title = element_blank(), # remove title\n text = element_text(size = 18), # change text size\n axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1) # turn x-axis text\n )\n\n\n\n\n\n\n\n\n\n\n26.7.3 Alluvial/sankey diagram\nVisualising starting points and outcomes for individuals can be very helpful to get an overview. There is quite an obvious application for mobile populations, however there are numerous other applications such as cohorts or any other situation where there are transitions in states for individuals. These diagrams have several different names including alluvial, sankey and parallel sets - the details are in the handbook chapter on diagrams and charts.\n\n## summarize data\nflow_table <- survey_data %>%\n count(startcause, endcause, sex) %>% # get counts \n gather_set_data(x = c(\"startcause\", \"endcause\")) # change format for plotting\n\n\n## plot your dataset \n ## on the x axis is the start and end causes\n ## gather_set_data generates an ID for each possible combination\n ## splitting by y gives the possible start/end combos\n ## value as n gives it as counts (could also be changed to proportion)\nggplot(flow_table, aes(x, id = id, split = y, value = n)) +\n ## colour lines by sex \n geom_parallel_sets(aes(fill = sex), alpha = 0.5, axis.width = 0.2) +\n ## fill in the label boxes grey\n geom_parallel_sets_axes(axis.width = 0.15, fill = \"grey80\", color = \"grey80\") +\n ## change text colour and angle (needs to be adjusted)\n geom_parallel_sets_labels(color = \"black\", angle = 0, size = 5) +\n ## remove axis labels\n theme_void()+\n ## move legend to bottom\n theme(legend.position = \"bottom\")", "crumbs": [ "Analysis", "26  Survey analysis" @@ -2309,7 +2309,7 @@ "href": "new_pages/survey_analysis.html#weighted-proportions", "title": "26  Survey analysis", "section": "26.8 Weighted proportions", - "text": "26.8 Weighted proportions\nThis section will detail how to produce tables for weighted counts and proportions, with associated confidence intervals and design effect. There are four different options using functions from the following packages: survey, srvyr, sitrep and gtsummary. For minimal coding to produce a standard epidemiology style table, we would recommend the sitrep function - which is a wrapper for srvyr code; note however that this is not yet on CRAN and may change in the future. Otherwise, the survey code is likely to be the most stable long-term, whereas srvyr will fit most nicely within tidyverse work-flows. While gtsummary functions hold a lot of potential, they appear to be experimental and incomplete at the time of writing.\n\n26.8.1 Survey package\nWe can use the svyciprop() function from survey to get weighted proportions and accompanying 95% confidence intervals. An appropriate design effect can be extracted using the svymean() rather than svyprop() function. It is worth noting that svyprop() only appears to accept variables between 0 and 1 (or TRUE/FALSE), so categorical variables will not work.\nNOTE: Functions from survey also accept srvyr design objects, but here we have used the survey design object just for consistency \n\n## produce weighted counts \nsvytable(~died, base_survey_design)\n\ndied\n FALSE TRUE \n1406244.43 76213.01 \n\n## produce weighted proportions\nsvyciprop(~died, base_survey_design, na.rm = T)\n\n 2.5% 97.5%\ndied 0.0514 0.0208 0.12\n\n## get the design effect \nsvymean(~died, base_survey_design, na.rm = T, deff = T) %>% \n deff()\n\ndiedFALSE diedTRUE \n 3.755508 3.755508 \n\n\nWe can combine the functions from survey shown above in to a function which we define ourselves below, called svy_prop; and we can then use that function together with map() from the purrr package to iterate over several variables and create a table. See the handbook iteration chapter for details on purrr.\n\n# Define function to calculate weighted counts, proportions, CI and design effect\n# x is the variable in quotation marks \n# design is your survey design object\n\nsvy_prop <- function(design, x) {\n \n ## put the variable of interest in a formula \n form <- as.formula(paste0( \"~\" , x))\n ## only keep the TRUE column of counts from svytable\n weighted_counts <- svytable(form, design)[[2]]\n ## calculate proportions (multiply by 100 to get percentages)\n weighted_props <- svyciprop(form, design, na.rm = TRUE) * 100\n ## extract the confidence intervals and multiply to get percentages\n weighted_confint <- confint(weighted_props) * 100\n ## use svymean to calculate design effect and only keep the TRUE column\n design_eff <- deff(svymean(form, design, na.rm = TRUE, deff = TRUE))[[TRUE]]\n \n ## combine in to one data frame\n full_table <- cbind(\n \"Variable\" = x,\n \"Count\" = weighted_counts,\n \"Proportion\" = weighted_props,\n weighted_confint, \n \"Design effect\" = design_eff\n )\n \n ## return table as a dataframe\n full_table <- data.frame(full_table, \n ## remove the variable names from rows (is a separate column now)\n row.names = NULL)\n \n ## change numerics back to numeric\n full_table[ , 2:6] <- as.numeric(full_table[, 2:6])\n \n ## return dataframe\n full_table\n}\n\n## iterate over several variables to create a table \npurrr::map(\n ## define variables of interest\n c(\"left\", \"died\", \"arrived\"), \n ## state function using and arguments for that function (design)\n svy_prop, design = base_survey_design) %>% \n ## collapse list in to a single data frame\n bind_rows() %>% \n ## round \n mutate(across(where(is.numeric), round, digits = 1))\n\n Variable Count Proportion X2.5. X97.5. Design.effect\n1 left 701199.1 47.3 39.2 55.5 2.4\n2 died 76213.0 5.1 2.1 12.1 3.8\n3 arrived 761799.0 51.4 40.9 61.7 3.9\n\n\n\n\n26.8.2 Srvyr package\nWith srvyr we can use dplyr syntax to create a table. Note that the survey_mean() function is used and the proportion argument is specified, and also that the same function is used to calculate design effect. This is because srvyr wraps around both of the survey package functions svyciprop() and svymean(), which are used in the above section.\nNOTE: It does not seem to be possible to get proportions from categorical variables using srvyr either, if you need this then check out the section below using sitrep \n\n## use the srvyr design object\nsurvey_design %>% \n summarise(\n ## produce the weighted counts \n counts = survey_total(died), \n ## produce weighted proportions and confidence intervals \n ## multiply by 100 to get a percentage \n props = survey_mean(died, \n proportion = TRUE, \n vartype = \"ci\") * 100, \n ## produce the design effect \n deff = survey_mean(died, deff = TRUE)) %>% \n ## only keep the rows of interest\n ## (drop standard errors and repeat proportion calculation)\n select(counts, props, props_low, props_upp, deff_deff)\n\n# A tibble: 1 × 5\n counts props props_low props_upp deff_deff\n <dbl> <dbl> <dbl> <dbl> <dbl>\n1 76213. 5.14 2.08 12.1 3.76\n\n\nHere too we could write a function to then iterate over multiple variables using the purrr package. See the handbook iteration chapter for details on purrr.\n\n# Define function to calculate weighted counts, proportions, CI and design effect\n# design is your survey design object\n# x is the variable in quotation marks \n\n\nsrvyr_prop <- function(design, x) {\n \n summarise(\n ## using the survey design object\n design, \n ## produce the weighted counts \n counts = survey_total(.data[[x]]), \n ## produce weighted proportions and confidence intervals \n ## multiply by 100 to get a percentage \n props = survey_mean(.data[[x]], \n proportion = TRUE, \n vartype = \"ci\") * 100, \n ## produce the design effect \n deff = survey_mean(.data[[x]], deff = TRUE)) %>% \n ## add in the variable name\n mutate(variable = x) %>% \n ## only keep the rows of interest\n ## (drop standard errors and repeat proportion calculation)\n select(variable, counts, props, props_low, props_upp, deff_deff)\n \n}\n \n\n## iterate over several variables to create a table \npurrr::map(\n ## define variables of interest\n c(\"left\", \"died\", \"arrived\"), \n ## state function using and arguments for that function (design)\n ~srvyr_prop(.x, design = survey_design)) %>% \n ## collapse list in to a single data frame\n bind_rows()\n\n# A tibble: 3 × 6\n variable counts props props_low props_upp deff_deff\n <chr> <dbl> <dbl> <dbl> <dbl> <dbl>\n1 left 701199. 47.3 39.2 55.5 2.38\n2 died 76213. 5.14 2.08 12.1 3.76\n3 arrived 761799. 51.4 40.9 61.7 3.93\n\n\n\n\n26.8.3 Sitrep package\nThe tab_survey() function from sitrep is a wrapper for srvyr, allowing you to create weighted tables with minimal coding. It also allows you to calculate weighted proportions for categorical variables.\n\n## using the survey design object\nsurvey_design %>% \n ## pass the names of variables of interest unquoted\n tab_survey(arrived, left, died, education_level,\n deff = TRUE, # calculate the design effect\n pretty = TRUE # merge the proportion and 95%CI\n )\n\nWarning: removing 257 missing value(s) from `education_level`\n\n\n# A tibble: 9 × 5\n variable value n deff ci \n <chr> <chr> <dbl> <dbl> <chr> \n1 arrived TRUE 761799. 3.93 51.4% (40.9-61.7)\n2 arrived FALSE 720658. 3.93 48.6% (38.3-59.1)\n3 left TRUE 701199. 2.38 47.3% (39.2-55.5)\n4 left FALSE 781258. 2.38 52.7% (44.5-60.8)\n5 died TRUE 76213. 3.76 5.1% (2.1-12.1) \n6 died FALSE 1406244. 3.76 94.9% (87.9-97.9)\n7 education_level higher 171644. 4.70 42.4% (26.9-59.7)\n8 education_level primary 102609. 2.37 25.4% (16.2-37.3)\n9 education_level secondary 130201. 6.68 32.2% (16.5-53.3)\n\n\n\n\n26.8.4 Gtsummary package\nWith gtsummary there does not seem to be inbuilt functions yet to add confidence intervals or design effect. Here we show how to define a function for adding confidence intervals and then add confidence intervals to a gtsummary table created using the tbl_svysummary() function.\n\nconfidence_intervals <- function(data, variable, by, ...) {\n \n ## extract the confidence intervals and multiply to get percentages\n props <- svyciprop(as.formula(paste0( \"~\" , variable)),\n data, na.rm = TRUE)\n \n ## extract the confidence intervals \n as.numeric(confint(props) * 100) %>% ## make numeric and multiply for percentage\n round(., digits = 1) %>% ## round to one digit\n c(.) %>% ## extract the numbers from matrix\n paste0(., collapse = \"-\") ## combine to single character\n}\n\n## using the survey package design object\ntbl_svysummary(base_survey_design, \n include = c(arrived, left, died), ## define variables want to include\n statistic = list(everything() ~ c(\"{n} ({p}%)\"))) %>% ## define stats of interest\n add_n() %>% ## add the weighted total \n add_stat(fns = everything() ~ confidence_intervals) %>% ## add CIs\n ## modify the column headers\n modify_header(\n list(\n n ~ \"**Weighted total (N)**\",\n stat_0 ~ \"**Weighted Count**\",\n add_stat_1 ~ \"**95%CI**\"\n )\n )\n\n\n\n\n\n\n\n\nCharacteristic\nWeighted total (N)\nWeighted Count1\n95%CI\n\n\n\n\narrived\n1,482,457\n761,799 (51%)\n40.9-61.7\n\n\nleft\n1,482,457\n701,199 (47%)\n39.2-55.5\n\n\ndied\n1,482,457\n76,213 (5.1%)\n2.1-12.1\n\n\n\n1 n (%)", + "text": "26.8 Weighted proportions\nThis section will detail how to produce tables for weighted counts and proportions, with associated confidence intervals and design effect. There are four different options using functions from the following packages: survey, srvyr, sitrep and gtsummary. For minimal coding to produce a standard epidemiology style table, we would recommend the sitrep function - which is a wrapper for srvyr code. Otherwise, the survey code is likely to be the most stable long-term, whereas srvyr will fit most nicely within tidyverse work-flows. As another alternative, gtsummary offers an easy way to display survey data following its update to versions >=2.0.\n\n26.8.1 Survey package\nWe can use the svyciprop() function from survey to get weighted proportions and accompanying 95% confidence intervals. An appropriate design effect can be extracted using the svymean() rather than svyprop() function. It is worth noting that svyprop() only appears to accept variables between 0 and 1 (or TRUE/FALSE), so categorical variables will not work.\nNOTE: Functions from survey also accept srvyr design objects, but here we have used the survey design object just for consistency \n\n## produce weighted counts \nsvytable(~died, base_survey_design)\n\ndied\n FALSE TRUE \n1406244.43 76213.01 \n\n## produce weighted proportions\nsvyciprop(~died, base_survey_design, na.rm = T)\n\n 2.5% 97.5%\ndied 0.0514 0.0208 0.1213\n\n## get the design effect \nsvymean(~died, base_survey_design, na.rm = T, deff = T) %>% \n deff()\n\ndiedFALSE diedTRUE \n 3.755508 3.755508 \n\n\nWe can combine the functions from survey shown above in to a function which we define ourselves below, called svy_prop; and we can then use that function together with map() from the purrr package to iterate over several variables and create a table. See the handbook iteration chapter for details on purrr.\n\n# Define function to calculate weighted counts, proportions, CI and design effect\n# x is the variable in quotation marks \n# design is your survey design object\n\nsvy_prop <- function(design, x) {\n \n ## put the variable of interest in a formula \n form <- as.formula(paste0( \"~\" , x))\n ## only keep the TRUE column of counts from svytable\n weighted_counts <- svytable(form, design)[[2]]\n ## calculate proportions (multiply by 100 to get percentages)\n weighted_props <- svyciprop(form, design, na.rm = TRUE) * 100\n ## extract the confidence intervals and multiply to get percentages\n weighted_confint <- confint(weighted_props) * 100\n ## use svymean to calculate design effect and only keep the TRUE column\n design_eff <- deff(svymean(form, design, na.rm = TRUE, deff = TRUE))[[TRUE]]\n \n ## combine in to one data frame\n full_table <- cbind(\n \"Variable\" = x,\n \"Count\" = weighted_counts,\n \"Proportion\" = weighted_props,\n weighted_confint, \n \"Design effect\" = design_eff\n )\n \n ## return table as a dataframe\n full_table <- data.frame(full_table, \n ## remove the variable names from rows (is a separate column now)\n row.names = NULL)\n \n ## change numerics back to numeric\n full_table[ , 2:6] <- as.numeric(full_table[, 2:6])\n \n ## return dataframe\n full_table\n}\n\n## iterate over several variables to create a table \npurrr::map(\n ## define variables of interest\n c(\"left\", \"died\", \"arrived\"), \n ## state function using and arguments for that function (design)\n svy_prop, design = base_survey_design) %>% \n ## collapse list in to a single data frame\n bind_rows() %>% \n ## round \n mutate(across(where(is.numeric), round, digits = 1))\n\n Variable Count Proportion X2.5. X97.5. Design.effect\n1 left 701199.1 47.3 39.2 55.5 2.4\n2 died 76213.0 5.1 2.1 12.1 3.8\n3 arrived 761799.0 51.4 40.9 61.7 3.9\n\n\n\n\n26.8.2 Srvyr package\nWith srvyr we can use dplyr syntax to create a table. Note that the survey_mean() function is used and the proportion argument is specified, and also that the same function is used to calculate design effect. This is because srvyr wraps around both of the survey package functions svyciprop() and svymean(), which are used in the above section.\nNOTE: It does not seem to be possible to get proportions from categorical variables using srvyr either, if you need this then check out the section below using sitrep \n\n## use the srvyr design object\nsurvey_design %>% \n summarise(\n ## produce the weighted counts \n counts = survey_total(died), \n ## produce weighted proportions and confidence intervals \n ## multiply by 100 to get a percentage \n props = survey_mean(died, \n proportion = TRUE, \n vartype = \"ci\") * 100, \n ## produce the design effect \n deff = survey_mean(died, deff = TRUE)) %>% \n ## only keep the rows of interest\n ## (drop standard errors and repeat proportion calculation)\n select(counts, props, props_low, props_upp, deff_deff)\n\n# A tibble: 1 × 5\n counts props props_low props_upp deff_deff\n <dbl> <dbl> <dbl> <dbl> <dbl>\n1 76213. 5.14 2.08 12.1 3.76\n\n\nHere too we could write a function to then iterate over multiple variables using the purrr package. See the handbook iteration chapter for details on purrr.\n\n# Define function to calculate weighted counts, proportions, CI and design effect\n# design is your survey design object\n# x is the variable in quotation marks \n\n\nsrvyr_prop <- function(design, x) {\n \n summarise(\n ## using the survey design object\n design, \n ## produce the weighted counts \n counts = survey_total(.data[[x]]), \n ## produce weighted proportions and confidence intervals \n ## multiply by 100 to get a percentage \n props = survey_mean(.data[[x]], \n proportion = TRUE, \n vartype = \"ci\") * 100, \n ## produce the design effect \n deff = survey_mean(.data[[x]], deff = TRUE)) %>% \n ## add in the variable name\n mutate(variable = x) %>% \n ## only keep the rows of interest\n ## (drop standard errors and repeat proportion calculation)\n select(variable, counts, props, props_low, props_upp, deff_deff)\n \n}\n \n\n## iterate over several variables to create a table \npurrr::map(\n ## define variables of interest\n c(\"left\", \"died\", \"arrived\"), \n ## state function using and arguments for that function (design)\n ~srvyr_prop(.x, design = survey_design)) %>% \n ## collapse list in to a single data frame\n bind_rows()\n\n# A tibble: 3 × 6\n variable counts props props_low props_upp deff_deff\n <chr> <dbl> <dbl> <dbl> <dbl> <dbl>\n1 left 701199. 47.3 39.2 55.5 2.38\n2 died 76213. 5.14 2.08 12.1 3.76\n3 arrived 761799. 51.4 40.9 61.7 3.93\n\n\n\n\n26.8.3 Sitrep package\nThe tab_survey() function from sitrep is a wrapper for srvyr, allowing you to create weighted tables with minimal coding. It also allows you to calculate weighted proportions for categorical variables.\n\n## using the survey design object\nsurvey_design %>% \n ## pass the names of variables of interest unquoted\n tab_survey(\n \"arrived\", \n \"left\", \n \"died\", \n \"education_level\",\n deff = TRUE, # calculate the design effect\n pretty = TRUE # merge the proportion and 95%CI\n )\n\n\n\n variable value n deff ci\n1 arrived TRUE 761799.05 3.925504 51.4% (40.9-61.7)\n2 arrived FALSE 720658.39 3.925504 48.6% (38.3-59.1)\n3 left TRUE 701199.14 2.379761 47.3% (39.2-55.5)\n4 left FALSE 781258.30 2.379761 52.7% (44.5-60.8)\n5 died TRUE 76213.01 3.755508 5.1% (2.1-12.1)\n6 died FALSE 1406244.43 3.755508 94.9% (87.9-97.9)\n7 education_level higher 171644.18 4.701398 42.4% (26.9-59.7)\n8 education_level primary 102609.44 2.368384 25.4% (16.2-37.3)\n9 education_level secondary 130200.67 6.677178 32.2% (16.5-53.3)\n\n\n\n\n26.8.4 Gtsummary package\nWith gtsummary we can use the function tbl_svysummary() and the addition add_ci() to add confidence intervals.\n\n## using the survey package design object\ntbl_svysummary(base_survey_design, \n include = c(arrived, left, died), ## define variables want to include\n statistic = list(everything() ~ c(\"{n} ({p}%)\"))) %>% ## define stats of interest\n add_ci() %>% ## add confidence intervals\n add_n() %>%\n modify_header(label = list(\n n ~ \"**Weighted total (N)**\",\n stat_0 ~ \"**Weighted count**\"\n ))\n\nWarning: The `update` argument of `modify_header()` is deprecated as of gtsummary 2.0.0.\nℹ Use `modify_header(...)` input instead. Dynamic dots allow for syntax like\n `modify_header(!!!list(...))`.\nℹ The deprecated feature was likely used in the gtsummary package.\n Please report the issue at <https://github.com/ddsjoberg/gtsummary/issues>.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nCharacteristic\nWeighted total (N)\nWeighted count\n1\n95% CI\n2\n\n\n\n\narrived\n1,482,457\n761,799 (51%)\n41%, 62%\n\n\nleft\n1,482,457\n701,199 (47%)\n39%, 56%\n\n\ndied\n1,482,457\n76,213 (5.1%)\n2.1%, 12%\n\n\n\n1\nn (%)\n\n\n2\nCI = Confidence Interval", "crumbs": [ "Analysis", "26  Survey analysis" @@ -2330,8 +2330,8 @@ "objectID": "new_pages/survey_analysis.html#resources", "href": "new_pages/survey_analysis.html#resources", "title": "26  Survey analysis", - "section": "26.10 Resources", - "text": "26.10 Resources\nUCLA stats page\nAnalyze survey data free\nsrvyr packge\ngtsummary package\nEPIET survey case studies", + "section": "26.11 Resources", + "text": "26.11 Resources\nUCLA stats page\nAnalyze survey data free\nsrvyr packge\ngtsummary package\nEPIET survey case studies\nAnalyzing Complex Survey Data", "crumbs": [ "Analysis", "26  Survey analysis" @@ -4377,7 +4377,7 @@ "href": "new_pages/help.html#reproducible-example", "title": "48  Getting help", "section": "48.2 Reproducible example", - "text": "48.2 Reproducible example\nProviding a reproducible example (“reprex”) is key to getting help when posting in a forum or in a Github issue. People want to help you, but you have to give them an example that they can work with on their own computer. The example should:\n\nDemonstrate the problem you encountered\n\nBe minimal, in that it includes only the data and code required to reproduce your problem\n\nBe reproducible, such that all objects (e.g. data), package calls (e.g. library() or p_load()) are included\n\nAlso, be sure you do not post any sensitive data with the reprex! You can create example data frames, or use one of the data frames built into R (enter data() to open a list of these datasets).\n\nThe reprex package\nThe reprex package can assist you with making a reproducible example:\n\nreprex is installed with tidyverse, so load either package\n\n\n# install/load tidyverse (which includes reprex)\npacman::p_load(tidyverse)\n\n\nBegin an R script that creates your problem, step-by-step, starting from loading packages and data.\n\n\n# load packages\npacman::p_load(\n tidyverse, # data mgmt and vizualization\n outbreaks) # example outbreak datasets\n\n# flu epidemic case linelist\noutbreak_raw <- outbreaks::fluH7N9_china_2013 # retrieve dataset from outbreaks package\n\n# Clean dataset\noutbreak <- outbreak_raw %>% \n mutate(across(contains(\"date\"), as.Date))\n\n# Plot epidemic\n\nggplot(data = outbreak)+\n geom_histogram(\n mapping = aes(x = date_of_onset),\n binwidth = 7\n )+\n scale_x_date(\n date_format = \"%d %m\"\n )\n\nCopy all the code to your clipboard, and run the following command:\n\nreprex::reprex()\n\nYou will see an HTML output appear in the RStudio Viewer pane. It will contain all your code and any warnings, errors, or plot outputs. This output is also copied to your clipboard, so you can post it directly into a Github issue or a forum post.\n\n\n\n\n\n\n\n\n\n\nIf you set session_info = TRUE the output of sessioninfo::session_info() with your R and R package versions will be included\n\nYou can provide a working directory to wd =\n\nYou can read more about the arguments and possible variations at the documentation or by entering ?reprex\n\nIn the example above, the ggplot() command did not run because the arguemnt date_format = is not correct - it should be date_labels =.\n\n\nMinimal data\nThe helpers need to be able to use your data - ideally they need to be able to create it with code.\nTo create a minumal dataset, consider anonymising and using only a subset of the observations.\nUNDER CONSTRUCTION - you can also use the function dput() to create minimal dataset.", + "text": "48.2 Reproducible example\nProviding a reproducible example (“reprex”) is key to getting help when posting in a forum or in a Github issue. People want to help you, but you have to give them an example that they can work with on their own computer. The example should:\n\nDemonstrate the problem you encountered.\n\nBe minimal, in that it includes only the data and code required to reproduce your problem.\n\nBe reproducible, such that all objects (e.g. data), package calls (e.g. library() or p_load()) are included.\n\nAlso, be sure you do not post any sensitive data with the reprex! You can create example data frames, or use one of the data frames built into R (enter data() to open a list of these datasets).\n\nThe reprex package\nThe reprex package can assist you with making a reproducible example:\n\nreprex is installed with tidyverse, so load either package.\n\n\n# install/load tidyverse (which includes reprex)\npacman::p_load(tidyverse)\n\n\nBegin an R script that creates your problem, step-by-step, starting from loading packages and data.\n\n\n# load packages\npacman::p_load(\n tidyverse, # data mgmt and vizualization\n outbreaks # example outbreak datasets\n ) \n\n# flu epidemic case linelist\noutbreak_raw <- outbreaks::fluH7N9_china_2013 # retrieve dataset from outbreaks package\n\n# Clean dataset\noutbreak <- outbreak_raw %>% \n mutate(across(contains(\"date\"), as.Date))\n\n# Plot epidemic\n\nggplot(data = outbreak) +\n geom_histogram(\n mapping = aes(x = date_of_onset),\n binwidth = 7\n ) +\n scale_x_date(\n date_labels = \"%d %m\"\n )\n\nCopy all the code to your clipboard, and run the following command:\n\nreprex::reprex()\n\nYou will see an HTML output appear in the RStudio Viewer pane. It will contain all your code and any warnings, errors, or plot outputs. This output is also copied to your clipboard, so you can post it directly into a Github issue or a forum post.\n\n\n\n\n\n\n\n\n\n\nIf you set session_info = TRUE the output of sessioninfo::session_info() with your R and R package versions will be included.\n\nYou can provide a working directory to wd =.\n\nYou can read more about the arguments and possible variations at the documentation or by entering ?reprex.\n\nIn the example above, the ggplot() command did not run because the argument date_format = is not correct - it should be date_labels =.\n\n\nMinimal data\nThe helpers need to be able to use your data - ideally they need to be able to create it with code. To create a minimal dataset, consider anonymising and using only a subset of the observations, or replicating the data using “dummy values”. These seek to replicate the style (same class of variables, same amount of data, etc), but do not contain any sensitive information.\nFor example, imagine we have a linelist of those infected with a novel pathogen.\n\n#Create dataset\nlinelist_data <- data.frame(\n first_name = c(\"John\", \"Jane\", \"Joe\", \"Tom\", \"Richard\", \"Harry\"),\n last_name = c(\"Doe\", \"Doe\", \"Bloggs\", \"Jerry\", \"Springer\", \"Potter\"),\n age_years = c(25, 34, 27, 89, 52, 47),\n location = as.factor(c(\"London\", \"Paris\", \"Berlin\", \"Tokyo\", \"Canberra\", \"Rio de Janeiro\")),\n outcome = c(\"Died\", \"Recovered\", NA, \"Recovered\", \"Recovered\", \"Died\"),\n symptoms = as.factor(c(T, F, F, T, T, F))\n)\n\nYou can imagine that this would be very sensitive information, and it would not be appropriate (or even legal!) to share this information. To anonymise the data you could think about removing columns that are not necessary for the analysis, but are identifying. For instance you could:\n\nReplace first_name and last_name with an id column, that uses the rownumber.\nRecode the location column so that it is not apparent where they were located.\nChange the outcome to a binary value, to avoid disclosing their condition.\n\n\nlinelist_data %>%\n mutate(id = row_number(),\n location = as.factor(LETTERS[as.numeric(as.factor(location))]),\n outcome = as.character(as.factor(outcome))) %>%\n select(id, age_years, location, outcome, symptoms)\n\n id age_years location outcome symptoms\n1 1 25 C Died TRUE\n2 2 34 D Recovered FALSE\n3 3 27 A <NA> FALSE\n4 4 89 F Recovered TRUE\n5 5 52 B Recovered TRUE\n6 6 47 E Died FALSE\n\n\nYou could also generate new data, where columns are the same class (class()). This would also produce a reproducible dataset that does not contain any sensitive information.\n\ndummy_dataset <- data.frame(\n first_name = LETTERS[1:6],\n last_name = unique(iris$Species),\n age_years = sample(1:100, 6, replace = T),\n location = sample(row.names(USArrests), 6, replace = T),\n outcome = sample(c(\"Died\", \"Recovered\"), 6, replace = T),\n symptoms = sample(c(T, F), 6, replace = T)\n )\n\ndummy_dataset\n\n first_name last_name age_years location outcome symptoms\n1 A setosa 58 New Mexico Died TRUE\n2 B versicolor 22 New Mexico Died TRUE\n3 C virginica 53 Alaska Recovered TRUE\n4 D setosa 60 North Carolina Died TRUE\n5 E versicolor 11 Utah Died FALSE\n6 F virginica 74 Pennsylvania Recovered FALSE", "crumbs": [ "Miscellaneous", "48  Getting help" @@ -4388,7 +4388,7 @@ "href": "new_pages/help.html#posting-to-a-forum", "title": "48  Getting help", "section": "48.3 Posting to a forum", - "text": "48.3 Posting to a forum\nRead lots of forum posts. Get an understanding for which posts are well-written, and which ones are not.\n\nFirst, decide whether to ask the question at all. Have you thoroughly reviewed the forum website, trying various search terms, to see if your question has already been asked?\nGive your question an informative title (not “Help! this isn’t working”).\nWrite your question:\n\n\nIntroduce your situation and problem\n\nLink to posts of similar issues and explain how they do not answer your question\n\nInclude any relevant information to help someone who does not know the context of your work\n\nGive a minimal reproducible example with your R session information\n\nUse proper spelling, grammar, punctuation, and break your question into paragraphs so that it is easier to read\n\n\nMonitor your question once posted to respond to any requests for clarification. Be courteous and gracious - often the people answering are volunteering their time to help you. If you have a follow-up question consider whether it should be a separate posted question.\nMark the question as answered, if you get an answer that meets the original request. This helps others later quickly recognize the solution.\n\nRead these posts about how to ask a good question the Stack overflow code of conduct.", + "text": "48.3 Posting to a forum\nOne highly recommend forum for posting your questions on is the Applied Epi Community Forum. There are a wealth of topics, and experts, to be found on our forum, ready to help you with questions on Epi methods, R code, and many other topics. Note, you will have to make an account in order to post a question!\nIn order to make it as easy as possible for others to understand and help you, you should approach posting your question as follows.\n\nFirst, decide whether to ask the question at all. Have you thoroughly reviewed the forum website, trying various search terms, to see if your question has already been asked?\nGive your question an informative title (not “Help! this isn’t working”).\nWrite your question:\n\n\nIntroduce your situation and problem.\n\nLink to posts of similar issues and explain how they do not answer your question.\n\nInclude any relevant information to help someone who does not know the context of your work.\n\nGive a minimal reproducible example with your R session information.\n\nUse proper spelling, grammar, punctuation, and break your question into paragraphs so that it is easier to read.\n\n\nMonitor your question once posted to respond to any requests for clarification. Be courteous and gracious - often the people answering are volunteering their time to help you. If you have a follow-up question consider whether it should be a separate posted question.\nMark the question as answered, if you get an answer that meets the original request. This helps others later quickly recognize the solution.\n\nRead these posts about how to ask a good question the Stack overflow code of conduct, and the Applied Epi Community post on “How to make a reproducible R code example”.", "crumbs": [ "Miscellaneous", "48  Getting help" @@ -4399,7 +4399,7 @@ "href": "new_pages/help.html#resources", "title": "48  Getting help", "section": "48.4 Resources", - "text": "48.4 Resources\nTidyverse page on how to get help!\nTips on producing a minimal dataset\nDocumentation for the dput function", + "text": "48.4 Resources\nTidyverse page on how to get help!\nTips on producing a minimal dataset\nThe Applied Epi Community forum\nApplied Epi Community post on “How to make a reproducible R code example”.", "crumbs": [ "Miscellaneous", "48  Getting help" @@ -4454,7 +4454,7 @@ "href": "new_pages/network_drives.html#troubleshooting-common-errors", "title": "49  R on network drives", "section": "49.4 Troubleshooting common errors", - "text": "49.4 Troubleshooting common errors\n“Failed to compile…tex in rmarkdown”\n\nCheck the installation of TinyTex, or install TinyTex to C: location. See the R basics page on how to install TinyTex.\n\n\n# check/install tinytex, to C: location\ntinytex::install_tinytex()\ntinytex:::is_tinytex() # should return TRUE (note three colons)\n\nInternet routines cannot be loaded\nFor example, Error in tools::startDynamicHelp() : internet routines cannot be loaded\n\nTry selecting 32-bit version from RStudio via Tools/Global Options.\n\nnote: if 32-bit version does not appear in menu, make sure you are not using RStudio v1.2.\n\n\nAlternatively, try uninstalling R and re-installing with different bit version (32 instead of 64).\n\nC: library does not appear as an option when I try to install packages manually\n\nRun RStudio as an administrator, then this option will appear.\n\nTo set-up RStudio to always run as administrator (advantageous when using an Rproject where you don’t click RStudio icon to open)… right-click the Rstudio icon\n\nThe image below shows how you can manually select the library to install a package to. This window appears when you open the Packages RStudio pane and click “Install”.\n\n\n\n\n\n\n\n\n\nPandoc 1 error\nIf you are getting “pandoc error 1” when knitting R Markdowns scripts on network drives:\n\nOf multiple library locations, have the one with a lettered drive listed first (see codes above)\n\nThe above solution worked when knitting on local drive but while on a networked internet connection\n\nSee more tips here: https://ciser.cornell.edu/rmarkdown-knit-to-html-word-pdf/\n\nPandoc Error 83\nThe error will look something like this: can't find file...rmarkdown...lua.... This means that it was unable to find this file.\nSee https://stackoverflow.com/questions/58830927/rmarkdown-unable-to-locate-lua-filter-when-knitting-to-word\nPossibilities:\n\nRmarkdown package is not installed\n\nRmarkdown package is not findable\n\nAn admin rights issue.\n\nIt is possible that R is not able to find the rmarkdown package file, so check which library the rmarkdown package lives (see code above). If the package is installed to a library that in inaccessible (e.g. starts with “\\\") consider manually moving it to C: or other named drive library. Be aware that the rmarkdown package has to be able to connect to TinyTex installation, so can not live in a library on a network drive.\nPandoc Error 61\nFor example: Error: pandoc document conversion failed with error 61 or Could not fetch...\n\nTry running RStudio as administrator (right click icon, select run as admin, see above instructions)\n\nAlso see if the specific package that was unable to be reached can be moved to C: library.\n\nLaTex error (see below)\nAn error like: ! Package pdftex.def Error: File 'cict_qm2_2020-06-29_files/figure-latex/unnamed-chunk-5-1.png' not found: using draft setting. or Error: LaTeX failed to compile file_name.tex.\n\nSee https://yihui.org/tinytex/r/#debugging for debugging tips.\n\nSee file_name.log for more info.\n\nPandoc Error 127\nThis could be a RAM (space) issue. Re-start your R session and try again.\nMapping network drives\nMapping a network drive can be risky. Consult with your IT department before attempting this.\nA tip borrowed from this forum discussion:\nHow does one open a file “through a mapped network drive”?\n\nFirst, you’ll need to know the network location you’re trying to access.\n\nNext, in the Windows file manager, you will need to right click on “This PC” on the right hand pane, and select “Map a network drive”.\n\nGo through the dialogue to define the network location from earlier as a lettered drive.\n\nNow you have two ways to get to the file you’re opening. Using the drive-letter path should work.\n\nError in install.packages()\nIf you get an error that includes mention of a “lock” directory, for example: Error in install.packages : ERROR: failed to lock directory...\nLook in your package library and you will see a folder whose name begins with “00LOCK”. Try the following tips:\n\nManually delete the “00LOCK” folder directory from your package library. Try installing the package again.\n\nYou can also try the command pacman::p_unlock() (you can also put this command in the Rprofile so it runs every time project opens.). Then try installing the package again. It may take several tries.\n\nTry running RStudio in Administrator mode, and try installing the packages one-by-one.\n\nIf all else fails, install the package to another library or folder (e.g. Temp) and then manually copy the package’s folder over to the desired library.", + "text": "49.4 Troubleshooting common errors\n“Failed to compile…tex in rmarkdown”\n\nCheck the installation of TinyTex, or install TinyTex to C: location. See the R basics page on how to install TinyTex.\n\n\n# check/install tinytex, to C: location\ntinytex::install_tinytex()\ntinytex:::is_tinytex() # should return TRUE (note three colons)\n\nInternet routines cannot be loaded\nFor example, Error in tools::startDynamicHelp() : internet routines cannot be loaded\n\nTry selecting 32-bit version from RStudio via Tools/Global Options.\n\nnote: if 32-bit version does not appear in menu, make sure you are not using RStudio v1.2.\n\n\nAlternatively, try uninstalling R and re-installing with different bit version (32 instead of 64).\n\nC: library does not appear as an option when I try to install packages manually\n\nRun RStudio as an administrator, then this option will appear.\n\nTo set-up RStudio to always run as administrator (advantageous when using an Rproject where you don’t click RStudio icon to open), right-click the Rstudio icon.\n\nThe image below shows how you can manually select the library to install a package to. This window appears when you open the Packages RStudio pane and click “Install”.\n\n\n\n\n\n\n\n\n\nPandoc 1 error\nIf you are getting “pandoc error 1” when knitting R Markdowns scripts on network drives:\n\nOf multiple library locations, have the one with a lettered drive listed first (see codes above).\n\nThe above solution worked when knitting on local drive but while on a networked internet connection.\n\nSee more tips here.\n\nPandoc Error 83\nThe error will look something like this: can't find file...rmarkdown...lua.... This means that it was unable to find this file.\nSee here.\nPossibilities:\n\nRmarkdown package is not installed.\n\nRmarkdown package is not findable.\n\nAn admin rights issue.\n\nIt is possible that R is not able to find the rmarkdown package file, so check which library the rmarkdown package lives (see code above). If the package is installed to a library that in inaccessible (e.g. starts with “\\\") consider manually moving it to C: or other named drive library. Be aware that the rmarkdown package has to be able to connect to TinyTex installation, so can not live in a library on a network drive.\nPandoc Error 61\nFor example: Error: pandoc document conversion failed with error 61 or Could not fetch...\n\nTry running RStudio as administrator (right click icon, select run as admin, see above instructions).\n\nAlso see if the specific package that was unable to be reached can be moved to C: library.\n\nLaTex error (see below)\nAn error like: ! Package pdftex.def Error: File 'cict_qm2_2020-06-29_files/figure-latex/unnamed-chunk-5-1.png' not found: using draft setting. or Error: LaTeX failed to compile file_name.tex.\n\nSee this article for debugging tips.\n\nSee file_name.log for more info.\n\nPandoc Error 127\nThis could be a RAM (space) issue. Re-start your R session and try again.\nMapping network drives\nMapping a network drive can be risky. Consult with your IT department before attempting this.\nA tip borrowed from this forum discussion:\nHow does one open a file “through a mapped network drive”?\n\nFirst, you’ll need to know the network location you’re trying to access.\n\nNext, in the Windows file manager, you will need to right click on “This PC” on the right hand pane, and select “Map a network drive”.\n\nGo through the dialogue to define the network location from earlier as a lettered drive.\n\nNow you have two ways to get to the file you’re opening. Using the drive-letter path should work.\n\nError in install.packages()\nIf you get an error that includes mention of a “lock” directory, for example: Error in install.packages : ERROR: failed to lock directory...\nLook in your package library and you will see a folder whose name begins with “00LOCK”. Try the following tips:\n\nManually delete the “00LOCK” folder directory from your package library. Try installing the package again.\n\nYou can also try the command pacman::p_unlock() (you can also put this command in the Rprofile so it runs every time project opens.). Then try installing the package again. It may take several tries.\n\nTry running RStudio in Administrator mode, and try installing the packages one-by-one.\n\nIf all else fails, install the package to another library or folder (e.g. Temp) and then manually copy the package’s folder over to the desired library.", "crumbs": [ "Miscellaneous", "49  R on network drives" @@ -4558,5 +4558,38 @@ "Data Visualization", "32  Epidemic curves" ] + }, + { + "objectID": "new_pages/regression.html#preparation-preparation", + "href": "new_pages/regression.html#preparation-preparation", + "title": "19  Univariate and multivariable regression", + "section": "", + "text": "Load packages\nThis code chunk shows the loading of packages required for the analyses. In this handbook we emphasize p_load() from pacman, which installs the package if necessary and loads it for use. You can also load installed packages with library() from base R. See the page on R basics for more information on R packages.\n\npacman::p_load(\n rio, # File import\n here, # File locator\n tidyverse, # data management + ggplot2 graphics, \n stringr, # manipulate text strings \n purrr, # loop over objects in a tidy way\n gtsummary, # summary statistics and tests \n broom, # tidy up results from regressions\n lmtest, # likelihood-ratio tests\n parameters, # alternative to tidy up results from regressions\n see # alternative to visualise forest plots\n )\n\n\n\nImport data\nWe import the dataset of cases from a simulated Ebola epidemic. If you want to follow along, click to download the “clean” linelist (as .rds file). Import your data with the import() function from the rio package (it accepts many file types like .xlsx, .rds, .csv - see the Import and export page for details).\n\n\nWarning: Missing `trust` will be set to FALSE by default for RDS in 2.0.0.\n\n\n\n# import the linelist\nlinelist <- import(\"linelist_cleaned.rds\")\n\nThe first 50 rows of the linelist are displayed below.\n\n\n\n\n\n\n\n\nClean data\n\nStore explanatory variables\nWe store the names of the explanatory columns as a character vector. This will be referenced later.\n\n## define variables of interest \nexplanatory_vars <- c(\"gender\", \"fever\", \"chills\", \"cough\", \"aches\", \"vomit\")\n\n\n\nConvert to 1’s and 0’s\nBelow we convert the explanatory columns from “yes”/“no”, “m”/“f”, and “dead”/“alive” to 1 / 0, to cooperate with the expectations of logistic regression models. To do this efficiently, used across() from dplyr to transform multiple columns at one time. The function we apply to each column is case_when() (also dplyr) which applies logic to convert specified values to 1’s and 0’s. See sections on across() and case_when() in the Cleaning data and core functions page.\nNote: the “.” below represents the column that is being processed by across() at that moment.\n\n## convert dichotomous variables to 0/1 \nlinelist <- linelist %>% \n mutate(across( \n .cols = all_of(c(explanatory_vars, \"outcome\")), ## for each column listed and \"outcome\"\n .fns = ~case_when( \n . %in% c(\"m\", \"yes\", \"Death\") ~ 1, ## recode male, yes and death to 1\n . %in% c(\"f\", \"no\", \"Recover\") ~ 0, ## female, no and recover to 0\n TRUE ~ NA_real_) ## otherwise set to missing\n )\n )\n\n\n\nDrop rows with missing values\nTo drop rows with missing values, can use the tidyr function drop_na(). However, we only want to do this for rows that are missing values in the columns of interest.\nThe first thing we must to is make sure our explanatory_vars vector includes the column age (age would have produced an error in the previous case_when() operation, which was only for dichotomous variables). Then we pipe the linelist to drop_na() to remove any rows with missing values in the outcome column or any of the explanatory_vars columns.\nBefore running the code, the number of rows in the linelist is nrow(linelist).\n\n## add in age_category to the explanatory vars \nexplanatory_vars <- c(explanatory_vars, \"age_cat\")\n\n## drop rows with missing information for variables of interest \nlinelist <- linelist %>% \n drop_na(any_of(c(\"outcome\", explanatory_vars)))\n\nThe number of rows remaining in linelist is nrow(linelist).", + "crumbs": [ + "Analysis", + "19  Univariate and multivariable regression" + ] + }, + { + "objectID": "new_pages/regression.html#model-performance", + "href": "new_pages/regression.html#model-performance", + "title": "19  Univariate and multivariable regression", + "section": "19.6 Model performance", + "text": "19.6 Model performance\nOnce you have built your regression models, you may want to assess how well the model has fit the data. There are many different approaches to do this, and many different metrics with which to assess your model fit, and how it compares with other model formulations. How you assess your model fit will depend on your model, the data, and the context in which you are conducting your work.\nWhile there are many different functions, and many different packages, to assess model fit, one package that nicely combines several different metrics and approaches into a single source is the performance package. This package allows you to assess model assumptions (such as linearity, homogeneity, highlight outliers, etc.) and check how well the model performs (Akaike Information Criterion values, R2, RMSE, etc) with a few simple functions.\nUnfortunately, we are unable to use this package with gtsummary, but it readily accepts objects generated by other packages such as stats, lmerMod and tidymodels. Here we will demonstrate its application using the function glm() for a multivariable regression. To do this we can use the function performance() to assess model fit, and compare_perfomrance() to compare the two models.\n\n#Load in packages\npacman::p_load(performance)\n\n#Set up regression models\nregression_one <- linelist %>%\n select(outcome, gender, fever, chills, cough) %>%\n glm(formula = outcome ~ .,\n family = binomial)\n\nregression_two <- linelist %>%\n select(outcome, days_onset_hosp, aches, vomit, age_years) %>%\n glm(formula = outcome ~ .,\n family = binomial)\n\n#Assess model fit\nperformance(regression_one)\n\n# Indices of model performance\n\nAIC | AICc | BIC | Tjur's R2 | RMSE | Sigma | Log_loss | Score_log | PCP\n-----------------------------------------------------------------------------------------\n5719.746 | 5719.760 | 5751.421 | 6.342e-04 | 0.496 | 1.000 | 0.685 | -Inf | 0.508\n\nperformance(regression_two)\n\n# Indices of model performance\n\nAIC | AICc | BIC | Tjur's R2 | RMSE | Sigma | Log_loss | Score_log | PCP\n-----------------------------------------------------------------------------------------\n5411.752 | 5411.767 | 5443.187 | 0.012 | 0.493 | 1.000 | 0.680 | -Inf | 0.513\n\n#Compare model fit\ncompare_performance(regression_one,\n regression_two)\n\nWhen comparing models, please note that probably not all models were fit\n from same data.\n\n\n# Comparison of Model Performance Indices\n\nName | Model | AIC (weights) | AICc (weights) | BIC (weights) | Tjur's R2 | RMSE | Sigma | Log_loss | Score_log | PCP\n------------------------------------------------------------------------------------------------------------------------------------\nregression_one | glm | 5719.7 (<.001) | 5719.8 (<.001) | 5751.4 (<.001) | 6.342e-04 | 0.496 | 1.000 | 0.685 | -Inf | 0.508\nregression_two | glm | 5411.8 (>.999) | 5411.8 (>.999) | 5443.2 (>.999) | 0.012 | 0.493 | 1.000 | 0.680 | -Inf | 0.513\n\n\nFor further reading on the performance package, and the model tests you can carry out, see their github.", + "crumbs": [ + "Analysis", + "19  Univariate and multivariable regression" + ] + }, + { + "objectID": "new_pages/survey_analysis.html#weighted-regression", + "href": "new_pages/survey_analysis.html#weighted-regression", + "title": "26  Survey analysis", + "section": "26.10 Weighted regression", + "text": "26.10 Weighted regression\nAnother tool we can use to analyse our survey data is to use weighted regression. This allows us to carry out to account for the survey design in our regression in order to avoid biases that may be introduced from the survey process.\nTo carry out a univariate regression, we can use the packages survey for the function svyglm() and the package gtsummary which allows us to call svyglm() inside the function tbl_uvregression. To do this we first use the survey_design object created above. This is then provided to the function tbl_uvregression() as in the Univariate and multivariable regression chapter. We then make one key change, we change method = glm to method = survey::svyglm in order to carry out our survey weighted regression.\nHere we will be using the previously created object survey_design to predict whether the value in the column died is TRUE, using the columns malaria_treatment, bednet, and age_years.\n\nsurvey_design %>%\n tbl_uvregression( #Carry out a univariate regression, if we wanted a multivariable regression we would use tbl_\n method = survey::svyglm, #Set this to survey::svyglm to carry out our weighted regression on the survey data\n y = died, #The column we are trying to predict\n method.args = list(family = binomial), #The family, we are carrying out a logistic regression so we want the family as binomial\n include = c(malaria_treatment, #These are the columns we want to evaluate\n bednet,\n age_years),\n exponentiate = T #To transform the log odds to odds ratio for easier interpretation\n )\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nCharacteristic\nN\nOR\n1\n95% CI\n1\np-value\n\n\n\n\nmalaria_treatment\n1,357,145\n\n\n\n\n\n\n\n\n    FALSE\n\n\n—\n—\n\n\n\n\n    TRUE\n\n\n1.12\n0.33, 3.82\n0.8\n\n\nbednet\n1,482,457\n\n\n\n\n\n\n\n\n    FALSE\n\n\n—\n—\n\n\n\n\n    TRUE\n\n\n0.50\n0.13, 1.88\n0.3\n\n\nage_years\n1,482,457\n1.01\n1.00, 1.02\n0.013\n\n\n\n1\nOR = Odds Ratio, CI = Confidence Interval\n\n\n\n\n\n\n\n\nIf we wanted to carry out a multivariable regression, we would have to first use the function svyglm() and pipe (%>%) the results into the function tbl_regression. Note that we need to specify the formula.\n\nsurvey_design %>%\n svyglm(formula = died ~ malaria_treatment + \n bednet + \n age_years,\n family = binomial) %>% #The family, we are carrying out a logistic regression so we want the family as binomial\n tbl_regression( \n exponentiate = T #To transform the log odds to odds ratio for easier interpretation \n )\n\n\n\n\n\n\n\n\n\n\n\n\n\nCharacteristic\nOR\n1\n95% CI\n1\np-value\n\n\n\n\nmalaria_treatment\n\n\n\n\n\n\n\n\n    FALSE\n—\n—\n\n\n\n\n    TRUE\n1.11\n0.28, 4.40\n0.9\n\n\nbednet\n\n\n\n\n\n\n\n\n    FALSE\n—\n—\n\n\n\n\n    TRUE\n0.62\n0.12, 3.21\n0.5\n\n\nage_years\n1.01\n1.00, 1.02\n0.018\n\n\n\n1\nOR = Odds Ratio, CI = Confidence Interval", + "crumbs": [ + "Analysis", + "26  Survey analysis" + ] } ] \ No newline at end of file diff --git a/html_outputs/site_libs/leaflet-binding-2.2.2/leaflet.js b/html_outputs/site_libs/leaflet-binding-2.2.2/leaflet.js new file mode 100644 index 00000000..79dbe714 --- /dev/null +++ b/html_outputs/site_libs/leaflet-binding-2.2.2/leaflet.js @@ -0,0 +1,2787 @@ +(function(){function r(e,n,t){function o(i,f){if(!n[i]){if(!e[i]){var c="function"==typeof require&&require;if(!f&&c)return c(i,!0);if(u)return u(i,!0);var a=new Error("Cannot find module '"+i+"'");throw a.code="MODULE_NOT_FOUND",a}var p=n[i]={exports:{}};e[i][0].call(p.exports,function(r){var n=e[i][1][r];return o(n||r)},p,p.exports,r,e,n,t)}return n[i].exports}for(var u="function"==typeof require&&require,i=0;i this.effectiveLength) throw new Error("Row argument was out of bounds: " + row + " > " + this.effectiveLength); + var colIndex = -1; + + if (typeof col === "undefined") { + var rowData = {}; + this.colnames.forEach(function (name, i) { + rowData[name] = _this3.columns[i][row % _this3.columns[i].length]; + }); + return rowData; + } else if (typeof col === "string") { + colIndex = this._colIndex(col); + } else if (typeof col === "number") { + colIndex = col; + } + + if (colIndex < 0 || colIndex > this.columns.length) { + if (missingOK) return void 0;else throw new Error("Unknown column index: " + col); + } + + return this.columns[colIndex][row % this.columns[colIndex].length]; + } + }, { + key: "nrow", + value: function nrow() { + return this.effectiveLength; + } + }]); + + return DataFrame; +}(); + +exports["default"] = DataFrame; + + +},{"./util":17}],5:[function(require,module,exports){ +"use strict"; + +var _leaflet = require("./global/leaflet"); + +var _leaflet2 = _interopRequireDefault(_leaflet); + +function _interopRequireDefault(obj) { return obj && obj.__esModule ? obj : { "default": obj }; } + +// In RMarkdown's self-contained mode, we don't have a way to carry around the +// images that Leaflet needs but doesn't load into the page. Instead, we'll use +// the unpkg CDN. +if (typeof _leaflet2["default"].Icon.Default.imagePath === "undefined") { + _leaflet2["default"].Icon.Default.imagePath = "https://unpkg.com/leaflet@1.3.1/dist/images/"; +} + + +},{"./global/leaflet":10}],6:[function(require,module,exports){ +"use strict"; + +var _leaflet = require("./global/leaflet"); + +var _leaflet2 = _interopRequireDefault(_leaflet); + +function _interopRequireDefault(obj) { return obj && obj.__esModule ? obj : { "default": obj }; } + +// add texxtsize, textOnly, and style +_leaflet2["default"].Tooltip.prototype.options.textsize = "10px"; +_leaflet2["default"].Tooltip.prototype.options.textOnly = false; +_leaflet2["default"].Tooltip.prototype.options.style = null; // copy original layout to not completely stomp it. + +var initLayoutOriginal = _leaflet2["default"].Tooltip.prototype._initLayout; + +_leaflet2["default"].Tooltip.prototype._initLayout = function () { + initLayoutOriginal.call(this); + this._container.style.fontSize = this.options.textsize; + + if (this.options.textOnly) { + _leaflet2["default"].DomUtil.addClass(this._container, "leaflet-tooltip-text-only"); + } + + if (this.options.style) { + for (var property in this.options.style) { + this._container.style[property] = this.options.style[property]; + } + } +}; + + +},{"./global/leaflet":10}],7:[function(require,module,exports){ +"use strict"; + +var _leaflet = require("./global/leaflet"); + +var _leaflet2 = _interopRequireDefault(_leaflet); + +function _interopRequireDefault(obj) { return obj && obj.__esModule ? obj : { "default": obj }; } + +var protocolRegex = /^\/\//; + +var upgrade_protocol = function upgrade_protocol(urlTemplate) { + if (protocolRegex.test(urlTemplate)) { + if (window.location.protocol === "file:") { + // if in a local file, support http + // http should auto upgrade if necessary + urlTemplate = "http:" + urlTemplate; + } + } + + return urlTemplate; +}; + +var originalLTileLayerInitialize = _leaflet2["default"].TileLayer.prototype.initialize; + +_leaflet2["default"].TileLayer.prototype.initialize = function (urlTemplate, options) { + urlTemplate = upgrade_protocol(urlTemplate); + originalLTileLayerInitialize.call(this, urlTemplate, options); +}; + +var originalLTileLayerWMSInitialize = _leaflet2["default"].TileLayer.WMS.prototype.initialize; + +_leaflet2["default"].TileLayer.WMS.prototype.initialize = function (urlTemplate, options) { + urlTemplate = upgrade_protocol(urlTemplate); + originalLTileLayerWMSInitialize.call(this, urlTemplate, options); +}; + + +},{"./global/leaflet":10}],8:[function(require,module,exports){ +(function (global){(function (){ +"use strict"; + +Object.defineProperty(exports, "__esModule", { + value: true +}); +exports["default"] = global.HTMLWidgets; + + +}).call(this)}).call(this,typeof global !== "undefined" ? global : typeof self !== "undefined" ? self : typeof window !== "undefined" ? window : {}) +},{}],9:[function(require,module,exports){ +(function (global){(function (){ +"use strict"; + +Object.defineProperty(exports, "__esModule", { + value: true +}); +exports["default"] = global.jQuery; + + +}).call(this)}).call(this,typeof global !== "undefined" ? global : typeof self !== "undefined" ? self : typeof window !== "undefined" ? window : {}) +},{}],10:[function(require,module,exports){ +(function (global){(function (){ +"use strict"; + +Object.defineProperty(exports, "__esModule", { + value: true +}); +exports["default"] = global.L; + + +}).call(this)}).call(this,typeof global !== "undefined" ? global : typeof self !== "undefined" ? self : typeof window !== "undefined" ? window : {}) +},{}],11:[function(require,module,exports){ +(function (global){(function (){ +"use strict"; + +Object.defineProperty(exports, "__esModule", { + value: true +}); +exports["default"] = global.L.Proj; + + +}).call(this)}).call(this,typeof global !== "undefined" ? global : typeof self !== "undefined" ? self : typeof window !== "undefined" ? window : {}) +},{}],12:[function(require,module,exports){ +(function (global){(function (){ +"use strict"; + +Object.defineProperty(exports, "__esModule", { + value: true +}); +exports["default"] = global.Shiny; + + +}).call(this)}).call(this,typeof global !== "undefined" ? global : typeof self !== "undefined" ? self : typeof window !== "undefined" ? window : {}) +},{}],13:[function(require,module,exports){ +"use strict"; + +var _jquery = require("./global/jquery"); + +var _jquery2 = _interopRequireDefault(_jquery); + +var _leaflet = require("./global/leaflet"); + +var _leaflet2 = _interopRequireDefault(_leaflet); + +var _shiny = require("./global/shiny"); + +var _shiny2 = _interopRequireDefault(_shiny); + +var _htmlwidgets = require("./global/htmlwidgets"); + +var _htmlwidgets2 = _interopRequireDefault(_htmlwidgets); + +var _util = require("./util"); + +var _crs_utils = require("./crs_utils"); + +var _controlStore = require("./control-store"); + +var _controlStore2 = _interopRequireDefault(_controlStore); + +var _layerManager = require("./layer-manager"); + +var _layerManager2 = _interopRequireDefault(_layerManager); + +var _methods = require("./methods"); + +var _methods2 = _interopRequireDefault(_methods); + +require("./fixup-default-icon"); + +require("./fixup-default-tooltip"); + +require("./fixup-url-protocol"); + +var _dataframe = require("./dataframe"); + +var _dataframe2 = _interopRequireDefault(_dataframe); + +var _clusterLayerStore = require("./cluster-layer-store"); + +var _clusterLayerStore2 = _interopRequireDefault(_clusterLayerStore); + +function _interopRequireDefault(obj) { return obj && obj.__esModule ? obj : { "default": obj }; } + +window.LeafletWidget = {}; +window.LeafletWidget.utils = {}; + +var methods = window.LeafletWidget.methods = _jquery2["default"].extend({}, _methods2["default"]); + +window.LeafletWidget.DataFrame = _dataframe2["default"]; +window.LeafletWidget.ClusterLayerStore = _clusterLayerStore2["default"]; +window.LeafletWidget.utils.getCRS = _crs_utils.getCRS; // Send updated bounds back to app. Takes a leaflet event object as input. + +function updateBounds(map) { + var id = map.getContainer().id; + var bounds = map.getBounds(); + + _shiny2["default"].onInputChange(id + "_bounds", { + north: bounds.getNorthEast().lat, + east: bounds.getNorthEast().lng, + south: bounds.getSouthWest().lat, + west: bounds.getSouthWest().lng + }); + + _shiny2["default"].onInputChange(id + "_center", { + lng: map.getCenter().lng, + lat: map.getCenter().lat + }); + + _shiny2["default"].onInputChange(id + "_zoom", map.getZoom()); +} + +function preventUnintendedZoomOnScroll(map) { + // Prevent unwanted scroll capturing. Similar in purpose to + // https://github.com/CliffCloud/Leaflet.Sleep but with a + // different set of heuristics. + // The basic idea is that when a mousewheel/DOMMouseScroll + // event is seen, we disable scroll wheel zooming until the + // user moves their mouse cursor or clicks on the map. This + // is slightly trickier than just listening for mousemove, + // because mousemove is fired when the page is scrolled, + // even if the user did not physically move the mouse. We + // handle this by examining the mousemove event's screenX + // and screenY properties; if they change, we know it's a + // "true" move. + // lastScreen can never be null, but its x and y can. + var lastScreen = { + x: null, + y: null + }; + (0, _jquery2["default"])(document).on("mousewheel DOMMouseScroll", "*", function (e) { + // Disable zooming (until the mouse moves or click) + map.scrollWheelZoom.disable(); // Any mousemove events at this screen position will be ignored. + + lastScreen = { + x: e.originalEvent.screenX, + y: e.originalEvent.screenY + }; + }); + (0, _jquery2["default"])(document).on("mousemove", "*", function (e) { + // Did the mouse really move? + if (map.options.scrollWheelZoom) { + if (lastScreen.x !== null && e.screenX !== lastScreen.x || e.screenY !== lastScreen.y) { + // It really moved. Enable zooming. + map.scrollWheelZoom.enable(); + lastScreen = { + x: null, + y: null + }; + } + } + }); + (0, _jquery2["default"])(document).on("mousedown", ".leaflet", function (e) { + // Clicking always enables zooming. + if (map.options.scrollWheelZoom) { + map.scrollWheelZoom.enable(); + lastScreen = { + x: null, + y: null + }; + } + }); +} + +_htmlwidgets2["default"].widget({ + name: "leaflet", + type: "output", + factory: function factory(el, width, height) { + var map = null; + return { + // we need to store our map in our returned object. + getMap: function getMap() { + return map; + }, + renderValue: function renderValue(data) { + // Create an appropriate CRS Object if specified + if (data && data.options && data.options.crs) { + data.options.crs = (0, _crs_utils.getCRS)(data.options.crs); + } // As per https://github.com/rstudio/leaflet/pull/294#discussion_r79584810 + + + if (map) { + map.remove(); + + map = function () { + return; + }(); // undefine map + + } + + if (data.options.mapFactory && typeof data.options.mapFactory === "function") { + map = data.options.mapFactory(el, data.options); + } else { + map = _leaflet2["default"].map(el, data.options); + } + + preventUnintendedZoomOnScroll(map); // Store some state in the map object + + map.leafletr = { + // Has the map ever rendered successfully? + hasRendered: false, + // Data to be rendered when resize is called with area != 0 + pendingRenderData: null + }; // Check if the map is rendered statically (no output binding) + + if (_htmlwidgets2["default"].shinyMode && /\bshiny-bound-output\b/.test(el.className)) { + map.id = el.id; // Store the map on the element so we can find it later by ID + + (0, _jquery2["default"])(el).data("leaflet-map", map); // When the map is clicked, send the coordinates back to the app + + map.on("click", function (e) { + _shiny2["default"].onInputChange(map.id + "_click", { + lat: e.latlng.lat, + lng: e.latlng.lng, + ".nonce": Math.random() // Force reactivity if lat/lng hasn't changed + + }); + }); + var groupTimerId = null; + map.on("moveend", function (e) { + updateBounds(e.target); + }).on("layeradd layerremove", function (e) { + // If the layer that's coming or going is a group we created, tell + // the server. + if (map.layerManager.getGroupNameFromLayerGroup(e.layer)) { + // But to avoid chattiness, coalesce events + if (groupTimerId) { + clearTimeout(groupTimerId); + groupTimerId = null; + } + + groupTimerId = setTimeout(function () { + groupTimerId = null; + + _shiny2["default"].onInputChange(map.id + "_groups", map.layerManager.getVisibleGroups()); + }, 100); + } + }); + } + + this.doRenderValue(data, map); + }, + doRenderValue: function doRenderValue(data, map) { + // Leaflet does not behave well when you set up a bunch of layers when + // the map is not visible (width/height == 0). Popups get misaligned + // relative to their owning markers, and the fitBounds calculations + // are off. Therefore we wait until the map is actually showing to + // render the value (we rely on the resize() callback being invoked + // at the appropriate time). + if (el.offsetWidth === 0 || el.offsetHeight === 0) { + map.leafletr.pendingRenderData = data; + return; + } + + map.leafletr.pendingRenderData = null; // Merge data options into defaults + + var options = _jquery2["default"].extend({ + zoomToLimits: "always" + }, data.options); + + if (!map.layerManager) { + map.controls = new _controlStore2["default"](map); + map.layerManager = new _layerManager2["default"](map); + } else { + map.controls.clear(); + map.layerManager.clear(); + } + + var explicitView = false; + + if (data.setView) { + explicitView = true; + map.setView.apply(map, data.setView); + } + + if (data.fitBounds) { + explicitView = true; + methods.fitBounds.apply(map, data.fitBounds); + } + + if (data.flyTo) { + if (!explicitView && !map.leafletr.hasRendered) { + // must be done to give a initial starting point + map.fitWorld(); + } + + explicitView = true; + map.flyTo.apply(map, data.flyTo); + } + + if (data.flyToBounds) { + if (!explicitView && !map.leafletr.hasRendered) { + // must be done to give a initial starting point + map.fitWorld(); + } + + explicitView = true; + methods.flyToBounds.apply(map, data.flyToBounds); + } + + if (data.options.center) { + explicitView = true; + } // Returns true if the zoomToLimits option says that the map should be + // zoomed to map elements. + + + function needsZoom() { + return options.zoomToLimits === "always" || options.zoomToLimits === "first" && !map.leafletr.hasRendered; + } + + if (!explicitView && needsZoom() && !map.getZoom()) { + if (data.limits && !_jquery2["default"].isEmptyObject(data.limits)) { + // Use the natural limits of what's being drawn on the map + // If the size of the bounding box is 0, leaflet gets all weird + var pad = 0.006; + + if (data.limits.lat[0] === data.limits.lat[1]) { + data.limits.lat[0] = data.limits.lat[0] - pad; + data.limits.lat[1] = data.limits.lat[1] + pad; + } + + if (data.limits.lng[0] === data.limits.lng[1]) { + data.limits.lng[0] = data.limits.lng[0] - pad; + data.limits.lng[1] = data.limits.lng[1] + pad; + } + + map.fitBounds([[data.limits.lat[0], data.limits.lng[0]], [data.limits.lat[1], data.limits.lng[1]]]); + } else { + map.fitWorld(); + } + } + + for (var i = 0; data.calls && i < data.calls.length; i++) { + var call = data.calls[i]; + if (methods[call.method]) methods[call.method].apply(map, call.args);else (0, _util.log)("Unknown method " + call.method); + } + + map.leafletr.hasRendered = true; + + if (_htmlwidgets2["default"].shinyMode) { + setTimeout(function () { + updateBounds(map); + }, 1); + } + }, + resize: function resize(width, height) { + if (map) { + map.invalidateSize(); + + if (map.leafletr.pendingRenderData) { + this.doRenderValue(map.leafletr.pendingRenderData, map); + } + } + } + }; + } +}); + +if (_htmlwidgets2["default"].shinyMode) { + _shiny2["default"].addCustomMessageHandler("leaflet-calls", function (data) { + var id = data.id; + var el = document.getElementById(id); + var map = el ? (0, _jquery2["default"])(el).data("leaflet-map") : null; + + if (!map) { + (0, _util.log)("Couldn't find map with id " + id); + return; + } // If the map has not rendered, stash the proposed `leafletProxy()` calls + // in `pendingRenderData.calls` to be run on display via `doRenderValue()`. + // This is necessary if the map has not been rendered. + // If new pendingRenderData is set via a new `leaflet()`, the previous calls will be discarded. + + + if (!map.leafletr.hasRendered) { + map.leafletr.pendingRenderData.calls = map.leafletr.pendingRenderData.calls.concat(data.calls); + return; + } + + for (var i = 0; i < data.calls.length; i++) { + var call = data.calls[i]; + var args = call.args; + + for (var _i = 0; _i < call.evals.length; _i++) { + window.HTMLWidgets.evaluateStringMember(args, call.evals[_i]); + } + + if (call.dependencies) { + _shiny2["default"].renderDependencies(call.dependencies); + } + + if (methods[call.method]) methods[call.method].apply(map, args);else (0, _util.log)("Unknown method " + call.method); + } + }); +} + + +},{"./cluster-layer-store":1,"./control-store":2,"./crs_utils":3,"./dataframe":4,"./fixup-default-icon":5,"./fixup-default-tooltip":6,"./fixup-url-protocol":7,"./global/htmlwidgets":8,"./global/jquery":9,"./global/leaflet":10,"./global/shiny":12,"./layer-manager":14,"./methods":15,"./util":17}],14:[function(require,module,exports){ +(function (global){(function (){ +"use strict"; + +Object.defineProperty(exports, "__esModule", { + value: true +}); +exports["default"] = undefined; + +var _jquery = require("./global/jquery"); + +var _jquery2 = _interopRequireDefault(_jquery); + +var _leaflet = require("./global/leaflet"); + +var _leaflet2 = _interopRequireDefault(_leaflet); + +var _util = require("./util"); + +function _interopRequireDefault(obj) { return obj && obj.__esModule ? obj : { "default": obj }; } + +function _classCallCheck(instance, Constructor) { if (!(instance instanceof Constructor)) { throw new TypeError("Cannot call a class as a function"); } } + +function _defineProperties(target, props) { for (var i = 0; i < props.length; i++) { var descriptor = props[i]; descriptor.enumerable = descriptor.enumerable || false; descriptor.configurable = true; if ("value" in descriptor) descriptor.writable = true; Object.defineProperty(target, descriptor.key, descriptor); } } + +function _createClass(Constructor, protoProps, staticProps) { if (protoProps) _defineProperties(Constructor.prototype, protoProps); if (staticProps) _defineProperties(Constructor, staticProps); return Constructor; } + +var LayerManager = /*#__PURE__*/function () { + function LayerManager(map) { + _classCallCheck(this, LayerManager); + + this._map = map; // BEGIN layer indices + // {: {: layer}} + + this._byGroup = {}; // {: {: layer}} + + this._byCategory = {}; // {: layer} + + this._byLayerId = {}; // {: { + // "group": , + // "layerId": , + // "category": , + // "container": + // } + // } + + this._byStamp = {}; // {: {: [, , ...], ...}} + + this._byCrosstalkGroup = {}; // END layer indices + // {: L.layerGroup} + + this._categoryContainers = {}; // {: L.layerGroup} + + this._groupContainers = {}; + } + + _createClass(LayerManager, [{ + key: "addLayer", + value: function addLayer(layer, category, layerId, group, ctGroup, ctKey) { + var _this = this; + + // Was a group provided? + var hasId = typeof layerId === "string"; + var grouped = typeof group === "string"; + var stamp = _leaflet2["default"].Util.stamp(layer) + ""; // This will be the default layer group to add the layer to. + // We may overwrite this let before using it (i.e. if a group is assigned). + // This one liner creates the _categoryContainers[category] entry if it + // doesn't already exist. + + var container = this._categoryContainers[category] = this._categoryContainers[category] || _leaflet2["default"].layerGroup().addTo(this._map); + + var oldLayer = null; + + if (hasId) { + // First, remove any layer with the same category and layerId + var prefixedLayerId = this._layerIdKey(category, layerId); + + oldLayer = this._byLayerId[prefixedLayerId]; + + if (oldLayer) { + this._removeLayer(oldLayer); + } // Update layerId index + + + this._byLayerId[prefixedLayerId] = layer; + } // Update group index + + + if (grouped) { + this._byGroup[group] = this._byGroup[group] || {}; + this._byGroup[group][stamp] = layer; // Since a group is assigned, don't add the layer to the category's layer + // group; instead, use the group's layer group. + // This one liner creates the _groupContainers[group] entry if it doesn't + // already exist. + + container = this.getLayerGroup(group, true); + } // Update category index + + + this._byCategory[category] = this._byCategory[category] || {}; + this._byCategory[category][stamp] = layer; // Update stamp index + + var layerInfo = this._byStamp[stamp] = { + layer: layer, + group: group, + ctGroup: ctGroup, + ctKey: ctKey, + layerId: layerId, + category: category, + container: container, + hidden: false + }; // Update crosstalk group index + + if (ctGroup) { + if (layer.setStyle) { + // Need to save this info so we know what to set opacity to later + layer.options.origOpacity = typeof layer.options.opacity !== "undefined" ? layer.options.opacity : 0.5; + layer.options.origFillOpacity = typeof layer.options.fillOpacity !== "undefined" ? layer.options.fillOpacity : 0.2; + } + + var ctg = this._byCrosstalkGroup[ctGroup]; + + if (!ctg) { + ctg = this._byCrosstalkGroup[ctGroup] = {}; + var crosstalk = global.crosstalk; + + var handleFilter = function handleFilter(e) { + if (!e.value) { + var groupKeys = Object.keys(ctg); + + for (var i = 0; i < groupKeys.length; i++) { + var key = groupKeys[i]; + var _layerInfo = _this._byStamp[ctg[key]]; + + _this._setVisibility(_layerInfo, true); + } + } else { + var selectedKeys = {}; + + for (var _i = 0; _i < e.value.length; _i++) { + selectedKeys[e.value[_i]] = true; + } + + var _groupKeys = Object.keys(ctg); + + for (var _i2 = 0; _i2 < _groupKeys.length; _i2++) { + var _key = _groupKeys[_i2]; + var _layerInfo2 = _this._byStamp[ctg[_key]]; + + _this._setVisibility(_layerInfo2, selectedKeys[_groupKeys[_i2]]); + } + } + }; + + var filterHandle = new crosstalk.FilterHandle(ctGroup); + filterHandle.on("change", handleFilter); + + var handleSelection = function handleSelection(e) { + if (!e.value || !e.value.length) { + var groupKeys = Object.keys(ctg); + + for (var i = 0; i < groupKeys.length; i++) { + var key = groupKeys[i]; + var _layerInfo3 = _this._byStamp[ctg[key]]; + + _this._setOpacity(_layerInfo3, 1.0); + } + } else { + var selectedKeys = {}; + + for (var _i3 = 0; _i3 < e.value.length; _i3++) { + selectedKeys[e.value[_i3]] = true; + } + + var _groupKeys2 = Object.keys(ctg); + + for (var _i4 = 0; _i4 < _groupKeys2.length; _i4++) { + var _key2 = _groupKeys2[_i4]; + var _layerInfo4 = _this._byStamp[ctg[_key2]]; + + _this._setOpacity(_layerInfo4, selectedKeys[_groupKeys2[_i4]] ? 1.0 : 0.2); + } + } + }; + + var selHandle = new crosstalk.SelectionHandle(ctGroup); + selHandle.on("change", handleSelection); + setTimeout(function () { + handleFilter({ + value: filterHandle.filteredKeys + }); + handleSelection({ + value: selHandle.value + }); + }, 100); + } + + if (!ctg[ctKey]) ctg[ctKey] = []; + ctg[ctKey].push(stamp); + } // Add to container + + + if (!layerInfo.hidden) container.addLayer(layer); + return oldLayer; + } + }, { + key: "brush", + value: function brush(bounds, extraInfo) { + var _this2 = this; + + /* eslint-disable no-console */ + // For each Crosstalk group... + Object.keys(this._byCrosstalkGroup).forEach(function (ctGroupName) { + var ctg = _this2._byCrosstalkGroup[ctGroupName]; + var selection = []; // ...iterate over each Crosstalk key (each of which may have multiple + // layers)... + + Object.keys(ctg).forEach(function (ctKey) { + // ...and for each layer... + ctg[ctKey].forEach(function (stamp) { + var layerInfo = _this2._byStamp[stamp]; // ...if it's something with a point... + + if (layerInfo.layer.getLatLng) { + // ... and it's inside the selection bounds... + // TODO: Use pixel containment, not lat/lng containment + if (bounds.contains(layerInfo.layer.getLatLng())) { + // ...add the key to the selection. + selection.push(ctKey); + } + } + }); + }); + new global.crosstalk.SelectionHandle(ctGroupName).set(selection, extraInfo); + }); + } + }, { + key: "unbrush", + value: function unbrush(extraInfo) { + Object.keys(this._byCrosstalkGroup).forEach(function (ctGroupName) { + new global.crosstalk.SelectionHandle(ctGroupName).clear(extraInfo); + }); + } + }, { + key: "_setVisibility", + value: function _setVisibility(layerInfo, visible) { + if (layerInfo.hidden ^ visible) { + return; + } else if (visible) { + layerInfo.container.addLayer(layerInfo.layer); + layerInfo.hidden = false; + } else { + layerInfo.container.removeLayer(layerInfo.layer); + layerInfo.hidden = true; + } + } + }, { + key: "_setOpacity", + value: function _setOpacity(layerInfo, opacity) { + if (layerInfo.layer.setOpacity) { + layerInfo.layer.setOpacity(opacity); + } else if (layerInfo.layer.setStyle) { + layerInfo.layer.setStyle({ + opacity: opacity * layerInfo.layer.options.origOpacity, + fillOpacity: opacity * layerInfo.layer.options.origFillOpacity + }); + } + } + }, { + key: "getLayer", + value: function getLayer(category, layerId) { + return this._byLayerId[this._layerIdKey(category, layerId)]; + } + }, { + key: "removeLayer", + value: function removeLayer(category, layerIds) { + var _this3 = this; + + // Find layer info + _jquery2["default"].each((0, _util.asArray)(layerIds), function (i, layerId) { + var layer = _this3._byLayerId[_this3._layerIdKey(category, layerId)]; + + if (layer) { + _this3._removeLayer(layer); + } + }); + } + }, { + key: "clearLayers", + value: function clearLayers(category) { + var _this4 = this; + + // Find all layers in _byCategory[category] + var catTable = this._byCategory[category]; + + if (!catTable) { + return false; + } // Remove all layers. Make copy of keys to avoid mutating the collection + // behind the iterator you're accessing. + + + var stamps = []; + + _jquery2["default"].each(catTable, function (k, v) { + stamps.push(k); + }); + + _jquery2["default"].each(stamps, function (i, stamp) { + _this4._removeLayer(stamp); + }); + } + }, { + key: "getLayerGroup", + value: function getLayerGroup(group, ensureExists) { + var g = this._groupContainers[group]; + + if (ensureExists && !g) { + this._byGroup[group] = this._byGroup[group] || {}; + g = this._groupContainers[group] = _leaflet2["default"].featureGroup(); + g.groupname = group; + g.addTo(this._map); + } + + return g; + } + }, { + key: "getGroupNameFromLayerGroup", + value: function getGroupNameFromLayerGroup(layerGroup) { + return layerGroup.groupname; + } + }, { + key: "getVisibleGroups", + value: function getVisibleGroups() { + var _this5 = this; + + var result = []; + + _jquery2["default"].each(this._groupContainers, function (k, v) { + if (_this5._map.hasLayer(v)) { + result.push(k); + } + }); + + return result; + } + }, { + key: "getAllGroupNames", + value: function getAllGroupNames() { + var result = []; + + _jquery2["default"].each(this._groupContainers, function (k, v) { + result.push(k); + }); + + return result; + } + }, { + key: "clearGroup", + value: function clearGroup(group) { + var _this6 = this; + + // Find all layers in _byGroup[group] + var groupTable = this._byGroup[group]; + + if (!groupTable) { + return false; + } // Remove all layers. Make copy of keys to avoid mutating the collection + // behind the iterator you're accessing. + + + var stamps = []; + + _jquery2["default"].each(groupTable, function (k, v) { + stamps.push(k); + }); + + _jquery2["default"].each(stamps, function (i, stamp) { + _this6._removeLayer(stamp); + }); + } + }, { + key: "clear", + value: function clear() { + function clearLayerGroup(key, layerGroup) { + layerGroup.clearLayers(); + } // Clear all indices and layerGroups + + + this._byGroup = {}; + this._byCategory = {}; + this._byLayerId = {}; + this._byStamp = {}; + this._byCrosstalkGroup = {}; + + _jquery2["default"].each(this._categoryContainers, clearLayerGroup); + + this._categoryContainers = {}; + + _jquery2["default"].each(this._groupContainers, clearLayerGroup); + + this._groupContainers = {}; + } + }, { + key: "_removeLayer", + value: function _removeLayer(layer) { + var stamp; + + if (typeof layer === "string") { + stamp = layer; + } else { + stamp = _leaflet2["default"].Util.stamp(layer); + } + + var layerInfo = this._byStamp[stamp]; + + if (!layerInfo) { + return false; + } + + layerInfo.container.removeLayer(stamp); + + if (typeof layerInfo.group === "string") { + delete this._byGroup[layerInfo.group][stamp]; + } + + if (typeof layerInfo.layerId === "string") { + delete this._byLayerId[this._layerIdKey(layerInfo.category, layerInfo.layerId)]; + } + + delete this._byCategory[layerInfo.category][stamp]; + delete this._byStamp[stamp]; + + if (layerInfo.ctGroup) { + var ctGroup = this._byCrosstalkGroup[layerInfo.ctGroup]; + var layersForKey = ctGroup[layerInfo.ctKey]; + var idx = layersForKey ? layersForKey.indexOf(stamp) : -1; + + if (idx >= 0) { + if (layersForKey.length === 1) { + delete ctGroup[layerInfo.ctKey]; + } else { + layersForKey.splice(idx, 1); + } + } + } + } + }, { + key: "_layerIdKey", + value: function _layerIdKey(category, layerId) { + return category + "\n" + layerId; + } + }]); + + return LayerManager; +}(); + +exports["default"] = LayerManager; + + +}).call(this)}).call(this,typeof global !== "undefined" ? global : typeof self !== "undefined" ? self : typeof window !== "undefined" ? window : {}) +},{"./global/jquery":9,"./global/leaflet":10,"./util":17}],15:[function(require,module,exports){ +(function (global){(function (){ +"use strict"; + +Object.defineProperty(exports, "__esModule", { + value: true +}); + +var _jquery = require("./global/jquery"); + +var _jquery2 = _interopRequireDefault(_jquery); + +var _leaflet = require("./global/leaflet"); + +var _leaflet2 = _interopRequireDefault(_leaflet); + +var _shiny = require("./global/shiny"); + +var _shiny2 = _interopRequireDefault(_shiny); + +var _htmlwidgets = require("./global/htmlwidgets"); + +var _htmlwidgets2 = _interopRequireDefault(_htmlwidgets); + +var _util = require("./util"); + +var _crs_utils = require("./crs_utils"); + +var _dataframe = require("./dataframe"); + +var _dataframe2 = _interopRequireDefault(_dataframe); + +var _clusterLayerStore = require("./cluster-layer-store"); + +var _clusterLayerStore2 = _interopRequireDefault(_clusterLayerStore); + +var _mipmapper = require("./mipmapper"); + +var _mipmapper2 = _interopRequireDefault(_mipmapper); + +function _interopRequireDefault(obj) { return obj && obj.__esModule ? obj : { "default": obj }; } + +var methods = {}; +exports["default"] = methods; + +function mouseHandler(mapId, layerId, group, eventName, extraInfo) { + return function (e) { + if (!_htmlwidgets2["default"].shinyMode) return; + var latLng = e.target.getLatLng ? e.target.getLatLng() : e.latlng; + + if (latLng) { + // retrieve only lat, lon values to remove prototype + // and extra parameters added by 3rd party modules + // these objects are for json serialization, not javascript + var latLngVal = _leaflet2["default"].latLng(latLng); // make sure it has consistent shape + + + latLng = { + lat: latLngVal.lat, + lng: latLngVal.lng + }; + } + + var eventInfo = _jquery2["default"].extend({ + id: layerId, + ".nonce": Math.random() // force reactivity + + }, group !== null ? { + group: group + } : null, latLng, extraInfo); + + _shiny2["default"].onInputChange(mapId + "_" + eventName, eventInfo); + }; +} + +methods.mouseHandler = mouseHandler; + +methods.clearGroup = function (group) { + var _this = this; + + _jquery2["default"].each((0, _util.asArray)(group), function (i, v) { + _this.layerManager.clearGroup(v); + }); +}; + +methods.setView = function (center, zoom, options) { + this.setView(center, zoom, options); +}; + +methods.fitBounds = function (lat1, lng1, lat2, lng2, options) { + this.fitBounds([[lat1, lng1], [lat2, lng2]], options); +}; + +methods.flyTo = function (center, zoom, options) { + this.flyTo(center, zoom, options); +}; + +methods.flyToBounds = function (lat1, lng1, lat2, lng2, options) { + this.flyToBounds([[lat1, lng1], [lat2, lng2]], options); +}; + +methods.setMaxBounds = function (lat1, lng1, lat2, lng2) { + this.setMaxBounds([[lat1, lng1], [lat2, lng2]]); +}; + +methods.addPopups = function (lat, lng, popup, layerId, group, options) { + var _this2 = this; + + var df = new _dataframe2["default"]().col("lat", lat).col("lng", lng).col("popup", popup).col("layerId", layerId).col("group", group).cbind(options); + + var _loop = function _loop(i) { + if (_jquery2["default"].isNumeric(df.get(i, "lat")) && _jquery2["default"].isNumeric(df.get(i, "lng"))) { + (function () { + var popup = _leaflet2["default"].popup(df.get(i)).setLatLng([df.get(i, "lat"), df.get(i, "lng")]).setContent(df.get(i, "popup")); + + var thisId = df.get(i, "layerId"); + var thisGroup = df.get(i, "group"); + this.layerManager.addLayer(popup, "popup", thisId, thisGroup); + }).call(_this2); + } + }; + + for (var i = 0; i < df.nrow(); i++) { + _loop(i); + } +}; + +methods.removePopup = function (layerId) { + this.layerManager.removeLayer("popup", layerId); +}; + +methods.clearPopups = function () { + this.layerManager.clearLayers("popup"); +}; + +methods.addTiles = function (urlTemplate, layerId, group, options) { + this.layerManager.addLayer(_leaflet2["default"].tileLayer(urlTemplate, options), "tile", layerId, group); +}; + +methods.removeTiles = function (layerId) { + this.layerManager.removeLayer("tile", layerId); +}; + +methods.clearTiles = function () { + this.layerManager.clearLayers("tile"); +}; + +methods.addWMSTiles = function (baseUrl, layerId, group, options) { + if (options && options.crs) { + options.crs = (0, _crs_utils.getCRS)(options.crs); + } + + this.layerManager.addLayer(_leaflet2["default"].tileLayer.wms(baseUrl, options), "tile", layerId, group); +}; // Given: +// {data: ["a", "b", "c"], index: [0, 1, 0, 2]} +// returns: +// ["a", "b", "a", "c"] + + +function unpackStrings(iconset) { + if (!iconset) { + return iconset; + } + + if (typeof iconset.index === "undefined") { + return iconset; + } + + iconset.data = (0, _util.asArray)(iconset.data); + iconset.index = (0, _util.asArray)(iconset.index); + return _jquery2["default"].map(iconset.index, function (e, i) { + return iconset.data[e]; + }); +} + +function addMarkers(map, df, group, clusterOptions, clusterId, markerFunc) { + (function () { + var _this3 = this; + + var clusterGroup = this.layerManager.getLayer("cluster", clusterId), + cluster = clusterOptions !== null; + + if (cluster && !clusterGroup) { + clusterGroup = _leaflet2["default"].markerClusterGroup.layerSupport(clusterOptions); + + if (clusterOptions.freezeAtZoom) { + var freezeAtZoom = clusterOptions.freezeAtZoom; + delete clusterOptions.freezeAtZoom; + clusterGroup.freezeAtZoom(freezeAtZoom); + } + + clusterGroup.clusterLayerStore = new _clusterLayerStore2["default"](clusterGroup); + } + + var extraInfo = cluster ? { + clusterId: clusterId + } : {}; + + var _loop2 = function _loop2(i) { + if (_jquery2["default"].isNumeric(df.get(i, "lat")) && _jquery2["default"].isNumeric(df.get(i, "lng"))) { + (function () { + var marker = markerFunc(df, i); + var thisId = df.get(i, "layerId"); + var thisGroup = cluster ? null : df.get(i, "group"); + + if (cluster) { + clusterGroup.clusterLayerStore.add(marker, thisId); + } else { + this.layerManager.addLayer(marker, "marker", thisId, thisGroup, df.get(i, "ctGroup", true), df.get(i, "ctKey", true)); + } + + var popup = df.get(i, "popup"); + var popupOptions = df.get(i, "popupOptions"); + + if (popup !== null) { + if (popupOptions !== null) { + marker.bindPopup(popup, popupOptions); + } else { + marker.bindPopup(popup); + } + } + + var label = df.get(i, "label"); + var labelOptions = df.get(i, "labelOptions"); + + if (label !== null) { + if (labelOptions !== null) { + if (labelOptions.permanent) { + marker.bindTooltip(label, labelOptions).openTooltip(); + } else { + marker.bindTooltip(label, labelOptions); + } + } else { + marker.bindTooltip(label); + } + } + + marker.on("click", mouseHandler(this.id, thisId, thisGroup, "marker_click", extraInfo), this); + marker.on("mouseover", mouseHandler(this.id, thisId, thisGroup, "marker_mouseover", extraInfo), this); + marker.on("mouseout", mouseHandler(this.id, thisId, thisGroup, "marker_mouseout", extraInfo), this); + marker.on("dragend", mouseHandler(this.id, thisId, thisGroup, "marker_dragend", extraInfo), this); + }).call(_this3); + } + }; + + for (var i = 0; i < df.nrow(); i++) { + _loop2(i); + } + + if (cluster) { + this.layerManager.addLayer(clusterGroup, "cluster", clusterId, group); + } + }).call(map); +} + +methods.addGenericMarkers = addMarkers; + +methods.addMarkers = function (lat, lng, icon, layerId, group, options, popup, popupOptions, clusterOptions, clusterId, label, labelOptions, crosstalkOptions) { + var icondf; + var getIcon; + + if (icon) { + // Unpack icons + icon.iconUrl = unpackStrings(icon.iconUrl); + icon.iconRetinaUrl = unpackStrings(icon.iconRetinaUrl); + icon.shadowUrl = unpackStrings(icon.shadowUrl); + icon.shadowRetinaUrl = unpackStrings(icon.shadowRetinaUrl); // This cbinds the icon URLs and any other icon options; they're all + // present on the icon object. + + icondf = new _dataframe2["default"]().cbind(icon); // Constructs an icon from a specified row of the icon dataframe. + + getIcon = function getIcon(i) { + var opts = icondf.get(i); + + if (!opts.iconUrl) { + return new _leaflet2["default"].Icon.Default(); + } // Composite options (like points or sizes) are passed from R with each + // individual component as its own option. We need to combine them now + // into their composite form. + + + if (opts.iconWidth) { + opts.iconSize = [opts.iconWidth, opts.iconHeight]; + } + + if (opts.shadowWidth) { + opts.shadowSize = [opts.shadowWidth, opts.shadowHeight]; + } + + if (opts.iconAnchorX) { + opts.iconAnchor = [opts.iconAnchorX, opts.iconAnchorY]; + } + + if (opts.shadowAnchorX) { + opts.shadowAnchor = [opts.shadowAnchorX, opts.shadowAnchorY]; + } + + if (opts.popupAnchorX) { + opts.popupAnchor = [opts.popupAnchorX, opts.popupAnchorY]; + } + + return new _leaflet2["default"].Icon(opts); + }; + } + + if (!(_jquery2["default"].isEmptyObject(lat) || _jquery2["default"].isEmptyObject(lng)) || _jquery2["default"].isNumeric(lat) && _jquery2["default"].isNumeric(lng)) { + var df = new _dataframe2["default"]().col("lat", lat).col("lng", lng).col("layerId", layerId).col("group", group).col("popup", popup).col("popupOptions", popupOptions).col("label", label).col("labelOptions", labelOptions).cbind(options).cbind(crosstalkOptions || {}); + if (icon) icondf.effectiveLength = df.nrow(); + addMarkers(this, df, group, clusterOptions, clusterId, function (df, i) { + var options = df.get(i); + if (icon) options.icon = getIcon(i); + return _leaflet2["default"].marker([df.get(i, "lat"), df.get(i, "lng")], options); + }); + } +}; + +methods.addAwesomeMarkers = function (lat, lng, icon, layerId, group, options, popup, popupOptions, clusterOptions, clusterId, label, labelOptions, crosstalkOptions) { + var icondf; + var getIcon; + + if (icon) { + // This cbinds the icon URLs and any other icon options; they're all + // present on the icon object. + icondf = new _dataframe2["default"]().cbind(icon); // Constructs an icon from a specified row of the icon dataframe. + + getIcon = function getIcon(i) { + var opts = icondf.get(i); + + if (!opts) { + return new _leaflet2["default"].AwesomeMarkers.icon(); + } + + if (opts.squareMarker) { + opts.className = "awesome-marker awesome-marker-square"; + } + + return new _leaflet2["default"].AwesomeMarkers.icon(opts); + }; + } + + if (!(_jquery2["default"].isEmptyObject(lat) || _jquery2["default"].isEmptyObject(lng)) || _jquery2["default"].isNumeric(lat) && _jquery2["default"].isNumeric(lng)) { + var df = new _dataframe2["default"]().col("lat", lat).col("lng", lng).col("layerId", layerId).col("group", group).col("popup", popup).col("popupOptions", popupOptions).col("label", label).col("labelOptions", labelOptions).cbind(options).cbind(crosstalkOptions || {}); + if (icon) icondf.effectiveLength = df.nrow(); + addMarkers(this, df, group, clusterOptions, clusterId, function (df, i) { + var options = df.get(i); + if (icon) options.icon = getIcon(i); + return _leaflet2["default"].marker([df.get(i, "lat"), df.get(i, "lng")], options); + }); + } +}; + +function addLayers(map, category, df, layerFunc) { + var _loop3 = function _loop3(i) { + (function () { + var layer = layerFunc(df, i); + + if (!_jquery2["default"].isEmptyObject(layer)) { + var thisId = df.get(i, "layerId"); + var thisGroup = df.get(i, "group"); + this.layerManager.addLayer(layer, category, thisId, thisGroup, df.get(i, "ctGroup", true), df.get(i, "ctKey", true)); + + if (layer.bindPopup) { + var popup = df.get(i, "popup"); + var popupOptions = df.get(i, "popupOptions"); + + if (popup !== null) { + if (popupOptions !== null) { + layer.bindPopup(popup, popupOptions); + } else { + layer.bindPopup(popup); + } + } + } + + if (layer.bindTooltip) { + var label = df.get(i, "label"); + var labelOptions = df.get(i, "labelOptions"); + + if (label !== null) { + if (labelOptions !== null) { + layer.bindTooltip(label, labelOptions); + } else { + layer.bindTooltip(label); + } + } + } + + layer.on("click", mouseHandler(this.id, thisId, thisGroup, category + "_click"), this); + layer.on("mouseover", mouseHandler(this.id, thisId, thisGroup, category + "_mouseover"), this); + layer.on("mouseout", mouseHandler(this.id, thisId, thisGroup, category + "_mouseout"), this); + var highlightStyle = df.get(i, "highlightOptions"); + + if (!_jquery2["default"].isEmptyObject(highlightStyle)) { + var defaultStyle = {}; + + _jquery2["default"].each(highlightStyle, function (k, v) { + if (k != "bringToFront" && k != "sendToBack") { + if (df.get(i, k)) { + defaultStyle[k] = df.get(i, k); + } + } + }); + + layer.on("mouseover", function (e) { + this.setStyle(highlightStyle); + + if (highlightStyle.bringToFront) { + this.bringToFront(); + } + }); + layer.on("mouseout", function (e) { + this.setStyle(defaultStyle); + + if (highlightStyle.sendToBack) { + this.bringToBack(); + } + }); + } + } + }).call(map); + }; + + for (var i = 0; i < df.nrow(); i++) { + _loop3(i); + } +} + +methods.addGenericLayers = addLayers; + +methods.addCircles = function (lat, lng, radius, layerId, group, options, popup, popupOptions, label, labelOptions, highlightOptions, crosstalkOptions) { + if (!(_jquery2["default"].isEmptyObject(lat) || _jquery2["default"].isEmptyObject(lng)) || _jquery2["default"].isNumeric(lat) && _jquery2["default"].isNumeric(lng)) { + var df = new _dataframe2["default"]().col("lat", lat).col("lng", lng).col("radius", radius).col("layerId", layerId).col("group", group).col("popup", popup).col("popupOptions", popupOptions).col("label", label).col("labelOptions", labelOptions).col("highlightOptions", highlightOptions).cbind(options).cbind(crosstalkOptions || {}); + addLayers(this, "shape", df, function (df, i) { + if (_jquery2["default"].isNumeric(df.get(i, "lat")) && _jquery2["default"].isNumeric(df.get(i, "lng")) && _jquery2["default"].isNumeric(df.get(i, "radius"))) { + return _leaflet2["default"].circle([df.get(i, "lat"), df.get(i, "lng")], df.get(i, "radius"), df.get(i)); + } else { + return null; + } + }); + } +}; + +methods.addCircleMarkers = function (lat, lng, radius, layerId, group, options, clusterOptions, clusterId, popup, popupOptions, label, labelOptions, crosstalkOptions) { + if (!(_jquery2["default"].isEmptyObject(lat) || _jquery2["default"].isEmptyObject(lng)) || _jquery2["default"].isNumeric(lat) && _jquery2["default"].isNumeric(lng)) { + var df = new _dataframe2["default"]().col("lat", lat).col("lng", lng).col("radius", radius).col("layerId", layerId).col("group", group).col("popup", popup).col("popupOptions", popupOptions).col("label", label).col("labelOptions", labelOptions).cbind(crosstalkOptions || {}).cbind(options); + addMarkers(this, df, group, clusterOptions, clusterId, function (df, i) { + return _leaflet2["default"].circleMarker([df.get(i, "lat"), df.get(i, "lng")], df.get(i)); + }); + } +}; +/* + * @param lat Array of arrays of latitude coordinates for polylines + * @param lng Array of arrays of longitude coordinates for polylines + */ + + +methods.addPolylines = function (polygons, layerId, group, options, popup, popupOptions, label, labelOptions, highlightOptions) { + if (polygons.length > 0) { + var df = new _dataframe2["default"]().col("shapes", polygons).col("layerId", layerId).col("group", group).col("popup", popup).col("popupOptions", popupOptions).col("label", label).col("labelOptions", labelOptions).col("highlightOptions", highlightOptions).cbind(options); + addLayers(this, "shape", df, function (df, i) { + var shapes = df.get(i, "shapes"); + shapes = shapes.map(function (shape) { + return _htmlwidgets2["default"].dataframeToD3(shape[0]); + }); + + if (shapes.length > 1) { + return _leaflet2["default"].polyline(shapes, df.get(i)); + } else { + return _leaflet2["default"].polyline(shapes[0], df.get(i)); + } + }); + } +}; + +methods.removeMarker = function (layerId) { + this.layerManager.removeLayer("marker", layerId); +}; + +methods.clearMarkers = function () { + this.layerManager.clearLayers("marker"); +}; + +methods.removeMarkerCluster = function (layerId) { + this.layerManager.removeLayer("cluster", layerId); +}; + +methods.removeMarkerFromCluster = function (layerId, clusterId) { + var cluster = this.layerManager.getLayer("cluster", clusterId); + if (!cluster) return; + cluster.clusterLayerStore.remove(layerId); +}; + +methods.clearMarkerClusters = function () { + this.layerManager.clearLayers("cluster"); +}; + +methods.removeShape = function (layerId) { + this.layerManager.removeLayer("shape", layerId); +}; + +methods.clearShapes = function () { + this.layerManager.clearLayers("shape"); +}; + +methods.addRectangles = function (lat1, lng1, lat2, lng2, layerId, group, options, popup, popupOptions, label, labelOptions, highlightOptions) { + var df = new _dataframe2["default"]().col("lat1", lat1).col("lng1", lng1).col("lat2", lat2).col("lng2", lng2).col("layerId", layerId).col("group", group).col("popup", popup).col("popupOptions", popupOptions).col("label", label).col("labelOptions", labelOptions).col("highlightOptions", highlightOptions).cbind(options); + addLayers(this, "shape", df, function (df, i) { + if (_jquery2["default"].isNumeric(df.get(i, "lat1")) && _jquery2["default"].isNumeric(df.get(i, "lng1")) && _jquery2["default"].isNumeric(df.get(i, "lat2")) && _jquery2["default"].isNumeric(df.get(i, "lng2"))) { + return _leaflet2["default"].rectangle([[df.get(i, "lat1"), df.get(i, "lng1")], [df.get(i, "lat2"), df.get(i, "lng2")]], df.get(i)); + } else { + return null; + } + }); +}; +/* + * @param lat Array of arrays of latitude coordinates for polygons + * @param lng Array of arrays of longitude coordinates for polygons + */ + + +methods.addPolygons = function (polygons, layerId, group, options, popup, popupOptions, label, labelOptions, highlightOptions) { + if (polygons.length > 0) { + var df = new _dataframe2["default"]().col("shapes", polygons).col("layerId", layerId).col("group", group).col("popup", popup).col("popupOptions", popupOptions).col("label", label).col("labelOptions", labelOptions).col("highlightOptions", highlightOptions).cbind(options); + addLayers(this, "shape", df, function (df, i) { + // This code used to use L.multiPolygon, but that caused + // double-click on a multipolygon to fail to zoom in on the + // map. Surprisingly, putting all the rings in a single + // polygon seems to still work; complicated multipolygons + // are still rendered correctly. + var shapes = df.get(i, "shapes").map(function (polygon) { + return polygon.map(_htmlwidgets2["default"].dataframeToD3); + }).reduce(function (acc, val) { + return acc.concat(val); + }, []); + return _leaflet2["default"].polygon(shapes, df.get(i)); + }); + } +}; + +methods.addGeoJSON = function (data, layerId, group, style) { + // This time, self is actually needed because the callbacks below need + // to access both the inner and outer senses of "this" + var self = this; + + if (typeof data === "string") { + data = JSON.parse(data); + } + + var globalStyle = _jquery2["default"].extend({}, style, data.style || {}); + + var gjlayer = _leaflet2["default"].geoJson(data, { + style: function style(feature) { + if (feature.style || feature.properties.style) { + return _jquery2["default"].extend({}, globalStyle, feature.style, feature.properties.style); + } else { + return globalStyle; + } + }, + onEachFeature: function onEachFeature(feature, layer) { + var extraInfo = { + featureId: feature.id, + properties: feature.properties + }; + var popup = feature.properties ? feature.properties.popup : null; + if (typeof popup !== "undefined" && popup !== null) layer.bindPopup(popup); + layer.on("click", mouseHandler(self.id, layerId, group, "geojson_click", extraInfo), this); + layer.on("mouseover", mouseHandler(self.id, layerId, group, "geojson_mouseover", extraInfo), this); + layer.on("mouseout", mouseHandler(self.id, layerId, group, "geojson_mouseout", extraInfo), this); + } + }); + + this.layerManager.addLayer(gjlayer, "geojson", layerId, group); +}; + +methods.removeGeoJSON = function (layerId) { + this.layerManager.removeLayer("geojson", layerId); +}; + +methods.clearGeoJSON = function () { + this.layerManager.clearLayers("geojson"); +}; + +methods.addTopoJSON = function (data, layerId, group, style) { + // This time, self is actually needed because the callbacks below need + // to access both the inner and outer senses of "this" + var self = this; + + if (typeof data === "string") { + data = JSON.parse(data); + } + + var globalStyle = _jquery2["default"].extend({}, style, data.style || {}); + + var gjlayer = _leaflet2["default"].geoJson(null, { + style: function style(feature) { + if (feature.style || feature.properties.style) { + return _jquery2["default"].extend({}, globalStyle, feature.style, feature.properties.style); + } else { + return globalStyle; + } + }, + onEachFeature: function onEachFeature(feature, layer) { + var extraInfo = { + featureId: feature.id, + properties: feature.properties + }; + var popup = feature.properties.popup; + if (typeof popup !== "undefined" && popup !== null) layer.bindPopup(popup); + layer.on("click", mouseHandler(self.id, layerId, group, "topojson_click", extraInfo), this); + layer.on("mouseover", mouseHandler(self.id, layerId, group, "topojson_mouseover", extraInfo), this); + layer.on("mouseout", mouseHandler(self.id, layerId, group, "topojson_mouseout", extraInfo), this); + } + }); + + global.omnivore.topojson.parse(data, null, gjlayer); + this.layerManager.addLayer(gjlayer, "topojson", layerId, group); +}; + +methods.removeTopoJSON = function (layerId) { + this.layerManager.removeLayer("topojson", layerId); +}; + +methods.clearTopoJSON = function () { + this.layerManager.clearLayers("topojson"); +}; + +methods.addControl = function (html, position, layerId, classes) { + function onAdd(map) { + var div = _leaflet2["default"].DomUtil.create("div", classes); + + if (typeof layerId !== "undefined" && layerId !== null) { + div.setAttribute("id", layerId); + } + + this._div = div; // It's possible for window.Shiny to be true but Shiny.initializeInputs to + // not be, when a static leaflet widget is included as part of the shiny + // UI directly (not through leafletOutput or uiOutput). In this case we + // don't do the normal Shiny stuff as that will all happen when Shiny + // itself loads and binds the entire doc. + + if (window.Shiny && _shiny2["default"].initializeInputs) { + _shiny2["default"].renderHtml(html, this._div); + + _shiny2["default"].initializeInputs(this._div); + + _shiny2["default"].bindAll(this._div); + } else { + this._div.innerHTML = html; + } + + return this._div; + } + + function onRemove(map) { + if (window.Shiny && _shiny2["default"].unbindAll) { + _shiny2["default"].unbindAll(this._div); + } + } + + var Control = _leaflet2["default"].Control.extend({ + options: { + position: position + }, + onAdd: onAdd, + onRemove: onRemove + }); + + this.controls.add(new Control(), layerId, html); +}; + +methods.addCustomControl = function (control, layerId) { + this.controls.add(control, layerId); +}; + +methods.removeControl = function (layerId) { + this.controls.remove(layerId); +}; + +methods.getControl = function (layerId) { + this.controls.get(layerId); +}; + +methods.clearControls = function () { + this.controls.clear(); +}; + +methods.addLegend = function (options) { + var legend = _leaflet2["default"].control({ + position: options.position + }); + + var gradSpan; + + legend.onAdd = function (map) { + var div = _leaflet2["default"].DomUtil.create("div", options.className), + colors = options.colors, + labels = options.labels, + legendHTML = ""; + + if (options.type === "numeric") { + // # Formatting constants. + var singleBinHeight = 20; // The distance between tick marks, in px + + var vMargin = 8; // If 1st tick mark starts at top of gradient, how + // many extra px are needed for the top half of the + // 1st label? (ditto for last tick mark/label) + + var tickWidth = 4; // How wide should tick marks be, in px? + + var labelPadding = 6; // How much distance to reserve for tick mark? + // (Must be >= tickWidth) + // # Derived formatting parameters. + // What's the height of a single bin, in percentage (of gradient height)? + // It might not just be 1/(n-1), if the gradient extends past the tick + // marks (which can be the case for pretty cut points). + + var singleBinPct = (options.extra.p_n - options.extra.p_1) / (labels.length - 1); // Each bin is `singleBinHeight` high. How tall is the gradient? + + var totalHeight = 1 / singleBinPct * singleBinHeight + 1; // How far should the first tick be shifted down, relative to the top + // of the gradient? + + var tickOffset = singleBinHeight / singleBinPct * options.extra.p_1; + gradSpan = (0, _jquery2["default"])("").css({ + "background": "linear-gradient(" + colors + ")", + "opacity": options.opacity, + "height": totalHeight + "px", + "width": "18px", + "display": "block", + "margin-top": vMargin + "px" + }); + var leftDiv = (0, _jquery2["default"])("
    ").css("float", "left"), + rightDiv = (0, _jquery2["default"])("
    ").css("float", "left"); + leftDiv.append(gradSpan); + (0, _jquery2["default"])(div).append(leftDiv).append(rightDiv).append((0, _jquery2["default"])("
    ")); // Have to attach the div to the body at this early point, so that the + // svg text getComputedTextLength() actually works, below. + + document.body.appendChild(div); + var ns = "http://www.w3.org/2000/svg"; + var svg = document.createElementNS(ns, "svg"); + rightDiv.append(svg); + var g = document.createElementNS(ns, "g"); + (0, _jquery2["default"])(g).attr("transform", "translate(0, " + vMargin + ")"); + svg.appendChild(g); // max label width needed to set width of svg, and right-justify text + + var maxLblWidth = 0; // Create tick marks and labels + + _jquery2["default"].each(labels, function (i, label) { + var y = tickOffset + i * singleBinHeight + 0.5; + var thisLabel = document.createElementNS(ns, "text"); + (0, _jquery2["default"])(thisLabel).text(labels[i]).attr("y", y).attr("dx", labelPadding).attr("dy", "0.5ex"); + g.appendChild(thisLabel); + maxLblWidth = Math.max(maxLblWidth, thisLabel.getComputedTextLength()); + var thisTick = document.createElementNS(ns, "line"); + (0, _jquery2["default"])(thisTick).attr("x1", 0).attr("x2", tickWidth).attr("y1", y).attr("y2", y).attr("stroke-width", 1); + g.appendChild(thisTick); + }); // Now that we know the max label width, we can right-justify + + + (0, _jquery2["default"])(svg).find("text").attr("dx", labelPadding + maxLblWidth).attr("text-anchor", "end"); // Final size for + + (0, _jquery2["default"])(svg).css({ + width: maxLblWidth + labelPadding + "px", + height: totalHeight + vMargin * 2 + "px" + }); + + if (options.na_color && _jquery2["default"].inArray(options.na_label, labels) < 0) { + (0, _jquery2["default"])(div).append("
    " + options.na_label + "
    "); + } + } else { + if (options.na_color && _jquery2["default"].inArray(options.na_label, labels) < 0) { + colors.push(options.na_color); + labels.push(options.na_label); + } + + for (var i = 0; i < colors.length; i++) { + legendHTML += " " + labels[i] + "
    "; + } + + div.innerHTML = legendHTML; + } + + if (options.title) (0, _jquery2["default"])(div).prepend("
    " + options.title + "
    "); + return div; + }; + + if (options.group) { + // Auto generate a layerID if not provided + if (!options.layerId) { + options.layerId = _leaflet2["default"].Util.stamp(legend); + } + + var map = this; + map.on("overlayadd", function (e) { + if (e.name === options.group) { + map.controls.add(legend, options.layerId); + } + }); + map.on("overlayremove", function (e) { + if (e.name === options.group) { + map.controls.remove(options.layerId); + } + }); + map.on("groupadd", function (e) { + if (e.name === options.group) { + map.controls.add(legend, options.layerId); + } + }); + map.on("groupremove", function (e) { + if (e.name === options.group) { + map.controls.remove(options.layerId); + } + }); + } + + this.controls.add(legend, options.layerId); +}; + +methods.addLayersControl = function (baseGroups, overlayGroups, options) { + var _this4 = this; + + // Only allow one layers control at a time + methods.removeLayersControl.call(this); + var firstLayer = true; + var base = {}; + + _jquery2["default"].each((0, _util.asArray)(baseGroups), function (i, g) { + var layer = _this4.layerManager.getLayerGroup(g, true); + + if (layer) { + base[g] = layer; // Check if >1 base layers are visible; if so, hide all but the first one + + if (_this4.hasLayer(layer)) { + if (firstLayer) { + firstLayer = false; + } else { + _this4.removeLayer(layer); + } + } + } + }); + + var overlay = {}; + + _jquery2["default"].each((0, _util.asArray)(overlayGroups), function (i, g) { + var layer = _this4.layerManager.getLayerGroup(g, true); + + if (layer) { + overlay[g] = layer; + } + }); + + this.currentLayersControl = _leaflet2["default"].control.layers(base, overlay, options); + this.addControl(this.currentLayersControl); +}; + +methods.removeLayersControl = function () { + if (this.currentLayersControl) { + this.removeControl(this.currentLayersControl); + this.currentLayersControl = null; + } +}; + +methods.addScaleBar = function (options) { + // Only allow one scale bar at a time + methods.removeScaleBar.call(this); + + var scaleBar = _leaflet2["default"].control.scale(options).addTo(this); + + this.currentScaleBar = scaleBar; +}; + +methods.removeScaleBar = function () { + if (this.currentScaleBar) { + this.currentScaleBar.remove(); + this.currentScaleBar = null; + } +}; + +methods.hideGroup = function (group) { + var _this5 = this; + + _jquery2["default"].each((0, _util.asArray)(group), function (i, g) { + var layer = _this5.layerManager.getLayerGroup(g, true); + + if (layer) { + _this5.removeLayer(layer); + } + }); +}; + +methods.showGroup = function (group) { + var _this6 = this; + + _jquery2["default"].each((0, _util.asArray)(group), function (i, g) { + var layer = _this6.layerManager.getLayerGroup(g, true); + + if (layer) { + _this6.addLayer(layer); + } + }); +}; + +function setupShowHideGroupsOnZoom(map) { + if (map.leafletr._hasInitializedShowHideGroups) { + return; + } + + map.leafletr._hasInitializedShowHideGroups = true; + + function setVisibility(layer, visible, group) { + if (visible !== map.hasLayer(layer)) { + if (visible) { + map.addLayer(layer); + map.fire("groupadd", { + "name": group, + "layer": layer + }); + } else { + map.removeLayer(layer); + map.fire("groupremove", { + "name": group, + "layer": layer + }); + } + } + } + + function showHideGroupsOnZoom() { + if (!map.layerManager) return; + var zoom = map.getZoom(); + map.layerManager.getAllGroupNames().forEach(function (group) { + var layer = map.layerManager.getLayerGroup(group, false); + + if (layer && typeof layer.zoomLevels !== "undefined") { + setVisibility(layer, layer.zoomLevels === true || layer.zoomLevels.indexOf(zoom) >= 0, group); + } + }); + } + + map.showHideGroupsOnZoom = showHideGroupsOnZoom; + map.on("zoomend", showHideGroupsOnZoom); +} + +methods.setGroupOptions = function (group, options) { + var _this7 = this; + + _jquery2["default"].each((0, _util.asArray)(group), function (i, g) { + var layer = _this7.layerManager.getLayerGroup(g, true); // This slightly tortured check is because 0 is a valid value for zoomLevels + + + if (typeof options.zoomLevels !== "undefined" && options.zoomLevels !== null) { + layer.zoomLevels = (0, _util.asArray)(options.zoomLevels); + } + }); + + setupShowHideGroupsOnZoom(this); + this.showHideGroupsOnZoom(); +}; + +methods.addRasterImage = function (uri, bounds, layerId, group, options) { + // uri is a data URI containing an image. We want to paint this image as a + // layer at (top-left) bounds[0] to (bottom-right) bounds[1]. + // We can't simply use ImageOverlay, as it uses bilinear scaling which looks + // awful as you zoom in (and sometimes shifts positions or disappears). + // Instead, we'll use a TileLayer.Canvas to draw pieces of the image. + // First, some helper functions. + // degree2tile converts latitude, longitude, and zoom to x and y tile + // numbers. The tile numbers returned can be non-integral, as there's no + // reason to expect that the lat/lng inputs are exactly on the border of two + // tiles. + // + // We'll use this to convert the bounds we got from the server, into coords + // in tile-space at a given zoom level. Note that once we do the conversion, + // we don't to do any more trigonometry to convert between pixel coordinates + // and tile coordinates; the source image pixel coords, destination canvas + // pixel coords, and tile coords all can be scaled linearly. + function degree2tile(lat, lng, zoom) { + // See http://wiki.openstreetmap.org/wiki/Slippy_map_tilenames + var latRad = lat * Math.PI / 180; + var n = Math.pow(2, zoom); + var x = (lng + 180) / 360 * n; + var y = (1 - Math.log(Math.tan(latRad) + 1 / Math.cos(latRad)) / Math.PI) / 2 * n; + return { + x: x, + y: y + }; + } // Given a range [from,to) and either one or two numbers, returns true if + // there is any overlap between [x,x1) and the range--or if x1 is omitted, + // then returns true if x is within [from,to). + + + function overlap(from, to, x, + /* optional */ + x1) { + if (arguments.length == 3) x1 = x; + return x < to && x1 >= from; + } + + function getCanvasSmoothingProperty(ctx) { + var candidates = ["imageSmoothingEnabled", "mozImageSmoothingEnabled", "webkitImageSmoothingEnabled", "msImageSmoothingEnabled"]; + + for (var i = 0; i < candidates.length; i++) { + if (typeof ctx[candidates[i]] !== "undefined") { + return candidates[i]; + } + } + + return null; + } // Our general strategy is to: + // 1. Load the data URI in an Image() object, so we can get its pixel + // dimensions and the underlying image data. (We could have done this + // by not encoding as PNG at all but just send an array of RGBA values + // from the server, but that would inflate the JSON too much.) + // 2. Create a hidden canvas that we use just to extract the image data + // from the Image (using Context2D.getImageData()). + // 3. Create a TileLayer.Canvas and add it to the map. + // We want to synchronously create and attach the TileLayer.Canvas (so an + // immediate call to clearRasters() will be respected, for example), but + // Image loads its data asynchronously. Fortunately we can resolve this + // by putting TileLayer.Canvas into async mode, which will let us create + // and attach the layer but have it wait until the image is loaded before + // it actually draws anything. + // These are the variables that we will populate once the image is loaded. + + + var imgData = null; // 1d row-major array, four [0-255] integers per pixel + + var imgDataMipMapper = null; + var w = null; // image width in pixels + + var h = null; // image height in pixels + // We'll use this array to store callbacks that need to be invoked once + // imgData, w, and h have been resolved. + + var imgDataCallbacks = []; // Consumers of imgData, w, and h can call this to be notified when data + // is available. + + function getImageData(callback) { + if (imgData != null) { + // Must not invoke the callback immediately; it's too confusing and + // fragile to have a function invoke the callback *either* immediately + // or in the future. Better to be consistent here. + setTimeout(function () { + callback(imgData, w, h, imgDataMipMapper); + }, 0); + } else { + imgDataCallbacks.push(callback); + } + } + + var img = new Image(); + + img.onload = function () { + // Save size + w = img.width; + h = img.height; // Create a dummy canvas to extract the image data + + var imgDataCanvas = document.createElement("canvas"); + imgDataCanvas.width = w; + imgDataCanvas.height = h; + imgDataCanvas.style.display = "none"; + document.body.appendChild(imgDataCanvas); + var imgDataCtx = imgDataCanvas.getContext("2d"); + imgDataCtx.drawImage(img, 0, 0); // Save the image data. + + imgData = imgDataCtx.getImageData(0, 0, w, h).data; + imgDataMipMapper = new _mipmapper2["default"](img); // Done with the canvas, remove it from the page so it can be gc'd. + + document.body.removeChild(imgDataCanvas); // Alert any getImageData callers who are waiting. + + for (var i = 0; i < imgDataCallbacks.length; i++) { + imgDataCallbacks[i](imgData, w, h, imgDataMipMapper); + } + + imgDataCallbacks = []; + }; + + img.src = uri; + + var canvasTiles = _leaflet2["default"].gridLayer(Object.assign({}, options, { + detectRetina: true, + async: true + })); // NOTE: The done() function MUST NOT be invoked until after the current + // tick; done() looks in Leaflet's tile cache for the current tile, and + // since it's still being constructed, it won't be found. + + + canvasTiles.createTile = function (tilePoint, done) { + var zoom = tilePoint.z; + + var canvas = _leaflet2["default"].DomUtil.create("canvas"); + + var error; // setup tile width and height according to the options + + var size = this.getTileSize(); + canvas.width = size.x; + canvas.height = size.y; + getImageData(function (imgData, w, h, mipmapper) { + try { + // The Context2D we'll being drawing onto. It's always 256x256. + var ctx = canvas.getContext("2d"); // Convert our image data's top-left and bottom-right locations into + // x/y tile coordinates. This is essentially doing a spherical mercator + // projection, then multiplying by 2^zoom. + + var topLeft = degree2tile(bounds[0][0], bounds[0][1], zoom); + var bottomRight = degree2tile(bounds[1][0], bounds[1][1], zoom); // The size of the image in x/y tile coordinates. + + var extent = { + x: bottomRight.x - topLeft.x, + y: bottomRight.y - topLeft.y + }; // Short circuit if tile is totally disjoint from image. + + if (!overlap(tilePoint.x, tilePoint.x + 1, topLeft.x, bottomRight.x)) return; + if (!overlap(tilePoint.y, tilePoint.y + 1, topLeft.y, bottomRight.y)) return; // The linear resolution of the tile we're drawing is always 256px per tile unit. + // If the linear resolution (in either direction) of the image is less than 256px + // per tile unit, then use nearest neighbor; otherwise, use the canvas's built-in + // scaling. + + var imgRes = { + x: w / extent.x, + y: h / extent.y + }; // We can do the actual drawing in one of three ways: + // - Call drawImage(). This is easy and fast, and results in smooth + // interpolation (bilinear?). This is what we want when we are + // reducing the image from its native size. + // - Call drawImage() with imageSmoothingEnabled=false. This is easy + // and fast and gives us nearest-neighbor interpolation, which is what + // we want when enlarging the image. However, it's unsupported on many + // browsers (including QtWebkit). + // - Do a manual nearest-neighbor interpolation. This is what we'll fall + // back to when enlarging, and imageSmoothingEnabled isn't supported. + // In theory it's slower, but still pretty fast on my machine, and the + // results look the same AFAICT. + // Is imageSmoothingEnabled supported? If so, we can let canvas do + // nearest-neighbor interpolation for us. + + var smoothingProperty = getCanvasSmoothingProperty(ctx); + + if (smoothingProperty || imgRes.x >= 256 && imgRes.y >= 256) { + // Use built-in scaling + // Turn off anti-aliasing if necessary + if (smoothingProperty) { + ctx[smoothingProperty] = imgRes.x >= 256 && imgRes.y >= 256; + } // Don't necessarily draw with the full-size image; if we're + // downscaling, use the mipmapper to get a pre-downscaled image + // (see comments on Mipmapper class for why this matters). + + + mipmapper.getBySize(extent.x * 256, extent.y * 256, function (mip) { + // It's possible that the image will go off the edge of the canvas-- + // that's OK, the canvas should clip appropriately. + ctx.drawImage(mip, // Convert abs tile coords to rel tile coords, then *256 to convert + // to rel pixel coords + (topLeft.x - tilePoint.x) * 256, (topLeft.y - tilePoint.y) * 256, // Always draw the whole thing and let canvas clip; so we can just + // convert from size in tile coords straight to pixels + extent.x * 256, extent.y * 256); + }); + } else { + // Use manual nearest-neighbor interpolation + // Calculate the source image pixel coordinates that correspond with + // the top-left and bottom-right of this tile. (If the source image + // only partially overlaps the tile, we use max/min to limit the + // sourceStart/End to only reflect the overlapping portion.) + var sourceStart = { + x: Math.max(0, Math.floor((tilePoint.x - topLeft.x) * imgRes.x)), + y: Math.max(0, Math.floor((tilePoint.y - topLeft.y) * imgRes.y)) + }; + var sourceEnd = { + x: Math.min(w, Math.ceil((tilePoint.x + 1 - topLeft.x) * imgRes.x)), + y: Math.min(h, Math.ceil((tilePoint.y + 1 - topLeft.y) * imgRes.y)) + }; // The size, in dest pixels, that each source pixel should occupy. + // This might be greater or less than 1 (e.g. if x and y resolution + // are very different). + + var pixelSize = { + x: 256 / imgRes.x, + y: 256 / imgRes.y + }; // For each pixel in the source image that overlaps the tile... + + for (var row = sourceStart.y; row < sourceEnd.y; row++) { + for (var col = sourceStart.x; col < sourceEnd.x; col++) { + // ...extract the pixel data... + var i = (row * w + col) * 4; + var r = imgData[i]; + var g = imgData[i + 1]; + var b = imgData[i + 2]; + var a = imgData[i + 3]; + ctx.fillStyle = "rgba(" + [r, g, b, a / 255].join(",") + ")"; // ...calculate the corresponding pixel coord in the dest image + // where it should be drawn... + + var pixelPos = { + x: (col / imgRes.x + topLeft.x - tilePoint.x) * 256, + y: (row / imgRes.y + topLeft.y - tilePoint.y) * 256 + }; // ...and draw a rectangle there. + + ctx.fillRect(Math.round(pixelPos.x), Math.round(pixelPos.y), // Looks crazy, but this is necessary to prevent rounding from + // causing overlap between this rect and its neighbors. The + // minuend is the location of the next pixel, while the + // subtrahend is the position of the current pixel (to turn an + // absolute coordinate to a width/height). Yes, I had to look + // up minuend and subtrahend. + Math.round(pixelPos.x + pixelSize.x) - Math.round(pixelPos.x), Math.round(pixelPos.y + pixelSize.y) - Math.round(pixelPos.y)); + } + } + } + } catch (e) { + error = e; + } finally { + done(error, canvas); + } + }); + return canvas; + }; + + this.layerManager.addLayer(canvasTiles, "image", layerId, group); +}; + +methods.removeImage = function (layerId) { + this.layerManager.removeLayer("image", layerId); +}; + +methods.clearImages = function () { + this.layerManager.clearLayers("image"); +}; + +methods.addMeasure = function (options) { + // if a measureControl already exists, then remove it and + // replace with a new one + methods.removeMeasure.call(this); + this.measureControl = _leaflet2["default"].control.measure(options); + this.addControl(this.measureControl); +}; + +methods.removeMeasure = function () { + if (this.measureControl) { + this.removeControl(this.measureControl); + this.measureControl = null; + } +}; + +methods.addSelect = function (ctGroup) { + var _this8 = this; + + methods.removeSelect.call(this); + this._selectButton = _leaflet2["default"].easyButton({ + states: [{ + stateName: "select-inactive", + icon: "ion-qr-scanner", + title: "Make a selection", + onClick: function onClick(btn, map) { + btn.state("select-active"); + _this8._locationFilter = new _leaflet2["default"].LocationFilter2(); + + if (ctGroup) { + var selectionHandle = new global.crosstalk.SelectionHandle(ctGroup); + selectionHandle.on("change", function (e) { + if (e.sender !== selectionHandle) { + if (_this8._locationFilter) { + _this8._locationFilter.disable(); + + btn.state("select-inactive"); + } + } + }); + + var handler = function handler(e) { + _this8.layerManager.brush(_this8._locationFilter.getBounds(), { + sender: selectionHandle + }); + }; + + _this8._locationFilter.on("enabled", handler); + + _this8._locationFilter.on("change", handler); + + _this8._locationFilter.on("disabled", function () { + selectionHandle.close(); + _this8._locationFilter = null; + }); + } + + _this8._locationFilter.addTo(map); + } + }, { + stateName: "select-active", + icon: "ion-close-round", + title: "Dismiss selection", + onClick: function onClick(btn, map) { + btn.state("select-inactive"); + + _this8._locationFilter.disable(); // If explicitly dismissed, clear the crosstalk selections + + + _this8.layerManager.unbrush(); + } + }] + }); + + this._selectButton.addTo(this); +}; + +methods.removeSelect = function () { + if (this._locationFilter) { + this._locationFilter.disable(); + } + + if (this._selectButton) { + this.removeControl(this._selectButton); + this._selectButton = null; + } +}; + +methods.createMapPane = function (name, zIndex) { + this.createPane(name); + this.getPane(name).style.zIndex = zIndex; +}; + + +}).call(this)}).call(this,typeof global !== "undefined" ? global : typeof self !== "undefined" ? self : typeof window !== "undefined" ? window : {}) +},{"./cluster-layer-store":1,"./crs_utils":3,"./dataframe":4,"./global/htmlwidgets":8,"./global/jquery":9,"./global/leaflet":10,"./global/shiny":12,"./mipmapper":16,"./util":17}],16:[function(require,module,exports){ +"use strict"; + +Object.defineProperty(exports, "__esModule", { + value: true +}); + +function _classCallCheck(instance, Constructor) { if (!(instance instanceof Constructor)) { throw new TypeError("Cannot call a class as a function"); } } + +function _defineProperties(target, props) { for (var i = 0; i < props.length; i++) { var descriptor = props[i]; descriptor.enumerable = descriptor.enumerable || false; descriptor.configurable = true; if ("value" in descriptor) descriptor.writable = true; Object.defineProperty(target, descriptor.key, descriptor); } } + +function _createClass(Constructor, protoProps, staticProps) { if (protoProps) _defineProperties(Constructor.prototype, protoProps); if (staticProps) _defineProperties(Constructor, staticProps); return Constructor; } + +// This class simulates a mipmap, which shrinks images by powers of two. This +// stepwise reduction results in "pixel-perfect downscaling" (where every +// pixel of the original image has some contribution to the downscaled image) +// as opposed to a single-step downscaling which will discard a lot of data +// (and with sparse images at small scales can give very surprising results). +var Mipmapper = /*#__PURE__*/function () { + function Mipmapper(img) { + _classCallCheck(this, Mipmapper); + + this._layers = [img]; + } // The various functions on this class take a callback function BUT MAY OR MAY + // NOT actually behave asynchronously. + + + _createClass(Mipmapper, [{ + key: "getBySize", + value: function getBySize(desiredWidth, desiredHeight, callback) { + var _this = this; + + var i = 0; + var lastImg = this._layers[0]; + + var testNext = function testNext() { + _this.getByIndex(i, function (img) { + // If current image is invalid (i.e. too small to be rendered) or + // it's smaller than what we wanted, return the last known good image. + if (!img || img.width < desiredWidth || img.height < desiredHeight) { + callback(lastImg); + return; + } else { + lastImg = img; + i++; + testNext(); + return; + } + }); + }; + + testNext(); + } + }, { + key: "getByIndex", + value: function getByIndex(i, callback) { + var _this2 = this; + + if (this._layers[i]) { + callback(this._layers[i]); + return; + } + + this.getByIndex(i - 1, function (prevImg) { + if (!prevImg) { + // prevImg could not be calculated (too small, possibly) + callback(null); + return; + } + + if (prevImg.width < 2 || prevImg.height < 2) { + // Can't reduce this image any further + callback(null); + return; + } // If reduce ever becomes truly asynchronous, we should stuff a promise or + // something into this._layers[i] before calling this.reduce(), to prevent + // redundant reduce operations from happening. + + + _this2.reduce(prevImg, function (reducedImg) { + _this2._layers[i] = reducedImg; + callback(reducedImg); + return; + }); + }); + } + }, { + key: "reduce", + value: function reduce(img, callback) { + var imgDataCanvas = document.createElement("canvas"); + imgDataCanvas.width = Math.ceil(img.width / 2); + imgDataCanvas.height = Math.ceil(img.height / 2); + imgDataCanvas.style.display = "none"; + document.body.appendChild(imgDataCanvas); + + try { + var imgDataCtx = imgDataCanvas.getContext("2d"); + imgDataCtx.drawImage(img, 0, 0, img.width / 2, img.height / 2); + callback(imgDataCanvas); + } finally { + document.body.removeChild(imgDataCanvas); + } + } + }]); + + return Mipmapper; +}(); + +exports["default"] = Mipmapper; + + +},{}],17:[function(require,module,exports){ +"use strict"; + +Object.defineProperty(exports, "__esModule", { + value: true +}); +exports.log = log; +exports.recycle = recycle; +exports.asArray = asArray; + +function log(message) { + /* eslint-disable no-console */ + if (console && console.log) console.log(message); + /* eslint-enable no-console */ +} + +function recycle(values, length, inPlace) { + if (length === 0 && !inPlace) return []; + + if (!(values instanceof Array)) { + if (inPlace) { + throw new Error("Can't do in-place recycling of a non-Array value"); + } + + values = [values]; + } + + if (typeof length === "undefined") length = values.length; + var dest = inPlace ? values : []; + var origLength = values.length; + + while (dest.length < length) { + dest.push(values[dest.length % origLength]); + } + + if (dest.length > length) { + dest.splice(length, dest.length - length); + } + + return dest; +} + +function asArray(value) { + if (value instanceof Array) return value;else return [value]; +} + + +},{}]},{},[13]); diff --git a/html_outputs/site_libs/leaflet-providers-plugin-2.2.2/leaflet-providers-plugin.js b/html_outputs/site_libs/leaflet-providers-plugin-2.2.2/leaflet-providers-plugin.js new file mode 100644 index 00000000..82cd6301 --- /dev/null +++ b/html_outputs/site_libs/leaflet-providers-plugin-2.2.2/leaflet-providers-plugin.js @@ -0,0 +1,3 @@ +LeafletWidget.methods.addProviderTiles = function(provider, layerId, group, options) { + this.layerManager.addLayer(L.tileLayer.provider(provider, options), "tile", layerId, group); +}; diff --git a/new_pages/help.qmd b/new_pages/help.qmd index 5a1282f3..1525d809 100644 --- a/new_pages/help.qmd +++ b/new_pages/help.qmd @@ -3,7 +3,9 @@ This page covers how to get help by posting a Github issue or by posting a reproducible example ("reprex") to an online forum. - +```{r, echo = F, warning = F, message = F} +library(tidyverse) +``` ## Github issues diff --git a/new_pages/missing_data.qmd b/new_pages/missing_data.qmd index 3df4e988..d4f95fa6 100644 --- a/new_pages/missing_data.qmd +++ b/new_pages/missing_data.qmd @@ -560,7 +560,7 @@ You could also do a similar process for replacing categorical data with a specif ```{r} linelist <- linelist %>% - mutate(outcome_replace_na_with_death = replace_na(outcome, "Death")) + mutate(outcome_replace_na_with_death = replace_na(outcome, 1)) ``` ### Regression imputation {.unnumbered} diff --git a/new_pages/regression_files/execute-results/html.json b/new_pages/regression_files/execute-results/html.json deleted file mode 100644 index a319d8b6..00000000 --- a/new_pages/regression_files/execute-results/html.json +++ /dev/null @@ -1,21 +0,0 @@ -{ - "hash": "747d81f90a204d35bd7b85d08f706d55", - "result": { - "engine": "knitr", - "markdown": "# Univariate and multivariable regression { }\n\n\n\nThis page demonstrates the use of **base** R regression functions such as `glm()` and the **gtsummary** package to \nlook at associations between variables (e.g. odds ratios, risk ratios and hazard\nratios). It also uses functions like `tidy()` from the **broom** package to clean-up regression outputs. \n\n1. Univariate: two-by-two tables. \n2. Stratified: Mantel–Haenszel estimates. \n3. Multivariable: variable selection, model selection, final table.\n4. Forest plots.\n\nFor Cox proportional hazard regression, see the [Survival analysis](survival_analysis.qmd) page. \n\n**_NOTE:_** We use the term *multivariable* to refer to a regression with multiple explanatory variables. In this sense a *multivariate* model would be a regression with several outcomes - see this [editorial](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3518362/) for detail . \n\n\n\n## Preparation { }\n\n\n### Load packages {.unnumbered}\n\nThis code chunk shows the loading of packages required for the analyses. In this handbook we emphasize `p_load()` from **pacman**, which installs the package if necessary *and* loads it for use. You can also load installed packages with `library()` from **base** R. See the page on [R basics](basics.qmd) for more information on R packages. \n\n\n::: {.cell}\n\n```{.r .cell-code}\npacman::p_load(\n rio, # File import\n here, # File locator\n tidyverse, # data management + ggplot2 graphics, \n stringr, # manipulate text strings \n purrr, # loop over objects in a tidy way\n gtsummary, # summary statistics and tests \n broom, # tidy up results from regressions\n lmtest, # likelihood-ratio tests\n parameters, # alternative to tidy up results from regressions\n see # alternative to visualise forest plots\n )\n```\n:::\n\n\n### Import data {.unnumbered}\n\nWe import the dataset of cases from a simulated Ebola epidemic. If you want to follow along, click to download the \"clean\" linelist (as .rds file). Import your data with the `import()` function from the **rio** package (it accepts many file types like .xlsx, .rds, .csv - see the [Import and export](importing.qmd) page for details). \n\n\n\n::: {.cell}\n::: {.cell-output .cell-output-stderr}\n\n```\nWarning: The `trust` argument of `import()` should be explicit for serialization formats\nas of rio 1.0.3.\nℹ Missing `trust` will be set to FALSE by default for RDS in 2.0.0.\nℹ The deprecated feature was likely used in the rio package.\n Please report the issue at .\n```\n\n\n:::\n:::\n\n::: {.cell}\n\n```{.r .cell-code}\n# import the linelist\nlinelist <- import(\"linelist_cleaned.rds\")\n```\n:::\n\n\nThe first 50 rows of the linelist are displayed below.\n\n\n::: {.cell}\n::: {.cell-output-display}\n\n```{=html}\n
    \n\n```\n\n:::\n:::\n\n\n### Clean data {.unnumbered}\n\n#### Store explanatory variables {.unnumbered} \n\nWe store the names of the explanatory columns as a character vector. This will be referenced later. \n\n\n::: {.cell}\n\n```{.r .cell-code}\n## define variables of interest \nexplanatory_vars <- c(\"gender\", \"fever\", \"chills\", \"cough\", \"aches\", \"vomit\")\n```\n:::\n\n\n\n#### Convert to 1's and 0's {.unnumbered} \n\nBelow we convert the explanatory columns from \"yes\"/\"no\", \"m\"/\"f\", and \"dead\"/\"alive\" to **1 / 0**, to cooperate with the expectations of logistic regression models. To do this efficiently, used `across()` from **dplyr** to transform multiple columns at one time. The function we apply to each column is `case_when()` (also **dplyr**) which applies logic to convert specified values to 1's and 0's. See sections on `across()` and `case_when()` in the [Cleaning data and core functions page](cleaning.qmd#clean_across). \n\nNote: the \".\" below represents the column that is being processed by `across()` at that moment.\n\n\n::: {.cell}\n\n```{.r .cell-code}\n## convert dichotomous variables to 0/1 \nlinelist <- linelist %>% \n mutate(across( \n .cols = all_of(c(explanatory_vars, \"outcome\")), ## for each column listed and \"outcome\"\n .fns = ~case_when( \n . %in% c(\"m\", \"yes\", \"Death\") ~ 1, ## recode male, yes and death to 1\n . %in% c(\"f\", \"no\", \"Recover\") ~ 0, ## female, no and recover to 0\n TRUE ~ NA_real_) ## otherwise set to missing\n )\n )\n```\n:::\n\n\n#### Drop rows with missing values {.unnumbered} \n\nTo drop rows with missing values, can use the **tidyr** function `drop_na()`. However, we only want to do this for rows that are missing values in the columns of interest. \n\nThe first thing we must to is make sure our `explanatory_vars` vector includes the column `age` (`age` would have produced an error in the previous `case_when()` operation, which was only for dichotomous variables). Then we pipe the `linelist` to `drop_na()` to remove any rows with missing values in the `outcome` column or any of the `explanatory_vars` columns. \n\nBefore running the code, the number of rows in the `linelist` is ` nrow(linelist)`. \n\n\n::: {.cell}\n\n```{.r .cell-code}\n## add in age_category to the explanatory vars \nexplanatory_vars <- c(explanatory_vars, \"age_cat\")\n\n## drop rows with missing information for variables of interest \nlinelist <- linelist %>% \n drop_na(any_of(c(\"outcome\", explanatory_vars)))\n```\n:::\n\n\nThe number of rows remaining in `linelist` is ` nrow(linelist)`. \n\n\n\n\n## Univariate { }\n\nJust like in the page on [Descriptive tables](tables_descriptive.qmd), your use case will determine which R package you use. We present two options for doing univariate analysis: \n\n* Use functions available in **base** R to quickly print results to the console. Use the **broom** package to tidy up the outputs. \n* Use the **gtsummary** package to model and get publication-ready outputs. \n\n\n\n\n\n### **base** R {.unnumbered}\n\n#### Linear regression {.unnumbered} \n\nThe **base** R function `lm()` perform linear regression, assessing the relationship between numeric response and explanatory variables that are assumed to have a linear relationship. \n\nProvide the equation as a formula, with the response and explanatory column names separated by a tilde `~`. Also, specify the dataset to `data = `. Define the model results as an R object, to use later. \n\n\n::: {.cell}\n\n```{.r .cell-code}\nlm_results <- lm(ht_cm ~ age, data = linelist)\n```\n:::\n\n\nYou can then run `summary()` on the model results to see the coefficients (Estimates), P-value, residuals, and other measures. \n\n\n::: {.cell}\n\n```{.r .cell-code}\nsummary(lm_results)\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n\nCall:\nlm(formula = ht_cm ~ age, data = linelist)\n\nResiduals:\n Min 1Q Median 3Q Max \n-128.579 -15.854 1.177 15.887 175.483 \n\nCoefficients:\n Estimate Std. Error t value Pr(>|t|) \n(Intercept) 69.9051 0.5979 116.9 <2e-16 ***\nage 3.4354 0.0293 117.2 <2e-16 ***\n---\nSignif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1\n\nResidual standard error: 23.75 on 4165 degrees of freedom\nMultiple R-squared: 0.7675,\tAdjusted R-squared: 0.7674 \nF-statistic: 1.375e+04 on 1 and 4165 DF, p-value: < 2.2e-16\n```\n\n\n:::\n:::\n\n\nAlternatively you can use the `tidy()` function from the **broom** package to pull \nthe results in to a table. What the results tell us is that for each year increase in age the height increases\nby 3.5 cm and this is statistically significant. \n\n\n::: {.cell}\n\n```{.r .cell-code}\ntidy(lm_results)\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n# A tibble: 2 × 5\n term estimate std.error statistic p.value\n \n1 (Intercept) 69.9 0.598 117. 0\n2 age 3.44 0.0293 117. 0\n```\n\n\n:::\n:::\n\n\nYou can then also use this regression to add it to a **ggplot**, to do this we \nfirst pull the points for the observed data and the fitted line in to one data frame \nusing the `augment()` function from **broom**. \n\n\n::: {.cell}\n\n```{.r .cell-code}\n## pull the regression points and observed data in to one dataset\npoints <- augment(lm_results)\n\n## plot the data using age as the x-axis \nggplot(points, aes(x = age)) + \n ## add points for height \n geom_point(aes(y = ht_cm)) + \n ## add your regression line \n geom_line(aes(y = .fitted), colour = \"red\")\n```\n\n::: {.cell-output-display}\n![](regression_files/figure-html/lin_reg_res_plot-1.png){width=672}\n:::\n:::\n\n\nIt is also possible to add a simple linear regression straight straight in **ggplot** \nusing the `geom_smooth()` function. \n\n\n::: {.cell}\n\n```{.r .cell-code}\n## add your data to a plot \n ggplot(linelist, aes(x = age, y = ht_cm)) + \n ## show points\n geom_point() + \n ## add a linear regression \n geom_smooth(method = \"lm\", se = FALSE)\n```\n\n::: {.cell-output .cell-output-stderr}\n\n```\n`geom_smooth()` using formula = 'y ~ x'\n```\n\n\n:::\n\n::: {.cell-output-display}\n![](regression_files/figure-html/geom_smooth-1.png){width=672}\n:::\n:::\n\n\nSee the Resource section at the end of this chapter for more detailed tutorials. \n\n\n#### Logistic regression {.unnumbered} \n\nThe function `glm()` from the **stats** package (part of **base** R) is used to fit [Generalized Linear Models (GLM)](https://online.stat.psu.edu/stat504/lesson/6/6.1). \n\n`glm()` can be used for univariate and multivariable logistic regression (e.g. to get Odds Ratios). Here are the core parts: \n\n\n::: {.cell}\n\n```{.r .cell-code}\n# arguments for glm()\nglm(formula, family, data, weights, subset, ...)\n```\n:::\n\n\n* `formula = ` The model is provided to `glm()` as an equation, with the outcome on the left and explanatory variables on the right of a tilde `~`. \n* `family = ` This determines the type of model to run. For logistic regression, use `family = \"binomial\"`, for poisson use `family = \"poisson\"`. Other examples are in the table below. \n* `data = ` Specify your data frame. \n\n\nIf necessary, you can also specify the link function via the syntax `family = familytype(link = \"linkfunction\"))`. You can read more in the documentation about other families and optional arguments such as `weights = ` and `subset = ` (`?glm`). \n\n\n\nFamily | Default link function \n-----------------------|------------------------------------------- \n`\"binomial\"` | `(link = \"logit\")` \n`\"gaussian\"` | `(link = \"identity\")` \n`\"Gamma\"` | `(link = \"inverse\")` \n`\"inverse.gaussian\"` | `(link = \"1/mu^2\")` \n`\"poisson\"` | `(link = \"log\")` \n`\"quasi\"` | `(link = \"identity\", variance = \"constant\")` \n`\"quasibinomial\"` | `(link = \"logit\")` \n`\"quasipoisson\"` | `(link = \"log\")` \n\n\nWhen running `glm()` it is most common to save the results as a named R object. Then you can print the results to your console using `summary()` as shown below, or perform other operations on the results (e.g. exponentiate). \n\nIf you need to run a negative binomial regression you can use the **MASS** package; the `glm.nb()` uses the same syntax as `glm()`. \nFor a walk-through of different regressions, see the [UCLA stats page](https://stats.idre.ucla.edu/other/dae/). \n\n#### Univariate `glm()` {.unnumbered}\n\nIn this example we are assessing the association between different age categories and the outcome of death (coded as 1 in the Preparation section). Below is a univariate model of `outcome` by `age_cat`. We save the model output as `model` and then print it with `summary()` to the console. Note the estimates provided are the *log odds* and that the baseline level is the first factor level of `age_cat` (\"0-4\"). \n\n\n::: {.cell}\n\n```{.r .cell-code}\nmodel <- glm(outcome ~ age_cat, family = \"binomial\", data = linelist)\nsummary(model)\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n\nCall:\nglm(formula = outcome ~ age_cat, family = \"binomial\", data = linelist)\n\nCoefficients:\n Estimate Std. Error z value Pr(>|z|) \n(Intercept) 0.233738 0.072805 3.210 0.00133 **\nage_cat5-9 -0.062898 0.101733 -0.618 0.53640 \nage_cat10-14 0.138204 0.107186 1.289 0.19726 \nage_cat15-19 -0.005565 0.113343 -0.049 0.96084 \nage_cat20-29 0.027511 0.102133 0.269 0.78765 \nage_cat30-49 0.063764 0.113771 0.560 0.57517 \nage_cat50-69 -0.387889 0.259240 -1.496 0.13459 \nage_cat70+ -0.639203 0.915770 -0.698 0.48518 \n---\nSignif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1\n\n(Dispersion parameter for binomial family taken to be 1)\n\n Null deviance: 5712.4 on 4166 degrees of freedom\nResidual deviance: 5705.1 on 4159 degrees of freedom\nAIC: 5721.1\n\nNumber of Fisher Scoring iterations: 4\n```\n\n\n:::\n:::\n\n\nTo alter the baseline level of a given variable, ensure the column is class Factor and move the desired level to the first position with `fct_relevel()` (see page on [Factors](factors.qmd)). For example, below we take column `age_cat` and set \"20-29\" as the baseline before piping the modified data frame into `glm()`. \n\n\n::: {.cell}\n\n```{.r .cell-code}\nlinelist %>% \n mutate(age_cat = fct_relevel(age_cat, \"20-29\", after = 0)) %>% \n glm(formula = outcome ~ age_cat, family = \"binomial\") %>% \n summary()\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n\nCall:\nglm(formula = outcome ~ age_cat, family = \"binomial\", data = .)\n\nCoefficients:\n Estimate Std. Error z value Pr(>|z|) \n(Intercept) 0.26125 0.07163 3.647 0.000265 ***\nage_cat0-4 -0.02751 0.10213 -0.269 0.787652 \nage_cat5-9 -0.09041 0.10090 -0.896 0.370220 \nage_cat10-14 0.11069 0.10639 1.040 0.298133 \nage_cat15-19 -0.03308 0.11259 -0.294 0.768934 \nage_cat30-49 0.03625 0.11302 0.321 0.748390 \nage_cat50-69 -0.41540 0.25891 -1.604 0.108625 \nage_cat70+ -0.66671 0.91568 -0.728 0.466546 \n---\nSignif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1\n\n(Dispersion parameter for binomial family taken to be 1)\n\n Null deviance: 5712.4 on 4166 degrees of freedom\nResidual deviance: 5705.1 on 4159 degrees of freedom\nAIC: 5721.1\n\nNumber of Fisher Scoring iterations: 4\n```\n\n\n:::\n:::\n\n\n#### Printing results {.unnumbered}\n\nFor most uses, several modifications must be made to the above outputs. The function `tidy()` from the package **broom** is convenient for making the model results presentable. \n\nHere we demonstrate how to combine model outputs with a table of counts. \n\n1) Get the *exponentiated* log odds ratio estimates and confidence intervals by passing the model to `tidy()` and setting `exponentiate = TRUE` and `conf.int = TRUE`. \n\n\n::: {.cell}\n\n```{.r .cell-code}\nmodel <- glm(outcome ~ age_cat, family = \"binomial\", data = linelist) %>% \n tidy(exponentiate = TRUE, conf.int = TRUE) %>% # exponentiate and produce CIs\n mutate(across(where(is.numeric), round, digits = 2)) # round all numeric columns\n```\n:::\n\n\nBelow is the outputted tibble `model`: \n\n\n::: {.cell}\n::: {.cell-output-display}\n\n```{=html}\n
    \n\n```\n\n:::\n:::\n\n\n2) Combine these model results with a table of counts. Below, we create the a counts cross-table with the `tabyl()` function from **janitor**, as covered in the [Descriptive tables](tables_descriptive.qmd) page. \n\n\n::: {.cell}\n\n```{.r .cell-code}\ncounts_table <- linelist %>% \n janitor::tabyl(age_cat, outcome)\n```\n:::\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nHere is what this `counts_table` data frame looks like: \n\n\n::: {.cell}\n::: {.cell-output-display}\n\n```{=html}\n
    \n\n```\n\n:::\n:::\n\n\nNow we can bind the `counts_table` and the `model` results together horizontally with `bind_cols()` (**dplyr**). Remember that with `bind_cols()` the rows in the two data frames must be aligned perfectly. In this code, because we are binding within a pipe chain, we use `.` to represent the piped object `counts_table` as we bind it to `model`. To finish the process, we use `select()` to pick the desired columns and their order, and finally apply the **base** R `round()` function across all numeric columns to specify 2 decimal places. \n\n\n::: {.cell}\n\n```{.r .cell-code}\ncombined <- counts_table %>% # begin with table of counts\n bind_cols(., model) %>% # combine with the outputs of the regression \n select(term, 2:3, estimate, # select and re-order cols\n conf.low, conf.high, p.value) %>% \n mutate(across(where(is.numeric), round, digits = 2)) ## round to 2 decimal places\n```\n:::\n\n\nHere is what the combined data frame looks like, printed nicely as an image with a function from **flextable**. The [Tables for presentation](tables_presentation.qmd) explains how to customize such tables with **flextable**, or or you can use numerous other packages such as **knitr** or **GT**. \n\n\n::: {.cell}\n\n```{.r .cell-code}\ncombined <- combined %>% \n flextable::qflextable()\n```\n:::\n\n\n\n#### Looping multiple univariate models {.unnumbered} \n\nBelow we present a method using `glm()` and `tidy()` for a more simple approach, see the section on **gtsummary**. \n\nTo run the models on several exposure variables to produce univariate odds ratios (i.e. not controlling for each other), you can use the approach below. It uses `str_c()` from **stringr** to create univariate formulas (see [Characters and strings](characters_strings.qmd)), runs the `glm()` regression on each formula, passes each `glm()` output to `tidy()` and finally collapses all the model outputs together with `bind_rows()` from **tidyr**. This approach uses `map()` from the package **purrr** to iterate - see the page on [Iteration, loops, and lists](iteration.qmd) for more information on this tool. \n\n1) Create a vector of column names of the explanatory variables. We already have this as `explanatory_vars` from the Preparation section of this page. \n\n2) Use `str_c()` to create multiple string formulas, with `outcome` on the left, and a column name from `explanatory_vars` on the right. The period `.` substitutes for the column name in `explanatory_vars`. \n\n\n::: {.cell}\n\n```{.r .cell-code}\nexplanatory_vars %>% str_c(\"outcome ~ \", .)\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n[1] \"outcome ~ gender\" \"outcome ~ fever\" \"outcome ~ chills\" \n[4] \"outcome ~ cough\" \"outcome ~ aches\" \"outcome ~ vomit\" \n[7] \"outcome ~ age_cat\"\n```\n\n\n:::\n:::\n\n\n3) Pass these string formulas to `map()` and set `~glm()` as the function to apply to each input. Within `glm()`, set the regression formula as `as.formula(.x)` where `.x` will be replaced by the string formula defined in the step above. `map()` will loop over each of the string formulas, running regressions for each one. \n\n4) The outputs of this first `map()` are passed to a second `map()` command, which applies `tidy()` to the regression outputs. \n\n5) Finally the output of the second `map()` (a list of tidied data frames) is condensed with `bind_rows()`, resulting in one data frame with all the univariate results. \n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nmodels <- explanatory_vars %>% # begin with variables of interest\n str_c(\"outcome ~ \", .) %>% # combine each variable into formula (\"outcome ~ variable of interest\")\n \n # iterate through each univariate formula\n map( \n .f = ~glm( # pass the formulas one-by-one to glm()\n formula = as.formula(.x), # within glm(), the string formula is .x\n family = \"binomial\", # specify type of glm (logistic)\n data = linelist)) %>% # dataset\n \n # tidy up each of the glm regression outputs from above\n map(\n .f = ~tidy(\n .x, \n exponentiate = TRUE, # exponentiate \n conf.int = TRUE)) %>% # return confidence intervals\n \n # collapse the list of regression outputs in to one data frame\n bind_rows() %>% \n \n # round all numeric columns\n mutate(across(where(is.numeric), round, digits = 2))\n```\n:::\n\n\nThis time, the end object `models` is longer because it now represents the combined results of several univariate regressions. Click through to see all the rows of `model`. \n\n\n::: {.cell}\n::: {.cell-output-display}\n\n```{=html}\n
    \n\n```\n\n:::\n:::\n\n\nAs before, we can create a counts table from the `linelist` for each explanatory variable, bind it to `models`, and make a nice table. We begin with the variables, and iterate through them with `map()`. We iterate through a user-defined function which involves creating a counts table with **dplyr** functions. Then the results are combined and bound with the `models` model results. \n\n\n\n::: {.cell}\n\n```{.r .cell-code}\n## for each explanatory variable\nuniv_tab_base <- explanatory_vars %>% \n map(.f = \n ~{linelist %>% ## begin with linelist\n group_by(outcome) %>% ## group data set by outcome\n count(.data[[.x]]) %>% ## produce counts for variable of interest\n pivot_wider( ## spread to wide format (as in cross-tabulation)\n names_from = outcome,\n values_from = n) %>% \n drop_na(.data[[.x]]) %>% ## drop rows with missings\n rename(\"variable\" = .x) %>% ## change variable of interest column to \"variable\"\n mutate(variable = as.character(variable))} ## convert to character, else non-dichotomous (categorical) variables come out as factor and cant be merged\n ) %>% \n \n ## collapse the list of count outputs in to one data frame\n bind_rows() %>% \n \n ## merge with the outputs of the regression \n bind_cols(., models) %>% \n \n ## only keep columns interested in \n select(term, 2:3, estimate, conf.low, conf.high, p.value) %>% \n \n ## round decimal places\n mutate(across(where(is.numeric), round, digits = 2))\n```\n:::\n\n\nBelow is what the data frame looks like. See the page on [Tables for presentation](tables_descriptive.qmd) for ideas on how to further convert this table to pretty HTML output (e.g. with **flextable**). \n\n\n::: {.cell}\n::: {.cell-output-display}\n\n```{=html}\n
    \n\n```\n\n:::\n:::\n\n\n\n\n\n\n\n\n### **gtsummary** package {#reg_gt_uni .unnumbered}\n\nBelow we present the use of `tbl_uvregression()` from the **gtsummary** package. Just like in the page on [Descriptive tables](https://epirhandbook.com/descriptive-tables.html), **gtsummary** functions do a good job of running statistics *and* producing professional-looking outputs. This function produces a table of univariate regression results. \n\nWe select only the necessary columns from the `linelist` (explanatory variables and the outcome variable) and pipe them into `tbl_uvregression()`. We are going to run univariate regression on each of the columns we defined as `explanatory_vars` in the data Preparation section (gender, fever, chills, cough, aches, vomit, and age_cat). \n\nWithin the function itself, we provide the `method = ` as `glm` (no quotes), the `y = ` outcome column (`outcome`), specify to `method.args = ` that we want to run logistic regression via `family = binomial`, and we tell it to exponentiate the results. \n\nThe output is HTML and contains the counts\n\n\n::: {.cell}\n\n```{.r .cell-code}\nuniv_tab <- linelist %>% \n dplyr::select(explanatory_vars, outcome) %>% ## select variables of interest\n\n tbl_uvregression( ## produce univariate table\n method = glm, ## define regression want to run (generalised linear model)\n y = outcome, ## define outcome variable\n method.args = list(family = binomial), ## define what type of glm want to run (logistic)\n exponentiate = TRUE ## exponentiate to produce odds ratios (rather than log odds)\n )\n\n## view univariate results table \nuniv_tab\n```\n\n::: {.cell-output-display}\n\n```{=html}\n
    \n\n\n \n \n \n \n \n \n \n \n \n \n \n\n\n\n\n \n\n\n\n\n \n\n\n\n\n \n\n\n\n\n \n\n\n\n\n \n\n\n\n\n \n\n\n\n\n \n\n\n\n\n \n\n\n\n\n \n\n\n\n\n \n\n\n\n\n \n\n\n\n\n \n\n\n\n\n \n\n\n\n\n \n\n\n\n\n \n \n \n \n \n \n \n

    Characteristic

    \n

    N

    \n

    OR

    \n
    1

    95% CI

    \n
    1

    p-value

    \n
    gender4,1671.000.88, 1.13>0.9
    fever4,1671.000.85, 1.17>0.9
    chills4,1671.030.89, 1.210.7
    cough4,1671.150.97, 1.370.11
    aches4,1670.930.76, 1.140.5
    vomit4,1671.090.96, 1.230.2
    age_cat4,167


        0-4

        5-9
    0.940.77, 1.150.5
        10-14
    1.150.93, 1.420.2
        15-19
    0.990.80, 1.24>0.9
        20-29
    1.030.84, 1.260.8
        30-49
    1.070.85, 1.330.6
        50-69
    0.680.41, 1.130.13
        70+
    0.530.07, 3.200.5
    1

    OR = Odds Ratio, CI = Confidence Interval

    \n
    \n
    \n```\n\n:::\n:::\n\n\nThere are many modifications you can make to this table output, such as adjusting the text labels, bolding rows by their p-value, etc. See tutorials [here](http://www.danieldsjoberg.com/gtsummary/articles/tbl_regression.html) and elsewhere online. \n\n\n\n\n\n## Stratified { }\n\nStratified analysis is currently still being worked on for **gtsummary**, \nthis page will be updated in due course. \n\n\n\n\n## Multivariable \n\nFor multivariable analysis, we again present two approaches: \n\n* `glm()` and `tidy()` \n* **gtsummary** package \n\nThe workflow is similar for each and only the last step of pulling together a final table is different.\n\n\n### Conduct multivariable {.unnumbered} \n\n\nHere we use `glm()` but add more variables to the right side of the equation, separated by plus symbols (`+`). \n\n\nTo run the model with all of our explanatory variables we would run: \n\n\n::: {.cell}\n\n```{.r .cell-code}\nmv_reg <- glm(outcome ~ gender + fever + chills + cough + aches + vomit + age_cat, family = \"binomial\", data = linelist)\n\nsummary(mv_reg)\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n\nCall:\nglm(formula = outcome ~ gender + fever + chills + cough + aches + \n vomit + age_cat, family = \"binomial\", data = linelist)\n\nCoefficients:\n Estimate Std. Error z value Pr(>|z|)\n(Intercept) 0.069054 0.131726 0.524 0.600\ngender 0.002448 0.065133 0.038 0.970\nfever 0.004309 0.080522 0.054 0.957\nchills 0.034112 0.078924 0.432 0.666\ncough 0.138584 0.089909 1.541 0.123\naches -0.070705 0.104078 -0.679 0.497\nvomit 0.086098 0.062618 1.375 0.169\nage_cat5-9 -0.063562 0.101851 -0.624 0.533\nage_cat10-14 0.136372 0.107275 1.271 0.204\nage_cat15-19 -0.011074 0.113640 -0.097 0.922\nage_cat20-29 0.026552 0.102780 0.258 0.796\nage_cat30-49 0.059569 0.116402 0.512 0.609\nage_cat50-69 -0.388964 0.262384 -1.482 0.138\nage_cat70+ -0.647443 0.917375 -0.706 0.480\n\n(Dispersion parameter for binomial family taken to be 1)\n\n Null deviance: 5712.4 on 4166 degrees of freedom\nResidual deviance: 5700.2 on 4153 degrees of freedom\nAIC: 5728.2\n\nNumber of Fisher Scoring iterations: 4\n```\n\n\n:::\n:::\n\n\nIf you want to include two variables and an interaction between them you can separate them with an asterisk `*` instead of a `+`. Separate them with a colon `:` if you are only specifying the interaction. For example: \n\n\n::: {.cell}\n\n```{.r .cell-code}\nglm(outcome ~ gender + age_cat * fever, family = \"binomial\", data = linelist)\n```\n:::\n\n\n\n*Optionally*, you can use this code to leverage the pre-defined vector of column names and re-create the above command using `str_c()`. This might be useful if your explanatory variable names are changing, or you don't want to type them all out again. \n\n\n::: {.cell}\n\n```{.r .cell-code}\n## run a regression with all variables of interest \nmv_reg <- explanatory_vars %>% ## begin with vector of explanatory column names\n str_c(collapse = \"+\") %>% ## combine all names of the variables of interest separated by a plus\n str_c(\"outcome ~ \", .) %>% ## combine the names of variables of interest with outcome in formula style\n glm(family = \"binomial\", ## define type of glm as logistic,\n data = linelist) ## define your dataset\n```\n:::\n\n\n\n#### Building the model {.unnumbered} \n\nYou can build your model step-by-step, saving various models that include certain explanatory variables. You can compare these models with likelihood-ratio tests using `lrtest()` from the package **lmtest**, as below: \n\n**_NOTE:_** Using **base** `anova(model1, model2, test = \"Chisq)` produces the same results \n\n\n::: {.cell}\n\n```{.r .cell-code}\nmodel1 <- glm(outcome ~ age_cat, family = \"binomial\", data = linelist)\nmodel2 <- glm(outcome ~ age_cat + gender, family = \"binomial\", data = linelist)\n\nlmtest::lrtest(model1, model2)\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\nLikelihood ratio test\n\nModel 1: outcome ~ age_cat\nModel 2: outcome ~ age_cat + gender\n #Df LogLik Df Chisq Pr(>Chisq)\n1 8 -2852.6 \n2 9 -2852.6 1 2e-04 0.9883\n```\n\n\n:::\n:::\n\n\nAnother option is to take the model object and apply the `step()` function from the **stats** package. Specify which variable selection direction you want use when building the model. \n\n\n::: {.cell}\n\n```{.r .cell-code}\n## choose a model using forward selection based on AIC\n## you can also do \"backward\" or \"both\" by adjusting the direction\nfinal_mv_reg <- mv_reg %>%\n step(direction = \"forward\", trace = FALSE)\n```\n:::\n\n\n\nYou can also turn off scientific notation in your R session, for clarity: \n\n\n::: {.cell}\n\n```{.r .cell-code}\noptions(scipen=999)\n```\n:::\n\n\nAs described in the section on univariate analysis, pass the model output to `tidy()` to exponentiate the log odds and CIs. Finally we round all numeric columns to two decimal places. Scroll through to see all the rows. \n\n\n::: {.cell}\n\n```{.r .cell-code}\nmv_tab_base <- final_mv_reg %>% \n broom::tidy(exponentiate = TRUE, conf.int = TRUE) %>% ## get a tidy dataframe of estimates \n mutate(across(where(is.numeric), round, digits = 2)) ## round \n```\n:::\n\n\nHere is what the resulting data frame looks like: \n\n\n::: {.cell}\n::: {.cell-output-display}\n\n```{=html}\n
    \n\n```\n\n:::\n:::\n\n\n\n\n\n\n\n\n### Combine univariate and multivariable {.unnumbered}\n\n#### Combine with **gtsummary** {.unnumbered} \n\nThe **gtsummary** package provides the `tbl_regression()` function, which will \ntake the outputs from a regression (`glm()` in this case) and produce an nice \nsummary table. \n\n\n::: {.cell}\n\n```{.r .cell-code}\n## show results table of final regression \nmv_tab <- tbl_regression(final_mv_reg, exponentiate = TRUE)\n```\n:::\n\n\nLet's see the table: \n\n\n::: {.cell}\n\n```{.r .cell-code}\nmv_tab\n```\n\n::: {.cell-output-display}\n\n```{=html}\n
    \n\n\n \n \n \n \n \n \n \n \n \n \n\n\n\n \n\n\n\n \n\n\n\n \n\n\n\n \n\n\n\n \n\n\n\n \n\n\n\n \n\n\n\n \n\n\n\n \n\n\n\n \n\n\n\n \n\n\n\n \n\n\n\n \n\n\n\n \n\n\n\n \n \n \n \n \n \n \n

    Characteristic

    \n

    OR

    \n
    1

    95% CI

    \n
    1

    p-value

    \n
    gender1.000.88, 1.14>0.9
    fever1.000.86, 1.18>0.9
    chills1.030.89, 1.210.7
    cough1.150.96, 1.370.12
    aches0.930.76, 1.140.5
    vomit1.090.96, 1.230.2
    age_cat


        0-4
        5-90.940.77, 1.150.5
        10-141.150.93, 1.410.2
        15-190.990.79, 1.24>0.9
        20-291.030.84, 1.260.8
        30-491.060.85, 1.330.6
        50-690.680.40, 1.130.14
        70+0.520.07, 3.190.5
    1

    OR = Odds Ratio, CI = Confidence Interval

    \n
    \n
    \n```\n\n:::\n:::\n\n\nYou can also combine several different output tables produced by **gtsummary** with \nthe `tbl_merge()` function. We now combine the multivariable results with the **gtsummary** *univariate* results that we created [above](#reg_gt_uni): \n\n\n::: {.cell}\n\n```{.r .cell-code}\n## combine with univariate results \ntbl_merge(\n tbls = list(univ_tab, mv_tab), # combine\n tab_spanner = c(\"**Univariate**\", \"**Multivariable**\")) # set header names\n```\n\n::: {.cell-output-display}\n\n```{=html}\n
    \n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n\n\n\n\n \n\n\n\n\n\n\n\n \n\n\n\n\n\n\n\n \n\n\n\n\n\n\n\n \n\n\n\n\n\n\n\n \n\n\n\n\n\n\n\n \n\n\n\n\n\n\n\n \n\n\n\n\n\n\n\n \n\n\n\n\n\n\n\n \n\n\n\n\n\n\n\n \n\n\n\n\n\n\n\n \n\n\n\n\n\n\n\n \n\n\n\n\n\n\n\n \n\n\n\n\n\n\n\n \n\n\n\n\n\n\n\n \n \n \n \n \n \n \n

    Characteristic

    \n
    \n

    Univariate

    \n
    \n
    \n

    Multivariable

    \n
    \n

    N

    \n

    OR

    \n
    1

    95% CI

    \n
    1

    p-value

    \n

    OR

    \n
    1

    95% CI

    \n
    1

    p-value

    \n
    gender4,1671.000.88, 1.13>0.91.000.88, 1.14>0.9
    fever4,1671.000.85, 1.17>0.91.000.86, 1.18>0.9
    chills4,1671.030.89, 1.210.71.030.89, 1.210.7
    cough4,1671.150.97, 1.370.111.150.96, 1.370.12
    aches4,1670.930.76, 1.140.50.930.76, 1.140.5
    vomit4,1671.090.96, 1.230.21.090.96, 1.230.2
    age_cat4,167





        0-4


        5-9
    0.940.77, 1.150.50.940.77, 1.150.5
        10-14
    1.150.93, 1.420.21.150.93, 1.410.2
        15-19
    0.990.80, 1.24>0.90.990.79, 1.24>0.9
        20-29
    1.030.84, 1.260.81.030.84, 1.260.8
        30-49
    1.070.85, 1.330.61.060.85, 1.330.6
        50-69
    0.680.41, 1.130.130.680.40, 1.130.14
        70+
    0.530.07, 3.200.50.520.07, 3.190.5
    1

    OR = Odds Ratio, CI = Confidence Interval

    \n
    \n
    \n```\n\n:::\n:::\n\n\n\n\n#### Combine with **dplyr** {.unnumbered} \n\nAn alternative way of combining the `glm()`/`tidy()` univariate and multivariable outputs is with the **dplyr** join functions. \n\n* Join the univariate results from earlier (`univ_tab_base`, which contains counts) with the tidied multivariable results `mv_tab_base` \n* Use `select()` to keep only the columns we want, specify their order, and re-name them \n* Use `round()` with two decimal places on all the column that are class Double \n\n\n::: {.cell}\n\n```{.r .cell-code}\n## combine univariate and multivariable tables \nleft_join(univ_tab_base, mv_tab_base, by = \"term\") %>% \n ## choose columns and rename them\n select( # new name = old name\n \"characteristic\" = term, \n \"recovered\" = \"0\", \n \"dead\" = \"1\", \n \"univ_or\" = estimate.x, \n \"univ_ci_low\" = conf.low.x, \n \"univ_ci_high\" = conf.high.x,\n \"univ_pval\" = p.value.x, \n \"mv_or\" = estimate.y, \n \"mvv_ci_low\" = conf.low.y, \n \"mv_ci_high\" = conf.high.y,\n \"mv_pval\" = p.value.y \n ) %>% \n mutate(across(where(is.double), round, 2)) \n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n# A tibble: 20 × 11\n characteristic recovered dead univ_or univ_ci_low univ_ci_high univ_pval\n \n 1 (Intercept) 909 1168 1.28 1.18 1.4 0 \n 2 gender 916 1174 1 0.88 1.13 0.97\n 3 (Intercept) 340 436 1.28 1.11 1.48 0 \n 4 fever 1485 1906 1 0.85 1.17 0.99\n 5 (Intercept) 1472 1877 1.28 1.19 1.37 0 \n 6 chills 353 465 1.03 0.89 1.21 0.68\n 7 (Intercept) 272 309 1.14 0.97 1.34 0.13\n 8 cough 1553 2033 1.15 0.97 1.37 0.11\n 9 (Intercept) 1636 2114 1.29 1.21 1.38 0 \n10 aches 189 228 0.93 0.76 1.14 0.51\n11 (Intercept) 931 1144 1.23 1.13 1.34 0 \n12 vomit 894 1198 1.09 0.96 1.23 0.17\n13 (Intercept) 338 427 1.26 1.1 1.46 0 \n14 age_cat5-9 365 433 0.94 0.77 1.15 0.54\n15 age_cat10-14 273 396 1.15 0.93 1.42 0.2 \n16 age_cat15-19 238 299 0.99 0.8 1.24 0.96\n17 age_cat20-29 345 448 1.03 0.84 1.26 0.79\n18 age_cat30-49 228 307 1.07 0.85 1.33 0.58\n19 age_cat50-69 35 30 0.68 0.41 1.13 0.13\n20 age_cat70+ 3 2 0.53 0.07 3.2 0.49\n# ℹ 4 more variables: mv_or , mvv_ci_low , mv_ci_high ,\n# mv_pval \n```\n\n\n:::\n:::\n\n\n\n\n\n\n\n## Forest plot { }\n\nThis section shows how to produce a plot with the outputs of your regression.\nThere are two options, you can build a plot yourself using **ggplot2** or use a \nmeta-package called **easystats** (a package that includes many packages). \n\nSee the page on [ggplot basics](ggplot_basics.qmd) if you are unfamiliar with the **ggplot2** plotting package. \n\n\n\n\n### **ggplot2** package {.unnumbered}\n\nYou can build a forest plot with `ggplot()` by plotting elements of the multivariable regression results. Add the layers of the plots using these \"geoms\": \n\n* estimates with `geom_point()` \n* confidence intervals with `geom_errorbar()` \n* a vertical line at OR = 1 with `geom_vline()` \n\nBefore plotting, you may want to use `fct_relevel()` from the **forcats** package to set the order of the variables/levels on the y-axis. `ggplot()` may display them in alpha-numeric order which would not work well for these age category values (\"30\" would appear before \"5\"). See the page on [Factors](factors.qmd) for more details. \n\n\n::: {.cell}\n\n```{.r .cell-code}\n## remove the intercept term from your multivariable results\nmv_tab_base %>% \n \n #set order of levels to appear along y-axis\n mutate(term = fct_relevel(\n term,\n \"vomit\", \"gender\", \"fever\", \"cough\", \"chills\", \"aches\",\n \"age_cat5-9\", \"age_cat10-14\", \"age_cat15-19\", \"age_cat20-29\",\n \"age_cat30-49\", \"age_cat50-69\", \"age_cat70+\")) %>%\n \n # remove \"intercept\" row from plot\n filter(term != \"(Intercept)\") %>% \n \n ## plot with variable on the y axis and estimate (OR) on the x axis\n ggplot(aes(x = estimate, y = term)) +\n \n ## show the estimate as a point\n geom_point() + \n \n ## add in an error bar for the confidence intervals\n geom_errorbar(aes(xmin = conf.low, xmax = conf.high)) + \n \n ## show where OR = 1 is for reference as a dashed line\n geom_vline(xintercept = 1, linetype = \"dashed\")\n```\n\n::: {.cell-output-display}\n![](regression_files/figure-html/ggplot_forest-1.png){width=672}\n:::\n:::\n\n\n\n\n\n### **easystats** packages {.unnumbered}\n\nAn alternative, if you do not want to the fine level of control that **ggplot2** provides, is to use a combination of **easystats** packages. \n\nThe function `model_parameters()` from the **parameters** package does the equivalent\nof the **broom** package function `tidy()`. The **see** package then accepts those outputs\nand creates a default forest plot as a `ggplot()` object. \n\n\n::: {.cell}\n\n```{.r .cell-code}\npacman::p_load(easystats)\n\n## remove the intercept term from your multivariable results\nfinal_mv_reg %>% \n model_parameters(exponentiate = TRUE) %>% \n plot()\n```\n\n::: {.cell-output-display}\n![](regression_files/figure-html/easystats_forest-1.png){width=672}\n:::\n:::\n\n\n\n\n\n## Resources { }\n\nThe content of this page was informed by these resources and vignettes online: \n\n[Linear regression in R](https://www.datacamp.com/community/tutorials/linear-regression-R) \n\n[gtsummary](http://www.danieldsjoberg.com/gtsummary/articles/tbl_regression.html) \n\n[UCLA stats page](https://stats.idre.ucla.edu/other/dae/) \n\n[sthda stepwise regression](http://www.sthda.com/english/articles/36-classification-methods-essentials/150-stepwise-logistic-regression-essentials-in-r/) \n\n", - "supporting": [ - "regression_files" - ], - "filters": [ - "rmarkdown/pagebreak.lua" - ], - "includes": { - "include-in-header": [ - "\n\n\n\n\n\n\n\n\n\n" - ] - }, - "engineDependencies": {}, - "preserve": {}, - "postProcess": true - } -} \ No newline at end of file diff --git a/new_pages/regression_files/figure-html/easystats_forest-1.png b/new_pages/regression_files/figure-html/easystats_forest-1.png deleted file mode 100644 index c7cb391d..00000000 Binary files a/new_pages/regression_files/figure-html/easystats_forest-1.png and /dev/null differ diff --git a/new_pages/regression_files/figure-html/geom_smooth-1.png b/new_pages/regression_files/figure-html/geom_smooth-1.png deleted file mode 100644 index b6d63b2d..00000000 Binary files a/new_pages/regression_files/figure-html/geom_smooth-1.png and /dev/null differ diff --git a/new_pages/regression_files/figure-html/ggplot_forest-1.png b/new_pages/regression_files/figure-html/ggplot_forest-1.png deleted file mode 100644 index c66aea52..00000000 Binary files a/new_pages/regression_files/figure-html/ggplot_forest-1.png and /dev/null differ diff --git a/new_pages/regression_files/figure-html/lin_reg_res_plot-1.png b/new_pages/regression_files/figure-html/lin_reg_res_plot-1.png deleted file mode 100644 index 96fefc14..00000000 Binary files a/new_pages/regression_files/figure-html/lin_reg_res_plot-1.png and /dev/null differ diff --git a/new_pages/standardization.qmd b/new_pages/standardization.qmd index 0aa23eca..1ddcd76f 100644 --- a/new_pages/standardization.qmd +++ b/new_pages/standardization.qmd @@ -263,7 +263,6 @@ Currently, the values of the column `age_cat5` from the `standard_pop_data` data ```{r} # Remove specific string from column values standard_pop_clean <- standard_pop_data %>% - rename(age_cat5 = AgeGroup) %>% #Change name of age group column to match that in our population data and death counts mutate( age_cat5 = str_replace_all(age_cat5, "years", ""), # remove "year" age_cat5 = str_replace_all(age_cat5, "plus", ""), # remove "plus" @@ -279,7 +278,9 @@ standard_pop_clean <- standard_pop_data %>% Finally, the package **PHEindicatormethods**, detailed [below](#standard_phe), expects the standard populations joined to the country event and population counts. So, we will create a dataset `all_data` for that purpose. ```{r} -all_data <- left_join(country_data, standard_pop_clean, by=c("age_cat5", "Sex")) +all_data <- left_join(country_data, + standard_pop_clean, + by = c("age_cat5", "Sex")) ``` This complete dataset looks like this: diff --git a/new_pages/survey_analysis.qmd b/new_pages/survey_analysis.qmd index 70f7d47b..2a100e8f 100644 --- a/new_pages/survey_analysis.qmd +++ b/new_pages/survey_analysis.qmd @@ -964,27 +964,31 @@ purrr::map( ``` - - ### **Sitrep** package The `tab_survey()` function from **sitrep** is a wrapper for **srvyr**, allowing you to create weighted tables with minimal coding. It also allows you to calculate weighted proportions for categorical variables. -```{r sitrep_props} +```{r eval = F, sitrep_props} ## using the survey design object survey_design %>% ## pass the names of variables of interest unquoted - tab_survey(arrived, left, died, education_level, - deff = TRUE, # calculate the design effect - pretty = TRUE # merge the proportion and 95%CI - ) + tab_survey( + "arrived", + "left", + "died", + "education_level", + deff = TRUE, # calculate the design effect + pretty = TRUE # merge the proportion and 95%CI + ) ``` - +```{r, echo = F, warning = F, message=FALSE} +import(here("data", "surveys", "tab_survey_output.rds")) +``` ### **Gtsummary** package @@ -1037,7 +1041,7 @@ cbind( ### **Srvyr** package -```{r srvyr_ratio} +```{r srvyr_ratio, warning=FALSE, message=F} survey_design %>% ## survey ratio used to account for observation time diff --git a/new_pages/time_series_files/figure-html/autocorrelation-1.png b/new_pages/time_series_files/figure-html/autocorrelation-1.png deleted file mode 100644 index e7166ceb..00000000 Binary files a/new_pages/time_series_files/figure-html/autocorrelation-1.png and /dev/null differ diff --git a/new_pages/time_series_files/figure-html/autocorrelation-2.png b/new_pages/time_series_files/figure-html/autocorrelation-2.png deleted file mode 100644 index 5c968235..00000000 Binary files a/new_pages/time_series_files/figure-html/autocorrelation-2.png and /dev/null differ diff --git a/new_pages/time_series_files/figure-html/basic_plot-1.png b/new_pages/time_series_files/figure-html/basic_plot-1.png deleted file mode 100644 index e13a1624..00000000 Binary files a/new_pages/time_series_files/figure-html/basic_plot-1.png and /dev/null differ diff --git a/new_pages/time_series_files/figure-html/basic_plot_bivar-1.png b/new_pages/time_series_files/figure-html/basic_plot_bivar-1.png deleted file mode 100644 index 1004aebe..00000000 Binary files a/new_pages/time_series_files/figure-html/basic_plot_bivar-1.png and /dev/null differ diff --git a/new_pages/time_series_files/figure-html/decomposition-1.png b/new_pages/time_series_files/figure-html/decomposition-1.png deleted file mode 100644 index 657732e3..00000000 Binary files a/new_pages/time_series_files/figure-html/decomposition-1.png and /dev/null differ diff --git a/new_pages/time_series_files/figure-html/forecast_plot-1.png b/new_pages/time_series_files/figure-html/forecast_plot-1.png deleted file mode 100644 index 48e92066..00000000 Binary files a/new_pages/time_series_files/figure-html/forecast_plot-1.png and /dev/null differ diff --git a/new_pages/time_series_files/figure-html/missings-1.png b/new_pages/time_series_files/figure-html/missings-1.png deleted file mode 100644 index 62651daa..00000000 Binary files a/new_pages/time_series_files/figure-html/missings-1.png and /dev/null differ diff --git a/new_pages/time_series_files/figure-html/moving_averages-1.png b/new_pages/time_series_files/figure-html/moving_averages-1.png deleted file mode 100644 index 2dd07f75..00000000 Binary files a/new_pages/time_series_files/figure-html/moving_averages-1.png and /dev/null differ diff --git a/new_pages/time_series_files/figure-html/nb_reg-1.png b/new_pages/time_series_files/figure-html/nb_reg-1.png deleted file mode 100644 index 2a5a2ca2..00000000 Binary files a/new_pages/time_series_files/figure-html/nb_reg-1.png and /dev/null differ diff --git a/new_pages/time_series_files/figure-html/periodogram-1.png b/new_pages/time_series_files/figure-html/periodogram-1.png deleted file mode 100644 index c0fc1a76..00000000 Binary files a/new_pages/time_series_files/figure-html/periodogram-1.png and /dev/null differ diff --git a/new_pages/time_series_files/figure-html/plot_nb_reg_bivar-1.png b/new_pages/time_series_files/figure-html/plot_nb_reg_bivar-1.png deleted file mode 100644 index 10cf6bbe..00000000 Binary files a/new_pages/time_series_files/figure-html/plot_nb_reg_bivar-1.png and /dev/null differ diff --git a/new_pages/time_series_files/figure-html/unnamed-chunk-2-1.png b/new_pages/time_series_files/figure-html/unnamed-chunk-2-1.png deleted file mode 100644 index 8163b423..00000000 Binary files a/new_pages/time_series_files/figure-html/unnamed-chunk-2-1.png and /dev/null differ diff --git a/new_pages/time_series_files/figure-html/unnamed-chunk-2-2.png b/new_pages/time_series_files/figure-html/unnamed-chunk-2-2.png deleted file mode 100644 index abbc0366..00000000 Binary files a/new_pages/time_series_files/figure-html/unnamed-chunk-2-2.png and /dev/null differ diff --git a/new_pages/time_series_files/figure-html/unnamed-chunk-2-3.png b/new_pages/time_series_files/figure-html/unnamed-chunk-2-3.png deleted file mode 100644 index 79ec7d23..00000000 Binary files a/new_pages/time_series_files/figure-html/unnamed-chunk-2-3.png and /dev/null differ diff --git a/new_pages/time_series_files/figure-html/unnamed-chunk-2-4.png b/new_pages/time_series_files/figure-html/unnamed-chunk-2-4.png deleted file mode 100644 index 79497455..00000000 Binary files a/new_pages/time_series_files/figure-html/unnamed-chunk-2-4.png and /dev/null differ diff --git a/new_pages/time_series_files/figure-html/unnamed-chunk-3-1.png b/new_pages/time_series_files/figure-html/unnamed-chunk-3-1.png deleted file mode 100644 index 969228f0..00000000 Binary files a/new_pages/time_series_files/figure-html/unnamed-chunk-3-1.png and /dev/null differ diff --git a/new_pages/time_series_files/figure-html/unnamed-chunk-3-2.png b/new_pages/time_series_files/figure-html/unnamed-chunk-3-2.png deleted file mode 100644 index a1f74cf2..00000000 Binary files a/new_pages/time_series_files/figure-html/unnamed-chunk-3-2.png and /dev/null differ diff --git a/new_pages/time_series_files/figure-html/unnamed-chunk-3-3.png b/new_pages/time_series_files/figure-html/unnamed-chunk-3-3.png deleted file mode 100644 index 6968c9ed..00000000 Binary files a/new_pages/time_series_files/figure-html/unnamed-chunk-3-3.png and /dev/null differ diff --git a/new_pages/time_series_files/figure-html/unnamed-chunk-3-4.png b/new_pages/time_series_files/figure-html/unnamed-chunk-3-4.png deleted file mode 100644 index 75fd5640..00000000 Binary files a/new_pages/time_series_files/figure-html/unnamed-chunk-3-4.png and /dev/null differ