# A tibble: 4 × 8
term estimate std.error statistic p.value conf.low conf.high gender
@@ -2621,54 +2622,54 @@ gtsummary
The same approach is repeated using gtsummary, however it is easier to produce publication ready tables with gtsummary and compare the two tables with the function tbl_merge()
.
-
#Run model for f
-f_model_gt <- f_linelist %>%
- dplyr::select(vomit, outcome) %>% ## select variables of interest
- tbl_uvregression( ## produce univariate table
- method = glm, ## define regression want to run (generalised linear model)
- y = outcome, ## define outcome variable
- method.args = list(family = binomial), ## define what type of glm want to run (logistic)
- exponentiate = TRUE ## exponentiate to produce odds ratios (rather than log odds)
- )
-
-#Run model for m
-m_model_gt <- m_linelist %>%
- dplyr::select(vomit, outcome) %>% ## select variables of interest
- tbl_uvregression( ## produce univariate table
- method = glm, ## define regression want to run (generalised linear model)
- y = outcome, ## define outcome variable
- method.args = list(family = binomial), ## define what type of glm want to run (logistic)
- exponentiate = TRUE ## exponentiate to produce odds ratios (rather than log odds)
- )
-
-#Combine gtsummary tables
-f_and_m_table <- tbl_merge(
- tbls = list(f_model_gt,
- m_model_gt),
- tab_spanner = c("Female",
- "Male")
-)
-
-#Print
-f_and_m_table
+
#Run model for f
+f_model_gt <- f_linelist %>%
+ dplyr::select(vomit, outcome) %>% ## select variables of interest
+ tbl_uvregression( ## produce univariate table
+ method = glm, ## define regression want to run (generalised linear model)
+ y = outcome, ## define outcome variable
+ method.args = list(family = binomial), ## define what type of glm want to run (logistic)
+ exponentiate = TRUE ## exponentiate to produce odds ratios (rather than log odds)
+ )
+
+#Run model for m
+m_model_gt <- m_linelist %>%
+ dplyr::select(vomit, outcome) %>% ## select variables of interest
+ tbl_uvregression( ## produce univariate table
+ method = glm, ## define regression want to run (generalised linear model)
+ y = outcome, ## define outcome variable
+ method.args = list(family = binomial), ## define what type of glm want to run (logistic)
+ exponentiate = TRUE ## exponentiate to produce odds ratios (rather than log odds)
+ )
+
+#Combine gtsummary tables
+f_and_m_table <- tbl_merge(
+ tbls = list(f_model_gt,
+ m_model_gt),
+ tab_spanner = c("Female",
+ "Male")
+)
+
+#Print
+f_and_m_table
-
-
@@ -3190,9 +3191,9 @@
Conduct m
Here we use glm()
but add more variables to the right side of the equation, separated by plus symbols (+
).
To run the model with all of our explanatory variables we would run:
-
mv_reg <- glm(outcome ~ gender + fever + chills + cough + aches + vomit + age_cat, family = "binomial", data = linelist)
-
-summary(mv_reg)
+
mv_reg <- glm(outcome ~ gender + fever + chills + cough + aches + vomit + age_cat, family = "binomial", data = linelist)
+
+summary(mv_reg)
Call:
@@ -3227,26 +3228,26 @@ Conduct m
If you want to include two variables and an interaction between them you can separate them with an asterisk *
instead of a +
. Separate them with a colon :
if you are only specifying the interaction. For example:
-
glm(outcome ~ gender + age_cat * fever, family = "binomial", data = linelist)
+
glm(outcome ~ gender + age_cat * fever, family = "binomial", data = linelist)
Optionally, you can use this code to leverage the pre-defined vector of column names and re-create the above command using str_c()
. This might be useful if your explanatory variable names are changing, or you don’t want to type them all out again.
-
## run a regression with all variables of interest
-mv_reg <- explanatory_vars %>% ## begin with vector of explanatory column names
- str_c(collapse = "+") %>% ## combine all names of the variables of interest separated by a plus
- str_c("outcome ~ ", .) %>% ## combine the names of variables of interest with outcome in formula style
- glm(family = "binomial", ## define type of glm as logistic,
- data = linelist) ## define your dataset
+
## run a regression with all variables of interest
+mv_reg <- explanatory_vars %>% ## begin with vector of explanatory column names
+ str_c(collapse = "+") %>% ## combine all names of the variables of interest separated by a plus
+ str_c("outcome ~ ", .) %>% ## combine the names of variables of interest with outcome in formula style
+ glm(family = "binomial", ## define type of glm as logistic,
+ data = linelist) ## define your dataset
Building the model
You can build your model step-by-step, saving various models that include certain explanatory variables. You can compare these models with likelihood-ratio tests using lrtest()
from the package lmtest, as below:
NOTE: Using base anova(model1, model2, test = "Chisq)
produces the same results
-
model1 <- glm(outcome ~ age_cat, family = "binomial", data = linelist)
-model2 <- glm(outcome ~ age_cat + gender, family = "binomial", data = linelist)
-
-lmtest::lrtest(model1, model2)
+
model1 <- glm(outcome ~ age_cat, family = "binomial", data = linelist)
+model2 <- glm(outcome ~ age_cat + gender, family = "binomial", data = linelist)
+
+lmtest::lrtest(model1, model2)
Likelihood ratio test
@@ -3259,26 +3260,26 @@ Building the
Another option is to take the model object and apply the step()
function from the stats package. Specify which variable selection direction you want use when building the model.
-
## choose a model using forward selection based on AIC
-## you can also do "backward" or "both" by adjusting the direction
-final_mv_reg <- mv_reg %>%
- step(direction = "forward", trace = FALSE)
+
## choose a model using forward selection based on AIC
+## you can also do "backward" or "both" by adjusting the direction
+final_mv_reg <- mv_reg %>%
+ step(direction = "forward", trace = FALSE)
You can also turn off scientific notation in your R session, for clarity:
As described in the section on univariate analysis, pass the model output to tidy()
to exponentiate the log odds and CIs. Finally we round all numeric columns to two decimal places. Scroll through to see all the rows.
-
mv_tab_base <- final_mv_reg %>%
- broom::tidy(exponentiate = TRUE, conf.int = TRUE) %>% ## get a tidy dataframe of estimates
- mutate(across(where(is.numeric), round, digits = 2)) ## round
+
mv_tab_base <- final_mv_reg %>%
+ broom::tidy(exponentiate = TRUE, conf.int = TRUE) %>% ## get a tidy dataframe of estimates
+ mutate(across(where(is.numeric), round, digits = 2)) ## round
Here is what the resulting data frame looks like:
@@ -3290,30 +3291,30 @@
Combine with gtsummary
The gtsummary package provides the tbl_regression()
function, which will take the outputs from a regression (glm()
in this case) and produce a nice summary table.
-
## show results table of final regression
-mv_tab <- tbl_regression(final_mv_reg, exponentiate = TRUE)
+
## show results table of final regression
+mv_tab <- tbl_regression(final_mv_reg, exponentiate = TRUE)
Let’s see the table:
-
+
-
-
@@ -3877,28 +3878,28 @@
Combine
You can also combine several different output tables produced by gtsummary with the tbl_merge()
function. We now combine the multivariable results with the gtsummary univariate results that we created above:
-
## combine with univariate results
-tbl_merge(
- tbls = list(univ_tab, mv_tab), # combine
- tab_spanner = c("**Univariate**", "**Multivariable**")) # set header names
+
## combine with univariate results
+tbl_merge(
+ tbls = list(univ_tab, mv_tab), # combine
+ tab_spanner = c("**Univariate**", "**Multivariable**")) # set header names
-
-
@@ -4568,23 +4569,23 @@
Combine with
Use round()
with two decimal places on all the column that are class Double.
-
## combine univariate and multivariable tables
-left_join(univ_tab_base, mv_tab_base, by = "term") %>%
- ## choose columns and rename them
- select( # new name = old name
- "characteristic" = term,
- "recovered" = "0",
- "dead" = "1",
- "univ_or" = estimate.x,
- "univ_ci_low" = conf.low.x,
- "univ_ci_high" = conf.high.x,
- "univ_pval" = p.value.x,
- "mv_or" = estimate.y,
- "mvv_ci_low" = conf.low.y,
- "mv_ci_high" = conf.high.y,
- "mv_pval" = p.value.y
- ) %>%
- mutate(across(where(is.double), round, 2))
+
## combine univariate and multivariable tables
+left_join(univ_tab_base, mv_tab_base, by = "term") %>%
+ ## choose columns and rename them
+ select( # new name = old name
+ "characteristic" = term,
+ "recovered" = "0",
+ "dead" = "1",
+ "univ_or" = estimate.x,
+ "univ_ci_low" = conf.low.x,
+ "univ_ci_high" = conf.high.x,
+ "univ_pval" = p.value.x,
+ "mv_or" = estimate.y,
+ "mvv_ci_low" = conf.low.y,
+ "mv_ci_high" = conf.high.y,
+ "mv_pval" = p.value.y
+ ) %>%
+ mutate(across(where(is.double), round, 2))
# A tibble: 20 × 11
characteristic recovered dead univ_or univ_ci_low univ_ci_high univ_pval
@@ -4634,30 +4635,30 @@ ggplot2
Before plotting, you may want to use fct_relevel()
from the forcats package to set the order of the variables/levels on the y-axis. ggplot()
may display them in alpha-numeric order which would not work well for these age category values (“30” would appear before “5”). See the page on Factors for more details.
-
## remove the intercept term from your multivariable results
-mv_tab_base %>%
-
- #set order of levels to appear along y-axis
- mutate(term = fct_relevel(
- term,
- "vomit", "gender", "fever", "cough", "chills", "aches",
- "age_cat5-9", "age_cat10-14", "age_cat15-19", "age_cat20-29",
- "age_cat30-49", "age_cat50-69", "age_cat70+")) %>%
-
- # remove "intercept" row from plot
- filter(term != "(Intercept)") %>%
-
- ## plot with variable on the y axis and estimate (OR) on the x axis
- ggplot(aes(x = estimate, y = term)) +
-
- ## show the estimate as a point
- geom_point() +
-
- ## add in an error bar for the confidence intervals
- geom_errorbar(aes(xmin = conf.low, xmax = conf.high)) +
-
- ## show where OR = 1 is for reference as a dashed line
- geom_vline(xintercept = 1, linetype = "dashed")
+
## remove the intercept term from your multivariable results
+mv_tab_base %>%
+
+ #set order of levels to appear along y-axis
+ mutate(term = fct_relevel(
+ term,
+ "vomit", "gender", "fever", "cough", "chills", "aches",
+ "age_cat5-9", "age_cat10-14", "age_cat15-19", "age_cat20-29",
+ "age_cat30-49", "age_cat50-69", "age_cat70+")) %>%
+
+ # remove "intercept" row from plot
+ filter(term != "(Intercept)") %>%
+
+ ## plot with variable on the y axis and estimate (OR) on the x axis
+ ggplot(aes(x = estimate, y = term)) +
+
+ ## show the estimate as a point
+ geom_point() +
+
+ ## add in an error bar for the confidence intervals
+ geom_errorbar(aes(xmin = conf.low, xmax = conf.high)) +
+
+ ## show where OR = 1 is for reference as a dashed line
+ geom_vline(xintercept = 1, linetype = "dashed")
-
-
also installing the dependency 'repr'
-
-
-
Warning: unable to access index for repository http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.4:
- cannot open URL 'http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.4/PACKAGES'
-
-
-
package 'repr' successfully unpacked and MD5 sums checked
-package 'skimr' successfully unpacked and MD5 sums checked
-
-The downloaded binary packages are in
- C:\Users\ah1114\AppData\Local\Temp\RtmpeoxLnM\downloaded_packages
-
-
-
-
also installing the dependencies 'iterators', 'permute', 'ca', 'foreach', 'gclus', 'qap', 'registry', 'TSP', 'vegan', 'seriation'
-
-
-
Warning: unable to access index for repository http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.4:
- cannot open URL 'http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.4/PACKAGES'
-
-
-
package 'iterators' successfully unpacked and MD5 sums checked
-package 'permute' successfully unpacked and MD5 sums checked
-package 'ca' successfully unpacked and MD5 sums checked
-package 'foreach' successfully unpacked and MD5 sums checked
-package 'gclus' successfully unpacked and MD5 sums checked
-package 'qap' successfully unpacked and MD5 sums checked
-package 'registry' successfully unpacked and MD5 sums checked
-package 'TSP' successfully unpacked and MD5 sums checked
-package 'vegan' successfully unpacked and MD5 sums checked
-package 'seriation' successfully unpacked and MD5 sums checked
-package 'corrr' successfully unpacked and MD5 sums checked
-
-The downloaded binary packages are in
- C:\Users\ah1114\AppData\Local\Temp\RtmpeoxLnM\downloaded_packages
-
-
Import data
We import the dataset of cases from a simulated Ebola epidemic. If you want to follow along, click to download the “clean” linelist (as .rds file). Import your data with the import()
function from the rio package (it accepts many file types like .xlsx, .rds, .csv - see the Import and export page for details).
-
-
Warning: Missing `trust` will be set to FALSE by default for RDS in 2.0.0.
-
-
-
-
# import the linelist
-linelist <- import("linelist_cleaned.rds")
+
# import the linelist
+linelist <- import("linelist_cleaned.rds")
The first 50 rows of the linelist are displayed below.
@@ -919,8 +874,8 @@
T-tests
A t-test, also called “Student’s t-Test”, is typically used to determine if there is a significant difference between the means of some numeric variable between two groups. Here we’ll show the syntax to do this test depending on whether the columns are in the same data frame.
Syntax 1: This is the syntax when your numeric and categorical columns are in the same data frame. Provide the numeric column on the left side of the equation and the categorical column on the right side. Specify the dataset to data =
. Optionally, set paired = TRUE
, and conf.level =
(0.95 default), and alternative =
(either “two.sided”, “less”, or “greater”). Enter ?t.test
for more details.
-
## compare mean age by outcome group with a t-test
-t.test(age_years ~ gender, data = linelist)
+
## compare mean age by outcome group with a t-test
+t.test(age_years ~ gender, data = linelist)
Welch Two Sample t-test
@@ -937,26 +892,26 @@ T-tests
Syntax 2: You can compare two separate numeric vectors using this alternative syntax. For example, if the two columns are in different data sets.
-
t.test(df1$age_years, df2$age_years)
+
t.test(df1$age_years, df2$age_years)
You can also use a t-test to determine whether a sample mean is significantly different from some specific value. Here we conduct a one-sample t-test with the known/hypothesized population mean as mu =
:
-
t.test(linelist$age_years, mu = 45)
+
t.test(linelist$age_years, mu = 45)
Shapiro-Wilk test
The Shapiro-Wilk test can be used to determine whether a sample came from a normally-distributed population (an assumption of many other tests and analysis, such as the t-test). However, this can only be used on a sample between 3 and 5000 observations. For larger samples a quantile-quantile plot may be helpful.
-
shapiro.test(linelist$age_years)
+
shapiro.test(linelist$age_years)
Wilcoxon rank sum test
The Wilcoxon rank sum test, also called the Mann–Whitney U test, is often used to help determine if two numeric samples are from the same distribution when their populations are not normally distributed or have unequal variance.
-
## compare age distribution by outcome group with a wilcox test
-wilcox.test(age_years ~ outcome, data = linelist)
+
## compare age distribution by outcome group with a wilcox test
+wilcox.test(age_years ~ outcome, data = linelist)
Wilcoxon rank sum test with continuity correction
@@ -971,8 +926,8 @@ Wilcoxon
Kruskal-Wallis test
The Kruskal-Wallis test is an extension of the Wilcoxon rank sum test that can be used to test for differences in the distribution of more than two samples. When only two samples are used it gives identical results to the Wilcoxon rank sum test.
-
## compare age distribution by outcome group with a kruskal-wallis test
-kruskal.test(age_years ~ outcome, linelist)
+
## compare age distribution by outcome group with a kruskal-wallis test
+kruskal.test(age_years ~ outcome, linelist)
Kruskal-Wallis rank sum test
@@ -986,8 +941,8 @@ Kruskal-Wal
Chi-squared test
Pearson’s Chi-squared test is used in testing for significant differences between categorical croups.
-
## compare the proportions in each group with a chi-squared test
-chisq.test(linelist$gender, linelist$outcome)
+
## compare the proportions in each group with a chi-squared test
+chisq.test(linelist$gender, linelist$outcome)
Pearson's Chi-squared test with Yates' continuity correction
@@ -1006,8 +961,8 @@ Summary stat
The function get_summary_stats()
is a quick way to return summary statistics. Simply pipe your dataset to this function and provide the columns to analyse. If no columns are specified, the statistics are calculated for all columns.
By default, a full range of summary statistics are returned: n, max, min, median, 25%ile, 75%ile, IQR, median absolute deviation (mad), mean, standard deviation, standard error, and a confidence interval of the mean.
-
linelist %>%
- rstatix::get_summary_stats(age, temp)
+
linelist %>%
+ rstatix::get_summary_stats(age, temp)
# A tibble: 2 × 13
variable n min max median q1 q3 iqr mad mean sd se
@@ -1020,9 +975,9 @@ Summary stat
You can specify a subset of summary statistics to return by providing one of the following values to type =
: “full”, “common”, “robust”, “five_number”, “mean_sd”, “mean_se”, “mean_ci”, “median_iqr”, “median_mad”, “quantile”, “mean”, “median”, “min”, “max”.
It can be used with grouped data as well, such that a row is returned for each grouping-variable:
-
linelist %>%
- group_by(hospital) %>%
- rstatix::get_summary_stats(age, temp, type = "common")
+
linelist %>%
+ group_by(hospital) %>%
+ rstatix::get_summary_stats(age, temp, type = "common")
# A tibble: 12 × 11
hospital variable n min max median iqr mean sd se ci
@@ -1047,8 +1002,8 @@ Summary stat
T-test
Use a formula syntax to specify the numeric and categorical columns:
-
linelist %>%
- t_test(age_years ~ gender)
+
linelist %>%
+ t_test(age_years ~ gender)
# A tibble: 1 × 10
.y. group1 group2 n1 n2 statistic df p p.adj p.adj.signif
@@ -1058,8 +1013,8 @@ T-test
Or use ~ 1
and specify mu =
for a one-sample T-test. This can also be done by group.
-
linelist %>%
- t_test(age_years ~ 1, mu = 30)
+
linelist %>%
+ t_test(age_years ~ 1, mu = 30)
# A tibble: 1 × 7
.y. group1 group2 n statistic df p
@@ -1069,9 +1024,9 @@ T-test
If applicable, the statistical tests can be done by group, as shown below:
-
linelist %>%
- group_by(gender) %>%
- t_test(age_years ~ 1, mu = 18)
+
linelist %>%
+ group_by(gender) %>%
+ t_test(age_years ~ 1, mu = 18)
# A tibble: 3 × 8
gender .y. group1 group2 n statistic df p
@@ -1086,9 +1041,9 @@ T-test
Shapiro-Wilk test
As stated above, sample size must be between 3 and 5000.
-
linelist %>%
- head(500) %>% # first 500 rows of case linelist, for example only
- shapiro_test(age_years)
+
linelist %>%
+ head(500) %>% # first 500 rows of case linelist, for example only
+ shapiro_test(age_years)
# A tibble: 1 × 3
variable statistic p
@@ -1100,8 +1055,8 @@ Shapiro-Wil
Wilcoxon rank sum test
-
linelist %>%
- wilcox_test(age_years ~ gender)
+
linelist %>%
+ wilcox_test(age_years ~ gender)
# A tibble: 1 × 9
.y. group1 group2 n1 n2 statistic p p.adj p.adj.signif
@@ -1114,8 +1069,8 @@ Wilcox
Kruskal-Wallis test
Also known as the Mann-Whitney U test.
-
linelist %>%
- kruskal_test(age_years ~ outcome)
+
linelist %>%
+ kruskal_test(age_years ~ outcome)
# A tibble: 1 × 6
.y. n statistic df p method
@@ -1128,10 +1083,10 @@ Kruskal-W
Chi-squared test
The chi-square test function accepts a table, so first we create a cross-tabulation. There are many ways to create a cross-tabulation (see Descriptive tables) but here we use tabyl()
from janitor and remove the left-most column of value labels before passing to chisq_test()
.
-
linelist %>%
- tabyl(gender, outcome) %>%
- select(-1) %>%
- chisq_test()
+
linelist %>%
+ tabyl(gender, outcome) %>%
+ select(-1) %>%
+ chisq_test()
# A tibble: 1 × 6
n statistic p df method p.signif
@@ -1150,34 +1105,31 @@ Chi-squared test
Compare the proportions of a categorical variable in two groups. The default statistical test for add_p()
when applied to a categorical variable is to perform a chi-squared test of independence with continuity correction, but if any expected call count is below 5 then a Fisher’s exact test is used.
-
linelist %>%
- select(gender, outcome) %>% # keep variables of interest
- tbl_summary(by = outcome) %>% # produce summary table and specify grouping variable
- add_p() # specify what test to perform
+
linelist %>%
+ select(gender, outcome) %>% # keep variables of interest
+ tbl_summary(by = outcome) %>% # produce summary table and specify grouping variable
+ add_p() # specify what test to perform
-
1323 missing rows in the "outcome" column have been removed.
-The following errors were returned during `add_p()`:
-✖ For variable `gender` (`outcome`) and "p.value" statistic: The package
- "cardx" (>= 0.2.1) is required.
+
1323 missing rows in the "outcome" column have been removed.
-
-
@@ -1633,7 +1585,8 @@
Chi-squared
N = 1,983
p-value |
+
" class="gt_col_heading gt_columns_bottom_border gt_center" data-quarto-table-cell-role="th" scope="col">
p-value
+
@@ -1643,8 +1596,7 @@ Chi-squared
|
-
- |
+>0.9 |
f |
@@ -1672,6 +1624,10 @@ Chi-squared
+
+
+
@@ -1684,36 +1640,33 @@ Chi-squared
T-tests
Compare the difference in means for a continuous variable in two groups. For example, compare the mean age by patient outcome.
-
linelist %>%
- select(age_years, outcome) %>% # keep variables of interest
- tbl_summary( # produce summary table
- statistic = age_years ~ "{mean} ({sd})", # specify what statistics to show
- by = outcome) %>% # specify the grouping variable
- add_p(age_years ~ "t.test") # specify what tests to perform
+
linelist %>%
+ select(age_years, outcome) %>% # keep variables of interest
+ tbl_summary( # produce summary table
+ statistic = age_years ~ "{mean} ({sd})", # specify what statistics to show
+ by = outcome) %>% # specify the grouping variable
+ add_p(age_years ~ "t.test") # specify what tests to perform
-
1323 missing rows in the "outcome" column have been removed.
-The following errors were returned during `add_p()`:
-✖ For variable `age_years` (`outcome`) and "p.value" statistic: The package
- "cardx" (>= 0.2.1) is required.
+
1323 missing rows in the "outcome" column have been removed.
-
-
@@ -2169,7 +2122,8 @@
T-tests
N = 1,983
p-value |
+
" class="gt_col_heading gt_columns_bottom_border gt_center" data-quarto-table-cell-role="th" scope="col">
p-value
+
@@ -2177,8 +2131,7 @@ T-tests
age_years |
16 (12) |
16 (13) |
-
- |
+0.6 |
Unknown |
@@ -2192,6 +2145,10 @@ T-tests
+
+
+
@@ -2204,36 +2161,33 @@ T-tests
Wilcoxon rank sum test
Compare the distribution of a continuous variable in two groups. The default is to use the Wilcoxon rank sum test and the median (IQR) when comparing two groups. However for non-normally distributed data or comparing multiple groups, the Kruskal-wallis test is more appropriate.
-
linelist %>%
- select(age_years, outcome) %>% # keep variables of interest
- tbl_summary( # produce summary table
- statistic = age_years ~ "{median} ({p25}, {p75})", # specify what statistic to show (this is default so could remove)
- by = outcome) %>% # specify the grouping variable
- add_p(age_years ~ "wilcox.test") # specify what test to perform (default so could leave brackets empty)
+
linelist %>%
+ select(age_years, outcome) %>% # keep variables of interest
+ tbl_summary( # produce summary table
+ statistic = age_years ~ "{median} ({p25}, {p75})", # specify what statistic to show (this is default so could remove)
+ by = outcome) %>% # specify the grouping variable
+ add_p(age_years ~ "wilcox.test") # specify what test to perform (default so could leave brackets empty)
-
1323 missing rows in the "outcome" column have been removed.
-The following errors were returned during `add_p()`:
-✖ For variable `age_years` (`outcome`) and "p.value" statistic: The package
- "cardx" (>= 0.2.1) is required.
+
1323 missing rows in the "outcome" column have been removed.
-
-
@@ -2689,7 +2643,8 @@
Wilcox
N = 1,983
p-value |
+
" class="gt_col_heading gt_columns_bottom_border gt_center" data-quarto-table-cell-role="th" scope="col">
p-value
+
@@ -2697,8 +2652,7 @@ Wilcox
age_years |
13 (6, 23) |
13 (6, 23) |
-
- |
+0.8 |
Unknown |
@@ -2712,6 +2666,10 @@ Wilcox
+
+
+
@@ -2724,36 +2682,33 @@ Wilcox
Kruskal-wallis test
Compare the distribution of a continuous variable in two or more groups, regardless of whether the data is normally distributed.
-
linelist %>%
- select(age_years, outcome) %>% # keep variables of interest
- tbl_summary( # produce summary table
- statistic = age_years ~ "{median} ({p25}, {p75})", # specify what statistic to show (default, so could remove)
- by = outcome) %>% # specify the grouping variable
- add_p(age_years ~ "kruskal.test") # specify what test to perform
+
linelist %>%
+ select(age_years, outcome) %>% # keep variables of interest
+ tbl_summary( # produce summary table
+ statistic = age_years ~ "{median} ({p25}, {p75})", # specify what statistic to show (default, so could remove)
+ by = outcome) %>% # specify the grouping variable
+ add_p(age_years ~ "kruskal.test") # specify what test to perform
-
1323 missing rows in the "outcome" column have been removed.
-The following errors were returned during `add_p()`:
-✖ For variable `age_years` (`outcome`) and "p.value" statistic: The package
- "cardx" (>= 0.2.1) is required.
+
1323 missing rows in the "outcome" column have been removed.
-
-
@@ -3209,7 +3164,8 @@
Kruskal-w
N = 1,983
p-value |
+
" class="gt_col_heading gt_columns_bottom_border gt_center" data-quarto-table-cell-role="th" scope="col">
p-value
+
@@ -3217,8 +3173,7 @@ Kruskal-w
age_years |
13 (6, 23) |
13 (6, 23) |
-
- |
+0.8 |
Unknown |
@@ -3232,6 +3187,10 @@ Kruskal-w
+
+
+
@@ -3371,11 +3330,11 @@ Correlation between numeric variables can be investigated using the tidyverse
corrr package. It allows you to compute correlations using Pearson, Kendall tau or Spearman rho. The package creates a table and also has a function to automatically plot the values.
-
correlation_tab <- linelist %>%
- select(generation, age, ct_blood, days_onset_hosp, wt_kg, ht_cm) %>% # keep numeric variables of interest
- correlate() # create correlation table (using default pearson)
-
-correlation_tab # print
+
correlation_tab <- linelist %>%
+ select(generation, age, ct_blood, days_onset_hosp, wt_kg, ht_cm) %>% # keep numeric variables of interest
+ correlate() # create correlation table (using default pearson)
+
+correlation_tab # print
# A tibble: 6 × 7
term generation age ct_blood days_onset_hosp wt_kg ht_cm
@@ -3387,12 +3346,12 @@
-
## remove duplicate entries (the table above is mirrored)
-correlation_tab <- correlation_tab %>%
- shave()
-
-## view correlation table
-correlation_tab
+
## remove duplicate entries (the table above is mirrored)
+correlation_tab <- correlation_tab %>%
+ shave()
+
+## view correlation table
+correlation_tab
# A tibble: 6 × 7
term generation age ct_blood days_onset_hosp wt_kg ht_cm
@@ -3404,8 +3363,8 @@
-
## plot correlations
-rplot(correlation_tab)
+
## plot correlations
+rplot(correlation_tab)
@@ -4018,7 +3977,7 @@
+
+
@@ -687,43 +693,43 @@
@@ -846,9 +852,9 @@
Packages
Load data
The example dataset used in this section:
-- fictional mortality survey data.
-- fictional population counts for the survey area.
-- data dictionary for the fictional mortality survey data.
+- Fictional mortality survey data.
+- Fictional population counts for the survey area.
+- Data dictionary for the fictional mortality survey data.
This is based off the MSF OCA ethical review board pre-approved survey. The fictional dataset was produced as part of the “R4Epis” project. This is all based off data collected using KoboToolbox, which is a data collection software based off Open Data Kit.
Kobo allows you to export both the collected data, as well as the data dictionary for that dataset. We strongly recommend doing this as it simplifies data cleaning and is useful for looking up variables/questions.
@@ -865,8 +871,8 @@
Load data
The first 10 rows of the survey are displayed below.
We also want to import the data on sampling population so that we can produce appropriate weights. This data can be in different formats, however we would suggest to have it as seen below (this can just be typed in to an excel).
@@ -877,8 +883,8 @@
Load data
The first 10 rows of the survey are displayed below.
For cluster surveys you may want to add survey weights at the cluster level. You could read this data in as above. Alternatively if there are only a few counts, these could be entered as below in to a tibble. In any case you will need to have one column with a cluster identifier which matches your survey data, and another column with the number of households in each cluster.
@@ -942,19 +948,6 @@
Clean data
mutate(across(all_of(YNVARS),
str_detect,
pattern = "yes"))
-
-
Warning: There was 1 warning in `mutate()`.
-ℹ In argument: `across(all_of(YNVARS), str_detect, pattern = "yes")`.
-Caused by warning:
-! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
-Supply arguments directly to `.fns` through an anonymous function instead.
-
- # Previously
- across(a:b, mean, na.rm = TRUE)
-
- # Now
- across(a:b, \(x) mean(x, na.rm = TRUE))
-
@@ -969,79 +962,79 @@ For mortality surveys we want to now how long each individual was present for in the location to be able to calculate an appropriate mortality rate for our period of interest. This is not relevant to all surveys, but particularly for mortality surveys this is important as they are conducted frequently among mobile or displaced populations.
To do this we first define our time period of interest, also known as a recall period (i.e. the time that participants are asked to report on when answering questions). We can then use this period to set inappropriate dates to missing, i.e. if deaths are reported from outside the period of interest.
-
## set the start/end of recall period
-## can be changed to date variables from dataset
-## (e.g. arrival date & date questionnaire)
-survey_data <- survey_data %>%
- mutate(recall_start = as.Date("2018-01-01"),
- recall_end = as.Date("2018-05-01")
- )
-
-
-# set inappropriate dates to NA based on rules
-## e.g. arrivals before start, departures departures after end
-survey_data <- survey_data %>%
- mutate(
- arrived_date = if_else(arrived_date < recall_start,
- as.Date(NA),
- arrived_date),
- birthday_date = if_else(birthday_date < recall_start,
- as.Date(NA),
- birthday_date),
- left_date = if_else(left_date > recall_end,
- as.Date(NA),
- left_date),
- death_date = if_else(death_date > recall_end,
- as.Date(NA),
- death_date)
- )
+
## set the start/end of recall period
+## can be changed to date variables from dataset
+## (e.g. arrival date & date questionnaire)
+survey_data <- survey_data %>%
+ mutate(recall_start = as.Date("2018-01-01"),
+ recall_end = as.Date("2018-05-01")
+ )
+
+
+# set inappropriate dates to NA based on rules
+## e.g. arrivals before start, departures departures after end
+survey_data <- survey_data %>%
+ mutate(
+ arrived_date = if_else(arrived_date < recall_start,
+ as.Date(NA),
+ arrived_date),
+ birthday_date = if_else(birthday_date < recall_start,
+ as.Date(NA),
+ birthday_date),
+ left_date = if_else(left_date > recall_end,
+ as.Date(NA),
+ left_date),
+ death_date = if_else(death_date > recall_end,
+ as.Date(NA),
+ death_date)
+ )
We can then use our date variables to define start and end dates for each individual. We can use the find_start_date()
function from sitrep to fine the causes for the dates and then use that to calculate the difference between days (person-time).
start date: Earliest appropriate arrival event within your recall period. Either the beginning of your recall period (which you define in advance), or a date after the start of recall if applicable (e.g. arrivals or births).
end date: Earliest appropriate departure event within your recall period. Either the end of your recall period, or a date before the end of recall if applicable (e.g. departures, deaths).
-
## create new variables for start and end dates/causes
-survey_data <- survey_data %>%
- ## choose earliest date entered in survey
- ## from births, household arrivals, and camp arrivals
- find_start_date("birthday_date",
- "arrived_date",
- period_start = "recall_start",
- period_end = "recall_end",
- datecol = "startdate",
- datereason = "startcause"
- ) %>%
- ## choose earliest date entered in survey
- ## from camp departures, death and end of the study
- find_end_date("left_date",
- "death_date",
- period_start = "recall_start",
- period_end = "recall_end",
- datecol = "enddate",
- datereason = "endcause"
- )
-
-
-## label those that were present at the start/end (except births/deaths)
-survey_data <- survey_data %>%
- mutate(
- ## fill in start date to be the beginning of recall period (for those empty)
- startdate = if_else(is.na(startdate), recall_start, startdate),
- ## set the start cause to present at start if equal to recall period
- ## unless it is equal to the birth date
- startcause = if_else(startdate == recall_start & startcause != "birthday_date",
- "Present at start", startcause),
- ## fill in end date to be end of recall period (for those empty)
- enddate = if_else(is.na(enddate), recall_end, enddate),
- ## set the end cause to present at end if equall to recall end
- ## unless it is equal to the death date
- endcause = if_else(enddate == recall_end & endcause != "death_date",
- "Present at end", endcause))
-
-
-## Define observation time in days
-survey_data <- survey_data %>%
- mutate(obstime = as.numeric(enddate - startdate))
+
## create new variables for start and end dates/causes
+survey_data <- survey_data %>%
+ ## choose earliest date entered in survey
+ ## from births, household arrivals, and camp arrivals
+ find_start_date("birthday_date",
+ "arrived_date",
+ period_start = "recall_start",
+ period_end = "recall_end",
+ datecol = "startdate",
+ datereason = "startcause"
+ ) %>%
+ ## choose earliest date entered in survey
+ ## from camp departures, death and end of the study
+ find_end_date("left_date",
+ "death_date",
+ period_start = "recall_start",
+ period_end = "recall_end",
+ datecol = "enddate",
+ datereason = "endcause"
+ )
+
+
+## label those that were present at the start/end (except births/deaths)
+survey_data <- survey_data %>%
+ mutate(
+ ## fill in start date to be the beginning of recall period (for those empty)
+ startdate = if_else(is.na(startdate), recall_start, startdate),
+ ## set the start cause to present at start if equal to recall period
+ ## unless it is equal to the birth date
+ startcause = if_else(startdate == recall_start & startcause != "birthday_date",
+ "Present at start", startcause),
+ ## fill in end date to be end of recall period (for those empty)
+ enddate = if_else(is.na(enddate), recall_end, enddate),
+ ## set the end cause to present at end if equall to recall end
+ ## unless it is equal to the death date
+ endcause = if_else(enddate == recall_end & endcause != "death_date",
+ "Present at end", endcause))
+
+
+## Define observation time in days
+survey_data <- survey_data %>%
+ mutate(obstime = as.numeric(enddate - startdate))
@@ -1051,52 +1044,52 @@ DANGER: You cant have missing values in your weight variable, or any of the variables relevant to your survey design (e.g. age, sex, strata or cluster variables).
-
## store the cases that you drop so you can describe them (e.g. non-consenting
-## or wrong village/cluster)
-dropped <- survey_data %>%
- filter(!consent | is.na(startdate) | is.na(enddate) | village_name == "other")
-
-## use the dropped cases to remove the unused rows from the survey data set
-survey_data <- anti_join(survey_data, dropped, by = names(dropped))
+
## store the cases that you drop so you can describe them (e.g. non-consenting
+## or wrong village/cluster)
+dropped <- survey_data %>%
+ filter(!consent | is.na(startdate) | is.na(enddate) | village_name == "other")
+
+## use the dropped cases to remove the unused rows from the survey data set
+survey_data <- anti_join(survey_data, dropped, by = names(dropped))
As mentioned above we demonstrate how to add weights for three different study designs (stratified, cluster and stratified cluster). These require information on the source population and/or the clusters surveyed. We will use the stratified cluster code for this example, but use whichever is most appropriate for your study design.
-
# stratified ------------------------------------------------------------------
-# create a variable called "surv_weight_strata"
-# contains weights for each individual - by age group, sex and health district
-survey_data <- add_weights_strata(x = survey_data,
- p = population,
- surv_weight = "surv_weight_strata",
- surv_weight_ID = "surv_weight_ID_strata",
- age_group, sex, health_district)
-
-## cluster ---------------------------------------------------------------------
-
-# get the number of people of individuals interviewed per household
-# adds a variable with counts of the household (parent) index variable
-survey_data <- survey_data %>%
- add_count(index, name = "interviewed")
-
-
-## create cluster weights
-survey_data <- add_weights_cluster(x = survey_data,
- cl = cluster_counts,
- eligible = member_number,
- interviewed = interviewed,
- cluster_x = village_name,
- cluster_cl = cluster,
- household_x = index,
- household_cl = households,
- surv_weight = "surv_weight_cluster",
- surv_weight_ID = "surv_weight_ID_cluster",
- ignore_cluster = FALSE,
- ignore_household = FALSE)
-
-
-# stratified and cluster ------------------------------------------------------
-# create a survey weight for cluster and strata
-survey_data <- survey_data %>%
- mutate(surv_weight_cluster_strata = surv_weight_strata * surv_weight_cluster)
+
# stratified ------------------------------------------------------------------
+# create a variable called "surv_weight_strata"
+# contains weights for each individual - by age group, sex and health district
+survey_data <- add_weights_strata(x = survey_data,
+ p = population,
+ surv_weight = "surv_weight_strata",
+ surv_weight_ID = "surv_weight_ID_strata",
+ age_group, sex, health_district)
+
+## cluster ---------------------------------------------------------------------
+
+# get the number of people of individuals interviewed per household
+# adds a variable with counts of the household (parent) index variable
+survey_data <- survey_data %>%
+ add_count(index, name = "interviewed")
+
+
+## create cluster weights
+survey_data <- add_weights_cluster(x = survey_data,
+ cl = cluster_counts,
+ eligible = member_number,
+ interviewed = interviewed,
+ cluster_x = village_name,
+ cluster_cl = cluster,
+ household_x = index,
+ household_cl = households,
+ surv_weight = "surv_weight_cluster",
+ surv_weight_ID = "surv_weight_ID_cluster",
+ ignore_cluster = FALSE,
+ ignore_household = FALSE)
+
+
+# stratified and cluster ------------------------------------------------------
+# create a survey weight for cluster and strata
+survey_data <- survey_data %>%
+ mutate(surv_weight_cluster_strata = surv_weight_strata * surv_weight_cluster)
@@ -1112,64 +1105,64 @@ The survey package effectively uses base R coding, and so it is not possible to use pipes (%>%
) or other dplyr syntax. With the survey package we use the svydesign()
function to define a survey object with appropriate clusters, weights and strata.
NOTE: we need to use the tilde (~
) in front of variables, this is because the package uses the base R syntax of assigning variables based on formulae.
-
# simple random ---------------------------------------------------------------
-base_survey_design_simple <- svydesign(ids = ~1, # 1 for no cluster ids
- weights = NULL, # No weight added
- strata = NULL, # sampling was simple (no strata)
- data = survey_data # have to specify the dataset
- )
-
-## stratified ------------------------------------------------------------------
-base_survey_design_strata <- svydesign(ids = ~1, # 1 for no cluster ids
- weights = ~surv_weight_strata, # weight variable created above
- strata = ~health_district, # sampling was stratified by district
- data = survey_data # have to specify the dataset
- )
-
-# cluster ---------------------------------------------------------------------
-base_survey_design_cluster <- svydesign(ids = ~village_name, # cluster ids
- weights = ~surv_weight_cluster, # weight variable created above
- strata = NULL, # sampling was simple (no strata)
- data = survey_data # have to specify the dataset
- )
-
-# stratified cluster ----------------------------------------------------------
-base_survey_design <- svydesign(ids = ~village_name, # cluster ids
- weights = ~surv_weight_cluster_strata, # weight variable created above
- strata = ~health_district, # sampling was stratified by district
- data = survey_data # have to specify the dataset
- )
+
# simple random ---------------------------------------------------------------
+base_survey_design_simple <- svydesign(ids = ~1, # 1 for no cluster ids
+ weights = NULL, # No weight added
+ strata = NULL, # sampling was simple (no strata)
+ data = survey_data # have to specify the dataset
+ )
+
+## stratified ------------------------------------------------------------------
+base_survey_design_strata <- svydesign(ids = ~1, # 1 for no cluster ids
+ weights = ~surv_weight_strata, # weight variable created above
+ strata = ~health_district, # sampling was stratified by district
+ data = survey_data # have to specify the dataset
+ )
+
+# cluster ---------------------------------------------------------------------
+base_survey_design_cluster <- svydesign(ids = ~village_name, # cluster ids
+ weights = ~surv_weight_cluster, # weight variable created above
+ strata = NULL, # sampling was simple (no strata)
+ data = survey_data # have to specify the dataset
+ )
+
+# stratified cluster ----------------------------------------------------------
+base_survey_design <- svydesign(ids = ~village_name, # cluster ids
+ weights = ~surv_weight_cluster_strata, # weight variable created above
+ strata = ~health_district, # sampling was stratified by district
+ data = survey_data # have to specify the dataset
+ )
Srvyr package
With the srvyr package we can use the as_survey_design()
function, which has all the same arguments as above but allows pipes (%>%
), and so we do not need to use the tilde (~
).
-
## simple random ---------------------------------------------------------------
-survey_design_simple <- survey_data %>%
- as_survey_design(ids = 1, # 1 for no cluster ids
- weights = NULL, # No weight added
- strata = NULL # sampling was simple (no strata)
- )
-## stratified ------------------------------------------------------------------
-survey_design_strata <- survey_data %>%
- as_survey_design(ids = 1, # 1 for no cluster ids
- weights = surv_weight_strata, # weight variable created above
- strata = health_district # sampling was stratified by district
- )
-## cluster ---------------------------------------------------------------------
-survey_design_cluster <- survey_data %>%
- as_survey_design(ids = village_name, # cluster ids
- weights = surv_weight_cluster, # weight variable created above
- strata = NULL # sampling was simple (no strata)
- )
-
-## stratified cluster ----------------------------------------------------------
-survey_design <- survey_data %>%
- as_survey_design(ids = village_name, # cluster ids
- weights = surv_weight_cluster_strata, # weight variable created above
- strata = health_district # sampling was stratified by district
- )
+
## simple random ---------------------------------------------------------------
+survey_design_simple <- survey_data %>%
+ as_survey_design(ids = 1, # 1 for no cluster ids
+ weights = NULL, # No weight added
+ strata = NULL # sampling was simple (no strata)
+ )
+## stratified ------------------------------------------------------------------
+survey_design_strata <- survey_data %>%
+ as_survey_design(ids = 1, # 1 for no cluster ids
+ weights = surv_weight_strata, # weight variable created above
+ strata = health_district # sampling was stratified by district
+ )
+## cluster ---------------------------------------------------------------------
+survey_design_cluster <- survey_data %>%
+ as_survey_design(ids = village_name, # cluster ids
+ weights = surv_weight_cluster, # weight variable created above
+ strata = NULL # sampling was simple (no strata)
+ )
+
+## stratified cluster ----------------------------------------------------------
+survey_design <- survey_data %>%
+ as_survey_design(ids = village_name, # cluster ids
+ weights = surv_weight_cluster_strata, # weight variable created above
+ strata = health_district # sampling was stratified by district
+ )
@@ -1190,52 +1183,52 @@ Compare the proportions in each age group between your sample and the source population. This is important to be able to highlight potential sampling bias. You could similarly repeat this looking at distributions by sex.
Note that these p-values are just indicative, and a descriptive discussion (or visualisation with age-pyramids below) of the distributions in your study sample compared to the source population is more important than the binomial test itself. This is because increasing sample size will more often than not lead to differences that may be irrelevant after weighting your data.
-
## counts and props of the study population
-ag <- survey_data %>%
- group_by(age_group) %>%
- drop_na(age_group) %>%
- tally() %>%
- mutate(proportion = n / sum(n),
- n_total = sum(n))
-
-## counts and props of the source population
-propcount <- population %>%
- group_by(age_group) %>%
- tally(population) %>%
- mutate(proportion = n / sum(n))
-
-## bind together the columns of two tables, group by age, and perform a
-## binomial test to see if n/total is significantly different from population
-## proportion.
- ## suffix here adds to text to the end of columns in each of the two datasets
-left_join(ag, propcount, by = "age_group", suffix = c("", "_pop")) %>%
- group_by(age_group) %>%
- ## broom::tidy(binom.test()) makes a data frame out of the binomial test and
- ## will add the variables p.value, parameter, conf.low, conf.high, method, and
- ## alternative. We will only use p.value here. You can include other
- ## columns if you want to report confidence intervals
- mutate(binom = list(broom::tidy(binom.test(n, n_total, proportion_pop)))) %>%
- unnest(cols = c(binom)) %>% # important for expanding the binom.test data frame
- mutate(proportion_pop = proportion_pop * 100) %>%
- ## Adjusting the p-values to correct for false positives
- ## (because testing multiple age groups). This will only make
- ## a difference if you have many age categories
- mutate(p.value = p.adjust(p.value, method = "holm")) %>%
-
- ## Only show p-values over 0.001 (those under report as <0.001)
- mutate(p.value = ifelse(p.value < 0.001,
- "<0.001",
- as.character(round(p.value, 3)))) %>%
-
- ## rename the columns appropriately
- select(
- "Age group" = age_group,
- "Study population (n)" = n,
- "Study population (%)" = proportion,
- "Source population (n)" = n_pop,
- "Source population (%)" = proportion_pop,
- "P-value" = p.value
- )
+
## counts and props of the study population
+ag <- survey_data %>%
+ group_by(age_group) %>%
+ drop_na(age_group) %>%
+ tally() %>%
+ mutate(proportion = n / sum(n),
+ n_total = sum(n))
+
+## counts and props of the source population
+propcount <- population %>%
+ group_by(age_group) %>%
+ tally(population) %>%
+ mutate(proportion = n / sum(n))
+
+## bind together the columns of two tables, group by age, and perform a
+## binomial test to see if n/total is significantly different from population
+## proportion.
+ ## suffix here adds to text to the end of columns in each of the two datasets
+left_join(ag, propcount, by = "age_group", suffix = c("", "_pop")) %>%
+ group_by(age_group) %>%
+ ## broom::tidy(binom.test()) makes a data frame out of the binomial test and
+ ## will add the variables p.value, parameter, conf.low, conf.high, method, and
+ ## alternative. We will only use p.value here. You can include other
+ ## columns if you want to report confidence intervals
+ mutate(binom = list(broom::tidy(binom.test(n, n_total, proportion_pop)))) %>%
+ unnest(cols = c(binom)) %>% # important for expanding the binom.test data frame
+ mutate(proportion_pop = proportion_pop * 100) %>%
+ ## Adjusting the p-values to correct for false positives
+ ## (because testing multiple age groups). This will only make
+ ## a difference if you have many age categories
+ mutate(p.value = p.adjust(p.value, method = "holm")) %>%
+
+ ## Only show p-values over 0.001 (those under report as <0.001)
+ mutate(p.value = ifelse(p.value < 0.001,
+ "<0.001",
+ as.character(round(p.value, 3)))) %>%
+
+ ## rename the columns appropriately
+ select(
+ "Age group" = age_group,
+ "Study population (n)" = n,
+ "Study population (%)" = proportion,
+ "Source population (n)" = n_pop,
+ "Source population (%)" = proportion_pop,
+ "P-value" = p.value
+ )
# A tibble: 5 × 6
# Groups: Age group [5]
@@ -1257,108 +1250,108 @@
As with the formal binomial test of difference, seen above in the sampling bias section, we are interested here in visualising whether our sampled population is substantially different from the source population and whether weighting corrects this difference. To do this we will use the patchwork package to show our ggplot visualisations side-by-side; for details see the section on combining plots in ggplot tips chapter of the handbook. We will visualise our source population, our un-weighted survey population and our weighted survey population. You may also consider visualising by each strata of your survey - in our example here that would be by using the argument stack_by = "health_district"
(see ?plot_age_pyramid
for details).
NOTE: The x and y axes are flipped in pyramids
-
## define x-axis limits and labels ---------------------------------------------
-## (update these numbers to be the values for your graph)
-max_prop <- 35 # choose the highest proportion you want to show
-step <- 5 # choose the space you want beween labels
-
-## this part defines vector using the above numbers with axis breaks
-breaks <- c(
- seq(max_prop/100 * -1, 0 - step/100, step/100),
- 0,
- seq(0 + step / 100, max_prop/100, step/100)
- )
-
-## this part defines vector using the above numbers with axis limits
-limits <- c(max_prop/100 * -1, max_prop/100)
-
-## this part defines vector using the above numbers with axis labels
-labels <- c(
- seq(max_prop, step, -step),
- 0,
- seq(step, max_prop, step)
- )
-
-
-## create plots individually --------------------------------------------------
-
-## plot the source population
-## nb: this needs to be collapsed for the overall population (i.e. removing health districts)
-source_population <- population %>%
- ## ensure that age and sex are factors
- mutate(age_group = factor(age_group,
- levels = c("0-2",
- "3-14",
- "15-29",
- "30-44",
- "45+")),
- sex = factor(sex)) %>%
- group_by(age_group, sex) %>%
- ## add the counts for each health district together
- summarise(population = sum(population)) %>%
- ## remove the grouping so can calculate overall proportion
- ungroup() %>%
- mutate(proportion = population / sum(population)) %>%
- ## plot pyramid
- age_pyramid(
- age_group = age_group,
- split_by = sex,
- count = proportion,
- proportional = TRUE) +
- ## only show the y axis label (otherwise repeated in all three plots)
- labs(title = "Source population",
- y = "",
- x = "Age group (years)") +
- ## make the x axis the same for all plots
- scale_y_continuous(breaks = breaks,
- limits = limits,
- labels = labels)
-
-
-## plot the unweighted sample population
-sample_population <- age_pyramid(survey_data,
- age_group = "age_group",
- split_by = "sex",
- proportion = TRUE) +
- ## only show the x axis label (otherwise repeated in all three plots)
- labs(title = "Unweighted sample population",
- y = "Proportion (%)",
- x = "") +
- ## make the x axis the same for all plots
- scale_y_continuous(breaks = breaks,
- limits = limits,
- labels = labels)
-
-
-## plot the weighted sample population
-weighted_population <- survey_design %>%
- ## make sure the variables are factors
- mutate(age_group = factor(age_group),
- sex = factor(sex)) %>%
- age_pyramid(
- age_group = "age_group",
- split_by = "sex",
- proportion = TRUE) +
- ## only show the x axis label (otherwise repeated in all three plots)
- labs(title = "Weighted sample population",
- y = "",
- x = "") +
- ## make the x axis the same for all plots
- scale_y_continuous(breaks = breaks,
- limits = limits,
- labels = labels)
-
-## combine all three plots ----------------------------------------------------
-## combine three plots next to eachother using +
-source_population + sample_population + weighted_population +
- ## only show one legend and define theme
- ## note the use of & for combining theme with plot_layout()
- plot_layout(guides = "collect") &
- theme(legend.position = "bottom", # move legend to bottom
- legend.title = element_blank(), # remove title
- text = element_text(size = 18), # change text size
- axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1) # turn x-axis text
- )
+
## define x-axis limits and labels ---------------------------------------------
+## (update these numbers to be the values for your graph)
+max_prop <- 35 # choose the highest proportion you want to show
+step <- 5 # choose the space you want beween labels
+
+## this part defines vector using the above numbers with axis breaks
+breaks <- c(
+ seq(max_prop/100 * -1, 0 - step/100, step/100),
+ 0,
+ seq(0 + step / 100, max_prop/100, step/100)
+ )
+
+## this part defines vector using the above numbers with axis limits
+limits <- c(max_prop/100 * -1, max_prop/100)
+
+## this part defines vector using the above numbers with axis labels
+labels <- c(
+ seq(max_prop, step, -step),
+ 0,
+ seq(step, max_prop, step)
+ )
+
+
+## create plots individually --------------------------------------------------
+
+## plot the source population
+## nb: this needs to be collapsed for the overall population (i.e. removing health districts)
+source_population <- population %>%
+ ## ensure that age and sex are factors
+ mutate(age_group = factor(age_group,
+ levels = c("0-2",
+ "3-14",
+ "15-29",
+ "30-44",
+ "45+")),
+ sex = factor(sex)) %>%
+ group_by(age_group, sex) %>%
+ ## add the counts for each health district together
+ summarise(population = sum(population)) %>%
+ ## remove the grouping so can calculate overall proportion
+ ungroup() %>%
+ mutate(proportion = population / sum(population)) %>%
+ ## plot pyramid
+ age_pyramid(
+ age_group = age_group,
+ split_by = sex,
+ count = proportion,
+ proportional = TRUE) +
+ ## only show the y axis label (otherwise repeated in all three plots)
+ labs(title = "Source population",
+ y = "",
+ x = "Age group (years)") +
+ ## make the x axis the same for all plots
+ scale_y_continuous(breaks = breaks,
+ limits = limits,
+ labels = labels)
+
+
+## plot the unweighted sample population
+sample_population <- age_pyramid(survey_data,
+ age_group = "age_group",
+ split_by = "sex",
+ proportion = TRUE) +
+ ## only show the x axis label (otherwise repeated in all three plots)
+ labs(title = "Unweighted sample population",
+ y = "Proportion (%)",
+ x = "") +
+ ## make the x axis the same for all plots
+ scale_y_continuous(breaks = breaks,
+ limits = limits,
+ labels = labels)
+
+
+## plot the weighted sample population
+weighted_population <- survey_design %>%
+ ## make sure the variables are factors
+ mutate(age_group = factor(age_group),
+ sex = factor(sex)) %>%
+ age_pyramid(
+ age_group = "age_group",
+ split_by = "sex",
+ proportion = TRUE) +
+ ## only show the x axis label (otherwise repeated in all three plots)
+ labs(title = "Weighted sample population",
+ y = "",
+ x = "") +
+ ## make the x axis the same for all plots
+ scale_y_continuous(breaks = breaks,
+ limits = limits,
+ labels = labels)
+
+## combine all three plots ----------------------------------------------------
+## combine three plots next to eachother using +
+source_population + sample_population + weighted_population +
+ ## only show one legend and define theme
+ ## note the use of & for combining theme with plot_layout()
+ plot_layout(guides = "collect") &
+ theme(legend.position = "bottom", # move legend to bottom
+ legend.title = element_blank(), # remove title
+ text = element_text(size = 18), # change text size
+ axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1) # turn x-axis text
+ )
@@ -1372,28 +1365,28 @@
Alluvial/sankey diagram
Visualising starting points and outcomes for individuals can be very helpful to get an overview. There is quite an obvious application for mobile populations, however there are numerous other applications such as cohorts or any other situation where there are transitions in states for individuals. These diagrams have several different names including alluvial, sankey and parallel sets - the details are in the handbook chapter on diagrams and charts.
-
## summarize data
-flow_table <- survey_data %>%
- count(startcause, endcause, sex) %>% # get counts
- gather_set_data(x = c("startcause", "endcause")) # change format for plotting
-
-
-## plot your dataset
- ## on the x axis is the start and end causes
- ## gather_set_data generates an ID for each possible combination
- ## splitting by y gives the possible start/end combos
- ## value as n gives it as counts (could also be changed to proportion)
-ggplot(flow_table, aes(x, id = id, split = y, value = n)) +
- ## colour lines by sex
- geom_parallel_sets(aes(fill = sex), alpha = 0.5, axis.width = 0.2) +
- ## fill in the label boxes grey
- geom_parallel_sets_axes(axis.width = 0.15, fill = "grey80", color = "grey80") +
- ## change text colour and angle (needs to be adjusted)
- geom_parallel_sets_labels(color = "black", angle = 0, size = 5) +
- ## remove axis labels
- theme_void()+
- ## move legend to bottom
- theme(legend.position = "bottom")
+
## summarize data
+flow_table <- survey_data %>%
+ count(startcause, endcause, sex) %>% # get counts
+ gather_set_data(x = c("startcause", "endcause")) # change format for plotting
+
+
+## plot your dataset
+ ## on the x axis is the start and end causes
+ ## gather_set_data generates an ID for each possible combination
+ ## splitting by y gives the possible start/end combos
+ ## value as n gives it as counts (could also be changed to proportion)
+ggplot(flow_table, aes(x, id = id, split = y, value = n)) +
+ ## colour lines by sex
+ geom_parallel_sets(aes(fill = sex), alpha = 0.5, axis.width = 0.2) +
+ ## fill in the label boxes grey
+ geom_parallel_sets_axes(axis.width = 0.15, fill = "grey80", color = "grey80") +
+ ## change text colour and angle (needs to be adjusted)
+ geom_parallel_sets_labels(color = "black", angle = 0, size = 5) +
+ ## remove axis labels
+ theme_void()+
+ ## move legend to bottom
+ theme(legend.position = "bottom")
@@ -1413,22 +1406,22 @@ We can use the svyciprop()
function from survey to get weighted proportions and accompanying 95% confidence intervals. An appropriate design effect can be extracted using the svymean()
rather than svyprop()
function. It is worth noting that svyprop()
only appears to accept variables between 0 and 1 (or TRUE/FALSE), so categorical variables will not work.
NOTE: Functions from survey also accept srvyr design objects, but here we have used the survey design object just for consistency
-
## produce weighted counts
-svytable(~died, base_survey_design)
+
## produce weighted counts
+svytable(~died, base_survey_design)
died
FALSE TRUE
1406244.43 76213.01
-
## produce weighted proportions
-svyciprop(~died, base_survey_design, na.rm = T)
+
## produce weighted proportions
+svyciprop(~died, base_survey_design, na.rm = T)
2.5% 97.5%
died 0.0514 0.0208 0.1213
-
## get the design effect
-svymean(~died, base_survey_design, na.rm = T, deff = T) %>%
- deff()
+
## get the design effect
+svymean(~died, base_survey_design, na.rm = T, deff = T) %>%
+ deff()
diedFALSE diedTRUE
3.755508 3.755508
@@ -1436,54 +1429,54 @@
We can combine the functions from survey shown above in to a function which we define ourselves below, called svy_prop
; and we can then use that function together with map()
from the purrr package to iterate over several variables and create a table. See the handbook iteration chapter for details on purrr.
-
# Define function to calculate weighted counts, proportions, CI and design effect
-# x is the variable in quotation marks
-# design is your survey design object
-
-svy_prop <- function(design, x) {
-
- ## put the variable of interest in a formula
- form <- as.formula(paste0( "~" , x))
- ## only keep the TRUE column of counts from svytable
- weighted_counts <- svytable(form, design)[[2]]
- ## calculate proportions (multiply by 100 to get percentages)
- weighted_props <- svyciprop(form, design, na.rm = TRUE) * 100
- ## extract the confidence intervals and multiply to get percentages
- weighted_confint <- confint(weighted_props) * 100
- ## use svymean to calculate design effect and only keep the TRUE column
- design_eff <- deff(svymean(form, design, na.rm = TRUE, deff = TRUE))[[TRUE]]
-
- ## combine in to one data frame
- full_table <- cbind(
- "Variable" = x,
- "Count" = weighted_counts,
- "Proportion" = weighted_props,
- weighted_confint,
- "Design effect" = design_eff
- )
-
- ## return table as a dataframe
- full_table <- data.frame(full_table,
- ## remove the variable names from rows (is a separate column now)
- row.names = NULL)
-
- ## change numerics back to numeric
- full_table[ , 2:6] <- as.numeric(full_table[, 2:6])
-
- ## return dataframe
- full_table
-}
-
-## iterate over several variables to create a table
-purrr::map(
- ## define variables of interest
- c("left", "died", "arrived"),
- ## state function using and arguments for that function (design)
- svy_prop, design = base_survey_design) %>%
- ## collapse list in to a single data frame
- bind_rows() %>%
- ## round
- mutate(across(where(is.numeric), round, digits = 1))
+
# Define function to calculate weighted counts, proportions, CI and design effect
+# x is the variable in quotation marks
+# design is your survey design object
+
+svy_prop <- function(design, x) {
+
+ ## put the variable of interest in a formula
+ form <- as.formula(paste0( "~" , x))
+ ## only keep the TRUE column of counts from svytable
+ weighted_counts <- svytable(form, design)[[2]]
+ ## calculate proportions (multiply by 100 to get percentages)
+ weighted_props <- svyciprop(form, design, na.rm = TRUE) * 100
+ ## extract the confidence intervals and multiply to get percentages
+ weighted_confint <- confint(weighted_props) * 100
+ ## use svymean to calculate design effect and only keep the TRUE column
+ design_eff <- deff(svymean(form, design, na.rm = TRUE, deff = TRUE))[[TRUE]]
+
+ ## combine in to one data frame
+ full_table <- cbind(
+ "Variable" = x,
+ "Count" = weighted_counts,
+ "Proportion" = weighted_props,
+ weighted_confint,
+ "Design effect" = design_eff
+ )
+
+ ## return table as a dataframe
+ full_table <- data.frame(full_table,
+ ## remove the variable names from rows (is a separate column now)
+ row.names = NULL)
+
+ ## change numerics back to numeric
+ full_table[ , 2:6] <- as.numeric(full_table[, 2:6])
+
+ ## return dataframe
+ full_table
+}
+
+## iterate over several variables to create a table
+purrr::map(
+ ## define variables of interest
+ c("left", "died", "arrived"),
+ ## state function using and arguments for that function (design)
+ svy_prop, design = base_survey_design) %>%
+ ## collapse list in to a single data frame
+ bind_rows() %>%
+ ## round
+ mutate(across(where(is.numeric), round, digits = 1))
Variable Count Proportion X2.5. X97.5. Design.effect
1 left 701199.1 47.3 39.2 55.5 2.4
@@ -1497,21 +1490,21 @@ With srvyr we can use dplyr syntax to create a table. Note that the survey_mean()
function is used and the proportion argument is specified, and also that the same function is used to calculate design effect. This is because srvyr wraps around both of the survey package functions svyciprop()
and svymean()
, which are used in the above section.
NOTE: It does not seem to be possible to get proportions from categorical variables using srvyr either, if you need this then check out the section below using sitrep
-
## use the srvyr design object
-survey_design %>%
- summarise(
- ## produce the weighted counts
- counts = survey_total(died),
- ## produce weighted proportions and confidence intervals
- ## multiply by 100 to get a percentage
- props = survey_mean(died,
- proportion = TRUE,
- vartype = "ci") * 100,
- ## produce the design effect
- deff = survey_mean(died, deff = TRUE)) %>%
- ## only keep the rows of interest
- ## (drop standard errors and repeat proportion calculation)
- select(counts, props, props_low, props_upp, deff_deff)
+
## use the srvyr design object
+survey_design %>%
+ summarise(
+ ## produce the weighted counts
+ counts = survey_total(died),
+ ## produce weighted proportions and confidence intervals
+ ## multiply by 100 to get a percentage
+ props = survey_mean(died,
+ proportion = TRUE,
+ vartype = "ci") * 100,
+ ## produce the design effect
+ deff = survey_mean(died, deff = TRUE)) %>%
+ ## only keep the rows of interest
+ ## (drop standard errors and repeat proportion calculation)
+ select(counts, props, props_low, props_upp, deff_deff)
# A tibble: 1 × 5
counts props props_low props_upp deff_deff
@@ -1521,42 +1514,42 @@
Here too we could write a function to then iterate over multiple variables using the purrr package. See the handbook iteration chapter for details on purrr.
-
# Define function to calculate weighted counts, proportions, CI and design effect
-# design is your survey design object
-# x is the variable in quotation marks
-
-
-srvyr_prop <- function(design, x) {
-
- summarise(
- ## using the survey design object
- design,
- ## produce the weighted counts
- counts = survey_total(.data[[x]]),
- ## produce weighted proportions and confidence intervals
- ## multiply by 100 to get a percentage
- props = survey_mean(.data[[x]],
- proportion = TRUE,
- vartype = "ci") * 100,
- ## produce the design effect
- deff = survey_mean(.data[[x]], deff = TRUE)) %>%
- ## add in the variable name
- mutate(variable = x) %>%
- ## only keep the rows of interest
- ## (drop standard errors and repeat proportion calculation)
- select(variable, counts, props, props_low, props_upp, deff_deff)
-
-}
-
-
-## iterate over several variables to create a table
-purrr::map(
- ## define variables of interest
- c("left", "died", "arrived"),
- ## state function using and arguments for that function (design)
- ~srvyr_prop(.x, design = survey_design)) %>%
- ## collapse list in to a single data frame
- bind_rows()
+
# Define function to calculate weighted counts, proportions, CI and design effect
+# design is your survey design object
+# x is the variable in quotation marks
+
+
+srvyr_prop <- function(design, x) {
+
+ summarise(
+ ## using the survey design object
+ design,
+ ## produce the weighted counts
+ counts = survey_total(.data[[x]]),
+ ## produce weighted proportions and confidence intervals
+ ## multiply by 100 to get a percentage
+ props = survey_mean(.data[[x]],
+ proportion = TRUE,
+ vartype = "ci") * 100,
+ ## produce the design effect
+ deff = survey_mean(.data[[x]], deff = TRUE)) %>%
+ ## add in the variable name
+ mutate(variable = x) %>%
+ ## only keep the rows of interest
+ ## (drop standard errors and repeat proportion calculation)
+ select(variable, counts, props, props_low, props_upp, deff_deff)
+
+}
+
+
+## iterate over several variables to create a table
+purrr::map(
+ ## define variables of interest
+ c("left", "died", "arrived"),
+ ## state function using and arguments for that function (design)
+ ~srvyr_prop(.x, design = survey_design)) %>%
+ ## collapse list in to a single data frame
+ bind_rows()
# A tibble: 3 × 6
variable counts props props_low props_upp deff_deff
@@ -1571,17 +1564,17 @@ Sitrep package
The tab_survey()
function from sitrep is a wrapper for srvyr, allowing you to create weighted tables with minimal coding. It also allows you to calculate weighted proportions for categorical variables.
-
## using the survey design object
-survey_design %>%
- ## pass the names of variables of interest unquoted
- tab_survey(
- "arrived",
- "left",
- "died",
- "education_level",
- deff = TRUE, # calculate the design effect
- pretty = TRUE # merge the proportion and 95%CI
- )
+
## using the survey design object
+survey_design %>%
+ ## pass the names of variables of interest unquoted
+ tab_survey(
+ "arrived",
+ "left",
+ "died",
+ "education_level",
+ deff = TRUE, # calculate the design effect
+ pretty = TRUE # merge the proportion and 95%CI
+ )
@@ -1602,16 +1595,16 @@
Gtsummary package
With gtsummary we can use the function tbl_svysummary()
and the addition add_ci()
to add confidence intervals.
-
## using the survey package design object
-tbl_svysummary(base_survey_design,
- include = c(arrived, left, died), ## define variables want to include
- statistic = list(everything() ~ c("{n} ({p}%)"))) %>% ## define stats of interest
- add_ci() %>% ## add confidence intervals
- add_n() %>%
- modify_header(label = list(
- n ~ "**Weighted total (N)**",
- stat_0 ~ "**Weighted count**"
- ))
+
## using the survey package design object
+tbl_svysummary(base_survey_design,
+ include = c(arrived, left, died), ## define variables want to include
+ statistic = list(everything() ~ c("{n} ({p}%)"))) %>% ## define stats of interest
+ add_ci() %>% ## add confidence intervals
+ add_n() %>%
+ modify_header(label = list(
+ n ~ "**Weighted total (N)**",
+ stat_0 ~ "**Weighted count**"
+ ))
Warning: The `update` argument of `modify_header()` is deprecated as of gtsummary 2.0.0.
ℹ Use `modify_header(...)` input instead. Dynamic dots allow for syntax like
@@ -1620,23 +1613,23 @@
-
-
@@ -2135,16 +2128,16 @@
Survey package
-
ratio <- svyratio(~died,
- denominator = ~obstime,
- design = base_survey_design)
-
-ci <- confint(ratio)
-
-cbind(
- ratio$ratio * 10000,
- ci * 10000
-)
+
ratio <- svyratio(~died,
+ denominator = ~obstime,
+ design = base_survey_design)
+
+ci <- confint(ratio)
+
+cbind(
+ ratio$ratio * 10000,
+ ci * 10000
+)
obstime 2.5 % 97.5 %
died 5.981922 1.194294 10.76955
@@ -2154,14 +2147,14 @@
Srvyr package
-
survey_design %>%
- ## survey ratio used to account for observation time
- summarise(
- mortality = survey_ratio(
- as.numeric(died) * 10000,
- obstime,
- vartype = "ci")
- )
+
survey_design %>%
+ ## survey ratio used to account for observation time
+ summarise(
+ mortality = survey_ratio(
+ as.numeric(died) * 10000,
+ obstime,
+ vartype = "ci")
+ )
# A tibble: 1 × 3
mortality mortality_low mortality_upp
@@ -2177,34 +2170,34 @@ To carry out a univariate regression, we can use the packages survey for the function svyglm()
and the package gtsummary which allows us to call svyglm()
inside the function tbl_uvregression
. To do this we first use the survey_design
object created above. This is then provided to the function tbl_uvregression()
as in the Univariate and multivariable regression chapter. We then make one key change, we change method = glm
to method = survey::svyglm
in order to carry out our survey weighted regression.
Here we will be using the previously created object survey_design
to predict whether the value in the column died
is TRUE
, using the columns malaria_treatment
, bednet
, and age_years
.
-
survey_design %>%
- tbl_uvregression( #Carry out a univariate regression, if we wanted a multivariable regression we would use tbl_
- method = survey::svyglm, #Set this to survey::svyglm to carry out our weighted regression on the survey data
- y = died, #The column we are trying to predict
- method.args = list(family = binomial), #The family, we are carrying out a logistic regression so we want the family as binomial
- include = c(malaria_treatment, #These are the columns we want to evaluate
- bednet,
- age_years),
- exponentiate = T #To transform the log odds to odds ratio for easier interpretation
- )
+
survey_design %>%
+ tbl_uvregression( #Carry out a univariate regression, if we wanted a multivariable regression we would use tbl_
+ method = survey::svyglm, #Set this to survey::svyglm to carry out our weighted regression on the survey data
+ y = died, #The column we are trying to predict
+ method.args = list(family = binomial), #The family, we are carrying out a logistic regression so we want the family as binomial
+ include = c(malaria_treatment, #These are the columns we want to evaluate
+ bednet,
+ age_years),
+ exponentiate = T #To transform the log odds to odds ratio for easier interpretation
+ )
-
-
@@ -2738,32 +2731,32 @@
If we wanted to carry out a multivariable regression, we would have to first use the function svyglm()
and pipe (%>%
) the results into the function tbl_regression
. Note that we need to specify the formula.
-
survey_design %>%
- svyglm(formula = died ~ malaria_treatment +
- bednet +
- age_years,
- family = binomial) %>% #The family, we are carrying out a logistic regression so we want the family as binomial
- tbl_regression(
- exponentiate = T #To transform the log odds to odds ratio for easier interpretation
- )
+
survey_design %>%
+ svyglm(formula = died ~ malaria_treatment +
+ bednet +
+ age_years,
+ family = binomial) %>% #The family, we are carrying out a logistic regression so we want the family as binomial
+ tbl_regression(
+ exponentiate = T #To transform the log odds to odds ratio for easier interpretation
+ )
-
-
@@ -3886,7 +3879,7 @@
-
+
+
@@ -1614,21 +1582,21 @@
Expand pati
tdc()
creates the time-dependent covariate column, agvhd
, to go with the newly created time intervals.
-
td_dat <-
- tmerge(
- data1 = bmt %>% select(my_id, T1, delta1),
- data2 = bmt %>% select(my_id, T1, delta1, TA, deltaA),
- id = my_id,
- death = event(T1, delta1),
- agvhd = tdc(TA)
- )
+
td_dat <-
+ tmerge(
+ data1 = bmt %>% select(my_id, T1, delta1),
+ data2 = bmt %>% select(my_id, T1, delta1, TA, deltaA),
+ id = my_id,
+ death = event(T1, delta1),
+ agvhd = tdc(TA)
+ )
To see what this does, let’s look at the data for the first 5 individual patients.
The variables of interest in the original data looked like this:
-
bmt %>%
- select(my_id, T1, delta1, TA, deltaA) %>%
- filter(my_id %in% seq(1, 5))
+
bmt %>%
+ select(my_id, T1, delta1, TA, deltaA) %>%
+ filter(my_id %in% seq(1, 5))
my_id T1 delta1 TA deltaA
1 1 2081 0 67 1
@@ -1640,8 +1608,8 @@ Expand pati
The new dataset for these same patients looks like this:
-
td_dat %>%
- filter(my_id %in% seq(1, 5))
+
td_dat %>%
+ filter(my_id %in% seq(1, 5))
my_id T1 delta1 tstart tstop death agvhd
1 1 2081 0 0 67 0 0
@@ -1660,12 +1628,12 @@ Expand pati
Cox regression with time-dependent covariates
Now that we’ve reshaped our data and added the new time-dependent aghvd
variable, let’s fit a simple single variable cox regression model. We can use the same coxph()
function as before, we just need to change our Surv()
function to specify both the start and stop time for each interval using the time1 =
and time2 =
arguments.
-
bmt_td_model = coxph(
- Surv(time = tstart, time2 = tstop, event = death) ~ agvhd,
- data = td_dat
- )
-
-summary(bmt_td_model)
+
bmt_td_model = coxph(
+ Surv(time = tstart, time2 = tstop, event = death) ~ agvhd,
+ data = td_dat
+ )
+
+summary(bmt_td_model)
Call:
coxph(formula = Surv(time = tstart, time2 = tstop, event = death) ~
@@ -1687,7 +1655,7 @@
-ggforest(bmt_td_model, data = td_dat)
+ggforest(bmt_td_model, data = td_dat)
@@ -2304,7 +2272,7 @@
+
Age Category/Gender | f | m | NA_ | Total |
---|
0-4 | 640 (22.8%) | 416 (14.8%) | 39 (14.0%) | 1,095 (18.6%) |
5-9 | 641 (22.8%) | 412 (14.7%) | 42 (15.1%) | 1,095 (18.6%) |
10-14 | 518 (18.5%) | 383 (13.7%) | 40 (14.4%) | 941 (16.0%) |
15-19 | 359 (12.8%) | 364 (13.0%) | 20 (7.2%) | 743 (12.6%) |
20-29 | 468 (16.7%) | 575 (20.5%) | 30 (10.8%) | 1,073 (18.2%) |
30-49 | 179 (6.4%) | 557 (19.9%) | 18 (6.5%) | 754 (12.8%) |
50-69 | 2 (0.1%) | 91 (3.2%) | 2 (0.7%) | 95 (1.6%) |
70+ | 0 (0.0%) | 5 (0.2%) | 1 (0.4%) | 6 (0.1%) |
| 0 (0.0%) | 0 (0.0%) | 86 (30.9%) | 86 (1.5%) |
@@ -1601,9 +1598,9 @@
Printing the
Use on other tables
You can use janitor’s adorn_*()
functions on other tables, such as those created by summarise()
and count()
from dplyr, or table()
from base R. Simply pipe the table to the desired janitor function. For example:
-
linelist %>%
- count(hospital) %>% # dplyr function
- adorn_totals() # janitor function
+
linelist %>%
+ count(hospital) %>% # dplyr function
+ adorn_totals() # janitor function
hospital n
Central Hospital 454
@@ -1620,19 +1617,19 @@ Use on othe
Saving the tabyl
If you convert the table to a “pretty” image with a package like flextable, you can save it with functions from that package - like save_as_html()
, save_as_word()
, save_as_ppt()
, and save_as_image()
from flextable (as discussed more extensively in the Tables for presentation page). Below, the table is saved as a Word document, in which it can be further hand-edited.
-
linelist %>%
- tabyl(age_cat, gender) %>%
- adorn_totals(where = "col") %>%
- adorn_percentages(denominator = "col") %>%
- adorn_pct_formatting() %>%
- adorn_ns(position = "front") %>%
- adorn_title(
- row_name = "Age Category",
- col_name = "Gender",
- placement = "combined") %>%
- flextable::flextable() %>% # convert to image
- flextable::autofit() %>% # ensure only one line per row
- flextable::save_as_docx(path = "tabyl.docx") # save as Word document to filepath
+
linelist %>%
+ tabyl(age_cat, gender) %>%
+ adorn_totals(where = "col") %>%
+ adorn_percentages(denominator = "col") %>%
+ adorn_pct_formatting() %>%
+ adorn_ns(position = "front") %>%
+ adorn_title(
+ row_name = "Age Category",
+ col_name = "Gender",
+ placement = "combined") %>%
+ flextable::flextable() %>% # convert to image
+ flextable::autofit() %>% # ensure only one line per row
+ flextable::save_as_docx(path = "tabyl.docx") # save as Word document to filepath
@@ -1648,10 +1645,10 @@
Saving the tab
Statistics
You can apply statistical tests on tabyls, like chisq.test()
or fisher.test()
from the stats package, as shown below. Note missing values are not allowed so they are excluded from the tabyl with show_na = FALSE
.
-
age_by_outcome <- linelist %>%
- tabyl(age_cat, outcome, show_na = FALSE)
-
-chisq.test(age_by_outcome)
+
age_by_outcome <- linelist %>%
+ tabyl(age_cat, outcome, show_na = FALSE)
+
+chisq.test(age_by_outcome)
Pearson's Chi-squared test
@@ -1683,8 +1680,8 @@ Get counts
The most simple function to apply within summarise()
is n()
. Leave the parentheses empty to count the number of rows.
-
linelist %>% # begin with linelist
- summarise(n_rows = n()) # return new summary dataframe with column n_rows
+
linelist %>% # begin with linelist
+ summarise(n_rows = n()) # return new summary dataframe with column n_rows
n_rows
1 5888
@@ -1692,9 +1689,9 @@
Get counts
This gets more interesting if we have grouped the data beforehand.
-
linelist %>%
- group_by(age_cat) %>% # group data by unique values in column age_cat
- summarise(n_rows = n()) # return number of rows *per group*
+
linelist %>%
+ group_by(age_cat) %>% # group data by unique values in column age_cat
+ summarise(n_rows = n()) # return number of rows *per group*
# A tibble: 9 × 2
age_cat n_rows
@@ -1719,8 +1716,8 @@ Get counts
Un-groups the data.
-
linelist %>%
- count(age_cat)
+
linelist %>%
+ count(age_cat)
age_cat n
1 0-4 1095
@@ -1737,8 +1734,8 @@ Get counts
You can change the name of the counts column from the default n
to something else by specifying it to name =
.
Tabulating counts of two or more grouping columns are still returned in “long” format, with the counts in the n
column. See the page on Pivoting data to learn about “long” and “wide” data formats.
-
linelist %>%
- count(age_cat, outcome)
+
linelist %>%
+ count(age_cat, outcome)
age_cat outcome n
1 0-4 Death 471
@@ -1782,14 +1779,14 @@ Proportions
Note that in this case, sum()
in the mutate()
command will return the sum of the whole column n
for use as the proportion denominator. As explained in the Grouping data page, if sum()
is used in grouped data (e.g. if the mutate()
immediately followed a group_by()
command), it will return sums by group. As stated just above, count()
finishes its actions by ungrouping. Thus, in this scenario we get full column proportions.
To easily display percents, you can wrap the proportion in the function percent()
from the package scales (note this convert to class character).
-
age_summary <- linelist %>%
- count(age_cat) %>% # group and count by gender (produces "n" column)
- mutate( # create percent of column - note the denominator
- percent = scales::percent(n / sum(n))
- )
-
-# print
-age_summary
+
age_summary <- linelist %>%
+ count(age_cat) %>% # group and count by gender (produces "n" column)
+ mutate( # create percent of column - note the denominator
+ percent = scales::percent(n / sum(n))
+ )
+
+# print
+age_summary
age_cat n percent
1 0-4 1095 18.60%
@@ -1805,15 +1802,15 @@ Proportions
Below is a method to calculate proportions within groups. It relies on different levels of data grouping being selectively applied and removed. First, the data are grouped on outcome
via group_by()
. Then, count()
is applied. This function further groups the data by age_cat
and returns counts for each outcome
-age-cat
combination. Importantly - as it finishes its process, count()
also ungroups the age_cat
grouping, so the only remaining data grouping is the original grouping by outcome
. Thus, the final step of calculating proportions (denominator sum(n)
) is still grouped by outcome
.
-
age_by_outcome <- linelist %>% # begin with linelist
- group_by(outcome) %>% # group by outcome
- count(age_cat) %>% # group and count by age_cat, and then remove age_cat grouping
- mutate(percent = scales::percent(n / sum(n))) # calculate percent - note the denominator is by outcome group
+
age_by_outcome <- linelist %>% # begin with linelist
+ group_by(outcome) %>% # group by outcome
+ count(age_cat) %>% # group and count by age_cat, and then remove age_cat grouping
+ mutate(percent = scales::percent(n / sum(n))) # calculate percent - note the denominator is by outcome group
@@ -1821,14 +1818,14 @@
Proportions
Plotting
To display a “long” table output like the above with ggplot()
is relatively straight-forward. The data are naturally in “long” format, which is naturally accepted by ggplot()
. See further examples in the pages ggplot basics and ggplot tips.
-
linelist %>% # begin with linelist
- count(age_cat, outcome) %>% # group and tabulate counts by two columns
- ggplot()+ # pass new data frame to ggplot
- geom_col( # create bar plot
- mapping = aes(
- x = outcome, # map outcome to x-axis
- fill = age_cat, # map age_cat to the fill
- y = n)) # map the counts column `n` to the height
+
linelist %>% # begin with linelist
+ count(age_cat, outcome) %>% # group and tabulate counts by two columns
+ ggplot()+ # pass new data frame to ggplot
+ geom_col( # create bar plot
+ mapping = aes(
+ x = outcome, # map outcome to x-axis
+ fill = age_cat, # map age_cat to the fill
+ y = n)) # map the counts column `n` to the height
@@ -1852,18 +1849,18 @@ Summary st
Below, linelist
data are summarised to describe the days delay from symptom onset to hospital admission (column days_onset_hosp
), by hospital.
-
summary_table <- linelist %>% # begin with linelist, save out as new object
- group_by(hospital) %>% # group all calculations by hospital
- summarise( # only the below summary columns will be returned
- cases = n(), # number of rows per group
- delay_max = max(days_onset_hosp, na.rm = T), # max delay
- delay_mean = round(mean(days_onset_hosp, na.rm=T), digits = 1), # mean delay, rounded
- delay_sd = round(sd(days_onset_hosp, na.rm = T), digits = 1), # standard deviation of delays, rounded
- delay_3 = sum(days_onset_hosp >= 3, na.rm = T), # number of rows with delay of 3 or more days
- pct_delay_3 = scales::percent(delay_3 / cases) # convert previously-defined delay column to percent
- )
-
-summary_table # print
+
summary_table <- linelist %>% # begin with linelist, save out as new object
+ group_by(hospital) %>% # group all calculations by hospital
+ summarise( # only the below summary columns will be returned
+ cases = n(), # number of rows per group
+ delay_max = max(days_onset_hosp, na.rm = T), # max delay
+ delay_mean = round(mean(days_onset_hosp, na.rm=T), digits = 1), # mean delay, rounded
+ delay_sd = round(sd(days_onset_hosp, na.rm = T), digits = 1), # standard deviation of delays, rounded
+ delay_3 = sum(days_onset_hosp >= 3, na.rm = T), # number of rows with delay of 3 or more days
+ pct_delay_3 = scales::percent(delay_3 / cases) # convert previously-defined delay column to percent
+ )
+
+summary_table # print
# A tibble: 6 × 7
hospital cases delay_max delay_mean delay_sd delay_3 pct_delay_3
@@ -1897,12 +1894,12 @@ Summary st
Conditional statistics
You may want to return conditional statistics - e.g. the maximum of rows that meet certain criteria. This can be done by subsetting the column with brackets [ ]
. The example below returns the maximum temperature for patients classified having or not having fever. Be aware however - it may be more appropriate to add another column to the group_by()
command and pivot_wider()
(as demonstrated below).
-
linelist %>%
- group_by(hospital) %>%
- summarise(
- max_temp_fvr = max(temp[fever == "yes"], na.rm = T),
- max_temp_no = max(temp[fever == "no"], na.rm = T)
- )
+
linelist %>%
+ group_by(hospital) %>%
+ summarise(
+ max_temp_fvr = max(temp[fever == "yes"], na.rm = T),
+ max_temp_no = max(temp[fever == "no"], na.rm = T)
+ )
# A tibble: 6 × 3
hospital max_temp_fvr max_temp_no
@@ -1924,18 +1921,18 @@ Glueing togeth
Then, to make the table more presentable, a total row is added with adorn_totals()
from janitor (which ignores non-numeric columns). Lastly, we use select()
from dplyr to both re-order and rename to nicer column names.
Now you could pass to flextable and print the table to Word, .png, .jpeg, .html, Powerpoint, RMarkdown, etc.! (see the Tables for presentation page).
-
summary_table %>%
- mutate(delay = str_glue("{delay_mean} ({delay_sd})")) %>% # combine and format other values
- select(-c(delay_mean, delay_sd)) %>% # remove two old columns
- adorn_totals(where = "row") %>% # add total row
- select( # order and rename cols
- "Hospital Name" = hospital,
- "Cases" = cases,
- "Max delay" = delay_max,
- "Mean (sd)" = delay,
- "Delay 3+ days" = delay_3,
- "% delay 3+ days" = pct_delay_3
- )
+
summary_table %>%
+ mutate(delay = str_glue("{delay_mean} ({delay_sd})")) %>% # combine and format other values
+ select(-c(delay_mean, delay_sd)) %>% # remove two old columns
+ adorn_totals(where = "row") %>% # add total row
+ select( # order and rename cols
+ "Hospital Name" = hospital,
+ "Cases" = cases,
+ "Max delay" = delay_max,
+ "Mean (sd)" = delay,
+ "Delay 3+ days" = delay_3,
+ "% delay 3+ days" = pct_delay_3
+ )
Hospital Name Cases Max delay Mean (sd) Delay 3+ days
Central Hospital 454 12 1.9 (1.9) 108
@@ -1959,9 +1956,9 @@ Glueing togeth
Percentiles
Percentiles and quantiles in dplyr deserve a special mention. To return quantiles, use quantile()
with the defaults or specify the value(s) you would like with probs =
.
-
# get default percentile values of age (0%, 25%, 50%, 75%, 100%)
-linelist %>%
- summarise(age_percentiles = quantile(age_years, na.rm = TRUE))
+
# get default percentile values of age (0%, 25%, 50%, 75%, 100%)
+linelist %>%
+ summarise(age_percentiles = quantile(age_years, na.rm = TRUE))
age_percentiles
1 0
@@ -1970,14 +1967,14 @@ Percentiles
4 23
5 84
-
# get manually-specified percentile values of age (5%, 50%, 75%, 98%)
-linelist %>%
- summarise(
- age_percentiles = quantile(
- age_years,
- probs = c(.05, 0.5, 0.75, 0.98),
- na.rm=TRUE)
- )
+
# get manually-specified percentile values of age (5%, 50%, 75%, 98%)
+linelist %>%
+ summarise(
+ age_percentiles = quantile(
+ age_years,
+ probs = c(.05, 0.5, 0.75, 0.98),
+ na.rm=TRUE)
+ )
age_percentiles
1 1
@@ -1988,15 +1985,15 @@ Percentiles
If you want to return quantiles by group, you may encounter long and less useful outputs if you simply add another column to group_by()
. So, try this approach instead - create a column for each quantile level desired.
-
# get manually-specified percentile values of age (5%, 50%, 75%, 98%)
-linelist %>%
- group_by(hospital) %>%
- summarise(
- p05 = quantile(age_years, probs = 0.05, na.rm=T),
- p50 = quantile(age_years, probs = 0.5, na.rm=T),
- p75 = quantile(age_years, probs = 0.75, na.rm=T),
- p98 = quantile(age_years, probs = 0.98, na.rm=T)
- )
+
# get manually-specified percentile values of age (5%, 50%, 75%, 98%)
+linelist %>%
+ group_by(hospital) %>%
+ summarise(
+ p05 = quantile(age_years, probs = 0.05, na.rm=T),
+ p50 = quantile(age_years, probs = 0.5, na.rm=T),
+ p75 = quantile(age_years, probs = 0.75, na.rm=T),
+ p98 = quantile(age_years, probs = 0.98, na.rm=T)
+ )
# A tibble: 6 × 5
hospital p05 p50 p75 p98
@@ -2011,9 +2008,9 @@ Percentiles
While dplyr summarise()
certainly offers more fine control, you may find that all the summary statistics you need can be produced with get_summary_stat()
from the rstatix package. If operating on grouped data, if will return 0%, 25%, 50%, 75%, and 100%. If applied to ungrouped data, you can specify the percentiles with probs = c(.05, .5, .75, .98)
.
-
linelist %>%
- group_by(hospital) %>%
- rstatix::get_summary_stats(age, type = "quantile")
+
linelist %>%
+ group_by(hospital) %>%
+ rstatix::get_summary_stats(age, type = "quantile")
# A tibble: 6 × 8
hospital variable n `0%` `25%` `50%` `75%` `100%`
@@ -2027,8 +2024,8 @@ Percentiles
-
linelist %>%
- rstatix::get_summary_stats(age, type = "quantile")
+
linelist %>%
+ rstatix::get_summary_stats(age, type = "quantile")
# A tibble: 1 × 7
variable n `0%` `25%` `50%` `75%` `100%`
@@ -2044,11 +2041,11 @@ Summa
For example, let’s say you are beginning with the data frame of counts below, called linelist_agg
- it shows in “long” format the case counts by outcome and gender.
Below we create this example data frame of linelist
case counts by outcome and gender (missing values removed for clarity).
-
linelist_agg <- linelist %>%
- drop_na(gender, outcome) %>%
- count(outcome, gender)
-
-linelist_agg
+
linelist_agg <- linelist %>%
+ drop_na(gender, outcome) %>%
+ count(outcome, gender)
+
+linelist_agg
outcome gender n
1 Death f 1227
@@ -2059,12 +2056,12 @@ Summa
To sum the counts (in column n
) by group you can use summarise()
but set the new column equal to sum(n, na.rm=T)
. To add a conditional element to the sum operation, you can use the subset bracket [ ] syntax on the counts column.
-
linelist_agg %>%
- group_by(outcome) %>%
- summarise(
- total_cases = sum(n, na.rm=T),
- male_cases = sum(n[gender == "m"], na.rm=T),
- female_cases = sum(n[gender == "f"], na.rm=T))
+
linelist_agg %>%
+ group_by(outcome) %>%
+ summarise(
+ total_cases = sum(n, na.rm=T),
+ male_cases = sum(n[gender == "m"], na.rm=T),
+ female_cases = sum(n[gender == "f"], na.rm=T))
# A tibble: 2 × 4
outcome total_cases male_cases female_cases
@@ -2085,11 +2082,11 @@ a
Below, mean()
is applied to several numeric columns. A vector of columns are named explicitly to .cols =
and a single function mean
is specified (no parentheses) to .fns =
. Any additional arguments for the function (e.g. na.rm=TRUE
) are provided after .fns =
, separated by a comma.
It can be difficult to get the order of parentheses and commas correct when using across()
. Remember that within across()
you must include the columns, the functions, and any extra arguments needed for the functions.
-
linelist %>%
- group_by(outcome) %>%
- summarise(across(.cols = c(age_years, temp, wt_kg, ht_cm), # columns
- .fns = mean, # function
- na.rm=T)) # extra arguments
+
linelist %>%
+ group_by(outcome) %>%
+ summarise(across(.cols = c(age_years, temp, wt_kg, ht_cm), # columns
+ .fns = mean, # function
+ na.rm=T)) # extra arguments
# A tibble: 3 × 5
outcome age_years temp wt_kg ht_cm
@@ -2101,11 +2098,11 @@ a
Multiple functions can be run at once. Below the functions mean
and sd
are provided to .fns =
within a list()
. You have the opportunity to provide character names (e.g. “mean” and “sd”) which are appended in the new column names.
-
linelist %>%
- group_by(outcome) %>%
- summarise(across(.cols = c(age_years, temp, wt_kg, ht_cm), # columns
- .fns = list("mean" = mean, "sd" = sd), # multiple functions
- na.rm=T)) # extra arguments
+
linelist %>%
+ group_by(outcome) %>%
+ summarise(across(.cols = c(age_years, temp, wt_kg, ht_cm), # columns
+ .fns = list("mean" = mean, "sd" = sd), # multiple functions
+ na.rm=T)) # extra arguments
# A tibble: 3 × 9
outcome age_years_mean age_years_sd temp_mean temp_sd wt_kg_mean wt_kg_sd
@@ -2135,12 +2132,12 @@ a
For example, to return the mean of every numeric column use where()
and provide the function as.numeric()
(without parentheses). All this remains within the across()
command.
-
linelist %>%
- group_by(outcome) %>%
- summarise(across(
- .cols = where(is.numeric), # all numeric columns in the data frame
- .fns = mean,
- na.rm=T))
+
linelist %>%
+ group_by(outcome) %>%
+ summarise(across(
+ .cols = where(is.numeric), # all numeric columns in the data frame
+ .fns = mean,
+ na.rm=T))
# A tibble: 3 × 12
outcome generation age age_years lon lat wt_kg ht_cm ct_blood temp
@@ -2157,22 +2154,22 @@ Pivot widerIf you prefer your table in “wide” format you can transform it using the tidyr pivot_wider()
function. You will likely need to re-name the columns with rename()
. For more information see the page on Pivoting data.
The example below begins with the “long” table age_by_outcome
from the proportions section. We create it again and print, for clarity:
-
age_by_outcome <- linelist %>% # begin with linelist
- group_by(outcome) %>% # group by outcome
- count(age_cat) %>% # group and count by age_cat, and then remove age_cat grouping
- mutate(percent = scales::percent(n / sum(n))) # calculate percent - note the denominator is by outcome group
+
age_by_outcome <- linelist %>% # begin with linelist
+ group_by(outcome) %>% # group by outcome
+ count(age_cat) %>% # group and count by age_cat, and then remove age_cat grouping
+ mutate(percent = scales::percent(n / sum(n))) # calculate percent - note the denominator is by outcome group
To pivot wider, we create the new columns from the values in the existing column age_cat
(by setting names_from = age_cat
). We also specify that the new table values will come from the existing column n
, with values_from = n
. The columns not mentioned in our pivoting command (outcome
) will remain unchanged on the far left side.
-
age_by_outcome %>%
- select(-percent) %>% # keep only counts for simplicity
- pivot_wider(names_from = age_cat, values_from = n)
+
age_by_outcome %>%
+ select(-percent) %>% # keep only counts for simplicity
+ pivot_wider(names_from = age_cat, values_from = n)
# A tibble: 3 × 10
# Groups: outcome [3]
@@ -2192,17 +2189,17 @@ j
If your table consists only of counts or proportions/percents that can be summed into a total, then you can add sum totals using janitor’s adorn_totals()
as described in the section above. Note that this function can only sum the numeric columns - if you want to calculate other total summary statistics see the next approach with dplyr.
Below, linelist
is grouped by gender and summarised into a table that described the number of cases with known outcome, deaths, and recovered. Piping the table to adorn_totals()
adds a total row at the bottom reflecting the sum of each column. The further adorn_*()
functions adjust the display as noted in the code.
-
linelist %>%
- group_by(gender) %>%
- summarise(
- known_outcome = sum(!is.na(outcome)), # Number of rows in group where outcome is not missing
- n_death = sum(outcome == "Death", na.rm=T), # Number of rows in group where outcome is Death
- n_recover = sum(outcome == "Recover", na.rm=T), # Number of rows in group where outcome is Recovered
- ) %>%
- adorn_totals() %>% # Adorn total row (sums of each numeric column)
- adorn_percentages("col") %>% # Get column proportions
- adorn_pct_formatting() %>% # Convert proportions to percents
- adorn_ns(position = "front") # display % and counts (with counts in front)
+
linelist %>%
+ group_by(gender) %>%
+ summarise(
+ known_outcome = sum(!is.na(outcome)), # Number of rows in group where outcome is not missing
+ n_death = sum(outcome == "Death", na.rm=T), # Number of rows in group where outcome is Death
+ n_recover = sum(outcome == "Recover", na.rm=T), # Number of rows in group where outcome is Recovered
+ ) %>%
+ adorn_totals() %>% # Adorn total row (sums of each numeric column)
+ adorn_percentages("col") %>% # Get column proportions
+ adorn_pct_formatting() %>% # Convert proportions to percents
+ adorn_ns(position = "front") # display % and counts (with counts in front)
gender known_outcome n_death n_recover
f 2,180 (47.8%) 1,227 (47.5%) 953 (48.1%)
@@ -2217,14 +2214,15 @@ Joining data page. Below is an example:
You can make a summary table of outcome by hospital with group_by()
and summarise()
like this:
-
by_hospital <- linelist %>%
- filter(!is.na(outcome) & hospital != "Missing") %>% # Remove cases with missing outcome or hospital
- group_by(hospital, outcome) %>% # Group data
- summarise( # Create new summary columns of indicators of interest
- N = n(), # Number of rows per hospital-outcome group
- ct_value = median(ct_blood, na.rm=T)) # median CT value per group
-
-by_hospital # print table
+
by_hospital <- linelist %>%
+ filter(!is.na(outcome) & hospital != "Missing") %>% # Remove cases with missing outcome or hospital
+ group_by(hospital, outcome) %>% # Group data
+ summarise( # Create new summary columns of indicators of interest
+ N = n(), # Number of rows per hospital-outcome group
+ ct_value = median(ct_blood, na.rm=T) # median CT value per group
+ )
+
+by_hospital # print table
# A tibble: 10 × 4
# Groups: hospital [5]
@@ -2244,14 +2242,14 @@
-totals <- linelist %>%
- filter(!is.na(outcome) & hospital != "Missing") %>%
- group_by(outcome) %>% # Grouped only by outcome, not by hospital
- summarise(
- N = n(), # These statistics are now by outcome only
- ct_value = median(ct_blood, na.rm=T))
-
-totals # print table
+totals <- linelist %>%
+ filter(!is.na(outcome) & hospital != "Missing") %>%
+ group_by(outcome) %>% # Grouped only by outcome, not by hospital
+ summarise(
+ N = n(), # These statistics are now by outcome only
+ ct_value = median(ct_blood, na.rm=T))
+
+totals # print table
# A tibble: 2 × 3
outcome N ct_value
@@ -2262,35 +2260,35 @@ Cleaning data and core functions page).
-
table_long <- bind_rows(by_hospital, totals) %>%
- mutate(hospital = replace_na(hospital, "Total"))
+
table_long <- bind_rows(by_hospital, totals) %>%
+ mutate(hospital = replace_na(hospital, "Total"))
Here is the new table with “Total” rows at the bottom.
This table is in a “long” format, which may be what you want. Optionally, you can pivot this table wider to make it more readable. See the section on pivoting wider above, and the Pivoting data page. You can also add more columns, and arrange it nicely. This code is below.
-
table_long %>%
-
- # Pivot wider and format
- ########################
- mutate(hospital = replace_na(hospital, "Total")) %>%
- pivot_wider( # Pivot from long to wide
- values_from = c(ct_value, N), # new values are from ct and count columns
- names_from = outcome) %>% # new column names are from outcomes
- mutate( # Add new columns
- N_Known = N_Death + N_Recover, # number with known outcome
- Pct_Death = scales::percent(N_Death / N_Known, 0.1), # percent cases who died (to 1 decimal)
- Pct_Recover = scales::percent(N_Recover / N_Known, 0.1)) %>% # percent who recovered (to 1 decimal)
- select( # Re-order columns
- hospital, N_Known, # Intro columns
- N_Recover, Pct_Recover, ct_value_Recover, # Recovered columns
- N_Death, Pct_Death, ct_value_Death) %>% # Death columns
- arrange(N_Known) # Arrange rows from lowest to highest (Total row at bottom)
+
table_long %>%
+
+ # Pivot wider and format
+ ########################
+ mutate(hospital = replace_na(hospital, "Total")) %>%
+ pivot_wider( # Pivot from long to wide
+ values_from = c(ct_value, N), # new values are from ct and count columns
+ names_from = outcome) %>% # new column names are from outcomes
+ mutate( # Add new columns
+ N_Known = N_Death + N_Recover, # number with known outcome
+ Pct_Death = scales::percent(N_Death / N_Known, 0.1), # percent cases who died (to 1 decimal)
+ Pct_Recover = scales::percent(N_Recover / N_Known, 0.1)) %>% # percent who recovered (to 1 decimal)
+ select( # Re-order columns
+ hospital, N_Known, # Intro columns
+ N_Recover, Pct_Recover, ct_value_Recover, # Recovered columns
+ N_Death, Pct_Death, ct_value_Death) %>% # Death columns
+ arrange(N_Known) # Arrange rows from lowest to highest (Total row at bottom)
# A tibble: 6 × 8
# Groups: hospital [6]
@@ -2308,7 +2306,7 @@ Tables for presentation page.
-
Hospital | Total cases with known outcome | Recovered | Died |
---|
Total | % of cases | Median CT values | Total | % of cases | Median CT values |
---|
St. Mark's Maternity Hospital (SMMH) | 325 | 126 | 38.8% | 22 | 199 | 61.2% | 22 |
Central Hospital | 358 | 165 | 46.1% | 22 | 193 | 53.9% | 22 |
Other | 685 | 290 | 42.3% | 21 | 395 | 57.7% | 22 |
Military Hospital | 708 | 309 | 43.6% | 22 | 399 | 56.4% | 21 |
Missing | 1,125 | 514 | 45.7% | 21 | 611 | 54.3% | 21 |
Port Hospital | 1,364 | 579 | 42.4% | 21 | 785 | 57.6% | 22 |
Total | 3,440 | 1,469 | 42.7% | 22 | 1,971 | 57.3% | 22 |
+
Hospital | Total cases with known outcome | Recovered | Died |
---|
Total | % of cases | Median CT values | Total | % of cases | Median CT values |
---|
St. Mark's Maternity Hospital (SMMH) | 325 | 126 | 38.8% | 22 | 199 | 61.2% | 22 |
Central Hospital | 358 | 165 | 46.1% | 22 | 193 | 53.9% | 22 |
Other | 685 | 290 | 42.3% | 21 | 395 | 57.7% | 22 |
Military Hospital | 708 | 309 | 43.6% | 22 | 399 | 56.4% | 21 |
Missing | 1,125 | 514 | 45.7% | 21 | 611 | 54.3% | 21 |
Port Hospital | 1,364 | 579 | 42.4% | 21 | 785 | 57.6% | 22 |
Total | 3,440 | 1,469 | 42.7% | 22 | 1,971 | 57.3% | 22 |
@@ -2323,27 +2321,27 @@ Summary table
The default behavior of tbl_summary()
is quite incredible - it takes the columns you provide and creates a summary table in one command. The function prints statistics appropriate to the column class: median and inter-quartile range (IQR) for numeric columns, and counts (%) for categorical columns. Missing values are converted to “Unknown”. Footnotes are added to the bottom to explain the statistics, while the total N is shown at the top.
-
linelist %>%
- select(age_years, gender, outcome, fever, temp, hospital) %>% # keep only the columns of interest
- tbl_summary() # default
+
linelist %>%
+ select(age_years, gender, outcome, fever, temp, hospital) %>% # keep only the columns of interest
+ tbl_summary() # default
-
-
@@ -2906,28 +2904,28 @@
Adjustments
A simple example of a statistic =
equation might look like below, to only print the mean of column age_years
:
-
linelist %>%
- select(age_years) %>% # keep only columns of interest
- tbl_summary( # create summary table
- statistic = age_years ~ "{mean}") # print mean of age
+
linelist %>%
+ select(age_years) %>% # keep only columns of interest
+ tbl_summary( # create summary table
+ statistic = age_years ~ "{mean}") # print mean of age
-
-
@@ -3398,28 +3396,28 @@
Adjustments
A slightly more complex equation might look like "({min}, {max})"
, incorporating the max and min values within parentheses and separated by a comma:
-
linelist %>%
- select(age_years) %>% # keep only columns of interest
- tbl_summary( # create summary table
- statistic = age_years ~ "({min}, {max})") # print min and max of age
+
linelist %>%
+ select(age_years) %>% # keep only columns of interest
+ tbl_summary( # create summary table
+ statistic = age_years ~ "({min}, {max})") # print min and max of age
-
-
@@ -3898,48 +3896,47 @@
Adjustments
type =
This is used to adjust how many levels of the statistics are shown. The syntax is similar to statistic =
in that you provide an equation with columns on the left and a value on the right. Two common scenarios include:
-type = all_categorical() ~ "categorical"
Forces dichotomous columns (e.g. fever
yes/no) to show all levels instead of only the “yes” row
-
-type = all_continuous() ~ "continuous2"
Allows multi-line statistics per variable, as shown in a later section
+type = all_categorical() ~ "categorical"
Forces dichotomous columns (e.g. fever
yes/no) to show all levels instead of only the “yes” row.
+type = all_continuous() ~ "continuous2"
Allows multi-line statistics per variable, as shown in a later section.
In the example below, each of these arguments is used to modify the original summary table: