Skip to content

Commit

Permalink
Iterate on age group agg example
Browse files Browse the repository at this point in the history
  • Loading branch information
brookslogan committed Dec 18, 2024
1 parent 903e736 commit 47c16fe
Showing 1 changed file with 70 additions and 62 deletions.
132 changes: 70 additions & 62 deletions vignettes/epi_df.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -211,91 +211,96 @@ Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/flusurv.html)
library(epidatr)
flu_data_api <- pub_flusurv(
locations = "ca",
epiweeks = epirange(201801, 202001),
epiweeks = epirange(201801, 202001)
)
```

We're interested in the age-specific rates:
```{r}
flu_data <- flu_data_api %>%
select(location, epiweek, issue, rate_age_0, rate_age_1, rate_age_2, rate_age_3, rate_age_4, rate_overall) %>%
select(location, epiweek, rate_age_0, rate_age_1, rate_age_2, rate_age_3, rate_age_4) %>%
# Turn `rate_age_0`..`rate_age_4` columns into an `age_group` and `rate`
# column (with 5x as many rows):
tidyr::pivot_longer(
cols = starts_with("rate_age_"), names_to = "age_group",
names_prefix = "rate_age_", names_transform = function(x) paste0("age_group_", x),
values_to = "rate"
cols = starts_with("rate_age_"), names_to = "age_group", values_to = "rate",
# When converting column names to entries in `age_group`, remove this prefix:
names_prefix = "rate_age_",
# And add a better prefix:
names_transform = function(age_group) paste0("age_group_", age_group)
) %>%
inner_join(
# tibble(
# location = "CA",
# age_group = paste0("age_group_", 0:4),
# # population_proportion = c(0.06187, 0.16343, 0.43359, 0.19666, 0.14445)
# # ^ (estimated via single regression)
# # population_proportion = c(201265, 520077, 1725382, 699145, 551243) %>% {. / sum(.)}
# # ^ (from https://www.cdc.gov/nchs/nvss/bridged_race.htm vintage 2020 year 2018 data alone)
# population_proportion = c(198705, 518211, 1724467, 696784, 568237) %>% {. / sum(.)}
# # ^ (from https://www.census.gov/data/tables/time-series/demo/popest/2010s-counties-detail.html, only taking estimates for one year)
# ),
# by = c("location", "age_group"),
# prepared from https://www.cdc.gov/nchs/nvss/bridged_race.htm vintage 2020(?) years 2017 and 2018 data:
# tribble(
# ~location, ~age_group, ~epiweek, ~pop,
# "CA", "age_group_0", as.Date("2017-07-01"), 203813,
# "CA", "age_group_1", as.Date("2017-07-01"), 521827,
# "CA", "age_group_2", as.Date("2017-07-01"), 1722399,
# "CA", "age_group_3", as.Date("2017-07-01"), 700090,
# "CA", "age_group_4", as.Date("2017-07-01"), 534789,
# "CA", "age_group_0", as.Date("2018-07-01"), 201265,
# "CA", "age_group_1", as.Date("2018-07-01"), 520077,
# "CA", "age_group_2", as.Date("2018-07-01"), 1725382,
# "CA", "age_group_3", as.Date("2018-07-01"), 699145,
# "CA", "age_group_4", as.Date("2018-07-01"), 551243,
# ),
# join_by(location, closest(y$epiweek <= x$epiweek), age_group),
# suffix = c("", "_for_pop"),
# relationship = "many-to-one", unmatched = "error"
# prepared from https://www.census.gov/programs-surveys/popest/technical-documentation/research/evaluation-estimates/2020-evaluation-estimates/2010s-county-detail.html vintage 2020(?) years 2017 and 2018 data (identical to above)
tribble(
~location, ~age_group, ~epiweek, ~pop,
"CA", "age_group_0", as.Date("2017-07-01"), 203813,
"CA", "age_group_1", as.Date("2017-07-01"), 521827,
"CA", "age_group_2", as.Date("2017-07-01"), 1722399,
"CA", "age_group_3", as.Date("2017-07-01"), 700090,
"CA", "age_group_4", as.Date("2017-07-01"), 534789,
"CA", "age_group_0", as.Date("2018-07-01"), 201265,
"CA", "age_group_1", as.Date("2018-07-01"), 520077,
"CA", "age_group_2", as.Date("2018-07-01"), 1725382,
"CA", "age_group_3", as.Date("2018-07-01"), 699145,
"CA", "age_group_4", as.Date("2018-07-01"), 551243,
),
join_by(location, closest(y$epiweek <= x$epiweek), age_group),
suffix = c("", "_for_pop"),
relationship = "many-to-one", unmatched = "error"
# Improve `age_group` labels a bit more:
mutate(
age_group = case_match(
age_group,
"age_group_0" ~ "0--4 yr",
"age_group_1" ~ "5--17 yr",
"age_group_2" ~ "18--49 yr",
"age_group_3" ~ "50--64 yr",
"age_group_4" ~ ">= 65 yr",
# Make this a factor with appropriate level ordering:
.ptype = factor(levels = c("0--4 yr", "5--17 yr", "18--49 yr", "50--64 yr", ">= 65 yr"))
)
) %>%
select(-epiweek_for_pop)
# The API now outputs `epiweek` in Date format (the constituent Sunday);
# rename it to remind us that it's not in YYYYww format:
rename(time_value = epiweek)
flu_data
```

We can now convert this data to an `epi_df` object and set the `age_group`
column as an additional group key:

```{r}
flu_data <- flu_data %>% as_epi_df(other_keys = "age_group", as_of = as.Date("2024-03-20"))
flu_data <- flu_data %>% as_epi_df(other_keys = "age_group")
flu_data
```

Note that the `epi_df` object now has an additional key column `age_group`. This
means that there should only be one row for each combination of `geo_value`,
`time_value`, and `age_group` in the dataset (this is enforced at construction
`age_group`, and `time_value` in the dataset (this is enforced at construction
time).

Now we can aggregate the data by `age_group`, if we want to compute the total:

Now we can aggregate the data by `age_group`, if we want to compute the total.
Since we are working with rates, we need to attach some population data in order
to do this aggregation. We have overall rates available from the original
source, so we can check our work.
```{r}
group_cols <- key_colnames(flu_data, exclude = "age_group")
rate_overall_recalc_edf <-
flu_data %>%
# mutate(weighted_rate = rate * population_proportion) %>%
inner_join(
# Population estimates derived from vintage 2020 estimates for 2017-07-01
# and 2018-07-01 (https://www.cdc.gov/nchs/nvss/bridged_race.htm and
# https://www.census.gov/programs-surveys/popest/technical-documentation/research/evaluation-estimates/2020-evaluation-estimates/2010s-county-detail.html

Check warning on line 274 in vignettes/epi_df.Rmd

View workflow job for this annotation

GitHub Actions / lint

file=vignettes/epi_df.Rmd,line=274,col=121,[line_length_linter] Lines should not be more than 120 characters. This line is 157 characters.
# give the same results here).
tribble(
~location, ~age_group, ~time_value, ~pop,
"CA", "0--4 yr", as.Date("2017-07-01"), 203813,
"CA", "5--17 yr", as.Date("2017-07-01"), 521827,
"CA", "18--49 yr", as.Date("2017-07-01"), 1722399,
"CA", "50--64 yr", as.Date("2017-07-01"), 700090,
"CA", ">= 65 yr", as.Date("2017-07-01"), 534789,
"CA", "0--4 yr", as.Date("2018-07-01"), 201265,
"CA", "5--17 yr", as.Date("2018-07-01"), 520077,
"CA", "18--49 yr", as.Date("2018-07-01"), 1725382,
"CA", "50--64 yr", as.Date("2018-07-01"), 699145,
"CA", ">= 65 yr", as.Date("2018-07-01"), 551243,
),

Check warning on line 288 in vignettes/epi_df.Rmd

View workflow job for this annotation

GitHub Actions / lint

file=vignettes/epi_df.Rmd,line=288,col=6,[indentation_linter] Indentation should be 4 spaces but is 6 spaces.
# Simple population interpolation/extrapolation scheme: last observation
# carried forward. Use the estimated population on 2017-07-01 for
# time_values 2017-07-01 through 2018-06-30, and the estimated population on
# 2018-07-01 for all subsequent time_values:
join_by(location, closest(y$time_value <= x$time_value), age_group),
suffix = c("", "_for_pop"),
relationship = "many-to-one", unmatched = "error"
) %>%
select(-time_value_for_pop)
group_by(geo_value, time_value) %>%

Check warning on line 298 in vignettes/epi_df.Rmd

View workflow job for this annotation

GitHub Actions / lint

file=vignettes/epi_df.Rmd,line=298,col=2,[indentation_linter] Indentation should be 0 spaces but is 2 spaces.
mutate(weighted_rate = rate * pop / sum(pop)) %>%
mutate(count = rate * pop / 100e3) %>%
ungroup() %>%
sum_groups_epi_df("weighted_rate", group_cols = group_cols) %>%
rename(rate_overall_recalc = weighted_rate) %>%
sum_groups_epi_df(c("count", "pop"), group_cols = group_cols) %>%
mutate(rate_overall_recalc = count / pop * 100e3) %>%
# match rounding of original data:
mutate(rate_overall_recalc = round(rate_overall_recalc, 1)) %>%
# compare to published overall rates:
inner_join(
Expand All @@ -304,10 +309,13 @@ rate_overall_recalc_edf <-
by = c("geo_value", "time_value"),
relationship = "one-to-one", unmatched = "error"
)
# What's our maximum error vs. the official overall estimates?
max(abs(rate_overall_recalc_edf$rate_overall - rate_overall_recalc_edf$rate_overall_recalc))
rate_overall_recalc_edf %>%
slice_max(abs(rate_overall_recalc_edf$rate_overall - rate_overall_recalc_edf$rate_overall_recalc))
```
This small amount of difference is expected, since all the age-specific rates
were rounded to the first decimal place, and population data might have been
interpolated and extrapolated a bit differently in the official source, limiting
our ability to precisely recreate its estimates from an age group breakdown.

## Detecting and filling time gaps with `complete.epi_df`

Expand Down

0 comments on commit 47c16fe

Please sign in to comment.