cmu-delphi · rachlobay · Apr 27, 2024 · Aug 2, 2022 · Aug 3, 2022 · Aug 12, 2022
@@ -56,3 +56,32 @@
 #'   \url{https://www.census.gov/data/tables/time-series/demo/popest/2010s-total-puerto-rico-municipios.html},
 #'   and \url{https://www.census.gov/data/tables/2010/dec/2010-island-areas.html}
 "state_census"
+
+#' Subset of Statistics Canada median employment income for postsecondary graduates
+#'
+#' @format An [epiprocess::epi_df] with 10193 rows and 8 variables:
+#' \describe{
+#'   \item{geo_value}{The province in Canada associated with each
+#'      row of measurements.}
+#'   \item{time_value}{The time value, a year integer in YYYY format}
+#'   \item{edu_qual}{The education qualification}
+#'   \item{fos}{The field of study}
+#'   \item{age_group}{The age group; either 15 to 34 or 35 to 64}
+#'   \item{num_graduates}{The number of graduates for the given row of characteristics}
+#'   \item{med_income_2y}{The median employment income two years after graduation}
+#'   \item{med_income_5y}{The median employment income five years after graduation}
+#' }
+#' @source This object contains modified data from the following Statistics Canada
+#' data table: \href{https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=3710011501}{
+#'  Characteristics and median employment income of longitudinal cohorts of postsecondary
+#'  graduates two and five years after graduation, by educational qualification and
+#'  field of study (primary groupings)
+#' }
+#'
+#' Modifications:
+#' * Only provincial-level geo_values are kept
+#' * Only age group, field of study, and educational qualification are kept as
+#'   covariates. For the remaining covariates, we keep aggregated values and
+#'   drop the level-specific rows.
+#' * No modifications were made to the time range of the data
+"grad_employ_subset"
@@ -97,6 +97,7 @@ reference:
     contents:
     - case_death_rate_subset
     - state_census
+    - grad_employ_subset
 
 
 
@@ -0,0 +1,106 @@
+library(epipredict)
+library(epiprocess)
+library(cansim)
+library(dplyr)
+library(stringr)
+library(tidyr)
+
+# https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=3710011501
+statcan_grad_employ <- get_cansim("37-10-0115-01")
+
+gemploy <- statcan_grad_employ %>%
+  select(c(
+    "REF_DATE",
+    "GEO",
+    # "DGUID",
+    # "UOM",
+    # "UOM_ID",
+    # "SCALAR_FACTOR",
+    # "SCALAR_ID",
+    # "VECTOR",
+    # "COORDINATE",
+    "VALUE",
+    "STATUS",
+    # "SYMBOL",
+    # "TERMINATED",
+    # "DECIMALS",
+    # "GeoUID",
+    # "Hierarchy for GEO",
+    # "Classification Code for Educational qualification",
+    # "Hierarchy for Educational qualification",
+    # "Classification Code for Field of study",
+    # "Hierarchy for Field of study",
+    # "Classification Code for Gender",
+    # "Hierarchy for Gender",
+    # "Classification Code for Age group",
+    # "Hierarchy for Age group",
+    # "Classification Code for Status of student in Canada",
+    # "Hierarchy for Status of student in Canada",
+    # "Classification Code for Characteristics after graduation",
+    # "Hierarchy for Characteristics after graduation",
+    # "Classification Code for Graduate statistics",
+    # "Hierarchy for Graduate statistics",
+    # "val_norm",
+    # "Date",
+    "Educational qualification",
+    "Field of study",
+    "Gender",
+    "Age group",
+    "Status of student in Canada",
+    "Characteristics after graduation",
+    "Graduate statistics"
+  )) %>%
+  rename(
+    "geo_value" = "GEO",
+    "time_value" = "REF_DATE",
+    "value" = "VALUE",
+    "status" = "STATUS",
+    "edu_qual" = "Educational qualification",
+    "fos" = "Field of study",
+    "gender" = "Gender",
+    "age_group" = "Age group",
+    "student_status" = "Status of student in Canada",
+    "grad_charac" = "Characteristics after graduation",
+    "grad_stat" = "Graduate statistics"
+  ) %>%
+  mutate(
+    grad_stat = recode_factor(
+      grad_stat,
+      `Number of graduates` = "num_graduates",
+      `Median employment income two years after graduation` = "med_income_2y",
+      `Median employment income five years after graduation` = "med_income_5y"
+    ),
+    time_value = as.integer(time_value)
+  ) %>%
+  pivot_wider(names_from = grad_stat, values_from = value) %>%
+  filter(
+    # Drop aggregates for some columns
+    geo_value != "Canada" &
+      age_group != "15 to 64 years" &
+      edu_qual != "Total, educational qualification" &
+      # Keep aggregates for keys we don't want to keep
+      fos == "Total, field of study" &
+      gender == "Total, gender" &
+      student_status == "Canadian and international students" &
+      # Since we're looking at 2y and 5y employment income, the only
+      # characteristics remaining are:
+      # - Graduates reporting employment income
+      # - Graduates reporting wages, salaries, and commissions only
+      # For simplicity, keep the first one only
+      grad_charac == "Graduates reporting employment income" &
+      # Only keep "good" data
+      is.na(status) &
+      # Drop NA value rows
+      !is.na(num_graduates) & !is.na(med_income_2y) & !is.na(med_income_5y)
+  ) %>%
+  select(-c(status, gender, student_status, grad_charac, fos))
+
+nrow(gemploy)
+ncol(gemploy)
+
+grad_employ_subset <- gemploy %>%
+  as_epi_df(
+    as_of = "2022-07-19",
+    additional_metadata = list(other_keys = c("age_group", "edu_qual"))
+  )
+usethis::use_data(grad_employ_subset, overwrite = TRUE)
@@ -1,3 +1,3 @@
 *.html
-*.R
 *_cache/
+*.R
@@ -61,10 +61,7 @@ versions for the less up-to-date input archive.
 ```{r grab-epi-data}
 theme_set(theme_bw())
 
-y <- readRDS(system.file(
-  "extdata", "all_states_covidcast_signals.rds",
-  package = "epipredict", mustWork = TRUE
-))
+y <- readRDS("all_states_covidcast_signals.rds")
 
 y <- purrr::map(y, ~ select(.x, geo_value, time_value, version = issue, value))