From 2243d36219a9af6c92ceb194b50e42b01dc6d337 Mon Sep 17 00:00:00 2001 From: beanumber Date: Mon, 1 Jun 2020 10:40:34 -0400 Subject: [PATCH] progress towards #53 --- R/utils.R | 36 +++++++++++++++++-------------- man/match_files_by_year_months.Rd | 4 ++-- tests/testthat/test-etl.R | 15 ++++++++++--- 3 files changed, 34 insertions(+), 21 deletions(-) diff --git a/R/utils.R b/R/utils.R index 3bc1288..1146e23 100644 --- a/R/utils.R +++ b/R/utils.R @@ -66,15 +66,17 @@ valid_year_month <- function(years, months, begin <- as.Date(begin) end <- as.Date(end) - valid_months <- data.frame(expand.grid(years, months)) %>% + valid_months <- tibble::tibble(expand.grid(years, months)) %>% rename(year = Var1, month = Var2) %>% - mutate(month_begin = lubridate::ymd(paste(year, month, - "01", sep = "/"))) %>% - mutate(month_end = lubridate::ymd( - ifelse(month == 12, paste(year + 1, "01/01", sep = "/"), - paste(year, month + 1, "01", sep = "/"))) - 1) %>% - filter(year > 0 & month >= 1 & month <= 12) %>% - filter(month_begin >= begin & month_begin <= end) %>% + mutate( + month_begin = lubridate::ymd(paste(year, month, "01", sep = "/")), + month_end = lubridate::ymd( + ifelse(month == 12, paste(year + 1, "01/01", sep = "/"), + paste(year, month + 1, "01", sep = "/"))) - 1) %>% + filter( + year > 0 & month >= 1 & month <= 12, + month_begin >= begin & month_begin <= end + ) %>% arrange(month_begin) return(valid_months) } @@ -90,11 +92,11 @@ valid_year_month <- function(years, months, #' @examples #' \dontrun{ #' if (require(airlines)) { -#' airlines <- etl("airlines", dir = "~/dumps/airlines") %>% +#' airlines <- etl("airlines", dir = "~/Data/airlines") %>% #' etl_extract(year = 1987) #' summary(airlines) #' match_files_by_year_months(list.files(attr(airlines, "raw_dir")), -#' pattern = "On_Time_On_Time_Performance_%Y_%m.zip"), year = 1987) +#' pattern = "On_Time_On_Time_Performance_%Y_%m.zip", year = 1987) #' } #' } @@ -105,16 +107,18 @@ match_files_by_year_months <- function(files, pattern, if (length(files) < 1) { return(NULL) } - file_df <- data.frame(filename = files, - file_date = extract_date_from_filename(files, - pattern)) %>% - mutate(file_year = lubridate::year(file_date), - file_month = lubridate::month(file_date)) + file_df <- tibble::tibble( + filename = files, + file_date = extract_date_from_filename(files, pattern)) %>% + mutate( + file_year = lubridate::year(file_date), + file_month = lubridate::month(file_date) + ) valid <- valid_year_month(years, months) good <- file_df %>% left_join(valid, by = c("file_year" = "year", "file_month" = "month")) %>% filter(!is.na(month_begin)) - return(as.character(good$filename)) + return(fs::as_fs_path(good$filename)) } #' @description Extracts a date from filenames diff --git a/man/match_files_by_year_months.Rd b/man/match_files_by_year_months.Rd index 006ca14..fb45b96 100644 --- a/man/match_files_by_year_months.Rd +++ b/man/match_files_by_year_months.Rd @@ -39,11 +39,11 @@ Extracts a date from filenames \examples{ \dontrun{ if (require(airlines)) { - airlines <- etl("airlines", dir = "~/dumps/airlines") \%>\% + airlines <- etl("airlines", dir = "~/Data/airlines") \%>\% etl_extract(year = 1987) summary(airlines) match_files_by_year_months(list.files(attr(airlines, "raw_dir")), - pattern = "On_Time_On_Time_Performance_\%Y_\%m.zip"), year = 1987) + pattern = "On_Time_On_Time_Performance_\%Y_\%m.zip", year = 1987) } } } diff --git a/tests/testthat/test-etl.R b/tests/testthat/test-etl.R index 8d140e8..d306ed9 100644 --- a/tests/testthat/test-etl.R +++ b/tests/testthat/test-etl.R @@ -59,8 +59,9 @@ test_that("mysql works", { test_that("valid_year_month works", { - expect_equal( - nrow(valid_year_month(years = 1999:2001, months = c(1:3, 7))), 12) + dates <- valid_year_month(years = 1999:2001, months = c(1:3, 7)) + expect_is(dates, "tbl_df") + expect_equal(nrow(dates), 12) }) test_that("extract_date_from_filename works", { @@ -68,8 +69,16 @@ test_that("extract_date_from_filename works", { mutate(filename = paste0("myfile_", year, "_", month, ".ext")) expect_is( extract_date_from_filename(test$filename, pattern = "myfile_%Y_%m.ext"), - "Date") + "Date" + ) expect_null(extract_date_from_filename(list.files("/cdrom"), pattern = "*")) + skip_if_not(require(airlines) && dir.exists("~/Data/airlines") && list.files(attr(airlines, "raw_dir")) >= 12) + airlines <- etl("airlines", dir = "~/Data/airlines") %>% + etl_extract(year = 1987) + summary(airlines) + res <- match_files_by_year_months(list.files(attr(airlines, "raw_dir")), + pattern = "On_Time_On_Time_Performance_%Y_%m.zip", year = 1987) + expect_is(res, "fs_path") }) test_that("etl works", {