From e99b9d9bf630321ae66a313a1a32ba9fa81e4986 Mon Sep 17 00:00:00 2001 From: Sima Najafzadehkhoei Date: Tue, 10 Dec 2024 17:37:01 -0700 Subject: [PATCH] no warning no error --- DESCRIPTION | 3 +- NAMESPACE | 4 +- R/build_cnn_model.R | 6 +- R/calibrate_sir.R | 63 ++------------- R/dataprep.R | 146 +++++++++------------------------- R/evaluate_model.R | 2 +- R/filter_non_null.R | 2 +- R/filter_non_null_infected.R | 3 +- R/generate_theta.R | 9 +-- R/george_calibrate.R | 35 -------- R/plot_results.R | 56 +++++-------- R/prepare_data_infected.R | 25 +----- R/preprocessing_data.R | 12 +-- R/run_simulations.R | 36 ++++----- R/simulate_calibrate_sir.R | 44 +++------- R/train_model.R | 5 +- README.Rmd | 42 ++-------- man/build_cnn_model.Rd | 2 +- man/calibrate_sir.Rd | 22 +---- man/prepare_data.Rd | 3 +- man/preprocessing_data.Rd | 2 - man/simulate_calibrate_sir.Rd | 4 +- man/train_model.Rd | 4 +- 23 files changed, 123 insertions(+), 407 deletions(-) delete mode 100644 R/george_calibrate.R diff --git a/DESCRIPTION b/DESCRIPTION index e5d4a38..5be5582 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -24,5 +24,6 @@ Imports: keras3, dplyr, ggplot2 (>= 3.4.0), - stats + stats, + reticulate VignetteBuilder: knitr diff --git a/NAMESPACE b/NAMESPACE index 13071c7..ff1b2c6 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -19,8 +19,10 @@ importFrom(data.table,as.data.table) importFrom(data.table,copy) importFrom(data.table,dcast) importFrom(data.table,melt) +importFrom(data.table,setnames) importFrom(dplyr,mutate) importFrom(dplyr,row_number) +importFrom(epiworldR,ModelSIRCONN) importFrom(epiworldR,run) importFrom(epiworldR,verbose_off) importFrom(ggplot2,aes) @@ -29,7 +31,6 @@ importFrom(ggplot2,geom_abline) importFrom(ggplot2,geom_boxplot) importFrom(ggplot2,geom_point) importFrom(ggplot2,labs) -importFrom(keras3,array_reshape) importFrom(parallel,clusterEvalQ) importFrom(parallel,clusterExport) importFrom(parallel,makeCluster) @@ -37,7 +38,6 @@ importFrom(parallel,mclapply) importFrom(parallel,parLapply) importFrom(parallel,stopCluster) importFrom(stats,plogis) -importFrom(stats,predict) importFrom(stats,qlogis) importFrom(stats,rbeta) importFrom(stats,rgamma) diff --git a/R/build_cnn_model.R b/R/build_cnn_model.R index aee84e1..c046ffc 100644 --- a/R/build_cnn_model.R +++ b/R/build_cnn_model.R @@ -1,7 +1,7 @@ #' Build a Convolutional Neural Network Model #' #' @description -#' Constructs and compiles a CNN model using the `keras` package. +#' Constructs and compiles a CNN model using the `keras3` package. #' #' @param input_shape Integer vector. The shape of the input data (excluding batch size). #' @param output_units Integer. The number of output units (number of output variables). @@ -21,9 +21,5 @@ build_cnn_model <- function(input_shape, output_units) { model |> keras3::compile(optimizer = 'adam', loss = 'mse', metrics = 'accuracy') - # Save the model in native Keras format - # model$save('/home/u1418987/epiworld-benchmark-oct15/calibration/sir-cnn.keras') - - return(model) } diff --git a/R/calibrate_sir.R b/R/calibrate_sir.R index fc84081..d053f86 100644 --- a/R/calibrate_sir.R +++ b/R/calibrate_sir.R @@ -1,82 +1,31 @@ - -#' Predict Parameters Using a CNN Model -#' Generates predictions for input test data using a pre-trained Convolutional Neural Network (CNN) model. -#' -#' This function loads a specific Keras CNN model based on the length of the input data and uses it to make predictions. The predicted values are returned as a `data.table` with standardized column names. +#' Calibrate SIR Model Predictions #' -#' @param data A numeric matrix or array containing the input test data. The function determines which model to load based on the number of columns in `data` (expects either 30 or 60). +#' @description +#' Generates predictions for input test data using a pre-trained CNN model. #' -#' @return A list containing: -#' \describe{ -#' \item{pred}{A `data.table` of predicted values with the following columns: -#' \describe{ -#' \item{\code{preval}}{Predicted prevalence.} -#' \item{\code{crate}}{Predicted case rate.} -#' \item{\code{ptran}}{Predicted transmission probability.} -#' \item{\code{prec}}{Predicted precision.} -#' } -#' } -#' } -#' -#' @details -#' The function determines which pre-trained CNN model to load based on the number of features (columns) in the input `data`. If `data` has 30 columns, it loads the `sir30-cnn.keras` model; if it has 60 columns, it loads the `sir60-cnn.keras` model. Ensure that the input data matches one of these expected formats to avoid errors. +#' @param data A numeric matrix or array containing the input test data. +#' @return A list containing a `data.table` of predicted values. #' @export -# calibrate_sir <- function(data) { -# library(keras3) -# ans=preprocessing_data(data) -# a=length(ans) -# ans <- tensorflow::array_reshape(ans, dim = c(1, 1, a, 1)) -# -# if(a <=30){ -# model <- keras3::load_model( -# system.file("models", "sir30-cnn.keras", package = "epiworldRcalibrate") -# ) -# } -# else{ -# model <- keras3::load_model( -# system.file("models", "sir60-cnn.keras", package = "epiworldRcalibrate") -# ) -# } -# pred <- predict(model, x =ans ) |> -# data.table::as.data.table() |> -# data.table::setnames(c("preval","crate","ptran","prec")) -# pred$crate=qlogis(pred$crate) -# -# return(list(pred = pred)) -# } calibrate_sir <- function(data) { - # Load required libraries - library(tensorflow) - library(data.table) - - # Preprocess the data ans <- preprocessing_data(data) a <- length(ans) - ans <- tensorflow::array_reshape(ans, dim = c(1, 1, a, 1)) # Reshape for the model + ans <- tensorflow::tf$reshape(ans, shape = c(1L, 1L, a, 1L)) - # Determine model file path model_path <- if (a <= 31) { system.file("models", "sir30-cnn.keras", package = "epiworldRcalibrate") } else { system.file("models", "sir60-cnn.keras", package = "epiworldRcalibrate") } - # Check if the model file exists if (model_path == "") { stop("Model file not found. Please ensure the models are included in the 'epiworldRcalibrate' package.") } - # Load the model using tensorflow model <- tensorflow::tf$keras$models$load_model(model_path) - # Make predictions pred <- model$predict(ans) |> data.table::as.data.table() |> data.table::setnames(c("preval", "crate", "ptran", "prec")) - - # Return predictions as a list return(list(pred = pred)) } - - diff --git a/R/dataprep.R b/R/dataprep.R index fe84973..9028d7f 100644 --- a/R/dataprep.R +++ b/R/dataprep.R @@ -2,12 +2,11 @@ #' #' @param m An `epiworldR` model object. #' @param max_days The maximum number of days to consider for data preparation. Defaults to 50. -#' -#' @return A reshaped array of processed data suitable for TensorFlow models. If an error occurs, it returns the error object. +#' @return A reshaped array of processed data suitable for TensorFlow models. +#' If an error occurs, it returns the error object. #' #' @export - -prepare_data <- function(m, max_days =max_days) { +prepare_data <- function(m, max_days = max_days) { err <- tryCatch({ ans <- list( @@ -16,32 +15,25 @@ prepare_data <- function(m, max_days =max_days) { gentime = epiworldR::plot_generation_time(m, plot = FALSE) ) - # Filling + # Convert all elements to data.table ans <- lapply(ans, data.table::as.data.table) - # Replacing NaN and NAs with the previous value - # in each element in the list - # Replace NA values with the last observed value - ans$repnum$avg <- data.table::nafill(ans$repnum$avg, type = "locf") - - # Replace NA values with the last observed value - ans$gentime$avg <- data.table::nafill(ans$gentime$avg, type = "locf") + # Replace NA values in repnum$avg and gentime$avg with the last observed value + ans[["repnum"]][, avg := data.table::nafill(avg, type = "locf")] + ans[["gentime"]][, avg := data.table::nafill(avg, type = "locf")] + # Filter up to max_days + ans[["repnum"]] <- ans[["repnum"]][date <= max_days, ] + ans[["gentime"]] <- ans[["gentime"]][date <= max_days, ] - # Filtering up to max_days - ans$repnum <- ans$repnum[ans$repnum$date <= max_days,] - ans$gentime <- ans$gentime[ans$gentime$date <= max_days,] - ans$incidence <- ans$incidence[as.integer(rownames(ans$incidence)) <= (max_days + 1),] + # incidence is indexed by row number since date is not explicitly in that data + # We assume the first row represents day 0, hence (max_days + 1) rows total. + ans[["incidence"]] <- ans[["incidence"]][as.integer(.I) <= (max_days + 1), ] - # Reference table for merging - # ndays <- epiworldR::get_ndays(m) + # Create a reference table for merging + ref_table <- data.table::data.table(date = 0:max_days) - ref_table <- data.table::data.table( - date = 0:max_days - ) - - # Replace the $ with the [[ ]] to avoid the warning in the next - # two lines + # Merge repnum and gentime with the reference table to ensure consistent length ans[["repnum"]] <- data.table::merge.data.table( ref_table, ans[["repnum"]], by = "date", all.x = TRUE ) @@ -49,106 +41,42 @@ prepare_data <- function(m, max_days =max_days) { ref_table, ans[["gentime"]], by = "date", all.x = TRUE ) - - # Generating the data.table with necessary columns + # Create a data.table with all required columns ans <- data.table::data.table( - infected = ans[["incidence"]][["Infected"]], - recovered = ans[["incidence"]][["Recovered"]], - repnum = ans[["repnum"]][["avg"]], - gentime = ans[["gentime"]][["avg"]], - repnum_sd = ans[["repnum"]][["sd"]], + infected = ans[["incidence"]][["Infected"]], + recovered = ans[["incidence"]][["Recovered"]], + repnum = ans[["repnum"]][["avg"]], + gentime = ans[["gentime"]][["avg"]], + repnum_sd = ans[["repnum"]][["sd"]], gentime_sd = ans[["gentime"]][["sd"]] ) - # Replace NA values with the last observed value for all columns + # Replace NA values in all relevant columns using locf nafill_cols <- c("infected", "recovered", "repnum", "gentime", "repnum_sd", "gentime_sd") - for (col in nafill_cols) { ans[[col]] <- data.table::nafill(ans[[col]], type = "locf") } + # Return ans as processed data + ans }, error = function(e) e) - # If there is an error, return NULL + # If there was an error, return it if (inherits(err, "error")) { return(err) } - # Returning without the first observation (which is mostly zero) - dprep <- t(diff(as.matrix(ans[-1,]))) - - ans <- array(dim = c(1, dim(dprep))) - ans[1,,] <- dprep - abm_hist_feat <- ans + # Remove the first observation (often zero) and take differences + # ans is now a data.table with rows representing days + # ans[-1, ] removes the first row + dprep <- t(diff(as.matrix(err[-1,]))) - array_reshape( - abm_hist_feat, - dim = c(1, dim(dprep)) - ) + # Construct a 3D array with shape (1, n_features, n_timesteps) + # Here n_features = number of variables (rows of dprep after transpose) + # and n_timesteps = number of columns (days-1) + ans_array <- array(dim = c(1, dim(dprep)[1], dim(dprep)[2])) + ans_array[1,,] <- dprep + # Reshape for TensorFlow input using keras3 (adjust if using another keras interface) + keras3::array_reshape(ans_array, dim = c(1, dim(dprep))) } - - - -# -# prepare_data_infections_only <- function(m, ...) { -# UseMethod("prepare_data_infectios_only") -# } -# -# prepare_data_infections_only.epiworld_model <- function(m, ...) { -# ans <- epiworldR::plot_incidence(m, plot = FALSE) |> -# data.table::as.data.table() -# -# prepare_data_infections_only.data.table( -# m = ans, -# ... -# ) -# } -# -# prepare_data_infections_only.default <- function(m, max_days = 50, ...) { -# -# err <- tryCatch({ -# ans <- list( -# incidence = data.table(Infected=m) -# ) -# -# # Replacing NaN and NAs with the previous value -# # in each element in the list -# # Filtering up to max_days -# ans$incidence <- ans$incidence[as.integer(rownames(ans$incidence)) <= (max_days + 1),] -# -# # Reference table for merging -# # ndays <- epiworldR::get_ndays(m) -# ref_table <- data.table::data.table( -# date = 0:max_days -# ) -# -# # Generating the arrays -# ans <- data.table::data.table( -# infected = ans[["incidence"]][["Infected"]] -# ) -# -# # Filling NAs with last obs -# ans[, "infected" := data.table::nafill(.SD[[1]], "locf"), -# .SDcols = "infected"] -# -# }, error = function(e) e) -# -# # If there is an error, return NULL -# if (inherits(err, "error")) { -# return(err) -# } -# -# # Returning without the first observation (which is mostly zero) -# dprep <- t(diff(as.matrix(ans[-1,]))) -# -# ans <- array(dim = c(1, dim(dprep))) -# ans[1,,] <- dprep -# abm_hist_feat <- ans -# -# array_reshape( -# abm_hist_feat, -# dim = c(1, dim(dprep)) -# ) -# -# } diff --git a/R/evaluate_model.R b/R/evaluate_model.R index a260e17..72afad6 100644 --- a/R/evaluate_model.R +++ b/R/evaluate_model.R @@ -13,7 +13,7 @@ #' } #' @export evaluate_model <- function(model, test_data, theta) { - pred <- predict(model, x = test_data$x) |> + pred <- model$predict(test_data$x) |> data.table::as.data.table() |> data.table::setnames(colnames(theta)) diff --git a/R/filter_non_null.R b/R/filter_non_null.R index e8d7d53..db688c8 100644 --- a/R/filter_non_null.R +++ b/R/filter_non_null.R @@ -15,7 +15,7 @@ filter_non_null <- function(matrices, theta) { is_not_null <- intersect( which(!sapply(matrices, inherits, what = "error")), - which(!sapply(matrices, \(x) any(is.na(x)))) + which(!sapply(matrices, function(x) any(is.na(x)))) ) matrices <- matrices[is_not_null] diff --git a/R/filter_non_null_infected.R b/R/filter_non_null_infected.R index db5ab7d..7288977 100644 --- a/R/filter_non_null_infected.R +++ b/R/filter_non_null_infected.R @@ -16,11 +16,10 @@ #' The function then returns the filtered matrices and their count. #' #' @export - filter_non_null_infected <- function(matrices) { is_not_null <- intersect( which(!sapply(matrices, inherits, what = "error")), - which(!sapply(matrices, \(x) any(is.na(x)))) + which(!sapply(matrices, function(x) any(is.na(x)))) ) matrices <- matrices[is_not_null] diff --git a/R/generate_theta.R b/R/generate_theta.R index ea4d2f6..e2732fc 100644 --- a/R/generate_theta.R +++ b/R/generate_theta.R @@ -5,17 +5,16 @@ #' #' @param N Integer. The number of parameter sets to generate. #' @param n Integer. The population size for each simulation. -#' @importFrom stats plogis predict qlogis rbeta rgamma +#' @importFrom stats plogis qlogis rbeta rgamma #' @return A data.table containing the generated parameters. #' @export generate_theta <- function(N, n) { - library(data.table) set.seed(1231) theta <- data.table::data.table( preval = sample((100:2000) / n, N, TRUE), - crate = rgamma(N, 5, 1), # Mean 5 - ptran = rbeta(N, 3, 7), # Mean 0.3 - prec = rbeta(N, 10, 10) # Mean 0.5 + crate = stats::rgamma(N, 5, 1), + ptran = stats::rbeta(N, 3, 7), + prec = stats::rbeta(N, 10, 10) ) return(theta) } diff --git a/R/george_calibrate.R b/R/george_calibrate.R deleted file mode 100644 index 456608d..0000000 --- a/R/george_calibrate.R +++ /dev/null @@ -1,35 +0,0 @@ -# calibrate <- function(y, fitted_model) { -# -# if (!length(fitted_model)) -# stop("You need to specify a fitted model") -# -# -# # Read the model -# model <- keras3::read_my_model(fitted_mode) -# -# prepro <- preprocess_data(y) -# -# predict(y, model) -# -# -# -# } -# -# calibrate_sir <- function(y) { -# -# calibrate( -# y, -# fitted_model = system.file("models/sir_30.keras", package="epiworldRcalibrate") -# ) -# -# } -#' #' THis is myu function -#' #' @param model -#' #' @param ndays -#' #' @noRd -#' verbose_off_and_run <- function(model, ndays) { -#' -#' verbose_off(model) -#' run(model, ndays) -#' } - diff --git a/R/plot_results.R b/R/plot_results.R index b7789bb..b471c0a 100644 --- a/R/plot_results.R +++ b/R/plot_results.R @@ -1,4 +1,3 @@ - #' Plot Results of Model Predictions #' #' @description @@ -12,52 +11,38 @@ #' @param N_train Integer. Number of training examples. #' @importFrom dplyr mutate row_number #' @importFrom ggplot2 aes geom_boxplot geom_point facet_wrap labs geom_abline -#' @importFrom keras3 array_reshape -#' @importFrom data.table dcast melt +#' @importFrom data.table dcast melt setnames as.data.table #' @return Generates plots (displayed on the active graphics device). #' @export - plot_results <- function(pred, test_data, theta, MAEs, N, N_train) { - # Prepare the data for plotting - library(dplyr) - library(ggplot2) - - # Add an 'id' column - pred <- pred |> - mutate(id = row_number()) - - # Modify the 'crate' column - pred <- pred |> - mutate(crate = qlogis(crate) * 10) + pred <- dplyr::mutate(pred, id = dplyr::row_number()) + pred <- dplyr::mutate(pred, crate = stats::qlogis(crate) * 10) - pred_long <- melt(pred, id.vars = "id") - theta_long <- test_data$y |> data.table::as.data.table() + pred_long <- data.table::melt(pred, id.vars = "id") + theta_long <- data.table::as.data.table(test_data$y) data.table::setnames(theta_long, names(theta)) theta_long$id <- seq_len(nrow(theta_long)) - # Modify the 'crate' column in theta_long - theta_long$crate <- qlogis(theta_long$crate) * 10 - theta_long <- melt(theta_long, id.vars = "id") + theta_long$crate <- stats::qlogis(theta_long$crate) * 10 + theta_long <- data.table::melt(theta_long, id.vars = "id") alldat <- rbind( cbind(pred_long, Type = "Predicted"), cbind(theta_long, Type = "Observed") ) - # Plot 1: Boxplot of Predicted vs Observed values - p1 <- ggplot2::ggplot(alldat, aes(x = value, colour = Type)) + - facet_wrap(~variable, scales = "free") + - geom_boxplot() + - labs(title = "Boxplot: Predicted vs Observed") + p1 <- ggplot2::ggplot(alldat, ggplot2::aes(x = value, colour = Type)) + + ggplot2::facet_wrap(~variable, scales = "free") + + ggplot2::geom_boxplot() + + ggplot2::labs(title = "Boxplot: Predicted vs Observed") - print(p1) # Display the first plot + print(p1) - # Prepare data for second plot - alldat_wide <- dcast(alldat, id + variable ~ Type, value.var = "value") + alldat_wide <- data.table::dcast(alldat, id + variable ~ Type, value.var = "value") vnames <- data.table::data.table( variable = c("preval", "crate", "ptran", "prec"), - Name = paste( + Name = paste( c("Init. state", "Contact Rate", "P(transmit)", "P(recover)"), sprintf("(MAE: %.2f)", MAEs) ) @@ -65,12 +50,11 @@ plot_results <- function(pred, test_data, theta, MAEs, N, N_train) { alldat_wide <- merge(alldat_wide, vnames, by = "variable") - # Plot 2: Observed vs Predicted with MAE labels - p2 <- ggplot2::ggplot(alldat_wide, aes(x = Observed, y = Predicted)) + - facet_wrap(~ Name, scales = "free") + - geom_abline(slope = 1, intercept = 0) + - geom_point(alpha = .2) + - labs( + p2 <- ggplot2::ggplot(alldat_wide, ggplot2::aes(x = Observed, y = Predicted)) + + ggplot2::facet_wrap(~ Name, scales = "free") + + ggplot2::geom_abline(slope = 1, intercept = 0) + + ggplot2::geom_point(alpha = .2) + + ggplot2::labs( title = "Observed vs Predicted (validation set)", subtitle = sprintf( "The model includes %i simulated datasets, of which %i were used for training.", @@ -79,5 +63,5 @@ plot_results <- function(pred, test_data, theta, MAEs, N, N_train) { caption = "Predictions made using a CNN as implemented with loss function MAE." ) - print(p2) # Display the second plot + print(p2) } diff --git a/R/prepare_data_infected.R b/R/prepare_data_infected.R index 90f9241..9d06b14 100644 --- a/R/prepare_data_infected.R +++ b/R/prepare_data_infected.R @@ -1,15 +1,10 @@ - #' Prepare Data for TensorFlow Model: Infected Cases #' #' @param data A numeric vector representing the number of infected cases over time (e.g., daily incidence). #' #' @return A reshaped array of processed data suitable for TensorFlow models. If an error occurs, it returns the error object. -#' #' @export prepare_data_infected <- function(data) { - library(reticulate) - reticulate::import("numpy") - library(keras3) if (!is.numeric(data)) { stop("Input 'data' must be a numeric vector.") } @@ -17,39 +12,27 @@ prepare_data_infected <- function(data) { max_days <- length(data) err <- tryCatch({ - # Explicitly use data.table namespace to create the table incidence_table <- data.table::data.table( day = seq_len(length(data)), Infected = data ) - - - # Explicitly use data.table subsetting with namespace incidence_table <- incidence_table[incidence_table$day <= max_days, ] - # Replace NA values with the last observed value incidence_table$Infected <- data.table::nafill( incidence_table$Infected, type = "locf" - ) - - # Debug: Print after NA replacement - print("After NA replacement:") - print(incidence_table) + ) - # Prepare array for TensorFlow dprep <- diff(as.matrix(incidence_table$Infected)) - dprep <- t(dprep) # Transpose to ensure correct shape + dprep <- t(dprep) - # Create a 3D array ans_array <- array(dim = c(1, length(dprep), 1)) ans_array[1, , 1] <- dprep - # Reshape for TensorFlow result <- tensorflow::array_reshape(ans_array, dim = c(1, length(dprep), 1)) return(result) - } , error = function(e) e) + }, error = function(e) e) if (inherits(err, "error")) { return(err) @@ -57,5 +40,3 @@ prepare_data_infected <- function(data) { return(err) } - - diff --git a/R/preprocessing_data.R b/R/preprocessing_data.R index 9ac6cf6..0dec6a2 100644 --- a/R/preprocessing_data.R +++ b/R/preprocessing_data.R @@ -1,3 +1,4 @@ + #' Preprocess Data for Machine Learning Models #' #' This function preprocesses input data for use in machine learning models, focusing on filtering, @@ -13,37 +14,28 @@ #' 2. Filters the data using `filter_non_null_infected()` to exclude missing or invalid values. #' 3. Extracts and reshapes the data into a one-column matrix format. #' -#' The function is designed to handle sequential data and prepare it for further processing in machine learning workflows. -#' #' @export preprocessing_data <- function(data) { - # Call prepare_data_infected ans <- list(prepare_data_infected(data)) - # Debugging: Check if 'ans' is valid if (is.null(ans[[1]]) || inherits(ans[[1]], "error")) { stop("Error in prepare_data_infected: The data could not be prepared.") } - # Call filter_non_null_infected filtered_data <- filter_non_null_infected(ans) - # Debugging: Check if filtered_data is valid if (is.null(filtered_data$matrices) || length(filtered_data$matrices) == 0) { stop("Error in filter_non_null_infected: No valid matrices found.") } - # Unlist and reshape matrices matr <- unlist(filtered_data$matrices) - # Debugging: Check if matr is valid if (is.null(matr) || length(matr) == 0) { stop("Error: Unlisted matrices are NULL or empty.") } - # Convert to matrix data2 <- as.vector(matr) - matrix_a <- as.matrix(data2, ncol = 1) # Ensure it's reshaped correctly + matrix_a <- as.matrix(data2, ncol = 1) return(matrix_a) } diff --git a/R/run_simulations.R b/R/run_simulations.R index 61ad924..05c6bc2 100644 --- a/R/run_simulations.R +++ b/R/run_simulations.R @@ -1,4 +1,3 @@ - #' Run SIR Model Simulations #' #' @description @@ -11,27 +10,24 @@ #' @param theta `data.table`. The parameters for the simulations. #' @param seeds Integer vector. Random seeds for each simulation. #' @importFrom parallel makeCluster stopCluster clusterExport clusterEvalQ parLapply mclapply -#' @importFrom epiworldR run verbose_off +#' @importFrom epiworldR run verbose_off ModelSIRCONN #' @importFrom data.table as.data.table dcast melt #' @return A list containing the simulation results as matrices. #' @export - run_simulations <- function(N, n, ndays, ncores, theta, seeds) { - library(epiworldR) - library(parallel) - - # Detect the operating system os_type <- .Platform$OS.type if (os_type == "windows") { - # Use parLapply for Windows - cl <- makeCluster(ncores) - on.exit(stopCluster(cl)) # Ensure the cluster is stopped after use + cl <- parallel::makeCluster(ncores) + on.exit(parallel::stopCluster(cl)) - clusterExport(cl, varlist = c("theta", "n", "ndays", "seeds", "prepare_data"), envir = environment()) - clusterEvalQ(cl, library(epiworldR)) # Load necessary libraries on workers + parallel::clusterExport(cl, varlist = c("theta", "n", "ndays", "seeds", "prepare_data"), envir = environment()) + parallel::clusterEvalQ(cl, { + # Load needed packages on workers if required (if not already loaded) + # library(epiworldR) # Not allowed, we rely on namespaces now + }) - matrices <- parLapply(cl, 1:N, function(i) { + matrices <- parallel::parLapply(cl, 1:N, function(i) { set.seed(seeds[i]) m <- epiworldR::ModelSIRCONN( "mycon", @@ -42,16 +38,14 @@ run_simulations <- function(N, n, ndays, ncores, theta, seeds) { n = n ) - verbose_off(m) - run(m, ndays = ndays) + epiworldR::verbose_off(m) + epiworldR::run(m, ndays = ndays) ans <- prepare_data(m, max_days = ndays) - return(ans) }) } else { - # Use mclapply for macOS/Linux - matrices <- mclapply(1:N, function(i) { + matrices <- parallel::mclapply(1:N, function(i) { set.seed(seeds[i]) m <- epiworldR::ModelSIRCONN( "mycon", @@ -62,14 +56,12 @@ run_simulations <- function(N, n, ndays, ncores, theta, seeds) { n = n ) - verbose_off(m) - run(m, ndays = ndays) + epiworldR::verbose_off(m) + epiworldR::run(m, ndays = ndays) ans <- prepare_data(m, max_days = ndays) - return(ans) }, mc.cores = ncores) } return(matrices) } - diff --git a/R/simulate_calibrate_sir.R b/R/simulate_calibrate_sir.R index 7d7b74c..4d740a7 100644 --- a/R/simulate_calibrate_sir.R +++ b/R/simulate_calibrate_sir.R @@ -1,4 +1,3 @@ - #' simulate calibrate sir Function #' #' @description @@ -8,37 +7,19 @@ #' @param n Integer. The population size for each simulation. #' @param ndays Integer. The number of days to simulate. #' @param ncores Integer. The number of cores to use for parallel processing. -#' @param epochs Integer. The number of training. -#' @param verbose Integer. 0 shows the result of training and 2 doesnt. +#' @param epochs Integer. The number of training epochs. +#' @param verbose Integer. Verbosity mode for training. 0 shows training output, 2 doesn't. #' @importFrom data.table copy #' @return Executes the pipeline and generates plots. #' @export -# N=2e4 -# n=5000 -# ncores=20 -# ndays=50 -simulate_calibrate_sir<- function(N,n,ndays,ncores,epochs,verbose) { - library(keras3) - library(data.table) - library(tensorflow) - -# Generate theta and seeds +simulate_calibrate_sir <- function(N, n, ndays, ncores, epochs, verbose) { + # Generate theta and seeds theta <- generate_theta(N, n) seeds <- sample.int(.Machine$integer.max, N, TRUE) + path = "~/epiworldRcalibrate/misc/simulated_data/sir-%06i.rds" - # Creating a temporary path to store the data - # path_dir <- tempfile("simulated_data") - # if (dir.exists(path_dir)) { - # file.remove(list.files(path_dir)) - # file.remove(path_dir) - # } - # dir.create(path_dir, recursive = TRUE) - # - # path <- file.path(path_dir, "sir-%06i.rds") - - path="~/epiworldRcalibrate/misc/simulated_data/sir-%06i.rds" # Run simulations - matrices <- run_simulations(N, n, ndays, ncores, theta,seeds) + matrices <- run_simulations(N, n, ndays, ncores, theta, seeds) # Filter non-null elements filtered_data <- filter_non_null(matrices, theta) @@ -50,12 +31,8 @@ simulate_calibrate_sir<- function(N,n,ndays,ncores,epochs,verbose) { arrays_1d <- prepare_data_for_tensorflow(matrices, N) # Save theta and simulations data - theta2 <-as.data.table(copy(theta)) - theta2$crate <- plogis(theta2$crate / 10) - - # theta2[, crate := plogis(crate / 10)] - - # saveRDS(list(theta = theta2, simulations = arrays_1d), file = "R/sir.rds", compress = TRUE) + theta2 <- data.table::as.data.table(data.table::copy(theta)) + theta2$crate <- stats::plogis(theta2$crate / 10) # Split data into training and testing sets data_split <- split_data(arrays_1d, theta2, N) @@ -64,7 +41,7 @@ simulate_calibrate_sir<- function(N,n,ndays,ncores,epochs,verbose) { # Build and train the CNN model model <- build_cnn_model(dim(arrays_1d)[-1], ncol(theta2)) - train_model(model, train,epochs=epochs,verbose=verbose) + train_model(model, train, epochs = epochs, verbose = verbose) # Evaluate the model eval_results <- evaluate_model(model, test, theta) @@ -73,6 +50,5 @@ simulate_calibrate_sir<- function(N,n,ndays,ncores,epochs,verbose) { # Plot the results plot_results(pred, test, theta, MAEs, N, floor(N * 0.7)) - return(list(pred=pred,MAEs=MAEs)) + return(list(pred = pred, MAEs = MAEs)) } - diff --git a/R/train_model.R b/R/train_model.R index 693673f..5f7cea5 100644 --- a/R/train_model.R +++ b/R/train_model.R @@ -1,3 +1,4 @@ + #' Train the CNN Model #' #' @description @@ -6,10 +7,10 @@ #' @param model Keras model. The compiled model to be trained. #' @param train_data List. A list containing training data (`x` and `y`). #' @param epochs Integer. The number of epochs for training. -#' @param verbose Integer. 0 shows the result of training and 2 doesnt. +#' @param verbose Integer. Verbosity mode for training output. #' @return The trained model (updated in place). #' @export -train_model <- function(model, train_data, epochs = epochs,verbose=verbose) { +train_model <- function(model, train_data, epochs, verbose) { tensorflow::set_random_seed(331) model |> keras3::fit(train_data$x, train_data$y, epochs = epochs, verbose = verbose) } diff --git a/README.Rmd b/README.Rmd index 9621d72..08f2123 100644 --- a/README.Rmd +++ b/README.Rmd @@ -1,6 +1,11 @@ ## 📊 simulate_calibrate_sir() and 🔧 calibrate_sir() ### ✍️ Authors: George Vega Yon, Sima NJF +# 🌍 Introduction + +Predicting the trajectory of infectious diseases lies at the heart of public health preparedness. With the epiworldRcalibrate package, you can effortlessly simulate and calibrate SIR (Susceptible-Infected-Recovered) epidemic models, blending the power of R, TensorFlow, and Keras. By generating realistic epidemic scenarios and training advanced Convolutional Neural Networks (CNNs) to identify key parameters—like prevalence, contact rate, transmission probability, and recovery precision—you can swiftly bridge the gap between theory and practice. + +The 🚀 simulate_calibrate_sir() function empowers you to produce synthetic incidence data and refine your model’s parameters through deep learning. Complementing this, 🔧 calibrate_sir() takes observed data, returning carefully tuned parameters ready for further analysis. Together, these tools streamline the modeling-to-inference pipeline, guiding you toward more informed, data-driven decisions in epidemiological research and public health policy. ### 🚀 simulate_calibrate_sir() @@ -27,43 +32,6 @@ ndays <- 50 ncores <- 20 epochs <- 2 verbose <- 2 - -simulate_calibrate_sir <- function(N, n, ndays, ncores, epochs, verbose) { - # ⚙️ Generate Theta and Seeds - theta <- generate_theta(N, n) - seeds <- sample.int(.Machine$integer.max, N, TRUE) - - # 🧪 Run Simulations - matrices <- run_simulations(N, n, ndays, ncores, theta, seeds) - - # 🔍 Filter Non-Null Data - filtered_data <- filter_non_null(matrices, theta) - matrices <- filtered_data$matrices - theta <- filtered_data$theta - N <- filtered_data$N - - # 📊 Prepare Data for TensorFlow - arrays_1d <- prepare_data_for_tensorflow(matrices, N) - theta2 <- as.data.table(copy(theta)) - theta2$crate <- plogis(theta2$crate / 10) - - # 🔀 Split Data into Training and Testing Sets - data_split <- split_data(arrays_1d, theta2, N) - train <- data_split$train - test <- data_split$test - - # 🏗️ Build and Train the CNN Model - model <- build_cnn_model(dim(arrays_1d)[-1], ncol(theta)) - train_model(model, train, epochs = epochs, verbose = verbose) - - # 📈 Evaluate the Model - eval_results <- evaluate_model(model, test, theta) - pred <- eval_results$pred - MAEs <- eval_results$MAEs - plot_results(pred, test, theta, MAEs, N, floor(N * 0.7)) - return(list(pred = pred, MAEs = MAEs)) -} - # ▶️ Call the function simulate_calibrate_sir(N, n, ndays, ncores, epochs, verbose) ``` diff --git a/man/build_cnn_model.Rd b/man/build_cnn_model.Rd index 1388946..cff994d 100644 --- a/man/build_cnn_model.Rd +++ b/man/build_cnn_model.Rd @@ -15,5 +15,5 @@ build_cnn_model(input_shape, output_units) A compiled Keras model ready for training. } \description{ -Constructs and compiles a CNN model using the \code{keras} package. +Constructs and compiles a CNN model using the \code{keras3} package. } diff --git a/man/calibrate_sir.Rd b/man/calibrate_sir.Rd index 38518d1..5816db3 100644 --- a/man/calibrate_sir.Rd +++ b/man/calibrate_sir.Rd @@ -2,30 +2,16 @@ % Please edit documentation in R/calibrate_sir.R \name{calibrate_sir} \alias{calibrate_sir} -\title{Predict Parameters Using a CNN Model -Generates predictions for input test data using a pre-trained Convolutional Neural Network (CNN) model.} +\title{Calibrate SIR Model Predictions} \usage{ calibrate_sir(data) } \arguments{ -\item{data}{A numeric matrix or array containing the input test data. The function determines which model to load based on the number of columns in \code{data} (expects either 30 or 60).} +\item{data}{A numeric matrix or array containing the input test data.} } \value{ -A list containing: -\describe{ -\item{pred}{A \code{data.table} of predicted values with the following columns: -\describe{ -\item{\code{preval}}{Predicted prevalence.} -\item{\code{crate}}{Predicted case rate.} -\item{\code{ptran}}{Predicted transmission probability.} -\item{\code{prec}}{Predicted precision.} -} -} -} +A list containing a \code{data.table} of predicted values. } \description{ -This function loads a specific Keras CNN model based on the length of the input data and uses it to make predictions. The predicted values are returned as a \code{data.table} with standardized column names. -} -\details{ -The function determines which pre-trained CNN model to load based on the number of features (columns) in the input \code{data}. If \code{data} has 30 columns, it loads the \code{sir30-cnn.keras} model; if it has 60 columns, it loads the \code{sir60-cnn.keras} model. Ensure that the input data matches one of these expected formats to avoid errors. +Generates predictions for input test data using a pre-trained CNN model. } diff --git a/man/prepare_data.Rd b/man/prepare_data.Rd index b623611..894fb36 100644 --- a/man/prepare_data.Rd +++ b/man/prepare_data.Rd @@ -12,7 +12,8 @@ prepare_data(m, max_days = max_days) \item{max_days}{The maximum number of days to consider for data preparation. Defaults to 50.} } \value{ -A reshaped array of processed data suitable for TensorFlow models. If an error occurs, it returns the error object. +A reshaped array of processed data suitable for TensorFlow models. +If an error occurs, it returns the error object. } \description{ Prepare Data for TensorFlow Model: General SIR Data Preparation diff --git a/man/preprocessing_data.Rd b/man/preprocessing_data.Rd index e95183f..8e0190e 100644 --- a/man/preprocessing_data.Rd +++ b/man/preprocessing_data.Rd @@ -23,6 +23,4 @@ The function performs the following steps: \item Filters the data using \code{filter_non_null_infected()} to exclude missing or invalid values. \item Extracts and reshapes the data into a one-column matrix format. } - -The function is designed to handle sequential data and prepare it for further processing in machine learning workflows. } diff --git a/man/simulate_calibrate_sir.Rd b/man/simulate_calibrate_sir.Rd index 9502c3c..042fb4e 100644 --- a/man/simulate_calibrate_sir.Rd +++ b/man/simulate_calibrate_sir.Rd @@ -15,9 +15,9 @@ simulate_calibrate_sir(N, n, ndays, ncores, epochs, verbose) \item{ncores}{Integer. The number of cores to use for parallel processing.} -\item{epochs}{Integer. The number of training.} +\item{epochs}{Integer. The number of training epochs.} -\item{verbose}{Integer. 0 shows the result of training and 2 doesnt.} +\item{verbose}{Integer. Verbosity mode for training. 0 shows training output, 2 doesn't.} } \value{ Executes the pipeline and generates plots. diff --git a/man/train_model.Rd b/man/train_model.Rd index 2f7ecb1..182bd9f 100644 --- a/man/train_model.Rd +++ b/man/train_model.Rd @@ -4,7 +4,7 @@ \alias{train_model} \title{Train the CNN Model} \usage{ -train_model(model, train_data, epochs = epochs, verbose = verbose) +train_model(model, train_data, epochs, verbose) } \arguments{ \item{model}{Keras model. The compiled model to be trained.} @@ -13,7 +13,7 @@ train_model(model, train_data, epochs = epochs, verbose = verbose) \item{epochs}{Integer. The number of epochs for training.} -\item{verbose}{Integer. 0 shows the result of training and 2 doesnt.} +\item{verbose}{Integer. Verbosity mode for training output.} } \value{ The trained model (updated in place).