diff --git a/.Rbuildignore b/.Rbuildignore index 6d1f529f..504e9311 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -17,3 +17,4 @@ ^_pkgdown\.yml$ ^docs$ ^pkgdown$ +^Dockerfile$ diff --git a/.github/workflows/build-check.yml b/.github/workflows/build-check.yml index 37a402f7..3b4b3fd5 100644 --- a/.github/workflows/build-check.yml +++ b/.github/workflows/build-check.yml @@ -28,9 +28,10 @@ jobs: config: - {os: macOS-latest, r: 'release'} - {os: windows-latest, r: 'release'} + - {os: ubuntu-latest, r: 'release'} #- {os: windows-latest, r: '3.6', rspm: "https://packagemanager.rstudio.com/cran/latest"} #- {os: ubuntu-18.04, r: 'devel', rspm: "https://packagemanager.rstudio.com/cran/__linux__/bionic/latest", http-user-agent: "R/4.0.0 (ubuntu-18.04) R (4.0.0 x86_64-pc-linux-gnu x86_64 linux-gnu) on GitHub Actions" } - - {os: ubuntu-18.04, r: 'release', rspm: "https://packagemanager.rstudio.com/cran/__linux__/bionic/latest"} + #- {os: ubuntu-18.04, r: 'release', rspm: "https://packagemanager.rstudio.com/cran/__linux__/bionic/latest"} #- {os: ubuntu-18.04, r: 'oldrel', rspm: "https://packagemanager.rstudio.com/cran/__linux__/bionic/latest"} #- {os: ubuntu-18.04, r: '3.6', rspm: "https://packagemanager.rstudio.com/cran/__linux__/bionic/latest"} #- {os: ubuntu-18.04, r: '3.5', rspm: "https://packagemanager.rstudio.com/cran/__linux__/bionic/latest"} @@ -42,16 +43,18 @@ jobs: GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 with: submodules: recursive - - uses: r-lib/actions/setup-r@v1 + - uses: r-lib/actions/setup-r@v2 with: + rtools-version: '42' r-version: ${{ matrix.config.r }} http-user-agent: ${{ matrix.config.http-user-agent }} + use-public-rspm: true - - uses: r-lib/actions/setup-pandoc@v1 + - uses: r-lib/actions/setup-pandoc@v2 - name: Query dependencies run: | diff --git a/.github/workflows/lint-project.yaml b/.github/workflows/lint-project.yaml index c3a2ce49..846da74f 100644 --- a/.github/workflows/lint-project.yaml +++ b/.github/workflows/lint-project.yaml @@ -16,11 +16,11 @@ jobs: env: GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 with: submodules: recursive - - uses: r-lib/actions/setup-r@v1 + - uses: r-lib/actions/setup-r@v2 - name: Install lintr run: install.packages("lintr") diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml index 51c30d39..f52b568c 100644 --- a/.github/workflows/pkgdown.yaml +++ b/.github/workflows/pkgdown.yaml @@ -14,13 +14,13 @@ jobs: env: GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 with: submodules: recursive - - uses: r-lib/actions/setup-r@v1 + - uses: r-lib/actions/setup-r@v2 - - uses: r-lib/actions/setup-pandoc@v1 + - uses: r-lib/actions/setup-pandoc@v2 - name: Query dependencies run: | diff --git a/.vscode/settings.json b/.vscode/settings.json index e4cb716d..f3b9c150 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -102,6 +102,8 @@ "geodaweight.h": "c", "filesystem": "cpp", "numbers": "cpp", - "semaphore": "cpp" + "semaphore": "cpp", + "__bits": "cpp", + "__verbose_abort": "cpp" } } \ No newline at end of file diff --git a/DESCRIPTION b/DESCRIPTION index 658071ee..09771901 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: rgeoda Type: Package Title: R Library for Spatial Data Analysis -Version: 0.0.9 -Date: 2022-04-09 +Version: 0.0.10-4 +Date: 2023-07-01 Authors@R: c(person(given = "Xun", family = "Li", email="lixun910@gmail.com", role=c("aut","cre")), person(given = "Luc", family = "Anselin", email="anselin@uchicago.edu", role="aut")) @@ -39,4 +39,4 @@ Encoding: UTF-8 Suggests: wkb, sp -SystemRequirements: C++14 +SystemRequirements: C++17 diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 00000000..8648e641 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,7 @@ +FROM rocker/r-base + +ENV DEBIAN_FRONTEND noninteractive + +RUN apt-get update && apt-get install -y git libssl-dev libgeos-dev libgeos++-dev gdal-bin libproj-dev libgdal-dev libudunits2-dev + +RUN install2.r --error proxy Rcpp wk sp digest sf BH wkb TinyTex diff --git a/R/clustering.R b/R/clustering.R index a33b45d6..8f82a405 100644 --- a/R/clustering.R +++ b/R/clustering.R @@ -4,15 +4,26 @@ #' similar values for features of interest. #' @param k The number of clusters #' @param w An instance of Weight class -#' @param df A data frame with selected variables only. E.g. guerry[c("Crm_prs", "Crm_prp", "Litercy")] +#' @param df A data frame with selected variables only. +#' E.g. guerry[c("Crm_prs", "Crm_prp", "Litercy")] #' @param bound_variable (optional) A data frame with selected bound variable -#' @param min_bound (optional) A minimum bound value that applies to all clusters -#' @param scale_method One of the scaling methods {'raw', 'standardize', 'demean', 'mad', 'range_standardize', 'range_adjust'} to apply on input data. Default is 'standardize' (Z-score normalization). -#' @param distance_method (optional) The distance method used to compute the distance betwen observation i and j. Defaults to "euclidean". Options are "euclidean" and "manhattan" -#' @param random_seed (int,optional) The seed for random number generator. Defaults to 123456789. -#' @param cpu_threads (optional) The number of cpu threads used for parallel computation -#' @param rdist (optional) The distance matrix (lower triangular matrix, column wise storage) -#' @return A names list with names "Clusters", "Total sum of squares", "Within-cluster sum of squares", "Total within-cluster sum of squares", and "The ratio of between to total sum of squares". +#' @param min_bound (optional) A minimum bound value that applies to all +#' clusters +#' @param scale_method One of the scaling methods {'raw', 'standardize', +#' 'demean', 'mad', 'range_standardize', 'range_adjust'} to apply on input data. +#' Default is 'standardize' (Z-score normalization). +#' @param distance_method (optional) The distance method used to compute the +#' distance betwen observation i and j. Defaults to "euclidean". Options are +#' "euclidean" and "manhattan" +#' @param random_seed (int,optional) The seed for random number generator. +#' Defaults to 123456789. +#' @param cpu_threads (optional) The number of cpu threads used for parallel +#' computation +#' @param rdist (optional) The distance matrix (lower triangular matrix, +#' column wise storage) +#' @return A names list with names "Clusters", "Total sum of squares", +#' "Within-cluster sum of squares", "Total within-cluster sum of squares", +#' and "The ratio of between to total sum of squares". #' @examples #' library(sf) #' guerry_path <- system.file("extdata", "Guerry.shp", package = "rgeoda") @@ -22,12 +33,15 @@ #' guerry_clusters <- skater(4, queen_w, data) #' guerry_clusters #' @export -skater <- function(k, w, df, bound_variable=data.frame(), min_bound=0, scale_method="standardize", distance_method="euclidean", random_seed=123456789, cpu_threads=6, rdist=numeric()) { +skater <- function(k, w, df, bound_variable=data.frame(), min_bound=0, + scale_method="standardize", distance_method="euclidean", + random_seed=123456789, cpu_threads=6, rdist=numeric()) { if (w$num_obs < 1) { stop("The weights is not valid.") } if (k <1 && k > w$num_obs) { - stop("The number of clusters should be a positive integer number, which is less than the number of observations.") + stop("The number of clusters should be a positive integer number, which is + less than the number of observations.") } if (inherits(df, "data.frame") == FALSE) { stop("The input data needs to be a data.frame.") @@ -43,9 +57,11 @@ skater <- function(k, w, df, bound_variable=data.frame(), min_bound=0, scale_met stop("The data.frame is empty.") } - scale_methods <- c('raw', 'standardize', 'demean', 'mad', 'range_standardize', 'range_adjust') + scale_methods <- c('raw', 'standardize', 'demean', 'mad', 'range_standardize', + 'range_adjust') if (!(scale_method %in% scale_methods)) { - stop("The scale_method has to be one of {'raw', 'standardize', 'demean', 'mad', 'range_standardize', 'range_adjust'}") + stop("The scale_method has to be one of {'raw', 'standardize', 'demean', + 'mad', 'range_standardize', 'range_adjust'}") } if (distance_method != "euclidean" && distance_method != "manhattan") { @@ -56,7 +72,8 @@ skater <- function(k, w, df, bound_variable=data.frame(), min_bound=0, scale_met if (length(bound_variable) > 0) { bound_values <- bound_variable[[1]] } - return(p_skater(k, w$GetPointer(), df, n_vars, scale_method, distance_method, bound_values, min_bound, random_seed, cpu_threads, rdist)) + return(p_skater(k, w$GetPointer(), df, n_vars, scale_method, distance_method, + bound_values, min_bound, random_seed, cpu_threads, rdist)) } diff --git a/R/sf_geoda.R b/R/sf_geoda.R index b8bbf332..55cbe1f7 100644 --- a/R/sf_geoda.R +++ b/R/sf_geoda.R @@ -1,8 +1,8 @@ # Create a random string (internally used) # The input is a positive number, indicating the number of items to choose from. random_string <- function(n = 5000) { - a <- do.call(paste0, replicate(5, sample(LETTERS, n, TRUE), FALSE)) - paste0(a, sprintf("%04d", sample(9999, n, TRUE)), sample(LETTERS, n, TRUE)) + a <- do.call(paste0, replicate(10, sample(LETTERS, n, TRUE), FALSE)) + return(a) } diff --git a/man/skater.Rd b/man/skater.Rd index 0efd2eed..4674e456 100644 --- a/man/skater.Rd +++ b/man/skater.Rd @@ -22,24 +22,35 @@ skater( \item{w}{An instance of Weight class} -\item{df}{A data frame with selected variables only. E.g. guerry[c("Crm_prs", "Crm_prp", "Litercy")]} +\item{df}{A data frame with selected variables only. +E.g. guerry[c("Crm_prs", "Crm_prp", "Litercy")]} \item{bound_variable}{(optional) A data frame with selected bound variable} -\item{min_bound}{(optional) A minimum bound value that applies to all clusters} +\item{min_bound}{(optional) A minimum bound value that applies to all +clusters} -\item{scale_method}{One of the scaling methods {'raw', 'standardize', 'demean', 'mad', 'range_standardize', 'range_adjust'} to apply on input data. Default is 'standardize' (Z-score normalization).} +\item{scale_method}{One of the scaling methods {'raw', 'standardize', +'demean', 'mad', 'range_standardize', 'range_adjust'} to apply on input data. +Default is 'standardize' (Z-score normalization).} -\item{distance_method}{(optional) The distance method used to compute the distance betwen observation i and j. Defaults to "euclidean". Options are "euclidean" and "manhattan"} +\item{distance_method}{(optional) The distance method used to compute the +distance betwen observation i and j. Defaults to "euclidean". Options are +"euclidean" and "manhattan"} -\item{random_seed}{(int,optional) The seed for random number generator. Defaults to 123456789.} +\item{random_seed}{(int,optional) The seed for random number generator. +Defaults to 123456789.} -\item{cpu_threads}{(optional) The number of cpu threads used for parallel computation} +\item{cpu_threads}{(optional) The number of cpu threads used for parallel +computation} -\item{rdist}{(optional) The distance matrix (lower triangular matrix, column wise storage)} +\item{rdist}{(optional) The distance matrix (lower triangular matrix, +column wise storage)} } \value{ -A names list with names "Clusters", "Total sum of squares", "Within-cluster sum of squares", "Total within-cluster sum of squares", and "The ratio of between to total sum of squares". +A names list with names "Clusters", "Total sum of squares", +"Within-cluster sum of squares", "Total within-cluster sum of squares", +and "The ratio of between to total sum of squares". } \description{ SKATER forms clusters by spatially partitioning data that has diff --git a/src/Makevars b/src/Makevars index d5b92124..53609ac2 100644 --- a/src/Makevars +++ b/src/Makevars @@ -8,7 +8,7 @@ PKG_CPPFLAGS=\ PKG_LIBS=\ -pthread -CXX_STD=CXX14 +CXX_STD=CXX17 CPP_SRC_FILES = \ $(RGEODALIB)/libgeoda.cpp \ diff --git a/src/Makevars.win b/src/Makevars.win index 6643db43..3450d0bb 100644 --- a/src/Makevars.win +++ b/src/Makevars.win @@ -12,7 +12,7 @@ PKG_LIBS=\ -L$(RWINLIB)/lib$(R_ARCH) \ -pthread -CXX_STD = CXX14 +CXX_STD = CXX17 CPP_SRC_FILES = \ $(RGEODALIB)/libgeoda.cpp \ diff --git a/src/libgeoda b/src/libgeoda index a9a3d0a3..b512f742 160000 --- a/src/libgeoda +++ b/src/libgeoda @@ -1 +1 @@ -Subproject commit a9a3d0a3212dfc7cdc63d41f5e8033ba54063693 +Subproject commit b512f742ff95fba89bbc9958972667c7643edba5 diff --git a/src/rcpp_clustering.cpp b/src/rcpp_clustering.cpp index eb7a23e4..d02e2381 100644 --- a/src/rcpp_clustering.cpp +++ b/src/rcpp_clustering.cpp @@ -81,10 +81,8 @@ Rcpp::List p_skater(int k, SEXP xp_w, Rcpp::List& data, int n_vars, std::string int num_obs = w->GetNumObs(); double** dist_matrix = rdist_matrix(num_obs, rdist); - Rcout << "aaa" << dist_matrix; std::vector > cluster_ids = gda_skater(k, w, raw_data, scale_method, distance_method, raw_bound, min_bound, seed, cpu_threads, dist_matrix); - Rcout << "after gda_skater"; if (dist_matrix) { for (int i = 1; i < num_obs; i++) { free(dist_matrix[i]); diff --git a/tests/testthat/test-clustering.R b/tests/testthat/test-clustering.R index 355d47d3..2bf38b17 100644 --- a/tests/testthat/test-clustering.R +++ b/tests/testthat/test-clustering.R @@ -63,10 +63,14 @@ testthat::test_that("schc", { testthat::expect_equal(clusters[[5]], 0.2147711255) }) -# NOTE!!!!!!!!! -# The results are computed using Boost library 1.58.0. -# To pass the following test cases -# , please install BH package version==1.58.0 +# NOTE +# The previous results are computed using Boost library 1.58.0. +# The new results are computed using Boost library 1.81.0.1 +# The differences are caused by the different implementation of +# boost::unordered_map: he keys in boost::unordered_map are not ordered and +# have different orders in the two Boost versions. This involves a different +# mechanism of randomness in max-p algorithm when picking which area or region +# to process. testthat::test_that("azp_greedy", { library(sf) @@ -78,7 +82,7 @@ testthat::test_that("azp_greedy", { azp_clusters <- azp_greedy(5, queen_w, data) - testthat::expect_equal(azp_clusters[[5]], 0.3598541) + testthat::expect_equal(azp_clusters[[5]], 0.36, tolerance = 1e-3) bound_variable <- guerry["Pop1831"] min_bound <- 3236.67 # 10% of Pop1831 @@ -87,7 +91,7 @@ testthat::test_that("azp_greedy", { bound_variable = bound_variable, min_bound = min_bound) - testthat::expect_equal(azp_clusters[[5]], 0.3980921835) + testthat::expect_equal(azp_clusters[[5]], 0.417, tolerance = 1e-3) }) @@ -101,7 +105,7 @@ testthat::test_that("azp_sa", { azp_clusters <- azp_sa(5, queen_w, data, cooling_rate = 0.85, sa_maxit = 1) - testthat::expect_equal(azp_clusters[[5]], 0.4211363) + testthat::expect_equal(azp_clusters[[5]], 0.359, tolerance = 1e-3) }) testthat::test_that("azp_tabu", { @@ -129,9 +133,9 @@ testthat::test_that("maxp_greedy", { bound_vals <- guerry["Pop1831"] min_bound <- 3236.67 # 10% of Pop1831 - #clusters <- maxp_greedy(queen_w, data, bound_vals, min_bound) + clusters <- maxp_greedy(queen_w, data, bound_vals, min_bound) - #testthat::expect_equal(clusters[[5]], 0.4499671068) + testthat::expect_equal(clusters[[5]], 0.484, tolerance = 1e-3) }) testthat::test_that("maxp_sa", { @@ -145,10 +149,10 @@ testthat::test_that("maxp_sa", { bound_vals <- guerry["Pop1831"] min_bound <- 3236.67 # 10% of Pop1831 - #clusters <- maxp_sa(queen_w, data, bound_vals, min_bound, - # cooling_rate = 0.85, sa_maxit = 1) + clusters <- maxp_sa(queen_w, data, bound_vals, min_bound, + cooling_rate = 0.85, sa_maxit = 1) - #testthat::expect_equal(clusters[[5]], 0.4585352223) + testthat::expect_equal(clusters[[5]], 0.496, tolerance = 1e-3) }) testthat::test_that("maxp_tabu", { @@ -163,9 +167,9 @@ testthat::test_that("maxp_tabu", { min_bound <- 3236.67 # 10% of Pop1831 - #clusters <- maxp_tabu(queen_w, data, bound_vals, min_bound, - # tabu_length = 10, conv_tabu = 10) + clusters <- maxp_tabu(queen_w, data, bound_vals, min_bound, + tabu_length = 10, conv_tabu = 10) - #testthat::expect_equal(clusters[[5]], 0.4893668149) + testthat::expect_equal(clusters[[5]], 0.478, tolerance = 1e-3) })