From be4007666f1c8a082dc67311d643678fea2a1115 Mon Sep 17 00:00:00 2001 From: Yigong Hu Date: Sun, 21 Jul 2024 15:52:05 +0100 Subject: [PATCH] Feature: multiprocess support based on MPI (#88) * edit: MPI with indep var selection * fix: compile errors * edit: further rename functions * add: fit mpi * fix: runtime errors * fix: unused variables * add: bw sel mpi * fix: bugs caused in non mpi mode * fix: indep var sel mpi runtime error * fix: bw sel runtime errors * edit: activate all tests * add: compile condition around mpi code * edit: mpi with omp * add: mpi with cuda * fix: fitMpi S send error * edit: call GWRBasic in MGWR * fix: split fit core functions * add: fit core omp and cuda * edit: use min distance * add: cubase static methods to create and destory handle * edit: CRSDistance use cumat * fix: GWR cuda bug * edit: MGWR move global fit earlier * add: mpi mat mul * add: MGWR mpi fit * fix: mpi mat mul error * edit: test matrix size * edit: use scatter instead of cast * fix: mgwr mpi run error * add: bw criterion cv aic mpi mode * fix: mgwr mpi omp * edit: mpi with cuda * edit: MPI code conditional compile * edit: mgwr set golden bounds of gwr basic * edit: pre compute b_rows_i * fix: setGroupSize use std::size_t * fix: syntax error * fix(test): MGWR * fix: type mismatch on armadillo and RcppArmadillo * fix: use MY_MPI_UWORD to gether sizes It is unsigned long when ARMA_32BIT_WORD is defined; otherwise, unsigned long long. * edit: rename MY_MPI_UWORD * edit: use int to record process status * add(workflow): MPI tests * Revert "add(workflow): MPI tests" This reverts commit 5bbb210e311cd0372b4654712a0b20cc043f9181. --------- Co-authored-by: rd21411 --- CMakeLists.txt | 1 + include/gwmodelpp/GWRBasic.h | 228 +-- include/gwmodelpp/GWRMultiscale.h | 314 +--- include/gwmodelpp/IParallelizable.h | 21 +- include/gwmodelpp/spatialweight/CRSDistance.h | 12 +- .../gwmodelpp/spatialweight/SpatialWeight.h | 4 +- include/gwmodelpp/utils/armampi.h | 10 + include/gwmodelpp/utils/cumat.hpp | 43 + src/CMakeLists.txt | 26 + src/gwmodelpp/GWRBasic.cpp | 901 +++++++----- src/gwmodelpp/GWRMultiscale.cpp | 1280 ++++++----------- .../spatialweight/BandwidthWeight.cpp | 2 +- src/gwmodelpp/spatialweight/CRSDistance.cpp | 21 +- .../cuda/BandwidthWeightKernel.cu | 17 +- src/gwmodelpp/utils/armampi.cpp | 25 + src/gwmodelpp/utils/cumat.cpp | 7 + test/CMakeLists.txt | 10 +- test/include/TerminateCheckTelegram.h | 6 +- test/mpi/CMakeLists.txt | 30 + test/mpi/testGWRBasicMpi.cpp | 310 ++++ test/mpi/testGWRMultiscaleMpi.cpp | 304 ++++ test/mpi/testMpiMatMul.cpp | 68 + test/testGWRBasic.cpp | 18 +- test/testGWRMultiscale.cpp | 4 +- 24 files changed, 2067 insertions(+), 1595 deletions(-) create mode 100644 include/gwmodelpp/utils/armampi.h create mode 100644 src/gwmodelpp/utils/armampi.cpp create mode 100644 test/mpi/CMakeLists.txt create mode 100644 test/mpi/testGWRBasicMpi.cpp create mode 100644 test/mpi/testGWRMultiscaleMpi.cpp create mode 100644 test/mpi/testMpiMatMul.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 4c58d004..675f4862 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,6 +6,7 @@ set(CMAKE_MODULE_PATH cmake) option(ENABLE_OpenMP "Determines whether OpemMP support should be built" ON) option(ENABLE_CUDA "Determines whether CUDA support should be built" OFF) +option(ENABLE_MPI "Determines whether MPI support should be built" OFF) option(WITH_TESTS "Determines whether to build and run tests" ON) if(ENABLE_CUDA) diff --git a/include/gwmodelpp/GWRBasic.h b/include/gwmodelpp/GWRBasic.h index 50daa284..30665d24 100644 --- a/include/gwmodelpp/GWRBasic.h +++ b/include/gwmodelpp/GWRBasic.h @@ -26,7 +26,7 @@ namespace gwm * 该算法可以通过 OpenMP 加速。 * */ -class GWRBasic : public GWRBase, public IBandwidthSelectable, public IVarialbeSelectable, public IParallelizable, public IParallelOpenmpEnabled, public IParallelCudaEnabled +class GWRBasic : public GWRBase, public IBandwidthSelectable, public IVarialbeSelectable, public IParallelizable, public IParallelOpenmpEnabled, public IParallelCudaEnabled, public IParallelMpiEnabled { public: @@ -47,10 +47,13 @@ class GWRBasic : public GWRBase, public IBandwidthSelectable, public IVarialbeSe static std::unordered_map BandwidthSelectionCriterionTypeNameMapper; typedef arma::mat (GWRBasic::*PredictCalculator)(const arma::mat&, const arma::mat&, const arma::vec&); //!< \~english Predict function declaration. \~chinese 预测函数声明。 - typedef arma::mat (GWRBasic::*FitCalculator)(const arma::mat&, const arma::vec&, arma::mat&, arma::vec&, arma::vec&, arma::mat&); //!< \~english Fit function declaration. \~chinese 拟合函数声明。 + typedef arma::mat (GWRBasic::*FitCalculator)(); //!< \~english Fit function declaration. \~chinese 拟合函数声明。 + typedef arma::mat (GWRBasic::*FitCoreCalculator)(const arma::mat&, const arma::vec&, const SpatialWeight&); //!< \~english Fit function declaration. \~chinese 拟合函数声明。 + typedef arma::mat (GWRBasic::*FitCoreSHatCalculator)(const arma::mat&, const arma::vec&, const SpatialWeight&, arma::vec&); //!< \~english Fit function declaration. \~chinese 拟合函数声明。 + typedef arma::mat (GWRBasic::*FitCoreCVCalculator)(const arma::mat&, const arma::vec&, const SpatialWeight&); //!< \~english Fit function declaration. \~chinese 拟合函数声明。 typedef double (GWRBasic::*BandwidthSelectionCriterionCalculator)(BandwidthWeight*); //!< \~english Declaration of criterion calculator for bandwidth selection. \~chinese 带宽优选指标计算函数声明。 - typedef double (GWRBasic::*IndepVarsSelectCriterionCalculator)(const std::vector&); //!< \~english Declaration of criterion calculator for variable selection. \~chinese 变量优选指标计算函数声明。 + typedef double (GWRBasic::*IndepVarsSelectCriterionCalculator)(const std::vector&); //!< \~english Declaration of criterion calculator for variable selection. \~chinese 变量优选指标计算函数声明。 private: @@ -361,6 +364,29 @@ class GWRBasic : public GWRBase, public IBandwidthSelectable, public IVarialbeSe */ const arma::mat& s() { return mS; } + const arma::cube& c() { return mC; } + + /** + * \~english + * @brief Whether to store hat-matrix \f$S\f$. + * + * @return true if store hat-matrix. + * @return false if not to store hat-matrix. + * + * \~chinese + * @brief 是否保存帽子矩阵 \f$S\f$. + * + * @return true 如果保存帽子矩阵。 + * @return false 如果不保存帽子矩阵。 + * + */ + bool isStoreS() { return mStoreS ? true : (mHasHatMatrix && (mCoords.n_rows < 8192)); } + + bool isStoreC() { return mStoreC; } + + void setStoreS(bool flag) { mStoreS = flag; } + + void setStoreC(bool flag) { mStoreC = flag; } public: // Implement Algorithm bool isValid() override; @@ -381,6 +407,22 @@ class GWRBasic : public GWRBase, public IBandwidthSelectable, public IVarialbeSe { return mSelectedIndepVars; } + + /** + * \~english + * @brief Get AIC value with given variables for variable optimization (serial implementation). + * + * @param indepVars Given variables + * @return double Criterion value + * + * \~chinese + * @brief 根据指定的变量计算变量优选的AIC值(串行实现)。 + * + * @param indepVars 指定的变量。 + * @return double 变量优选的指标值。 + */ + double indepVarsSelectionCriterion(const std::vector& indepVars); + public: // Implement IBandwidthSelectable Status getCriterion(BandwidthWeight* weight, double& criterion) override @@ -388,6 +430,36 @@ class GWRBasic : public GWRBase, public IBandwidthSelectable, public IVarialbeSe criterion = (this->*mBandwidthSelectionCriterionFunction)(weight); return mStatus; } + + /** + * \~english + * @brief Get CV value with given bandwidth for bandwidth optimization (serial implementation). + * + * @param bandwidthWeight Given bandwidth + * @return double Criterion value + * + * \~chinese + * @brief 根据指定的带宽计算带宽优选的CV值(串行实现)。 + * + * @param bandwidthWeight 指定的带宽。 + * @return double 带宽优选的指标值。 + */ + double bandwidthSizeCriterionCV(BandwidthWeight* bandwidthWeight); + + /** + * \~english + * @brief Get AIC value with given bandwidth for bandwidth optimization (serial implementation). + * + * @param bandwidthWeight Given bandwidth + * @return double Criterion value + * + * \~chinese + * @brief 根据指定的带宽计算带宽优选的AIC值(串行实现)。 + * + * @param bandwidthWeight 指定的带宽。 + * @return double 带宽优选的指标值。 + */ + double bandwidthSizeCriterionAIC(BandwidthWeight* bandwidthWeight); private: @@ -434,52 +506,16 @@ class GWRBasic : public GWRBase, public IBandwidthSelectable, public IVarialbeSe * @param S [out] 帽子矩阵 \f$S\f$。 * @return mat 回归系数估计值 */ - arma::mat fitSerial(const arma::mat& x, const arma::vec& y, arma::mat& betasSE, arma::vec& shat, arma::vec& qDiag, arma::mat& S); - - /** - * \~english - * @brief Get CV value with given bandwidth for bandwidth optimization (serial implementation). - * - * @param bandwidthWeight Given bandwidth - * @return double Criterion value - * - * \~chinese - * @brief 根据指定的带宽计算带宽优选的CV值(串行实现)。 - * - * @param bandwidthWeight 指定的带宽。 - * @return double 带宽优选的指标值。 - */ - double bandwidthSizeCriterionCVSerial(BandwidthWeight* bandwidthWeight); - - /** - * \~english - * @brief Get AIC value with given bandwidth for bandwidth optimization (serial implementation). - * - * @param bandwidthWeight Given bandwidth - * @return double Criterion value - * - * \~chinese - * @brief 根据指定的带宽计算带宽优选的AIC值(串行实现)。 - * - * @param bandwidthWeight 指定的带宽。 - * @return double 带宽优选的指标值。 - */ - double bandwidthSizeCriterionAICSerial(BandwidthWeight* bandwidthWeight); - - /** - * \~english - * @brief Get AIC value with given variables for variable optimization (serial implementation). - * - * @param indepVars Given variables - * @return double Criterion value - * - * \~chinese - * @brief 根据指定的变量计算变量优选的AIC值(串行实现)。 - * - * @param indepVars 指定的变量。 - * @return double 变量优选的指标值。 - */ - double indepVarsSelectionCriterionSerial(const std::vector& indepVars); + arma::mat fitBase(); + + +private: + + arma::mat fitCoreSerial(const arma::mat& x, const arma::vec& y, const SpatialWeight& sw); + + arma::mat fitCoreSHatSerial(const arma::mat& x, const arma::vec& y, const SpatialWeight& sw, arma::vec& shat); + + arma::mat fitCoreCVSerial(const arma::mat& x, const arma::vec& y, const SpatialWeight& sw); #ifdef ENABLE_OPENMP @@ -525,7 +561,7 @@ class GWRBasic : public GWRBase, public IBandwidthSelectable, public IVarialbeSe * @param S [out] 帽子矩阵 \f$S\f$。 * @return mat 回归系数估计值 */ - arma::mat fitOmp(const arma::mat& x, const arma::vec& y, arma::mat& betasSE, arma::vec& shat, arma::vec& qDiag, arma::mat& S); + arma::mat fitCoreOmp(const arma::mat& x, const arma::vec& y, const SpatialWeight& sw); /** * \~english @@ -540,22 +576,7 @@ class GWRBasic : public GWRBase, public IBandwidthSelectable, public IVarialbeSe * @param bandwidthWeight 指定的带宽。 * @return double 带宽优选的指标值。 */ - double bandwidthSizeCriterionCVOmp(BandwidthWeight* bandwidthWeight); - - /** - * \~english - * @brief Get AIC value with given bandwidth for bandwidth optimization (OpenMP implementation). - * - * @param bandwidthWeight Given bandwidth - * @return double Criterion value - * - * \~chinese - * @brief 根据指定的带宽计算带宽优选的AIC值(OpenMP 实现)。 - * - * @param bandwidthWeight 指定的带宽。 - * @return double 带宽优选的指标值。 - */ - double bandwidthSizeCriterionAICOmp(BandwidthWeight* bandwidthWeight); + arma::mat fitCoreCVOmp(const arma::mat& x, const arma::vec& y, const SpatialWeight& sw); /** * \~english @@ -570,7 +591,7 @@ class GWRBasic : public GWRBase, public IBandwidthSelectable, public IVarialbeSe * @param indepVars 指定的变量。 * @return double 变量优选的指标值。 */ - double indepVarsSelectionCriterionOmp(const std::vector& indepVars); + arma::mat fitCoreSHatOmp(const arma::mat& x, const arma::vec& y, const SpatialWeight& sw, arma::vec& shat); #endif @@ -593,7 +614,7 @@ class GWRBasic : public GWRBase, public IBandwidthSelectable, public IVarialbeSe * @param y 因变量。 * @return mat 回归系数预测值。 */ - arma::mat fitCuda(const arma::mat& x, const arma::vec& y, arma::mat& betasSE, arma::vec& shat, arma::vec& qDiag, arma::mat& S); + arma::mat fitCoreCuda(const arma::mat& x, const arma::vec& y, const SpatialWeight& sw); /** * \~english @@ -633,22 +654,7 @@ class GWRBasic : public GWRBase, public IBandwidthSelectable, public IVarialbeSe * @param bandwidthWeight 指定的带宽。 * @return double 带宽优选的指标值。 */ - double bandwidthSizeCriterionCVCuda(BandwidthWeight* bandwidthWeight); - - /** - * \~english - * @brief Get AIC value with given bandwidth for bandwidth optimization (CUDA implementation). - * - * @param bandwidthWeight Given bandwidth - * @return double Criterion value - * - * \~chinese - * @brief 根据指定的带宽计算带宽优选的AIC值(CUDA实现)。 - * - * @param bandwidthWeight 指定的带宽。 - * @return double 带宽优选的指标值。 - */ - double bandwidthSizeCriterionAICCuda(BandwidthWeight* bandwidthWeight); + arma::mat fitCoreCVCuda(const arma::mat& x, const arma::vec& y, const SpatialWeight& sw); /** * \~english @@ -663,10 +669,17 @@ class GWRBasic : public GWRBase, public IBandwidthSelectable, public IVarialbeSe * @param indepVars 指定的变量。 * @return double 变量优选的指标值。 */ - double indepVarsSelectionCriterionCuda(const std::vector& indepVars); + arma::mat fitCoreSHatCuda(const arma::mat& x, const arma::vec& y, const SpatialWeight& sw, arma::vec& shat); #endif +#ifdef ENABLE_MPI + double indepVarsSelectionCriterionMpi(const std::vector& indepVars); + double bandwidthSizeCriterionCVMpi(BandwidthWeight* bandwidthWeight); + double bandwidthSizeCriterionAICMpi(BandwidthWeight* bandwidthWeight); + arma::mat fitMpi(); +#endif // ENABLE_MPI + public: // Implement IParallelizable int parallelAbility() const override { @@ -677,6 +690,14 @@ class GWRBasic : public GWRBase, public IBandwidthSelectable, public IVarialbeSe #ifdef ENABLE_CUDA | ParallelType::CUDA #endif // ENABLE_CUDA +#ifdef ENABLE_MPI +#ifdef ENABLE_OPENMP + | ParallelType::OpenMP +#endif // ENABLE_OPENMP +#ifdef ENABLE_CUDA + | ParallelType::CUDA +#endif // ENABLE_CUDA +#endif // ENABLE_MPI ; } @@ -687,26 +708,13 @@ class GWRBasic : public GWRBase, public IBandwidthSelectable, public IVarialbeSe public: // Implement IGwmParallelOpenmpEnabled void setOmpThreadNum(const int threadNum) override { mOmpThreadNum = threadNum; } void setGPUId(const int gpuId) override { mGpuId = gpuId; }; - void setGroupSize(const size_t size) override { mGroupLength = size; }; + void setGroupSize(const std::size_t size) override { mGroupLength = size; }; + int workerId() override { return mWorkerId; } + void setWorkerId(int id) override { mWorkerId = id; }; + void setWorkerNum(int size) override { mWorkerNum = size; }; protected: - /** - * \~english - * @brief Whether to store hat-matrix \f$S\f$. - * - * @return true if store hat-matrix. - * @return false if not to store hat-matrix. - * - * \~chinese - * @brief 是否保存帽子矩阵 \f$S\f$. - * - * @return true 如果保存帽子矩阵。 - * @return false 如果不保存帽子矩阵。 - * - */ - bool isStoreS() { return mHasHatMatrix && (mCoords.n_rows < 8192); } - /** * \~english * @brief Create distance parameters for prediction. @@ -728,7 +736,7 @@ class GWRBasic : public GWRBase, public IBandwidthSelectable, public IVarialbeSe bool mIsAutoselectIndepVars = false; //!< \~english Whether to auto select variables. \~chinese 是否自动优选变量。 double mIndepVarSelectionThreshold = 3.0; //!< \~english The threshold for variable selection. \~chinese 变量优选的阈值。 - IndepVarsSelectCriterionCalculator mIndepVarsSelectionCriterionFunction = &GWRBasic::indepVarsSelectionCriterionSerial; //!< \~english Criterion calculator for variable selection. \~chinese 变量优选的指标计算函数。 + IndepVarsSelectCriterionCalculator mIndepVarsSelectionCriterionFunction = &GWRBasic::indepVarsSelectionCriterion; //!< \~english Criterion calculator for variable selection. \~chinese 变量优选的指标计算函数。 VariablesCriterionList mIndepVarsSelectionCriterionList; //!< \~english Criterion list of each variable combination. \~chinese 每种变量组合对应的指标值。 std::vector mSelectedIndepVars; //!< \~english Selected variables. \~chinese 优选得到的变量。 std::size_t mIndepVarSelectionProgressTotal = 0; //!< \~english Total number of independent variable combination. \~chinese 自变量所有组合总数。 @@ -736,24 +744,34 @@ class GWRBasic : public GWRBase, public IBandwidthSelectable, public IVarialbeSe bool mIsAutoselectBandwidth = false; //!< \~english Whether to auto select bandwidth. \~chinese 是否自动优选带宽。 BandwidthSelectionCriterionType mBandwidthSelectionCriterion = BandwidthSelectionCriterionType::AIC; //!< \~english Type criterion for bandwidth selection. \~chinese 带宽优选的指标值类型。 - BandwidthSelectionCriterionCalculator mBandwidthSelectionCriterionFunction = &GWRBasic::bandwidthSizeCriterionCVSerial; //!< \~english Criterion calculator for bandwidth selection. \~chinese 带宽优选的指标计算函数。 + BandwidthSelectionCriterionCalculator mBandwidthSelectionCriterionFunction = &GWRBasic::bandwidthSizeCriterionCV; //!< \~english Criterion calculator for bandwidth selection. \~chinese 带宽优选的指标计算函数。 BandwidthCriterionList mBandwidthSelectionCriterionList; //!< \~english Criterion list of each bandwidth. \~chinese 每种带宽组合对应的指标值。 double mBandwidthLastCriterion = DBL_MAX; //!< \~english Last criterion for bandwidth selection. \~chinese 上一次带宽优选的有效指标值。 std::optional mGoldenUpperBounds; std::optional mGoldenLowerBounds; PredictCalculator mPredictFunction = &GWRBasic::predictSerial; //!< \~english Implementation of predict function. \~chinese 预测的具体实现函数。 - FitCalculator mFitFunction = &GWRBasic::fitSerial; //!< \~english Implementation of fit function. \~chinese 拟合的具体实现函数。 + FitCalculator mFitFunction = &GWRBasic::fitBase; //!< \~english Implementation of fit function. \~chinese 拟合的具体实现函数。 + FitCoreCalculator mFitCoreFunction = &GWRBasic::fitCoreSerial; //!< \~english Implementation of fit function. \~chinese 拟合的具体实现函数。 + FitCoreSHatCalculator mFitCoreSHatFunction = &GWRBasic::fitCoreSHatSerial; //!< \~english Implementation of fit function. \~chinese 拟合的具体实现函数。 + FitCoreCVCalculator mFitCoreCVFunction = &GWRBasic::fitCoreCVSerial; //!< \~english Implementation of fit function. \~chinese 拟合的具体实现函数。 ParallelType mParallelType = ParallelType::SerialOnly; //!< \~english Type of parallel method. \~chinese 并行方法类型。 int mOmpThreadNum = 8; //!< \~english Number of threads to create. \~chinese 并行计算创建的线程数。 size_t mGroupLength = 64; //!< \~english Size of a group computing together. \~chinese 同时计算的一组的大小。 int mGpuId = 0; //!< \~english The ID of selected GPU. \~chinese 选择的 GPU 的 ID。 + int mWorkerId = 0; + int mWorkerNum = 1; + arma::uword mWorkRangeSize = 0; + std::optional> mWorkRange; arma::mat mBetasSE; //!< \~english Standard errors of coefficient estimates. \~chinese 回归系数估计值的标准差。 arma::vec mSHat; //!< \~english A vector of \f$tr(S)\f$ and \f$tr(SS^T)\f$. \~chinese 由 \f$tr(S)\f$ 和 \f$tr(SS^T)\f$ 组成的向量。 arma::vec mQDiag; //!< \~english The diagonal elements of matrix \f$Q\f$. \~chinese 矩阵 \f$Q\f$ 的对角线元素。 arma::mat mS; //!< \~english The hat-matrix \f$S\f$. \~chinese 帽子矩阵 \f$S\f$。 + arma::cube mC;//!< \~english All \f$S\f$ matrices. \~chinese 所有 \f$C\f$ 矩阵。 + bool mStoreS = false; //!< \~english Whether to save S \~chinese 是否保存 S 矩阵 + bool mStoreC = false; //!< \~english Whether to save C \~chinese 是否保存 C 矩阵 }; } diff --git a/include/gwmodelpp/GWRMultiscale.h b/include/gwmodelpp/GWRMultiscale.h index 6bec0067..af2e77f8 100644 --- a/include/gwmodelpp/GWRMultiscale.h +++ b/include/gwmodelpp/GWRMultiscale.h @@ -32,7 +32,13 @@ namespace gwm * \~chinese * @brief 多尺度GWR算法 */ -class GWRMultiscale : public SpatialMultiscaleAlgorithm, public IBandwidthSelectable, public IRegressionAnalysis, public IParallelizable, public IParallelOpenmpEnabled, public IParallelCudaEnabled +class GWRMultiscale : public SpatialMultiscaleAlgorithm, + public IBandwidthSelectable, + public IRegressionAnalysis, + public IParallelizable, + public IParallelOpenmpEnabled, + public IParallelCudaEnabled, + public IParallelMpiEnabled { public: @@ -83,10 +89,14 @@ class GWRMultiscale : public SpatialMultiscaleAlgorithm, public IBandwidthSelect static std::unordered_map BackFittingCriterionTypeNameMapper; //!< \~english A mapper from backfitting convergence criterion types to their names. \~chinese 后向迭代算法收敛指标类型到其名称的映射表。 typedef double (GWRMultiscale::*BandwidthSizeCriterionFunction)(BandwidthWeight*); //!< \~english Function to calculate the criterion for given bandwidth size. \~chinese 根据指定带宽大小计算对应指标值的函数。 - - typedef arma::mat (GWRMultiscale::*FitAllFunction)(const arma::mat&, const arma::vec&); //!< \~english Function to fit a model for all variables. \~chinese 根据所有变量拟合模型的函数。 - - typedef arma::vec (GWRMultiscale::*FitVarFunction)(const arma::vec&, const arma::vec&, const arma::uword, arma::mat&); //!< \~english Function to fit a model for the given variable. \~chinese 根据给定变量拟合模型的函数。 + + typedef arma::vec (GWRMultiscale::*FitVarFunction)(const size_t); //!< \~english Function to fit a model for the given variable. \~chinese 根据给定变量拟合模型的函数。 + + typedef arma::vec (GWRMultiscale::*FitVarCoreFunction)(const arma::vec&, const arma::vec&, const SpatialWeight&, arma::mat&); //!< \~english Function to fit a model for the given variable. \~chinese 根据给定变量拟合模型的函数。 + + typedef arma::vec (GWRMultiscale::*FitVarCoreCVFunction)(const arma::vec&, const arma::vec&, const SpatialWeight&); //!< \~english Function to fit a model for the given variable. \~chinese 根据给定变量拟合模型的函数。 + + typedef arma::vec (GWRMultiscale::*FitVarCoreSHatFunction)(const arma::vec&, const arma::vec&, const SpatialWeight&, arma::vec&); //!< \~english Function to fit a model for the given variable. \~chinese 根据给定变量拟合模型的函数。 private: @@ -539,21 +549,6 @@ class GWRMultiscale : public SpatialMultiscaleAlgorithm, public IBandwidthSelect * @return arma::mat 回归系数估计值 \f$\beta\f$。 */ const arma::mat& betas() const { return mBetas; } - - /** - * \~english - * @brief Get criterion calculator function for optimize bandwidth size for all variables. - * - * @param type The criterion type for optimize bandwidth size for all variables. - * @return BandwidthSizeCriterionFunction The criterion calculator for optimize bandwidth size for all variables. - * - * \~chinese - * @brief 获取所有变量带宽优选的指标值计算函数。 - * - * @param type 所有变量带宽优选的指标值类型。 - * @return BandwidthSizeCriterionFunction 所有变量带宽优选的指标值计算函数。 - */ - BandwidthSizeCriterionFunction bandwidthSizeCriterionAll(BandwidthSelectionCriterionType type); /** * \~english @@ -625,7 +620,10 @@ class GWRMultiscale : public SpatialMultiscaleAlgorithm, public IBandwidthSelect public: // IParallelCudaEnabled interface void setGPUId(const int gpuId) override { mGpuId = gpuId; } - void setGroupSize(const size_t size) override { mGroupLength = size; } + void setGroupSize(const std::size_t size) override { mGroupLength = size; } + int workerId() override { return mWorkerId; } + void setWorkerId(int id) override { mWorkerId = id; }; + void setWorkerNum(int size) override { mWorkerNum = size; }; protected: @@ -662,76 +660,10 @@ class GWRMultiscale : public SpatialMultiscaleAlgorithm, public IBandwidthSelect * @param y 因变量 \f$y\f$。 * @return arma::mat 回归系数估计值 \f$\beta\f$。 */ - arma::mat backfitting(const arma::mat &x, const arma::vec &y); - - /** - * \~english - * @brief The serial implementation of fit function for all variables. - * - * @param x Independent variables \f$X\f$. - * @param y Dependent variable \f$y\f$. - * @return arma::mat Coefficient estimates \f$\beta\f$. - * - * \~chinese - * @brief 拟合所有变量的非并行实现。 - * - * @param x 自变量矩阵 \f$X\f$。 - * @param y 因变量 \f$y\f$。 - * @return arma::mat 回归系数估计值 \f$\beta\f$。 - */ - arma::mat fitAllSerial(const arma::mat& x, const arma::vec& y); - - /** - * \~english - * @brief The serial implementation of fit function for one variable. - * - * @param x Independent variables \f$X\f$. - * @param y Dependent variable \f$y\f$. - * @param var The index of this variable. - * @param S The hat matrix \f$S\f$. - * @return arma::vec The coefficient estimates corresponding to this variable. - * - * \~chinese - * @brief 拟合单个变量的非并行实现。 - * - * @param x 自变量矩阵 \f$X\f$。 - * @param y 因变量 \f$y\f$。 - * @param var 当前变量的索引值。 - * @param S 帽子矩阵 \f$S\f$ - * @return arma::vec 该变量对应的回归系数估计值。 - */ - arma::vec fitVarSerial(const arma::vec& x, const arma::vec& y, const arma::uword var, arma::mat& S); - - /** - * \~english - * @brief The serial implementation of CV criterion calculator for given bandwidth size and all variables. - * - * @param bandwidthWeight Badwidth weight. - * @return double CV criterion value. - * - * \~chinese - * @brief 为指定带宽值和所有变量计算CV指标值函数的非并行实现。 - * - * @param bandwidthWeight 带宽值。 - * @return double CV指标值。 - */ - double bandwidthSizeCriterionAllCVSerial(BandwidthWeight* bandwidthWeight); - - /** - * \~english - * @brief The serial implementation of AIC criterion calculator for given bandwidth size and all variables. - * - * @param bandwidthWeight Badwidth weight. - * @return double AIC criterion value. - * - * \~chinese - * @brief 为指定带宽值和所有变量计算AIC指标值函数的非并行实现。 - * - * @param bandwidthWeight 带宽值。 - * @return double AIC指标值。 - */ - double bandwidthSizeCriterionAllAICSerial(BandwidthWeight* bandwidthWeight); + arma::mat backfitting(const arma::mat& betas0); + arma::vec fitVarBase(const size_t var); + /** * \~english * @brief The serial implementation of CV criterion calculator for given bandwidth size and one variable. @@ -745,8 +677,8 @@ class GWRMultiscale : public SpatialMultiscaleAlgorithm, public IBandwidthSelect * @param bandwidthWeight 带宽值。 * @return double CV指标值。 */ - double bandwidthSizeCriterionVarCVSerial(BandwidthWeight* bandwidthWeight); - + double bandwidthSizeCriterionVarCVBase(BandwidthWeight* bandwidthWeight); + /** * \~english * @brief The serial implementation of AIC criterion calculator for given bandwidth size and one variable. @@ -760,29 +692,11 @@ class GWRMultiscale : public SpatialMultiscaleAlgorithm, public IBandwidthSelect * @param bandwidthWeight 带宽值。 * @return double AIC指标值。 */ - double bandwidthSizeCriterionVarAICSerial(BandwidthWeight* bandwidthWeight); + double bandwidthSizeCriterionVarAICBase(BandwidthWeight* bandwidthWeight); -#ifdef ENABLE_OPENMP - /** - * \~english - * @brief The openmp parallel implementation of fit function for all variables. - * - * @param x Independent variables \f$X\f$. - * @param y Dependent variable \f$y\f$. - * @return arma::mat Coefficient estimates \f$\beta\f$. - * - * \~chinese - * @brief 拟合所有变量的多线程实现。 - * - * @param x 自变量矩阵 \f$X\f$。 - * @param y 因变量 \f$y\f$。 - * @return arma::mat 回归系数估计值 \f$\beta\f$。 - */ - arma::mat fitAllOmp(const arma::mat& x, const arma::vec& y); - /** * \~english - * @brief The openmp parallel implementation of fit function for one variable. + * @brief The serial implementation of fit function for one variable. * * @param x Independent variables \f$X\f$. * @param y Dependent variable \f$y\f$. @@ -791,7 +705,7 @@ class GWRMultiscale : public SpatialMultiscaleAlgorithm, public IBandwidthSelect * @return arma::vec The coefficient estimates corresponding to this variable. * * \~chinese - * @brief 拟合单个变量的多线程实现。 + * @brief 拟合单个变量的非并行实现。 * * @param x 自变量矩阵 \f$X\f$。 * @param y 因变量 \f$y\f$。 @@ -799,88 +713,22 @@ class GWRMultiscale : public SpatialMultiscaleAlgorithm, public IBandwidthSelect * @param S 帽子矩阵 \f$S\f$ * @return arma::vec 该变量对应的回归系数估计值。 */ - arma::vec fitVarOmp(const arma::vec& x, const arma::vec& y, const arma::uword var, arma::mat& S); - - /** - * \~english - * @brief The openmp parallel implementation of CV criterion calculator for given bandwidth size and all variables. - * - * @param bandwidthWeight Badwidth weight. - * @return double CV criterion value. - * - * \~chinese - * @brief 为指定带宽值和所有变量计算CV指标值函数的多线程实现。 - * - * @param bandwidthWeight 带宽值。 - * @return double CV指标值。 - */ - double bandwidthSizeCriterionAllCVOmp(BandwidthWeight* bandwidthWeight); + arma::vec fitVarCoreSerial(const arma::vec& x, const arma::vec& y, const SpatialWeight& sw, arma::mat& S); - /** - * \~english - * @brief The openmp parallel implementation of AIC criterion calculator for given bandwidth size and all variables. - * - * @param bandwidthWeight Badwidth weight. - * @return double AIC criterion value. - * - * \~chinese - * @brief 为指定带宽值和所有变量计算AIC指标值函数的多线程实现。 - * - * @param bandwidthWeight 带宽值。 - * @return double AIC指标值。 - */ - double bandwidthSizeCriterionAllAICOmp(BandwidthWeight* bandwidthWeight); + arma::vec fitVarCoreCVSerial(const arma::vec& x, const arma::vec& y, const SpatialWeight& sw); - /** - * \~english - * @brief The openmp parallel implementation of CV criterion calculator for given bandwidth size and one variable. - * - * @param bandwidthWeight Badwidth weight. - * @return double CV criterion value. - * - * \~chinese - * @brief 为指定带宽值和某个变量计算CV指标值函数的多线程实现。 - * - * @param bandwidthWeight 带宽值。 - * @return double CV指标值。 - */ - double bandwidthSizeCriterionVarCVOmp(BandwidthWeight* bandwidthWeight); + arma::vec fitVarCoreSHatSerial(const arma::vec& x, const arma::vec& y, const SpatialWeight& sw, arma::vec& shat); - /** - * \~english - * @brief The openmp parallel implementation of AIC criterion calculator for given bandwidth size and one variable. - * - * @param bandwidthWeight Badwidth weight. - * @return double AIC criterion value. - * - * \~chinese - * @brief 为指定带宽值和某个变量计算AIC指标值函数的多线程实现。 - * - * @param bandwidthWeight 带宽值。 - * @return double AIC指标值。 - */ - double bandwidthSizeCriterionVarAICOmp(BandwidthWeight* bandwidthWeight); +#ifdef ENABLE_OPENMP + arma::vec fitVarCoreOmp(const arma::vec& x, const arma::vec& y, const SpatialWeight& sw, arma::mat& S); + + arma::vec fitVarCoreCVOmp(const arma::vec& x, const arma::vec& y, const SpatialWeight& sw); + + arma::vec fitVarCoreSHatOmp(const arma::vec& x, const arma::vec& y, const SpatialWeight& sw, arma::vec& shat); #endif #ifdef ENABLE_CUDA - /** - * \~english - * @brief The CUDA implementation of fit function for all variables. - * - * @param x Independent variables \f$X\f$. - * @param y Dependent variable \f$y\f$. - * @return arma::mat Coefficient estimates \f$\beta\f$. - * - * \~chinese - * @brief 拟合所有变量的CUDA实现。 - * - * @param x 自变量矩阵 \f$X\f$。 - * @param y 因变量 \f$y\f$。 - * @return arma::mat 回归系数估计值 \f$\beta\f$。 - */ - arma::mat fitAllCuda(const arma::mat& x, const arma::vec& y); - /** * \~english * @brief The CUDA implementation of fit function for one variable. @@ -900,90 +748,34 @@ class GWRMultiscale : public SpatialMultiscaleAlgorithm, public IBandwidthSelect * @param S 帽子矩阵 \f$S\f$ * @return arma::vec 该变量对应的回归系数估计值。 */ - arma::vec fitVarCuda(const arma::vec& x, const arma::vec& y, const arma::uword var, arma::mat& S); - - /** - * \~english - * @brief The CUDA implementation of CV criterion calculator for given bandwidth size and all variables. - * - * @param bandwidthWeight Badwidth weight. - * @return double CV criterion value. - * - * \~chinese - * @brief 为指定带宽值和所有变量计算CV指标值函数的CUDA实现。 - * - * @param bandwidthWeight 带宽值。 - * @return double CV指标值。 - */ - double bandwidthSizeCriterionAllCVCuda(BandwidthWeight* bandwidthWeight); + arma::vec fitVarCoreCuda(const arma::vec& x, const arma::vec& y, const SpatialWeight& sw, arma::mat& S); - /** - * \~english - * @brief The CUDA implementation of AIC criterion calculator for given bandwidth size and all variables. - * - * @param bandwidthWeight Badwidth weight. - * @return double AIC criterion value. - * - * \~chinese - * @brief 为指定带宽值和所有变量计算AIC指标值函数的CUDA实现。 - * - * @param bandwidthWeight 带宽值。 - * @return double AIC指标值。 - */ - double bandwidthSizeCriterionAllAICCuda(BandwidthWeight* bandwidthWeight); - - /** - * \~english - * @brief The CUDA implementation of CV criterion calculator for given bandwidth size and one variable. - * - * @param bandwidthWeight Badwidth weight. - * @return double CV criterion value. - * - * \~chinese - * @brief 为指定带宽值和某个变量计算CV指标值函数的CUDA实现。 - * - * @param bandwidthWeight 带宽值。 - * @return double CV指标值。 - */ - double bandwidthSizeCriterionVarCVCuda(BandwidthWeight* bandwidthWeight); + arma::vec fitVarCoreCVCuda(const arma::vec& x, const arma::vec& y, const SpatialWeight& sw); - /** - * \~english - * @brief The CUDA implementation of AIC criterion calculator for given bandwidth size and one variable. - * - * @param bandwidthWeight Badwidth weight. - * @return double AIC criterion value. - * - * \~chinese - * @brief 为指定带宽值和某个变量计算AIC指标值函数的CUDA实现。 - * - * @param bandwidthWeight 带宽值。 - * @return double AIC指标值。 - */ - double bandwidthSizeCriterionVarAICCuda(BandwidthWeight* bandwidthWeight); + arma::vec fitVarCoreSHatCuda(const arma::vec& x, const arma::vec& y, const SpatialWeight& sw, arma::vec& shat); #endif // ENABLE_CUDA - /** - * \~english - * @brief Create a Initial Distance Parameter object - * - * \~chinese - * @brief 创建初始距离参数对象。 - */ - void createInitialDistanceParameter(); +#ifdef ENABLE_MPI + arma::vec fitVarMpi(const size_t var); + double bandwidthSizeCriterionVarCVMpi(BandwidthWeight* bandwidthWeight); + double bandwidthSizeCriterionVarAICMpi(BandwidthWeight* bandwidthWeight); +#endif // ENABLE_MPI private: - FitAllFunction mFitAll = &GWRMultiscale::fitAllSerial; //!< \~english Calculator to fit a model for all variables. \~chinese 为所有变量拟合模型的函数。 - FitVarFunction mFitVar = &GWRMultiscale::fitVarSerial; //!< \~english Calculator to fit a model for one variable. \~chinese 为单一变量拟合模型的函数。 + FitVarFunction mFitVar = &GWRMultiscale::fitVarBase; //!< \~english Calculator to fit a model for one variable. \~chinese 为单一变量拟合模型的函数。 + FitVarCoreFunction mFitVarCore = &GWRMultiscale::fitVarCoreSerial; //!< \~english Calculator to fit a model for one variable. \~chinese 为单一变量拟合模型的函数。 + FitVarCoreCVFunction mFitVarCoreCV = &GWRMultiscale::fitVarCoreCVSerial; //!< \~english Calculator to fit a model for one variable. \~chinese 为单一变量拟合模型的函数。 + FitVarCoreSHatFunction mFitVarCoreSHat = &GWRMultiscale::fitVarCoreSHatSerial; //!< \~english Calculator to fit a model for one variable. \~chinese 为单一变量拟合模型的函数。 SpatialWeight mInitSpatialWeight; //!< \~english Spatial weighting sheme for initializing bandwidth. \~chinese 计算初始带宽值时所用的空间权重配置。 - BandwidthSizeCriterionFunction mBandwidthSizeCriterion = &GWRMultiscale::bandwidthSizeCriterionAllCVSerial; //!< \~english The criterion calculator for given bandwidth size. \~chinese 根据指定带宽值计算指标值的函数。 + BandwidthSizeCriterionFunction mBandwidthSizeCriterion = &GWRMultiscale::bandwidthSizeCriterionVarCVBase; //!< \~english The criterion calculator for given bandwidth size. \~chinese 根据指定带宽值计算指标值的函数。 size_t mBandwidthSelectionCurrentIndex = 0; //!< \~english The index of variable which currently the algorithm select bandwidth for. \~chinese 当前正在选带宽的变量索引值。 double mBandwidthLastCriterion = DBL_MAX; //!< \~english Last criterion for bandwidth selection. \~chinese 上一次带宽优选的有效指标值。 std::optional mGoldenUpperBounds; std::optional mGoldenLowerBounds; std::vector mMaxDistances; + std::vector mMinDistances; std::vector mBandwidthInitilize; //!< \~english Type of bandwidth initilization values. \~chinese 带宽初始值类型。 std::vector mBandwidthSelectionApproach; //!< \~english Type of bandwidth selection approach. \~chinese 带宽选择方法类型。 @@ -1020,6 +812,12 @@ class GWRMultiscale : public SpatialMultiscaleAlgorithm, public IBandwidthSelect int mOmpThreadNum = 8; //!< \~english Number of threads. \~chinese 并行线程数。 size_t mGroupLength = 64; //!< \~english Size of a group computing together. \~chinese 同时计算的一组的大小。 int mGpuId = 0; //!< \~english The ID of selected GPU. \~chinese 选择的 GPU 的 ID。 + int mWorkerId = 0; + int mWorkerNum = 1; +#ifdef ENABLE_MPI + arma::uword mWorkRangeSize = 0; +#endif // ENABLE_MPI + std::optional> mWorkRange; public: static int treeChildCount; //!< \~english \~chinese diff --git a/include/gwmodelpp/IParallelizable.h b/include/gwmodelpp/IParallelizable.h index ec5a4fff..19158bd8 100644 --- a/include/gwmodelpp/IParallelizable.h +++ b/include/gwmodelpp/IParallelizable.h @@ -16,7 +16,11 @@ enum ParallelType { SerialOnly = 1 << 0, //!< \~english Use no parallel methods. \~chinese 不并行。 OpenMP = 1 << 1, //!< \~english Use multithread methods. \~chinese 多线程并行。 - CUDA = 1 << 2 //!< \~english Use CUDA accelerated methods. \~chinese CUDA加速。 + CUDA = 1 << 2, //!< \~english Use CUDA accelerated methods. \~chinese CUDA加速。 + MPI = (1 << 3), + MPI_Serial = (1 << 3) | (1 << 0), + MPI_MP = (1 << 3) | (1 << 1), + MPI_CUDA = (1 << 3) | (1 << 2) }; /** @@ -137,10 +141,23 @@ struct IParallelCudaEnabled * 对于大多数 GPU 可选择值 64。 * */ - virtual void setGroupSize(const size_t size) = 0; + virtual void setGroupSize(const std::size_t size) = 0; }; +struct IParallelMpiEnabled +{ + virtual int workerId() = 0; + virtual void setWorkerId(int id) = 0; + virtual void setWorkerNum(int size) = 0; +}; + +#define GWM_MPI_MASTER_BEGIN if (workerId() == 0) { +#define GWM_MPI_MASTER_END } +#define GWM_MPI_WORKER_BEGIN if (workerId() != 0) { +#define GWM_MPI_WORKER_END } +#define GWM_MPI_MASTER_WORKER_SWITCH } else { + } #endif // IPARALLELIZABLE_H \ No newline at end of file diff --git a/include/gwmodelpp/spatialweight/CRSDistance.h b/include/gwmodelpp/spatialweight/CRSDistance.h index 3f953bfb..461f0ead 100644 --- a/include/gwmodelpp/spatialweight/CRSDistance.h +++ b/include/gwmodelpp/spatialweight/CRSDistance.h @@ -4,6 +4,7 @@ #include "Distance.h" #ifdef ENABLE_CUDA +#include "gwmodelpp/utils/cumat.hpp" #include "gwmodelpp/spatialweight/cuda/CRSDistanceKernel.h" #include "gwmodelpp/spatialweight/cuda/ISpatialCudaEnabled.h" #endif // ENABLE_CUDA @@ -146,13 +147,6 @@ class CRSDistance : public Distance virtual ~CRSDistance() { -#ifdef ENABLE_CUDA - if (mCudaPrepared) - { - cudaFree(mCudaDp); - cudaFree(mCudaFp); - } -#endif } virtual Distance * clone() const override @@ -222,8 +216,8 @@ class CRSDistance : public Distance CalculatorType mCalculator = &EuclideanDistance; //!< \~english Calculator \~chinese 距离计算方法 #ifdef ENABLE_CUDA - double* mCudaDp = 0; //!< \~english Device pointer to data points \~chinese 指向数据点的设备指针 - double* mCudaFp = 0; //!< \~english Device pointer to focus points \~chinese 指向关注点的设备指针 + cumat mCudaDp; //!< \~english Device pointer to data points \~chinese 指向数据点的设备指针 + cumat mCudaFp; //!< \~english Device pointer to focus points \~chinese 指向关注点的设备指针 CalculatorCudaType mCalculatorCuda = &eu_dist_cuda; //!< \~english CUDA based Calculator \~chinese 基于 CUDA 的距离计算方法 #endif diff --git a/include/gwmodelpp/spatialweight/SpatialWeight.h b/include/gwmodelpp/spatialweight/SpatialWeight.h index 8bd8901c..5a35e69c 100644 --- a/include/gwmodelpp/spatialweight/SpatialWeight.h +++ b/include/gwmodelpp/spatialweight/SpatialWeight.h @@ -377,7 +377,7 @@ class SpatialWeight * @param focus 当前样本的索引值。 * @return vec 当前样本到其他所有样本的空间权重向量。 */ - virtual arma::vec weightVector(arma::uword focus) + virtual arma::vec weightVector(arma::uword focus) const { return mWeight->weight(mDistance->distance(focus)); } @@ -412,7 +412,7 @@ class SpatialWeight * @param d_weights \~english Device pointer to distances \~chinese 指向输出权重的设备指针 * @return cudaError_t \~english CUDA error or success \~chinese CUDA 错误或成功 */ - virtual cudaError_t weightVector(arma::uword focus, double* d_dists, double* d_weights) + virtual cudaError_t weightVector(arma::uword focus, double* d_dists, double* d_weights) const { cudaError_t error; size_t elems = 0; diff --git a/include/gwmodelpp/utils/armampi.h b/include/gwmodelpp/utils/armampi.h new file mode 100644 index 00000000..d1866a0a --- /dev/null +++ b/include/gwmodelpp/utils/armampi.h @@ -0,0 +1,10 @@ +#include "armadillo_config.h" +#include "mpi.h" + +#ifdef ARMA_32BIT_WORD +#define GWM_MPI_UWORD MPI_UNSIGNED_LONG +#else // ARMA_32BIT_WORD +#define GWM_MPI_UWORD MPI_UNSIGNED_LONG_LONG +#endif // ARMA_32BIT_WORD + +void mat_mul_mpi(arma::mat& a, arma::mat& b, arma::mat& c, const int ip, const int np, const size_t range); diff --git a/include/gwmodelpp/utils/cumat.hpp b/include/gwmodelpp/utils/cumat.hpp index 9fc51cb2..3e8af06e 100644 --- a/include/gwmodelpp/utils/cumat.hpp +++ b/include/gwmodelpp/utils/cumat.hpp @@ -47,6 +47,21 @@ class cubase { public: static cublasHandle_t handle; //!< Save handle for cublas + static auto create_handle() + { + if (handle == nullptr) return cublasCreate(&handle); + else return CUBLAS_STATUS_SUCCESS; + } + static auto destory_handle() + { + if (handle != nullptr) + { + auto error = cublasDestroy(handle); + handle = nullptr; + return error; + } + else return CUBLAS_STATUS_SUCCESS; + } constexpr static const double alpha1 = 1.0; constexpr static const double beta0 = 0.0; constexpr static const double beta1 = 1.0; @@ -245,6 +260,32 @@ class cumat : public cubase */ const cuop_trans t() const; + void resize(size_t rows, size_t cols) + { + if (dMem != nullptr && mIsRelease) + { + cudaFree(dMem); + } + cudaMalloc(&dMem, sizeof(double) * rows * cols); + } + + cumat& operator=(const cumat& right) + { + resize(right.mRows, right.mCols); + cudaMemcpy(dMem, right.dMem, nbytes(), cudaMemcpyDeviceToDevice); + return *this; + } + + cumat& operator=(cumat&& right) + { + mRows = right.mRows; + mCols = right.mCols; + dMem = right.dMem; + mIsRelease = true; + right.mIsRelease = false; + return *this; + } + cumat& operator=(const cuop_trans& right); template @@ -282,6 +323,8 @@ class cumat : public cubase size_t mCols = 0; }; +void print(const cumat& mat); + /** * @brief \~english Strided matrix. \~chinese 条带矩阵。 * diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index e93a8ce1..c4cdb921 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -15,6 +15,15 @@ if(OpenMP_FOUND AND OpenMP_C_FOUND AND OpenMP_CXX_FOUND) endif(OpenMP_FOUND AND OpenMP_C_FOUND AND OpenMP_CXX_FOUND) endif() +if(ENABLE_MPI) + find_package(MPI REQUIRED) + add_definitions(-DENABLE_MPI) + include_directories(${MPI_CXX_HEADER_DIR}) + add_link_options(${MPI_CXX_LINK_FLAGS}) + add_compile_options(${MPI_CXX_COMPILE_OPTIONS}) + add_definitions(${MPI_CXX_COMPILE_DEFINITIONS}) +endif(ENABLE_MPI) + find_package(GSL REQUIRED) if(GSL_FOUND) include_directories(${GSL_INCLUDE_DIRS}) @@ -137,6 +146,17 @@ if(ENABLE_CUDA) list(PREPEND SOURCES_ALL ${SOURCES_CUDA}) endif(ENABLE_CUDA) +if(ENABLE_MPI) + set(HEADERS_MPI + ../include/gwmodelpp/utils/armampi.h + ) + set(SOURCES_MPI + gwmodelpp/utils/armampi.cpp + ) + list(PREPEND HEADERS_ALL ${HEADERS_MPI}) + list(PREPEND SOURCES_ALL ${SOURCES_MPI}) +endif(ENABLE_MPI) + add_library(gwmodel STATIC ${HEADERS_ALL} ${SOURCES_ALL}) set_property(TARGET gwmodel PROPERTY POSITION_INDEPENDENT_CODE ON) @@ -194,6 +214,12 @@ if(OpenMP_FOUND) ) endif(OpenMP_FOUND) +if(ENABLE_MPI AND MPI_FOUND) + target_link_libraries(gwmodel + ${MPI_CXX_LIBRARIES} + ) +endif() + if(USE_CUDA_SHARED) set(HEADERS_CUDA_SHARED ../include/gwmodelcuda/StdTelegram.h diff --git a/src/gwmodelpp/GWRBasic.cpp b/src/gwmodelpp/GWRBasic.cpp index 28c34649..3c300b23 100644 --- a/src/gwmodelpp/GWRBasic.cpp +++ b/src/gwmodelpp/GWRBasic.cpp @@ -14,10 +14,36 @@ #include "cumat.hpp" #endif +#ifdef ENABLE_MPI +#include "mpi.h" +#include "gwmodelpp/utils/armampi.h" +#endif + using namespace std; using namespace arma; using namespace gwm; +enum class GWRBasicFitMpiTags +{ + Betas = 1 << 0, + BetasSE, + SHat, + QDiag, + SMat, + Finished +}; + +enum class GWRBasicAICMpiTags +{ + SHat = 1 << 4, + Betas +}; + +enum class GWRBasicCVMpiTags +{ + Betas = 1 << 8 +}; + RegressionDiagnostic GWRBasic::CalcDiagnostic(const mat& x, const vec& y, const mat& betas, const vec& shat) { vec r = y - sum(betas % x, 1); @@ -37,11 +63,47 @@ mat GWRBasic::fit() { GWM_LOG_STAGE("Initializing"); uword nDp = mCoords.n_rows, nVars = mX.n_cols; +#ifdef ENABLE_MPI + // Sync x, y, coords with other processes + if (mParallelType & ParallelType::MPI) + { + uword aShape[4]; + if (mWorkerId == 0) // master process + { + // sync matrix dimensions + uword nDim = mCoords.n_cols; + mWorkRangeSize = nDp / uword(mWorkerNum) + ((nDp % uword(mWorkerNum) == 0) ? 0 : 1); + aShape[0] = nDp; + aShape[1] = nVars; + aShape[2] = nDim; + aShape[3] = mWorkRangeSize; + // cout << mWorkerId << " process send size: [" << aShape[0] << "," << aShape[1] << "," << aShape[2] << "," << aShape[3] << "]\n"; + } + MPI_Bcast(aShape, 4, GWM_MPI_UWORD, 0, MPI_COMM_WORLD); + if (mWorkerId > 0) // slave process + { + // cout << mWorkerId << " process recv size: [" << aShape[0] << "," << aShape[1] << "," << aShape[2] << "," << aShape[3] << "]\n"; + nDp = aShape[0]; + nVars = aShape[1]; + mX.resize(nDp, nVars); + mY.resize(nDp); + mCoords.resize(aShape[0], aShape[2]); + mWorkRangeSize = aShape[3]; + } + MPI_Bcast(mX.memptr(), mX.n_elem, MPI_DOUBLE, 0, MPI_COMM_WORLD); + MPI_Bcast(mY.memptr(), mY.n_elem, MPI_DOUBLE, 0, MPI_COMM_WORLD); + MPI_Bcast(mCoords.memptr(), mCoords.n_elem, MPI_DOUBLE, 0, MPI_COMM_WORLD); + uword workRangeFrom = uword(mWorkerId) * mWorkRangeSize; + // cout << mWorkerId << " process work range: [" << workRangeFrom << "," << min(workRangeFrom + mWorkRangeSize, nDp) << "]\n"; + mWorkRange = make_pair(workRangeFrom, min(workRangeFrom + mWorkRangeSize, nDp)); + // cout << mWorkerId << " process work range: [" << mWorkRange.value().first << "," << mWorkRange.value().second << "]\n"; + } +#endif // ENABLE_MPI createDistanceParameter(); #ifdef ENABLE_CUDA - if (mParallelType == ParallelType::CUDA) + if (mParallelType & ParallelType::CUDA) { - cublasCreate(&cubase::handle); + cubase::create_handle(); mSpatialWeight.prepareCuda(mGpuId); } #endif // ENABLE_CUDA @@ -58,16 +120,15 @@ mat GWRBasic::fit() size_t k = indep_vars.size(); mIndepVarSelectionProgressTotal = (k + 1) * k / 2; mIndepVarSelectionProgressCurrent = 0; - GWM_LOG_INFO(IVarialbeSelectable::infoVariableCriterion()); VariableForwardSelector selector(indep_vars, mIndepVarSelectionThreshold); mSelectedIndepVars = selector.optimize(this); if (mSelectedIndepVars.size() > 0) { mX = mX.cols(VariableForwardSelector::index2uvec(mSelectedIndepVars, mHasIntercept)); - nVars = mX.n_cols; mIndepVarsSelectionCriterionList = selector.indepVarsCriterion(); } + nVars = mX.n_cols; GWM_LOG_STOP_RETURN(mStatus, mat(nDp, nVars, arma::fill::zeros)); } @@ -85,7 +146,7 @@ mat GWRBasic::fit() { mSpatialWeight.setWeight(bw); #ifdef ENABLE_CUDA - if (mParallelType == ParallelType::CUDA) + if (mParallelType & ParallelType::CUDA) { mSpatialWeight.prepareCuda(mGpuId); } @@ -96,34 +157,15 @@ mat GWRBasic::fit() } GWM_LOG_STAGE("Model fitting"); - mBetas = (this->*mFitFunction)(mX, mY, mBetasSE, mSHat, mQDiag, mS); - GWM_LOG_STOP_RETURN(mStatus, mat(nDp, nVars, arma::fill::zeros)); - - GWM_LOG_STAGE("Model Diagnostic"); - mDiagnostic = CalcDiagnostic(mX, mY, mBetas, mSHat); - double trS = mSHat(0), trStS = mSHat(1); - double sigmaHat = mDiagnostic.RSS / (nDp - 2 * trS + trStS); - mBetasSE = sqrt(sigmaHat * mBetasSE); - vec yhat = Fitted(mX, mBetas); - vec res = mY - yhat; - vec stu_res = res / sqrt(sigmaHat * mQDiag); - mat betasTV = mBetas / mBetasSE; - vec dybar2 = (mY - mean(mY)) % (mY - mean(mY)); - vec dyhat2 = (mY - yhat) % (mY - yhat); - vec localR2 = vec(nDp, fill::zeros); - for (uword i = 0; i < nDp; i++) - { - vec w = mSpatialWeight.weightVector(i); - double tss = sum(dybar2 % w); - double rss = sum(dyhat2 % w); - localR2(i) = (tss - rss) / tss; - } + mBetas = (this->*mFitFunction)(); + #ifdef ENABLE_CUDA - if (mParallelType == ParallelType::CUDA) + if (mParallelType & ParallelType::CUDA) { - cublasDestroy(cubase::handle); + cubase::destory_handle(); } #endif // ENABLE_CUDA + return mBetas; } @@ -133,9 +175,9 @@ mat GWRBasic::predict(const mat& locations) uword nDp = mCoords.n_rows, nVars = mX.n_cols; createPredictionDistanceParameter(locations); #ifdef ENABLE_CUDA - if (mParallelType == ParallelType::CUDA) + if (mParallelType & ParallelType::CUDA) { - cublasCreate(&cubase::handle); + cubase::create_handle(); mSpatialWeight.prepareCuda(mGpuId); } #endif // ENABLE_CUDA @@ -146,9 +188,9 @@ mat GWRBasic::predict(const mat& locations) GWM_LOG_STOP_RETURN(mStatus, mat(nDp, nVars, arma::fill::zeros)); #ifdef ENABLE_CUDA - if (mParallelType == ParallelType::CUDA) + if (mParallelType & ParallelType::CUDA) { - cublasDestroy(cubase::handle); + cubase::destory_handle(); } #endif // ENABLE_CUDA @@ -190,15 +232,30 @@ mat GWRBasic::predictSerial(const mat& locations, const mat& x, const vec& y) return betas.t(); } -mat GWRBasic::fitSerial(const mat& x, const vec& y, mat& betasSE, vec& shat, vec& qDiag, mat& S) +arma::mat gwm::GWRBasic::fitBase() +{ + mBetas = (this->*mFitCoreFunction)(mX, mY, mSpatialWeight); + mDiagnostic = CalcDiagnostic(mX, mY, mBetas, mSHat); + double trS = mSHat(0), trStS = mSHat(1); + double nDp = double(mCoords.n_rows); + double sigmaHat = mDiagnostic.RSS / (nDp - 2 * trS + trStS); + mBetasSE = sqrt(sigmaHat * mBetasSE); + return mBetas; +} + +mat GWRBasic::fitCoreSerial(const mat &x, const vec &y, const SpatialWeight &sw) { uword nDp = mCoords.n_rows, nVar = x.n_cols; mat betas(nVar, nDp, fill::zeros); - betasSE = mat(nVar, nDp, fill::zeros); - shat = vec(2, fill::zeros); - qDiag = vec(nDp, fill::zeros); - S = mat(isStoreS() ? nDp : 1, nDp, fill::zeros); - for (uword i = 0; i < nDp; i++) + mBetasSE = mat(nVar, nDp, fill::zeros); + mSHat = vec(2, fill::zeros); + mQDiag = vec(nDp, fill::zeros); + std::pair workRange = mWorkRange.value_or(make_pair(0, nDp)); + uword rangeSize = workRange.second - workRange.first; + mS = mat(isStoreS() ? rangeSize : 1, nDp, fill::zeros); + mC = cube(nVar, nDp, isStoreC() ? rangeSize : 1, fill::zeros); + // cout << mWorkerId << " process work range: [" << workRange.first << "," << workRange.second << "]\n"; + for (uword i = workRange.first; i < workRange.second; i++) { GWM_LOG_STOP_BREAK(mStatus); vec w = mSpatialWeight.weightVector(i); @@ -210,14 +267,15 @@ mat GWRBasic::fitSerial(const mat& x, const vec& y, mat& betasSE, vec& shat, vec mat xtwx_inv = inv_sympd(xtwx); betas.col(i) = xtwx_inv * xtwy; mat ci = xtwx_inv * xtw; - betasSE.col(i) = sum(ci % ci, 1); + mBetasSE.col(i) = sum(ci % ci, 1); mat si = x.row(i) * ci; - shat(0) += si(0, i); - shat(1) += det(si * si.t()); + mSHat(0) += si(0, i); + mSHat(1) += det(si * si.t()); vec p = - si.t(); p(i) += 1.0; - qDiag += p % p; - S.row(isStoreS() ? i : 0) = si; + mQDiag += p % p; + mS.row(isStoreS() ? (i - workRange.first) : 0) = si; + mC.slice(isStoreC() ? (i - workRange.first) : 0) = ci; } catch (const exception& e) { @@ -226,123 +284,137 @@ mat GWRBasic::fitSerial(const mat& x, const vec& y, mat& betasSE, vec& shat, vec } GWM_LOG_PROGRESS(i + 1, nDp); } - betasSE = betasSE.t(); + mBetasSE = mBetasSE.t(); return betas.t(); } -double GWRBasic::bandwidthSizeCriterionCVSerial(BandwidthWeight* bandwidthWeight) +mat GWRBasic::fitCoreCVSerial(const mat& x, const vec& y, const SpatialWeight& sw) { - uword nDp = mCoords.n_rows; - vec shat(2, fill::zeros); - double cv = 0.0; - for (uword i = 0; i < nDp; i++) + uword nDp = mCoords.n_rows, nVar = x.n_cols; + mat betas(nVar, nDp, fill::zeros); + std::pair workRange = mWorkRange.value_or(make_pair(0, nDp)); + for (uword i = workRange.first; i < workRange.second; i++) { GWM_LOG_STOP_BREAK(mStatus); - vec d = mSpatialWeight.distance()->distance(i); - vec w = bandwidthWeight->weight(d); + vec w = sw.weightVector(i); w(i) = 0.0; - mat xtw = trans(mX.each_col() % w); - mat xtwx = xtw * mX; - mat xtwy = xtw * mY; + mat xtw = trans(x.each_col() % w); + mat xtwx = xtw * x; + mat xtwy = xtw * y; try { mat xtwx_inv = inv_sympd(xtwx); - vec beta = xtwx_inv * xtwy; - double res = mY(i) - det(mX.row(i) * beta); - cv += res * res; + betas.col(i) = xtwx_inv * xtwy; } catch (const exception& e) { GWM_LOG_ERROR(e.what()); - return DBL_MAX; + throw e; } } - if (mStatus == Status::Success && isfinite(cv)) - { - GWM_LOG_INFO(IBandwidthSelectable::infoBandwidthCriterion(bandwidthWeight, cv)); - GWM_LOG_PROGRESS_PERCENT(exp(- abs(mBandwidthLastCriterion - cv))); - mBandwidthLastCriterion = cv; - return cv; - } - else return DBL_MAX; + return betas.t(); } -double GWRBasic::bandwidthSizeCriterionAICSerial(BandwidthWeight* bandwidthWeight) +mat GWRBasic::fitCoreSHatSerial(const mat& x, const vec& y, const SpatialWeight& sw, vec& shat) { - uword nDp = mCoords.n_rows, nVar = mX.n_cols; + uword nDp = mCoords.n_rows, nVar = x.n_cols; mat betas(nVar, nDp, fill::zeros); - vec shat(2, fill::zeros); - for (uword i = 0; i < nDp; i++) + shat = vec(2, arma::fill::zeros); + std::pair workRange = mWorkRange.value_or(make_pair(0, nDp)); + for (uword i = workRange.first; i < workRange.second; i++) { GWM_LOG_STOP_BREAK(mStatus); - vec d = mSpatialWeight.distance()->distance(i); - vec w = bandwidthWeight->weight(d); - mat xtw = trans(mX.each_col() % w); - mat xtwx = xtw * mX; - mat xtwy = xtw * mY; + vec w = sw.weightVector(i); + mat xtw = trans(x.each_col() % w); + mat xtwx = xtw * x; + mat xtwy = xtw * y; try { mat xtwx_inv = inv_sympd(xtwx); betas.col(i) = xtwx_inv * xtwy; mat ci = xtwx_inv * xtw; - mat si = mX.row(i) * ci; + mat si = x.row(i) * ci; shat(0) += si(0, i); shat(1) += det(si * si.t()); } catch (const exception& e) { GWM_LOG_ERROR(e.what()); - return DBL_MAX; + throw e; } } - double value = GWRBase::AICc(mX, mY, betas.t(), shat); - if (mStatus == Status::Success && isfinite(value)) + return betas.t(); +} + +double GWRBasic::bandwidthSizeCriterionCV(BandwidthWeight* bandwidthWeight) +{ + SpatialWeight sw(bandwidthWeight, mSpatialWeight.distance()); + try { - GWM_LOG_INFO(IBandwidthSelectable::infoBandwidthCriterion(bandwidthWeight, value)); - GWM_LOG_PROGRESS_PERCENT(exp(- abs(mBandwidthLastCriterion - value))); - mBandwidthLastCriterion = value; - return value; + mat betas = (this->*mFitCoreCVFunction)(mX, mY, sw); + vec res = mY - sum(mX % betas, 1); + double cv = sum(res % res); + if (mStatus == Status::Success && isfinite(cv)) + { + GWM_LOG_INFO(IBandwidthSelectable::infoBandwidthCriterion(bandwidthWeight, cv)); + GWM_LOG_PROGRESS_PERCENT(exp(- abs(mBandwidthLastCriterion - cv))); + mBandwidthLastCriterion = cv; + return cv; + } + else return DBL_MAX; + } + catch(const std::exception& e) + { + return DBL_MAX; } - else return DBL_MAX; + } -double GWRBasic::indepVarsSelectionCriterionSerial(const vector& indepVars) +double GWRBasic::bandwidthSizeCriterionAIC(BandwidthWeight* bandwidthWeight) { - mat x = mX.cols(VariableForwardSelector::index2uvec(indepVars, mHasIntercept)); - vec y = mY; - uword nDp = mCoords.n_rows, nVar = x.n_cols; - mat betas(nVar, nDp, fill::zeros); - vec shat(2, fill::zeros); - for (uword i = 0; i < nDp; i++) + SpatialWeight sw(bandwidthWeight, mSpatialWeight.distance()); + try { - GWM_LOG_STOP_BREAK(mStatus); - vec w(nDp, fill::ones); - mat xtw = trans(x.each_col() % w); - mat xtwx = xtw * x; - mat xtwy = xtw * y; - try + vec shat; + mat betas = (this->*mFitCoreSHatFunction)(mX, mY, sw, shat); + double value = GWRBase::AICc(mX, mY, betas, shat); + if (mStatus == Status::Success && isfinite(value)) { - mat xtwx_inv = inv_sympd(xtwx); - betas.col(i) = xtwx_inv * xtwy; - mat ci = xtwx_inv * xtw; - mat si = x.row(i) * ci; - shat(0) += si(0, i); - shat(1) += det(si * si.t()); + GWM_LOG_INFO(IBandwidthSelectable::infoBandwidthCriterion(bandwidthWeight, value)); + GWM_LOG_PROGRESS_PERCENT(exp(- abs(mBandwidthLastCriterion - value))); + mBandwidthLastCriterion = value; + return value; } - catch (const exception& e) + else return DBL_MAX; + } + catch(const std::exception& e) + { + return DBL_MAX; + } +} + +double gwm::GWRBasic::indepVarsSelectionCriterion(const vector& indepVars) +{ + mat x = mX.cols(VariableForwardSelector::index2uvec(indepVars, mHasIntercept)); + SpatialWeight sw = mSpatialWeight; + sw.weight()->setBandwidth(DBL_MAX); + try + { + vec shat; + mat betas = (this->*mFitCoreSHatFunction)(x, mY, sw, shat); + GWM_LOG_PROGRESS(++mIndepVarSelectionProgressCurrent, mIndepVarSelectionProgressTotal); + if (mStatus == Status::Success) { - GWM_LOG_ERROR(e.what()); - return DBL_MAX; + double aic = GWRBase::AICc(x, mY, betas, shat); + GWM_LOG_INFO(IVarialbeSelectable::infoVariableCriterion(indepVars, aic)); + return aic; } + else return DBL_MAX; } - GWM_LOG_PROGRESS(++mIndepVarSelectionProgressCurrent, mIndepVarSelectionProgressTotal); - if (mStatus == Status::Success) + catch(const std::exception& e) { - double value = GWRBase::AICc(x, y, betas.t(), shat); - GWM_LOG_INFO(IVarialbeSelectable::infoVariableCriterion(indepVars, value)); - return value; + return DBL_MAX; } - else return DBL_MAX; } #ifdef ENABLE_OPENMP @@ -383,18 +455,21 @@ mat GWRBasic::predictOmp(const mat& locations, const mat& x, const vec& y) return betas.t(); } -mat GWRBasic::fitOmp(const mat& x, const vec& y, mat& betasSE, vec& shat, vec& qDiag, mat& S) +mat GWRBasic::fitCoreOmp(const mat& x, const vec& y, const SpatialWeight& sw) { uword nDp = mCoords.n_rows, nVar = x.n_cols; mat betas(nVar, nDp, fill::zeros); - betasSE = mat(nVar, nDp, fill::zeros); - S = mat(isStoreS() ? nDp : 1, nDp, fill::zeros); + mBetasSE = mat(nVar, nDp, fill::zeros); + std::pair workRange = mWorkRange.value_or(make_pair(0, nDp)); + uword rangeSize = workRange.second - workRange.first; + mS = mat(isStoreS() ? rangeSize : 1, nDp, fill::zeros); + mC = cube(nVar, nDp, isStoreC() ? rangeSize : 1, fill::zeros); mat shat_all(2, mOmpThreadNum, fill::zeros); mat qDiag_all(nDp, mOmpThreadNum, fill::zeros); bool success = true; std::exception except; #pragma omp parallel for num_threads(mOmpThreadNum) - for (int i = 0; (uword)i < nDp; i++) + for (int i = int(workRange.first); (uword)i < workRange.second; i++) { GWM_LOG_STOP_CONTINUE(mStatus); if (success) @@ -409,14 +484,15 @@ mat GWRBasic::fitOmp(const mat& x, const vec& y, mat& betasSE, vec& shat, vec& q mat xtwx_inv = inv_sympd(xtwx); betas.col(i) = xtwx_inv * xtwy; mat ci = xtwx_inv * xtw; - betasSE.col(i) = sum(ci % ci, 1); + mBetasSE.col(i) = sum(ci % ci, 1); mat si = x.row(i) * ci; shat_all(0, thread) += si(0, i); shat_all(1, thread) += det(si * si.t()); vec p = - si.t(); p(i) += 1.0; qDiag_all.col(thread) += p % p; - S.row(isStoreS() ? i : 0) = si; + mS.row(isStoreS() ? (i - workRange.first) : 0) = si; + mC.slice(isStoreC() ? (i - workRange.first) : 0) = ci; } catch (const exception& e) { @@ -431,74 +507,26 @@ mat GWRBasic::fitOmp(const mat& x, const vec& y, mat& betasSE, vec& shat, vec& q { throw except; } - shat = sum(shat_all, 1); - qDiag = sum(qDiag_all, 1); - betasSE = betasSE.t(); + mSHat = sum(shat_all, 1); + mQDiag = sum(qDiag_all, 1); + mBetasSE = mBetasSE.t(); return betas.t(); } -double GWRBasic::bandwidthSizeCriterionCVOmp(BandwidthWeight* bandwidthWeight) +arma::mat GWRBasic::fitCoreCVOmp(const arma::mat& x, const arma::vec& y, const SpatialWeight& sw) { - uword nDp = mCoords.n_rows; - vec shat(2, fill::zeros); - vec cv_all(mOmpThreadNum, fill::zeros); - bool flag = true; -#pragma omp parallel for num_threads(mOmpThreadNum) - for (int i = 0; (uword)i < nDp; i++) - { - GWM_LOG_STOP_CONTINUE(mStatus); - if (flag) - { - int thread = omp_get_thread_num(); - vec d = mSpatialWeight.distance()->distance(i); - vec w = bandwidthWeight->weight(d); - w(i) = 0.0; - mat xtw = trans(mX.each_col() % w); - mat xtwx = xtw * mX; - mat xtwy = xtw * mY; - try - { - mat xtwx_inv = inv_sympd(xtwx); - vec beta = xtwx_inv * xtwy; - double res = mY(i) - det(mX.row(i) * beta); - if (isfinite(res)) - cv_all(thread) += res * res; - else - flag = false; - } - catch (const exception& e) - { - GWM_LOG_ERROR(e.what()); - flag = false; - } - } - } - if (mStatus == Status::Success && flag) - { - double cv = sum(cv_all); - GWM_LOG_INFO(IBandwidthSelectable::infoBandwidthCriterion(bandwidthWeight, cv)); - GWM_LOG_PROGRESS_PERCENT(exp(- abs(mBandwidthLastCriterion - cv))); - mBandwidthLastCriterion = cv; - return cv; - } - else return DBL_MAX; -} - -double GWRBasic::bandwidthSizeCriterionAICOmp(BandwidthWeight* bandwidthWeight) -{ - uword nDp = mCoords.n_rows, nVar = mX.n_cols; + uword nDp = mCoords.n_rows, nVar = x.n_cols; mat betas(nVar, nDp, fill::zeros); - mat shat_all(2, mOmpThreadNum, fill::zeros); bool flag = true; + std::pair workRange = mWorkRange.value_or(make_pair(0, nDp)); #pragma omp parallel for num_threads(mOmpThreadNum) - for (int i = 0; (uword)i < nDp; i++) + for (int i = int(workRange.first); (uword)i < workRange.second; i++) { GWM_LOG_STOP_CONTINUE(mStatus); if (flag) { - int thread = omp_get_thread_num(); - vec d = mSpatialWeight.distance()->distance(i); - vec w = bandwidthWeight->weight(d); + vec w = sw.weightVector(i); + w(i) = 0.0; mat xtw = trans(mX.each_col() % w); mat xtwx = xtw * mX; mat xtwy = xtw * mY; @@ -506,10 +534,6 @@ double GWRBasic::bandwidthSizeCriterionAICOmp(BandwidthWeight* bandwidthWeight) { mat xtwx_inv = inv_sympd(xtwx); betas.col(i) = xtwx_inv * xtwy; - mat ci = xtwx_inv * xtw; - mat si = mX.row(i) * ci; - shat_all(0, thread) += si(0, i); - shat_all(1, thread) += det(si * si.t()); } catch (const exception& e) { @@ -518,38 +542,24 @@ double GWRBasic::bandwidthSizeCriterionAICOmp(BandwidthWeight* bandwidthWeight) } } } - if (mStatus == Status::Success && flag) - { - vec shat = sum(shat_all, 1); - double value = GWRBase::AICc(mX, mY, betas.t(), shat); - if (isfinite(value)) - { - GWM_LOG_INFO(IBandwidthSelectable::infoBandwidthCriterion(bandwidthWeight, value)); - GWM_LOG_PROGRESS_PERCENT(exp(- abs(mBandwidthLastCriterion - value))); - mBandwidthLastCriterion = value; - return value; - } - else return DBL_MAX; - } - else return DBL_MAX; + return betas.t(); } -double GWRBasic::indepVarsSelectionCriterionOmp(const vector& indepVars) +arma::mat GWRBasic::fitCoreSHatOmp(const arma::mat& x, const arma::vec& y, const SpatialWeight& sw, arma::vec& shat) { - mat x = mX.cols(VariableForwardSelector::index2uvec(indepVars, mHasIntercept)); - vec y = mY; uword nDp = mCoords.n_rows, nVar = x.n_cols; mat betas(nVar, nDp, fill::zeros); - mat shat(2, mOmpThreadNum, fill::zeros); + mat shat_all(2, mOmpThreadNum, fill::zeros); int flag = true; + std::pair workRange = mWorkRange.value_or(make_pair(0, nDp)); #pragma omp parallel for num_threads(mOmpThreadNum) - for (int i = 0; (uword)i < nDp; i++) + for (int i = int(workRange.first); (uword)i < workRange.second; i++) { GWM_LOG_STOP_CONTINUE(mStatus); if (flag) { int thread = omp_get_thread_num(); - vec w(nDp, fill::ones); + vec w = sw.weightVector(i); mat xtw = trans(x.each_col() % w); mat xtwx = xtw * x; mat xtwy = xtw * y; @@ -559,8 +569,8 @@ double GWRBasic::indepVarsSelectionCriterionOmp(const vector& indepVars) betas.col(i) = xtwx_inv * xtwy; mat ci = xtwx_inv * xtw; mat si = x.row(i) * ci; - shat(0, thread) += si(0, i); - shat(1, thread) += det(si * si.t()); + shat_all(0, thread) += si(0, i); + shat_all(1, thread) += as_scalar(si * si.t()); } catch (const exception& e) { @@ -569,29 +579,27 @@ double GWRBasic::indepVarsSelectionCriterionOmp(const vector& indepVars) } } } - GWM_LOG_PROGRESS(++mIndepVarSelectionProgressCurrent, mIndepVarSelectionProgressTotal); - if (mStatus == Status::Success && flag) - { - double value = GWRBase::AICc(x, y, betas.t(), sum(shat, 1)); - GWM_LOG_INFO(IVarialbeSelectable::infoVariableCriterion(indepVars, value)); - return value; - } - else return DBL_MAX; + shat = sum(shat_all, 1); + return betas.t(); } #endif #ifdef ENABLE_CUDA -mat GWRBasic::fitCuda(const mat& x, const vec& y, mat& betasSE, vec& shat, vec& qDiag, mat& S) +mat GWRBasic::fitCoreCuda(const mat& x, const vec& y, const SpatialWeight& sw) { - uword nDp = mCoords.n_rows, nVar = x.n_cols; + uword nDp = x.n_rows, nVar = x.n_cols; mat xt = trans(x); mat betas = mat(nVar, nDp, arma::fill::zeros); - betasSE = mat(nVar, nDp, arma::fill::zeros); - shat = vec(2, arma::fill::zeros); - qDiag = vec(nDp, arma::fill::zeros); - S = mat(isStoreS() ? nDp : 1, nDp, arma::fill::zeros); - size_t groups = nDp / mGroupLength + (nDp % mGroupLength == 0 ? 0 : 1); + mat ci(nVar, nDp, arma::fill::zeros); + mBetasSE = mat(nVar, nDp, arma::fill::zeros); + mSHat = vec(2, arma::fill::zeros); + mQDiag = vec(nDp, arma::fill::zeros); + mS = mat(isStoreS() ? nDp : 1, nDp, arma::fill::zeros); + mC = cube(nVar, nDp, isStoreC() ? nDp : 1, arma::fill::zeros); + std::pair workRange = mWorkRange.value_or(make_pair(0, nDp)); + uword rangeSize = workRange.second - workRange.first; + size_t groups = rangeSize / mGroupLength + (rangeSize % mGroupLength == 0 ? 0 : 1); cumat u_xt(xt), u_y(y); cumat u_betas(nVar, nDp), u_betasSE(nVar, nDp); cumat u_dists(nDp, 1), u_weights(nDp, 1); @@ -608,10 +616,10 @@ mat GWRBasic::fitCuda(const mat& x, const vec& y, mat& betasSE, vec& shat, vec& for (size_t i = 0; i < groups; i++) { GWM_LOG_STOP_BREAK(mStatus); - size_t begin = i * mGroupLength, length = (begin + mGroupLength > nDp) ? (nDp - begin) : mGroupLength; + size_t begin = workRange.first + i * mGroupLength, length = (begin + mGroupLength > workRange.second) ? (workRange.second - begin) : mGroupLength; for (size_t j = 0, e = begin + j; j < length; j++, e++) { - checkCudaErrors(mSpatialWeight.weightVector(e, u_dists.dmem(), u_weights.dmem())); + checkCudaErrors(sw.weightVector(e, u_dists.dmem(), u_weights.dmem())); u_xtw.strides(j) = u_xt.diagmul(u_weights); } // xtwx and xtwy @@ -638,26 +646,28 @@ mat GWRBasic::fitCuda(const mat& x, const vec& y, mat& betasSE, vec& shat, vec& // si = t(xti) * ci [1*k,k*n] u_s = u_xt.as_stride().strides(begin, begin + length).t() * u_c; u_s.get(sg.memptr()); - shat(0) += trace(sg.submat(begin, 0, arma::SizeMat(length, length))); + mSHat(0) += trace(sg.submat(begin, 0, arma::SizeMat(length, length))); // cct = ci * cit [k*n,t(k*n)] u_cct = u_c * u_c.t(); u_sst = u_s * u_s.t(); u_cct.get(cct.memptr()); u_sst.get(sst.memptr()); - shat(1) += sum(sst.head(length)); + mSHat(1) += sum(sst.head(length)); // Transfer to cpu Perform further diagnostic for (size_t j = 0, e = begin + j; j < length; j++, e++) { - betasSE.col(e) = diagvec(cct.slice(j)); + mBetasSE.col(e) = diagvec(cct.slice(j)); vec p = -sg.col(j); p(e) += 1.0; - qDiag += p % p; - S.row(isStoreS() ? e : 0) = sg.col(j).t(); + mQDiag += p % p; + mS.row(isStoreS() ? e : 0) = sg.col(j).t(); + u_c.strides(j).get(ci.memptr()); + mC.slice(isStoreC() ? e : 0) = ci; } GWM_LOG_PROGRESS(begin + length, nDp); } u_betas.get(betas.memptr()); - betasSE = betasSE.t(); + mBetasSE = mBetasSE.t(); cudaFree(d_info); return betas.t(); } @@ -666,7 +676,9 @@ arma::mat GWRBasic::predictCuda(const mat& locations, const mat& x, const vec& y { uword nRp = locations.n_rows, nDp = mCoords.n_rows, nVar = x.n_cols; mat betas(nVar, nRp, fill::zeros); - size_t groups = nRp / mGroupLength + (nRp % mGroupLength == 0 ? 0 : 1); + std::pair workRange = mWorkRange.value_or(make_pair(0, nDp)); + uword rangeSize = workRange.second - workRange.first; + size_t groups = rangeSize / mGroupLength + (rangeSize % mGroupLength == 0 ? 0 : 1); cumat u_xt(x.t()), u_y(y); custride u_xtwx(nVar, nVar, mGroupLength), u_xtwy(nVar, 1, mGroupLength), u_xtwxI(nVar, nVar, mGroupLength); cumat u_betas(nVar, nRp); @@ -678,7 +690,7 @@ arma::mat GWRBasic::predictCuda(const mat& locations, const mat& x, const vec& y for (size_t i = 0; i < groups; i++) { GWM_LOG_STOP_BREAK(mStatus); - size_t begin = i * mGroupLength, length = (begin + mGroupLength > nRp) ? (nRp - begin) : mGroupLength; + size_t begin = workRange.first + i * mGroupLength, length = (begin + mGroupLength > workRange.second) ? (workRange.second - begin) : mGroupLength; for (size_t j = 0, e = begin + j; j < length; j++, e++) { checkCudaErrors(mSpatialWeight.weightVector(e, u_dists.dmem(), u_weights.dmem())); @@ -706,29 +718,29 @@ arma::mat GWRBasic::predictCuda(const mat& locations, const mat& x, const vec& y return betas.t(); } -double GWRBasic::bandwidthSizeCriterionCVCuda(BandwidthWeight* bandwidthWeight) +arma::mat GWRBasic::fitCoreCVCuda(const arma::mat& x, const arma::vec& y, const SpatialWeight& sw) { uword nDp = mCoords.n_rows, nVar = mX.n_cols; - size_t elems = nDp; + mat betas(nVar, nDp, fill::zeros); cumat u_xt(mX.t()), u_y(mY); cumat u_dists(nDp, 1), u_weights(nDp, 1); custride u_xtw(nVar, nDp, mGroupLength); custride u_xtwx(nVar, nVar, mGroupLength), u_xtwy(nVar, 1, mGroupLength), u_xtwxI(nVar, nVar, mGroupLength); - custride u_betas(nVar, 1, mGroupLength), u_yhat(1, 1, mGroupLength); - vec yhat(mGroupLength), yhat_all(nDp); + cumat u_betas(nVar, nDp); int *d_info, *p_info; p_info = new int[mGroupLength]; checkCudaErrors(cudaMalloc(&d_info, sizeof(int) * mGroupLength)); bool success = true; - size_t groups = nDp / mGroupLength + (nDp % mGroupLength == 0 ? 0 : 1); + std::pair workRange = mWorkRange.value_or(make_pair(0, nDp)); + uword rangeSize = workRange.second - workRange.first; + size_t groups = rangeSize / mGroupLength + (rangeSize % mGroupLength == 0 ? 0 : 1); for (size_t i = 0; i < groups && success; i++) { GWM_LOG_STOP_BREAK(mStatus); - size_t begin = i * mGroupLength, length = (begin + mGroupLength > nDp) ? (nDp - begin) : mGroupLength; + size_t begin = workRange.first + i * mGroupLength, length = (begin + mGroupLength > workRange.second) ? (workRange.second - begin) : mGroupLength; for (size_t j = 0, e = begin + j; j < length; j++, e++) { - checkCudaErrors(mSpatialWeight.distance()->distance(e, u_dists.dmem(), &elems)); - checkCudaErrors(bandwidthWeight->weight(u_dists.dmem(), u_weights.dmem(), elems)); + checkCudaErrors(sw.weightVector(e, u_dists.dmem(), u_weights.dmem())); checkCudaErrors(cudaMemcpy(u_weights.dmem() + e, &cubase::beta0, sizeof(double), cudaMemcpyHostToDevice)); u_xtw.strides(j) = u_xt.diagmul(u_weights); } @@ -747,57 +759,44 @@ double GWRBasic::bandwidthSizeCriterionCVCuda(BandwidthWeight* bandwidthWeight) } if (success) { - u_betas = u_xtwxI * u_xtwy; - u_yhat = u_xt.as_stride().strides(begin, begin + length).t() * u_betas; - u_yhat.get(yhat.memptr()); - yhat_all.rows(begin, begin + length - 1) = yhat.head_rows(length); + u_betas.as_stride().strides(begin, begin + length) = u_xtwxI * u_xtwy; } } checkCudaErrors(cudaFree(d_info)); delete[] p_info; - if (!success) return DBL_MAX; - double cv = as_scalar((mY - yhat_all).t() * (mY - yhat_all)); - if (isfinite(cv)) - { - GWM_LOG_INFO(IBandwidthSelectable::infoBandwidthCriterion(bandwidthWeight, cv)); - GWM_LOG_PROGRESS_PERCENT(exp(- abs(mBandwidthLastCriterion - cv))); - mBandwidthLastCriterion = cv; - return cv; - } - else return DBL_MAX; + u_betas.get(betas.memptr()); + return betas.t(); } -double GWRBasic::bandwidthSizeCriterionAICCuda(BandwidthWeight* bandwidthWeight) +arma::mat GWRBasic::fitCoreSHatCuda(const arma::mat& x, const arma::vec& y, const SpatialWeight& sw, vec& shat) { - uword nDp = mCoords.n_rows, nVar = mX.n_cols; - size_t elems = nDp; - cumat u_xt(mX.t()), u_y(mY), u_betas(nVar, nDp); + uword nDp = mCoords.n_rows, nVar = x.n_cols; + mat betas(nVar, nDp, fill::zeros); + shat = vec(2, fill::zeros); + cumat u_xt(x.t()), u_y(y), u_betas(nVar, nDp); cumat u_dists(nDp, 1), u_weights(nDp, 1); custride u_xtw(nVar, nDp, mGroupLength); - custride u_xtwx(nVar, nVar, mGroupLength), u_xtwy(nVar, 1, mGroupLength), u_xtwxI(nVar, nVar, mGroupLength); - custride u_c(nVar, nDp, mGroupLength), u_s(1, nDp, mGroupLength); - custride u_sst(1, 1, mGroupLength); - mat betas(nVar, nDp); - vec shat(2), sst(mGroupLength); + vec sst(mGroupLength); mat sg(nDp, mGroupLength); int *d_info, *p_info; p_info = new int[mGroupLength]; checkCudaErrors(cudaMalloc(&d_info, sizeof(int) * mGroupLength)); bool success = true; - size_t groups = nDp / mGroupLength + (nDp % mGroupLength == 0 ? 0 : 1); + std::pair workRange = mWorkRange.value_or(make_pair(0, nDp)); + uword rangeSize = workRange.second - workRange.first; + size_t groups = rangeSize / mGroupLength + (rangeSize % mGroupLength == 0 ? 0 : 1); for (size_t i = 0; i < groups && success; i++) { GWM_LOG_STOP_BREAK(mStatus); - size_t begin = i * mGroupLength, length = (begin + mGroupLength > nDp) ? (nDp - begin) : mGroupLength; + size_t begin = workRange.first + i * mGroupLength, length = (begin + mGroupLength > workRange.second) ? (workRange.second - begin) : mGroupLength; for (size_t j = 0, e = begin + j; j < length; j++, e++) { - checkCudaErrors(mSpatialWeight.distance()->distance(e, u_dists.dmem(), &elems)); - checkCudaErrors(bandwidthWeight->weight(u_dists.dmem(), u_weights.dmem(), elems)); + checkCudaErrors(sw.weightVector(e, u_dists.dmem(), u_weights.dmem())); u_xtw.strides(j) = u_xt.diagmul(u_weights); } - u_xtwx = u_xtw * u_xt.t(); - u_xtwy = u_xtw * u_y; - u_xtwxI = u_xtwx.inv(d_info); + custride u_xtwx = u_xtw * u_xt.t(); + custride u_xtwy = u_xtw * u_y; + custride u_xtwxI = u_xtwx.inv(d_info); checkCudaErrors(cudaMemcpy(p_info, d_info, sizeof(int) * mGroupLength, cudaMemcpyDeviceToHost)); for (size_t j = 0; j < mGroupLength; j++) { @@ -811,132 +810,269 @@ double GWRBasic::bandwidthSizeCriterionAICCuda(BandwidthWeight* bandwidthWeight) if (success) { u_betas.as_stride().strides(begin, begin + length) = u_xtwxI * u_xtwy; - u_c = u_xtwxI * u_xtw; - u_s = u_xt.as_stride().strides(begin, begin + length).t() * u_c; + custride u_c = u_xtwxI * u_xtw; + custride u_s = u_xt.as_stride().strides(begin, begin + length).t() * u_c; + custride u_sst = u_s * u_s.t(); u_s.get(sg.memptr()); - shat(0) += trace(sg.submat(begin, 0, arma::SizeMat(length, length))); - u_sst = u_s * u_s.t(); u_sst.get(sst.memptr()); + shat(0) += trace(sg.submat(begin, 0, arma::SizeMat(length, length))); shat(1) += sum(sst); } } checkCudaErrors(cudaFree(d_info)); delete[] p_info; - if (!success) return DBL_MAX; u_betas.get(betas.memptr()); - double aic = GWRBasic::AICc(mX, mY, betas.t(), shat); - if (isfinite(aic)) - { - GWM_LOG_INFO(IBandwidthSelectable::infoBandwidthCriterion(bandwidthWeight, aic)); - GWM_LOG_PROGRESS_PERCENT(exp(- abs(mBandwidthLastCriterion - aic))); - mBandwidthLastCriterion = aic; - return aic; - } - else return DBL_MAX; + return betas.t(); } -double GWRBasic::indepVarsSelectionCriterionCuda(const std::vector& indepVars) +#endif + +#ifdef ENABLE_MPI +double GWRBasic::indepVarsSelectionCriterionMpi(const vector& indepVars) { mat x = mX.cols(VariableForwardSelector::index2uvec(indepVars, mHasIntercept)); mat y = mY; - uword nDp = mCoords.n_rows, nVar = x.n_cols; - cumat u_xt(x.t()), u_y(y), u_betas(nVar, nDp); - cumat u_weights(vec(nDp, fill::ones)); - custride u_xtw(nVar, nDp, mGroupLength); - mat betas(nVar, nDp); - vec shat(2), sst(mGroupLength); - mat sg(nDp, mGroupLength); - int *d_info, *p_info; - p_info = new int[mGroupLength]; - checkCudaErrors(cudaMalloc(&d_info, sizeof(int) * mGroupLength)); - bool success = true; - size_t groups = nDp / mGroupLength + (nDp % mGroupLength == 0 ? 0 : 1); - for (size_t i = 0; i < groups && success; i++) + SpatialWeight sw = mSpatialWeight; + sw.weight()->setBandwidth(DBL_MAX); + vec shat; + double aic; + mat betas = (this->*mFitCoreSHatFunction)(x, y, sw, shat); +GWM_MPI_MASTER_BEGIN + umat received(mWorkerNum, 2, arma::fill::zeros); + received.row(0).fill(1); + unique_ptr> buf(new double[betas.n_elem]); + while (!all(all(received == 1))) { - GWM_LOG_STOP_BREAK(mStatus); - size_t begin = i * mGroupLength, length = (begin + mGroupLength > nDp) ? (nDp - begin) : mGroupLength; - for (size_t j = 0, e = begin + j; j < length; j++, e++) + MPI_Status status; + MPI_Recv(buf.get(), betas.n_elem, MPI_DOUBLE, MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &status); + // printf("0 process received message with %d from %d\n", status.MPI_TAG, status.MPI_SOURCE); + received(status.MPI_SOURCE, status.MPI_TAG - int(GWRBasicAICMpiTags::SHat)) = 1; + switch (GWRBasicAICMpiTags(status.MPI_TAG)) { - u_xtw.strides(j) = u_xt.diagmul(u_weights); - } - custride u_xtwx = u_xtw * u_xt.t(); - custride u_xtwy = u_xtw * u_y; - custride u_xtwxI = u_xtwx.inv(d_info); - checkCudaErrors(cudaMemcpy(p_info, d_info, sizeof(int) * mGroupLength, cudaMemcpyDeviceToHost)); - for (size_t j = 0; j < mGroupLength; j++) - { - if (p_info[j] != 0) - { - GWM_LOG_ERROR("Cuda failed to get the inverse of matrix"); - success = false; - break; - } + case GWRBasicAICMpiTags::SHat: + shat += vec(buf.get(), 2); + break; + case GWRBasicAICMpiTags::Betas: + betas += mat(buf.get(), betas.n_rows, betas.n_cols); + break; + default: + break; } - if (success) + } + aic = GWRBasic::AICc(x, y, betas, shat); +GWM_MPI_MASTER_END +GWM_MPI_WORKER_BEGIN + MPI_Send(shat.memptr(), 2, MPI_DOUBLE, 0, int(GWRBasicAICMpiTags::SHat), MPI_COMM_WORLD); + MPI_Send(betas.memptr(), betas.n_elem, MPI_DOUBLE, 0, int(GWRBasicAICMpiTags::Betas), MPI_COMM_WORLD); +GWM_MPI_WORKER_END + MPI_Bcast(&aic, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); + return aic; +} + +double GWRBasic::bandwidthSizeCriterionCVMpi(BandwidthWeight* bandwidthWeight) +{ + uword nDp = mX.n_rows, nVar = mX.n_cols; + SpatialWeight sw(bandwidthWeight, mSpatialWeight.distance()); + double cv; + mat betas = (this->*mFitCoreCVFunction)(mX, mY, sw).t(); +GWM_MPI_MASTER_BEGIN + uvec received(mWorkerNum, arma::fill::zeros); + received.row(0).fill(1); + unique_ptr> buf(new double[betas.n_elem]); + while (!all(received)) + { + MPI_Status status; + MPI_Recv(buf.get(), betas.n_elem, MPI_DOUBLE, MPI_ANY_SOURCE, int(GWRBasicCVMpiTags::Betas), MPI_COMM_WORLD, &status); + received(status.MPI_SOURCE) = 1; + betas += mat(buf.get(), nVar, nDp); + } + vec residual = mY - sum(mX % betas.t(), 1); + cv = sum(residual % residual); +GWM_MPI_MASTER_END +GWM_MPI_WORKER_BEGIN + MPI_Send(betas.memptr(), betas.n_elem, MPI_DOUBLE, 0, int(GWRBasicCVMpiTags::Betas), MPI_COMM_WORLD); +GWM_MPI_WORKER_END + MPI_Bcast(&cv, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); + return cv; +} + +double GWRBasic::bandwidthSizeCriterionAICMpi(BandwidthWeight* bandwidthWeight) +{ + SpatialWeight sw(bandwidthWeight, mSpatialWeight.distance()); + double aic; + vec shat(2, fill::zeros); + mat betas = (this->*mFitCoreSHatFunction)(mX, mY, sw, shat); +GWM_MPI_MASTER_BEGIN + umat received(mWorkerNum, 2, arma::fill::zeros); + received.row(0).fill(1); + unique_ptr> buf(new double[betas.n_elem]); + while (!all(all(received))) + { + MPI_Status status; + MPI_Recv(buf.get(), betas.n_elem, MPI_DOUBLE, MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &status); + // printf("0 process received message with %d from %d\n", status.MPI_TAG, status.MPI_SOURCE); + received(status.MPI_SOURCE, status.MPI_TAG - int(GWRBasicAICMpiTags::SHat)) = 1; + switch (GWRBasicAICMpiTags(status.MPI_TAG)) { - u_betas.as_stride().strides(begin, begin + length) = u_xtwxI * u_xtwy; - custride u_c = u_xtwxI * u_xtw; - custride u_s = u_xt.as_stride().strides(begin, begin + length).t() * u_c; - custride u_sst = u_s * u_s.t(); - u_s.get(sg.memptr()); - u_sst.get(sst.memptr()); - shat(0) += trace(sg.submat(begin, 0, arma::SizeMat(length, length))); - shat(1) += sum(sst); + case GWRBasicAICMpiTags::SHat: + shat += vec(buf.get(), 2); + break; + case GWRBasicAICMpiTags::Betas: + betas += mat(buf.get(), betas.n_rows, betas.n_cols); + break; + default: + break; } } - checkCudaErrors(cudaFree(d_info)); - delete[] p_info; - if (!success) return DBL_MAX; - u_betas.get(betas.memptr()); - GWM_LOG_PROGRESS(++mIndepVarSelectionProgressCurrent, mIndepVarSelectionProgressTotal); - double aic = GWRBasic::AICc(x, y, betas.t(), shat); - if (isfinite(aic)) + aic = GWRBasic::AICc(mX, mY, betas, shat); +GWM_MPI_MASTER_END +GWM_MPI_WORKER_BEGIN + MPI_Send(shat.memptr(), shat.n_elem, MPI_DOUBLE, 0, int(GWRBasicAICMpiTags::SHat), MPI_COMM_WORLD); + MPI_Send(betas.memptr(), betas.n_elem, MPI_DOUBLE, 0, int(GWRBasicAICMpiTags::Betas), MPI_COMM_WORLD); +GWM_MPI_WORKER_END + MPI_Bcast(&aic, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); + return aic; +} + +arma::mat gwm::GWRBasic::fitMpi() +{ + // fit GWR on each process + uword nDp = mCoords.n_rows, nVar = mX.n_cols; + mBetas = (this->*mFitCoreFunction)(mX, mY, mSpatialWeight).t(); + mBetasSE = mBetasSE.t(); + mS = mS.t(); + // gather results to master process + GWM_MPI_MASTER_BEGIN + mat shat_all(2, mWorkerNum); + mat qdiag_all(nDp, mWorkerNum); + shat_all.col(0) = mSHat; + qdiag_all.col(0) = mQDiag; + // prepare to receive data + umat received(mWorkerNum, 4, fill::zeros); + received.row(0).fill(1); + int bufSize = nDp * nVar; + unique_ptr> buf(new double[bufSize]); + while (!all(all(received))) { - GWM_LOG_INFO(IVarialbeSelectable::infoVariableCriterion(indepVars, aic)); - mBandwidthLastCriterion = aic; - return aic; + MPI_Status status; + MPI_Recv(buf.get(), bufSize, MPI_DOUBLE, MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &status); + // printf("0 process received message with %d from %d\n", status.MPI_TAG, status.MPI_SOURCE); + received(status.MPI_SOURCE, status.MPI_TAG - int(GWRBasicFitMpiTags::Betas)) = 1; + uword rangeFrom = status.MPI_SOURCE * mWorkRangeSize, rangeTo = min(rangeFrom + mWorkRangeSize, nDp), rangeSize = rangeTo - rangeFrom; + switch (GWRBasicFitMpiTags(status.MPI_TAG)) + { + case GWRBasicFitMpiTags::Betas: + mBetas.cols(rangeFrom, rangeTo - 1) = mat(buf.get(), nVar, rangeSize); + break; + case GWRBasicFitMpiTags::BetasSE: + mBetasSE.cols(rangeFrom, rangeTo - 1) = mat(buf.get(), nVar, rangeSize); + break; + case GWRBasicFitMpiTags::SHat: + shat_all.col(status.MPI_SOURCE) = vec(buf.get(), 2); + break; + case GWRBasicFitMpiTags::QDiag: + qdiag_all.col(status.MPI_SOURCE) = vec(buf.get(), nDp); + break; + default: + break; + } } - else return DBL_MAX; + mSHat = sum(shat_all, 1); + mQDiag = sum(qdiag_all, 1); + // vec dybar2 = (mY - mean(mY)) % (mY - mean(mY)); + // vec dyhat2 = (mY - yhat) % (mY - yhat); + // vec localR2 = vec(nDp, fill::zeros); + // for (uword i = 0; i < nDp; i++) + // { + // vec w = mSpatialWeight.weightVector(i); + // double tss = sum(dybar2 % w); + // double rss = sum(dyhat2 % w); + // localR2(i) = (tss - rss) / tss; + // } + GWM_MPI_MASTER_END + GWM_MPI_WORKER_BEGIN + std::pair workRange = mWorkRange.value_or(make_pair(0, nDp)); + // printf("%d process work range: [%lld, %lld]\n", mWorkerId, workRange.first, workRange.second); + // cout << mWorkerId << " process work range: [" << workRange.first << "," << workRange.second << "]\n"; + mat betas = mBetas.cols(workRange.first, workRange.second - 1); + mat betasSE = mBetasSE.cols(workRange.first, workRange.second - 1); + MPI_Send(betas.memptr(), betas.n_elem, MPI_DOUBLE, 0, int(GWRBasicFitMpiTags::Betas), MPI_COMM_WORLD); + MPI_Send(betasSE.memptr(), betasSE.n_elem, MPI_DOUBLE, 0, int(GWRBasicFitMpiTags::BetasSE), MPI_COMM_WORLD); + MPI_Send(mSHat.memptr(), mSHat.n_elem, MPI_DOUBLE, 0, int(GWRBasicFitMpiTags::SHat), MPI_COMM_WORLD); + MPI_Send(mQDiag.memptr(), mQDiag.n_elem, MPI_DOUBLE, 0, int(GWRBasicFitMpiTags::QDiag), MPI_COMM_WORLD); + GWM_MPI_WORKER_END + MPI_Bcast(mBetas.memptr(), mBetas.n_elem, MPI_DOUBLE, 0, MPI_COMM_WORLD); + MPI_Bcast(mBetasSE.memptr(), mBetasSE.n_elem, MPI_DOUBLE, 0, MPI_COMM_WORLD); + MPI_Bcast(mSHat.memptr(), mSHat.n_elem, MPI_DOUBLE, 0, MPI_COMM_WORLD); + MPI_Bcast(mQDiag.memptr(), mQDiag.n_elem, MPI_DOUBLE, 0, MPI_COMM_WORLD); + mBetas = mBetas.t(); + mBetasSE = mBetasSE.t(); + mS = mS.t(); + // printf("shat [%lf,%lf]", mSHat(0), mSHat(1)); + // diagnostic in master process + GWM_LOG_STAGE("Model Diagnostic"); + mDiagnostic = CalcDiagnostic(mX, mY, mBetas, mSHat); + double trS = mSHat(0), trStS = mSHat(1); + double sigmaHat = mDiagnostic.RSS / (nDp - 2 * trS + trStS); + mBetasSE = sqrt(sigmaHat * mBetasSE); + // check cancel status + GWM_LOG_STOP_RETURN(mStatus, mat(nDp, nVar, arma::fill::zeros)); + return mBetas; } -#endif +// double GWRBasic::indepVarsSelectionCriterion(const mat& x, const vec& y, vec& shat) +// { +// try +// { +// mat S; +// mat betas = (this->*mFitCoreSHatFunction)(x, y, mSpatialWeight, shat, S); +// GWM_LOG_PROGRESS(++mIndepVarSelectionProgressCurrent, mIndepVarSelectionProgressTotal); +// if (mStatus == Status::Success) +// { +// double rss = GWRBase::RSS(x, y, betas); +// return rss; +// } +// else return DBL_MAX; +// } +// catch(const std::exception& e) +// { +// return DBL_MAX; +// } + +// } +#endif // ENABLE_MPI void GWRBasic::setBandwidthSelectionCriterion(const BandwidthSelectionCriterionType& criterion) { +#ifdef ENABLE_MPI + if (mParallelType & ParallelType::MPI) + { + switch (criterion) + { + case BandwidthSelectionCriterionType::CV: + mBandwidthSelectionCriterionFunction = &GWRBasic::bandwidthSizeCriterionCVMpi; + break; + default: + mBandwidthSelectionCriterionFunction = &GWRBasic::bandwidthSizeCriterionAICMpi; + break; + } + } + else + { +#endif // ENABLE_MPI + switch (criterion) + { + case BandwidthSelectionCriterionType::CV: + mBandwidthSelectionCriterionFunction = &GWRBasic::bandwidthSizeCriterionCV; + break; + default: + mBandwidthSelectionCriterionFunction = &GWRBasic::bandwidthSizeCriterionAIC; + break; + } +#ifdef ENABLE_MPI + } +#endif // ENABLE_MPI mBandwidthSelectionCriterion = criterion; - unordered_map mapper; - switch (mParallelType) - { - case ParallelType::SerialOnly: - mapper = { - make_pair(BandwidthSelectionCriterionType::CV, &GWRBasic::bandwidthSizeCriterionCVSerial), - make_pair(BandwidthSelectionCriterionType::AIC, &GWRBasic::bandwidthSizeCriterionAICSerial) - }; - break; -#ifdef ENABLE_OPENMP - case ParallelType::OpenMP: - mapper = { - make_pair(BandwidthSelectionCriterionType::CV, &GWRBasic::bandwidthSizeCriterionCVOmp), - make_pair(BandwidthSelectionCriterionType::AIC, &GWRBasic::bandwidthSizeCriterionAICOmp) - }; - break; -#endif -#ifdef ENABLE_CUDA - case ParallelType::CUDA: - mapper = { - make_pair(BandwidthSelectionCriterionType::CV, &GWRBasic::bandwidthSizeCriterionCVCuda), - make_pair(BandwidthSelectionCriterionType::AIC, &GWRBasic::bandwidthSizeCriterionAICCuda) - }; - break; -#endif - default: - mapper = { - make_pair(BandwidthSelectionCriterionType::CV, &GWRBasic::bandwidthSizeCriterionCVSerial), - make_pair(BandwidthSelectionCriterionType::AIC, &GWRBasic::bandwidthSizeCriterionAICSerial) - }; - break; - } - mBandwidthSelectionCriterionFunction = mapper[mBandwidthSelectionCriterion]; } void GWRBasic::setParallelType(const ParallelType& type) @@ -944,32 +1080,51 @@ void GWRBasic::setParallelType(const ParallelType& type) if (type & parallelAbility()) { mParallelType = type; - switch (type) { + switch (mParallelType) { case ParallelType::SerialOnly: + case ParallelType::MPI_Serial: mPredictFunction = &GWRBasic::predictSerial; - mFitFunction = &GWRBasic::fitSerial; - mIndepVarsSelectionCriterionFunction = &GWRBasic::indepVarsSelectionCriterionSerial; + mFitCoreFunction = &GWRBasic::fitCoreSerial; + mFitCoreCVFunction = &GWRBasic::fitCoreCVSerial; + mFitCoreSHatFunction = &GWRBasic::fitCoreSHatSerial; break; #ifdef ENABLE_OPENMP case ParallelType::OpenMP: + case ParallelType::MPI_MP: mPredictFunction = &GWRBasic::predictOmp; - mFitFunction = &GWRBasic::fitOmp; - mIndepVarsSelectionCriterionFunction = &GWRBasic::indepVarsSelectionCriterionOmp; + mFitCoreFunction = &GWRBasic::fitCoreOmp; + mFitCoreCVFunction = &GWRBasic::fitCoreCVOmp; + mFitCoreSHatFunction = &GWRBasic::fitCoreSHatOmp; break; #endif // ENABLE_OPENMP #ifdef ENABLE_CUDA case ParallelType::CUDA: + case ParallelType::MPI_CUDA: mPredictFunction = &GWRBasic::predictCuda; - mFitFunction = &GWRBasic::fitCuda; - mIndepVarsSelectionCriterionFunction = &GWRBasic::indepVarsSelectionCriterionCuda; + mFitCoreFunction = &GWRBasic::fitCoreCuda; + mFitCoreCVFunction = &GWRBasic::fitCoreCVCuda; + mFitCoreSHatFunction = &GWRBasic::fitCoreSHatCuda; break; #endif // ENABLE_CUDA default: mPredictFunction = &GWRBasic::predictSerial; - mFitFunction = &GWRBasic::fitSerial; - mIndepVarsSelectionCriterionFunction = &GWRBasic::indepVarsSelectionCriterionSerial; + mFitCoreFunction = &GWRBasic::fitCoreSerial; + mFitCoreCVFunction = &GWRBasic::fitCoreCVSerial; + mFitCoreSHatFunction = &GWRBasic::fitCoreSHatSerial; break; } +#ifdef ENABLE_MPI + if (type & ParallelType::MPI) + { + mFitFunction = &GWRBasic::fitMpi; + mIndepVarsSelectionCriterionFunction = &GWRBasic::indepVarsSelectionCriterionMpi; + } + else + { + mFitFunction = &GWRBasic::fitBase; + mIndepVarsSelectionCriterionFunction = &GWRBasic::indepVarsSelectionCriterion; + } +#endif setBandwidthSelectionCriterion(mBandwidthSelectionCriterion); } } diff --git a/src/gwmodelpp/GWRMultiscale.cpp b/src/gwmodelpp/GWRMultiscale.cpp index ed16facd..5c255eaf 100644 --- a/src/gwmodelpp/GWRMultiscale.cpp +++ b/src/gwmodelpp/GWRMultiscale.cpp @@ -9,6 +9,7 @@ #include "BandwidthSelector.h" #include "VariableForwardSelector.h" #include "Logger.h" +#include "GWRBasic.h" #ifdef ENABLE_CUDA #include @@ -16,12 +17,23 @@ #include "CudaUtils.h" #endif // ENABLE_CUDA +#ifdef ENABLE_MPI +#include +#include "gwmodelpp/utils/armampi.h" +#endif + using namespace std; using namespace arma; using namespace gwm; int GWRMultiscale::treeChildCount = 0; +enum class GWRMultiscaleFitMPITags +{ + Betas = 1 << 0, + SMat +}; + unordered_map GWRMultiscale::BandwidthInitilizeTypeNameMapper = { make_pair(GWRMultiscale::BandwidthInitilizeType::Null, ("Not initilized, not specified")), make_pair(GWRMultiscale::BandwidthInitilizeType::Initial, ("Initilized")), @@ -67,27 +79,11 @@ RegressionDiagnostic GWRMultiscale::CalcDiagnostic(const mat &x, const vec &y, c mat GWRMultiscale::fit() { - GWM_LOG_STAGE("Initializing"); uword nDp = mX.n_rows, nVar = mX.n_cols; - createDistanceParameter(nVar); - createInitialDistanceParameter(); - mMaxDistances.resize(nVar); -#ifdef ENABLE_CUDA - if (mParallelType == ParallelType::CUDA) - { - cublasCreate(&cubase::handle); - mInitSpatialWeight.prepareCuda(mGpuId); - for (size_t i = 0; i < nVar; i++) - { - mSpatialWeights[i].prepareCuda(mGpuId); - } - } -#endif // ENABLE_CUDA - GWM_LOG_STOP_RETURN(mStatus, mat(nDp, nVar, arma::fill::zeros)); - // ******************************** // Centering and scaling predictors // ******************************** + GWM_LOG_STAGE("Centering and scaling predictors"); mX0 = mX; mY0 = mY; for (uword i = (mHasIntercept ? 1 : 0); i < nVar ; i++) @@ -99,6 +95,100 @@ mat GWRMultiscale::fit() } GWM_LOG_STOP_RETURN(mStatus, mat(nDp, nVar, arma::fill::zeros)); + // ***************************************************** + // Calculate the initial beta0 from the above bandwidths + // ***************************************************** + GWM_LOG_STAGE("Calculating initial betas"); + GWRBasic gwr; + gwr.setCoords(mCoords); + gwr.setDependentVariable(mY); + gwr.setIndependentVariables(mX); + gwr.setSpatialWeight(mInitSpatialWeight); + gwr.setIsAutoselectBandwidth(true); + switch (mBandwidthSelectionApproach[0]) + { + case GWRMultiscale::BandwidthSelectionCriterionType::CV: + gwr.setBandwidthSelectionCriterion(GWRBasic::BandwidthSelectionCriterionType::CV); + break; + case GWRMultiscale::BandwidthSelectionCriterionType::AIC: + gwr.setBandwidthSelectionCriterion(GWRBasic::BandwidthSelectionCriterionType::AIC); + default: + break; + } + gwr.setParallelType(mParallelType); + switch (mParallelType) + { + case ParallelType::OpenMP: + case ParallelType::MPI_MP: + gwr.setOmpThreadNum(mOmpThreadNum); + break; + case ParallelType::CUDA: + case ParallelType::MPI_CUDA: + gwr.setGroupSize(mGroupLength); + gwr.setGPUId(mGpuId); + default: + break; + } + if (mParallelType & ParallelType::MPI) + { + gwr.setWorkerId(mWorkerId); + gwr.setWorkerNum(mWorkerNum); + } + gwr.setStoreS(mHasHatMatrix); + gwr.setStoreC(mHasHatMatrix); + if (mGoldenLowerBounds.has_value()) gwr.setGoldenLowerBounds(mGoldenLowerBounds.value()); + if (mGoldenUpperBounds.has_value()) gwr.setGoldenUpperBounds(mGoldenUpperBounds.value()); + mat betas = gwr.fit(); + mBetasSE = gwr.betasSE(); + GWM_LOG_STOP_RETURN(mStatus, mat(nDp, nVar, arma::fill::zeros)); + + GWM_LOG_STAGE("Initializing"); + createDistanceParameter(nVar); + mMaxDistances.resize(nVar); + mMinDistances.resize(nVar); +#ifdef ENABLE_MPI + if (mParallelType & ParallelType::MPI) + { + uword aShape[4]; + if (mWorkerId == 0) // master process + { + // sync matrix dimensions + uword nDim = mCoords.n_cols; + mWorkRangeSize = nDp / uword(mWorkerNum) + ((nDp % uword(mWorkerNum) == 0) ? 0 : 1); + aShape[0] = nDp; + aShape[1] = nVar; + aShape[2] = nDim; + aShape[3] = mWorkRangeSize; + } + MPI_Bcast(aShape, 4, GWM_MPI_UWORD, 0, MPI_COMM_WORLD); + if (mWorkerId > 0) // slave process + { + nDp = aShape[0]; + nVar = aShape[1]; + mX.resize(nDp, nVar); + mY.resize(nDp); + mCoords.resize(aShape[0], aShape[2]); + mWorkRangeSize = aShape[3]; + } + MPI_Bcast(mX.memptr(), mX.n_elem, MPI_DOUBLE, 0, MPI_COMM_WORLD); + MPI_Bcast(mY.memptr(), mY.n_elem, MPI_DOUBLE, 0, MPI_COMM_WORLD); + MPI_Bcast(mCoords.memptr(), mCoords.n_elem, MPI_DOUBLE, 0, MPI_COMM_WORLD); + uword workRangeFrom = uword(mWorkerId) * mWorkRangeSize; + mWorkRange = make_pair(workRangeFrom, min(workRangeFrom + mWorkRangeSize, nDp)); + } +#endif // ENABLE_MPI +#ifdef ENABLE_CUDA + if (mParallelType & ParallelType::CUDA) + { + cubase::create_handle(); + for (size_t i = 0; i < nVar; i++) + { + mSpatialWeights[i].prepareCuda(mGpuId); + } + } +#endif // ENABLE_CUDA + GWM_LOG_STOP_RETURN(mStatus, mat(nDp, nVar, arma::fill::zeros)); + // *********************** // Intialize the bandwidth // *********************** @@ -115,18 +205,20 @@ mat GWRMultiscale::fit() BandwidthWeight* bw0 = bandwidth(i); bool adaptive = bw0->adaptive(); mMaxDistances[i] = mSpatialWeights[i].distance()->maxDistance(); + mMinDistances[i] = mSpatialWeights[i].distance()->minDistance(); GWM_LOG_INFO(string(GWM_LOG_TAG_MGWR_INITIAL_BW) + to_string(i)); BandwidthSelector selector; selector.setBandwidth(bw0); - selector.setLower(mGoldenLowerBounds.value_or(adaptive ? mAdaptiveLower : mMaxDistances[i] / 5000.0)); + selector.setLower(mGoldenLowerBounds.value_or(adaptive ? mAdaptiveLower : mMinDistances[i])); selector.setUpper(mGoldenUpperBounds.value_or(adaptive ? mCoords.n_rows : mMaxDistances[i])); BandwidthWeight* bw = selector.optimize(this); if (bw) { mSpatialWeights[i].setWeight(bw); #ifdef ENABLE_CUDA - mSpatialWeights[i].prepareCuda(mGpuId); + if (mParallelType & ParallelType::CUDA) + mSpatialWeights[i].prepareCuda(mGpuId); #endif // ENABLE_CUDA } GWM_LOG_INFO(string(GWM_LOG_TAG_MGWR_INITIAL_BW) + to_string(i) + "," + to_string(bw->bandwidth())); @@ -134,102 +226,83 @@ mat GWRMultiscale::fit() GWM_LOG_STOP_RETURN(mStatus, mat(nDp, nVar, arma::fill::zeros)); } - // ***************************************************** - // Calculate the initial beta0 from the above bandwidths - // ***************************************************** - BandwidthWeight* bw0 = bandwidth(0); - bool adaptive = bw0->adaptive(); - mBandwidthSizeCriterion = bandwidthSizeCriterionAll(mBandwidthSelectionApproach[0]); - - GWM_LOG_STAGE("Calculating initial bandwidth"); - BandwidthSelector initBwSelector; - initBwSelector.setBandwidth(bw0); - double maxDist = mSpatialWeights[0].distance()->maxDistance(); - initBwSelector.setLower(mGoldenLowerBounds.value_or(adaptive ? mAdaptiveLower : maxDist / 5000.0)); - initBwSelector.setUpper(mGoldenUpperBounds.value_or(adaptive ? mCoords.n_rows : maxDist)); - GWM_LOG_INFO(IBandwidthSelectable::infoBandwidthCriterion(bw0)); - BandwidthWeight* initBw = initBwSelector.optimize(this); - if (!initBw) - { - throw std::runtime_error("Cannot select initial bandwidth."); - } - mInitSpatialWeight.setWeight(initBw); -#ifdef ENABLE_CUDA - mInitSpatialWeight.prepareCuda(mGpuId); -#endif // ENABLE_CUDA - GWM_LOG_STOP_RETURN(mStatus, mat(nDp, nVar, arma::fill::zeros)); + std::pair workRange = mWorkRange.value_or(make_pair(0, nDp)); - // 初始化诊断信息矩阵 + GWM_LOG_STAGE("Initializing diagnostic matrices"); + mat idm = eye(nVar, nVar); if (mHasHatMatrix) { - mS0 = mat(nDp, nDp, fill::zeros); - mSArray = cube(nDp, nDp, nVar, fill::zeros); - mC = cube(nVar, nDp, nDp, fill::zeros); + mS0 = gwr.s(); + mSArray = cube(mS0.n_rows, mS0.n_cols, nVar, fill::zeros); + mC = gwr.c(); + for (uword i = 0; i < nVar; ++i) + { + for (uword j = workRange.first; j < workRange.second ; ++j) + { + uword e = j - workRange.first; + mSArray.slice(i).row(e) = mX(j, i) * (idm.row(i) * mC.slice(e)); + } + } } GWM_LOG_STAGE("Model fitting"); - mBetas = backfitting(mX, mY); + mBetas = backfitting(betas); GWM_LOG_STOP_RETURN(mStatus, mat(nDp, nVar, arma::fill::zeros)); // Diagnostic GWM_LOG_STAGE("Model Diagnostic"); - vec shat = { - mHasHatMatrix ? trace(mS0) : 0, - mHasHatMatrix ? trace(mS0.t() * mS0) : 0 - }; - mDiagnostic = CalcDiagnostic(mX, mY, shat, mRSS0); + vec shat(2, arma::fill::zeros); if (mHasHatMatrix) { +#ifdef ENABLE_MPI + if (mParallelType & ParallelType::MPI) + { + vec shati(2); + shati(0) = trace(mS0.cols(workRange.first, workRange.second - 1)); + shati(1) = trace(mS0 * mS0.t()); + MPI_Reduce(shati.memptr(), shat.memptr(), 2, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); + MPI_Bcast(shat.memptr(), 2, MPI_DOUBLE, 0, MPI_COMM_WORLD); + } + else + { +#endif // ENABLE_MPI + shat = { + mHasHatMatrix ? trace(mS0) : 0, + mHasHatMatrix ? trace(mS0 * mS0.t()) : 0 + }; +#ifdef ENABLE_MPI + } +#endif // ENABLE_MPI mBetasTV = mBetas / mBetasSE; } + mDiagnostic = CalcDiagnostic(mX, mY, shat, mRSS0); vec yhat = Fitted(mX, mBetas); vec residual = mY - yhat; // Cleaning #ifdef ENABLE_CUDA - if (mParallelType == ParallelType::CUDA) + if (mParallelType & ParallelType::CUDA) { - cublasDestroy(cubase::handle); + cubase::destory_handle(); } #endif // ENABLE_CUDA return mBetas; } -void GWRMultiscale::createInitialDistanceParameter() -{//回归距离计算 - if (mInitSpatialWeight.distance()->type() == Distance::DistanceType::CRSDistance || - mInitSpatialWeight.distance()->type() == Distance::DistanceType::MinkwoskiDistance) - { - mInitSpatialWeight.distance()->makeParameter({ mCoords, mCoords }); - } -} - -mat GWRMultiscale::backfitting(const mat &x, const vec &y) +mat GWRMultiscale::backfitting(const mat& betas0) { - GWM_LOG_MGWR_BACKFITTING("Model fitting with inital bandwidth"); uword nDp = mCoords.n_rows, nVar = mX.n_cols; - mat betas = (this->*mFitAll)(x, y); + mat betas = betas0; + // cout << "[backfitting] Process " << mWorkerId << " betas\n"; GWM_LOG_STOP_RETURN(mStatus, mat(nDp, nVar, arma::fill::zeros)); - mat idm = eye(nVar, nVar); - if (mHasHatMatrix) - { - for (uword i = 0; i < nVar; ++i) - { - for (uword j = 0; j < nDp ; ++j) - { - mSArray.slice(i).row(j) = x(j, i) * (idm.row(i) * mC.slice(j)); - } - } - } - // *********************************************************** // Select the optimum bandwidths for each independent variable // *********************************************************** GWM_LOG_MGWR_BACKFITTING("Selecting the optimum bandwidths for each independent variable"); uvec bwChangeNo(nVar, fill::zeros); - vec resid = y - Fitted(x, betas); + vec resid = mY - Fitted(mX, betas); double RSS0 = sum(resid % resid), RSS1 = DBL_MAX; double criterion = DBL_MAX; for (size_t iteration = 1; iteration <= mMaxIteration && criterion > mCriterionThreshold; iteration++) @@ -239,21 +312,20 @@ mat GWRMultiscale::backfitting(const mat &x, const vec &y) for (uword i = 0; i < nVar ; i++) { GWM_LOG_STOP_BREAK(mStatus); - vec fi = betas.col(i) % x.col(i); - vec yi = resid + fi; + mXi = mX.col(i); + vec fi = betas.col(i) % mX.col(i); + mYi = resid + fi; if (mBandwidthInitilize[i] != BandwidthInitilizeType::Specified) { GWM_LOG_MGWR_BACKFITTING("#variable-bandwidth-selection " + to_string(i)); mBandwidthSizeCriterion = bandwidthSizeCriterionVar(mBandwidthSelectionApproach[i]); mBandwidthSelectionCurrentIndex = i; - mYi = yi; - mXi = mX.col(i); BandwidthWeight* bwi0 = bandwidth(i); bool adaptive = bwi0->adaptive(); BandwidthSelector selector; selector.setBandwidth(bwi0); - double maxDist = mMaxDistances[i]; - selector.setLower(mGoldenLowerBounds.value_or(adaptive ? mAdaptiveLower : maxDist / 5000.0)); + double maxDist = mMaxDistances[i], minDist = mMinDistances[i]; + selector.setLower(mGoldenLowerBounds.value_or(adaptive ? mAdaptiveLower : minDist)); selector.setUpper(mGoldenUpperBounds.value_or(adaptive ? mCoords.n_rows : maxDist)); BandwidthWeight* bwi = selector.optimize(this); double bwi0s = bwi0->bandwidth(), bwi1s = bwi->bandwidth(); @@ -291,17 +363,13 @@ mat GWRMultiscale::backfitting(const mat &x, const vec &y) } GWM_LOG_STOP_BREAK(mStatus); - mat S; - betas.col(i) = (this->*mFitVar)(x.col(i), yi, i, S); - if (mHasHatMatrix) - { - mat SArrayi = mSArray.slice(i) - mS0; - mSArray.slice(i) = S * SArrayi + S; - mS0 = mSArray.slice(i) - SArrayi; - } - resid = y - Fitted(x, betas); + vec betai = (this->*mFitVar)(i); + // betas.brief_print("betas"); + // betai.brief_print("betai"); + betas.col(i) = betai; + resid = mY - Fitted(mX, betas); } - RSS1 = RSS(x, y, betas); + RSS1 = RSS(mX, mY, betas); criterion = (mCriterionType == BackFittingCriterionType::CVR) ? abs(RSS1 - RSS0) : sqrt(abs(RSS1 - RSS0) / RSS1); @@ -316,6 +384,65 @@ mat GWRMultiscale::backfitting(const mat &x, const vec &y) return betas; } +arma::vec gwm::GWRMultiscale::fitVarBase(const size_t var) +{ + mat S; + vec beta = (this->*mFitVarCore)(mXi, mYi, mSpatialWeights[var], S); + if (mHasHatMatrix) + { + mat SArrayi = mSArray.slice(var) - mS0; + mSArray.slice(var) = S * SArrayi + S; + mS0 = mSArray.slice(var) - SArrayi; + } + return beta; +} + +double gwm::GWRMultiscale::bandwidthSizeCriterionVarCVBase(BandwidthWeight* bandwidthWeight) +{ + SpatialWeight sw(bandwidthWeight, mSpatialWeights[mBandwidthSelectionCurrentIndex].distance()); + try + { + vec betas = (this->*mFitVarCoreCV)(mXi, mYi, sw); + vec res = mYi - mXi % betas; + double cv = sum(res % res); + if (mStatus == Status::Success && isfinite(cv)) + { + GWM_LOG_INFO(IBandwidthSelectable::infoBandwidthCriterion(bandwidthWeight, cv)); + GWM_LOG_PROGRESS_PERCENT(exp(- abs(mBandwidthLastCriterion - cv))); + mBandwidthLastCriterion = cv; + return cv; + } + else return DBL_MAX; + } + catch(const std::exception& e) + { + return DBL_MAX; + } +} + +double gwm::GWRMultiscale::bandwidthSizeCriterionVarAICBase(BandwidthWeight* bandwidthWeight) +{ + SpatialWeight sw(bandwidthWeight, mSpatialWeights[mBandwidthSelectionCurrentIndex].distance()); + try + { + vec shat; + mat betas = (this->*mFitVarCoreSHat)(mXi, mYi, sw, shat); + double value = GWRBase::AICc(mXi, mYi, betas, shat); + if (mStatus == Status::Success && isfinite(value)) + { + GWM_LOG_INFO(IBandwidthSelectable::infoBandwidthCriterion(bandwidthWeight, value)); + GWM_LOG_PROGRESS_PERCENT(exp(- abs(mBandwidthLastCriterion - value))); + mBandwidthLastCriterion = value; + return value; + } + else return DBL_MAX; + } + catch(const std::exception& e) + { + return DBL_MAX; + } +} + bool GWRMultiscale::isValid() { if (!(mX.n_cols > 0)) @@ -359,78 +486,22 @@ bool GWRMultiscale::isValid() return true; } -mat GWRMultiscale::fitAllSerial(const mat& x, const vec& y) -{ - uword nDp = mCoords.n_rows, nVar = x.n_cols; - mat betas(nVar, nDp, fill::zeros); - if (mHasHatMatrix ) - { - mat betasSE(nVar, nDp, fill::zeros); - for (uword i = 0; i < nDp ; i++) - { - GWM_LOG_STOP_BREAK(mStatus); - vec w = mInitSpatialWeight.weightVector(i); - mat xtw = trans(x.each_col() % w); - mat xtwx = xtw * x; - mat xtwy = xtw * y; - try - { - mat xtwx_inv = inv_sympd(xtwx); - betas.col(i) = xtwx_inv * xtwy; - mat ci = xtwx_inv * xtw; - betasSE.col(i) = sum(ci % ci, 1); - mat si = x.row(i) * ci; - mS0.row(i) = si; - mC.slice(i) = ci; - } - catch (const exception& e) - { - GWM_LOG_ERROR(e.what()); - throw e; - } - GWM_LOG_PROGRESS(i + 1, nDp); - } - mBetasSE = betasSE.t(); - } - else - { - for (int i = 0; (uword)i < nDp ; i++) - { - GWM_LOG_STOP_BREAK(mStatus); - vec w = mInitSpatialWeight.weightVector(i); - mat xtw = trans(x.each_col() % w); - mat xtwx = xtw * x; - mat xtwy = xtw * y; - try - { - mat xtwx_inv = inv_sympd(xtwx); - betas.col(i) = xtwx_inv * xtwy; - } - catch (const exception& e) - { - GWM_LOG_ERROR(e.what()); - throw e; - } - GWM_LOG_PROGRESS(i + 1, nDp); - } - } - return betas.t(); -} - -vec GWRMultiscale::fitVarSerial(const vec &x, const vec &y, const uword var, mat &S) +vec GWRMultiscale::fitVarCoreSerial(const vec &x, const vec &y, const SpatialWeight& sw, mat &S) { uword nDp = mCoords.n_rows; mat betas(1, nDp, fill::zeros); bool success = true; std::exception except; + std::pair workRange = mWorkRange.value_or(make_pair(0, nDp)); if (mHasHatMatrix) { mat ci, si; - S = mat(mHasHatMatrix ? nDp : 1, nDp, fill::zeros); - for (uword i = 0; i < nDp ; i++) + uword rangeSize = workRange.second - workRange.first; + S = mat(rangeSize, nDp, fill::zeros); + for (uword i = workRange.first; i < workRange.second; i++) { GWM_LOG_STOP_BREAK(mStatus); - vec w = mSpatialWeights[var].weightVector(i); + vec w = sw.weightVector(i); mat xtw = trans(x % w); mat xtwx = xtw * x; mat xtwy = xtw * y; @@ -440,7 +511,7 @@ vec GWRMultiscale::fitVarSerial(const vec &x, const vec &y, const uword var, mat betas.col(i) = xtwx_inv * xtwy; ci = xtwx_inv * xtw; si = x(i) * ci; - S.row(mHasHatMatrix ? i : 0) = si; + S.row(i - workRange.first) = si; } catch (const exception& e) { @@ -453,10 +524,10 @@ vec GWRMultiscale::fitVarSerial(const vec &x, const vec &y, const uword var, mat } else { - for (int i = 0; (uword)i < nDp ; i++) + for (uword i = workRange.first; i < workRange.second; i++) { GWM_LOG_STOP_BREAK(mStatus); - vec w = mSpatialWeights[var].weightVector(i); + vec w = sw.weightVector(i); mat xtw = trans(x % w); mat xtwx = xtw * x; mat xtwy = xtw * y; @@ -481,178 +552,83 @@ vec GWRMultiscale::fitVarSerial(const vec &x, const vec &y, const uword var, mat return betas.t(); } -double GWRMultiscale::bandwidthSizeCriterionAllCVSerial(BandwidthWeight *bandwidthWeight) +vec GWRMultiscale::fitVarCoreCVSerial(const vec &x, const vec &y, const SpatialWeight& sw) { - uword nDp = mCoords.n_rows; - vec shat(2, fill::zeros); - double cv = 0.0; - for (uword i = 0; i < nDp; i++) + uword nDp = x.n_rows; + vec beta(nDp, fill::zeros); + std::pair workRange = mWorkRange.value_or(make_pair(0, nDp)); + for (uword i = workRange.first; i < workRange.second; i++) { GWM_LOG_STOP_BREAK(mStatus); - vec d = mInitSpatialWeight.distance()->distance(i); - vec w = bandwidthWeight->weight(d); + vec w = sw.weightVector(i); w(i) = 0.0; - mat xtw = trans(mX.each_col() % w); - mat xtwx = xtw * mX; - mat xtwy = xtw * mY; + mat xtw = trans(x % w); + mat xtwx = xtw * x; + mat xtwy = xtw * y; try { mat xtwx_inv = inv_sympd(xtwx); - vec beta = xtwx_inv * xtwy; - double res = mY(i) - det(mX.row(i) * beta); - cv += res * res; + beta(i) = as_scalar(xtwx_inv * xtwy); } catch (const exception& e) { GWM_LOG_ERROR(e.what()); - return DBL_MAX; + throw e; } } - if (mStatus == Status::Success && isfinite(cv)) - { - GWM_LOG_INFO(IBandwidthSelectable::infoBandwidthCriterion(bandwidthWeight, cv)); - GWM_LOG_PROGRESS_PERCENT(exp(- abs(mBandwidthLastCriterion - cv))); - mBandwidthLastCriterion = cv; - return cv; - } - else return DBL_MAX; + return beta; } -double GWRMultiscale::bandwidthSizeCriterionAllAICSerial(BandwidthWeight *bandwidthWeight) +vec GWRMultiscale::fitVarCoreSHatSerial(const vec &x, const vec &y, const SpatialWeight& sw, vec& shat) { - uword nDp = mCoords.n_rows, nVar = mX.n_cols; - mat betas(nVar, nDp, fill::zeros); - vec shat(2, fill::zeros); - for (uword i = 0; i < nDp ; i++) + uword nDp = x.n_rows; + vec betas(nDp, fill::zeros); + shat = vec(2, fill::zeros); + std::pair workRange = mWorkRange.value_or(make_pair(0, nDp)); + for (uword i = workRange.first; i < workRange.second; i++) { GWM_LOG_STOP_BREAK(mStatus); - vec d = mInitSpatialWeight.distance()->distance(i); - vec w = bandwidthWeight->weight(d); - mat xtw = trans(mX.each_col() % w); - mat xtwx = xtw * mX; - mat xtwy = xtw * mY; + vec w = sw.weightVector(i); + mat xtw = trans(x % w); + mat xtwx = xtw * x; + mat xtwy = xtw * y; try { mat xtwx_inv = inv_sympd(xtwx); - betas.col(i) = xtwx_inv * xtwy; + betas(i) = as_scalar(xtwx_inv * xtwy); mat ci = xtwx_inv * xtw; - mat si = mX.row(i) * ci; + mat si = x(i) * ci; shat(0) += si(0, i); - shat(1) += det(si * si.t()); + shat(1) += as_scalar(si * si.t()); } catch (const exception& e) { GWM_LOG_ERROR(e.what()); - return DBL_MAX; + throw e; } } - double value = GWRMultiscale::AICc(mX, mY, betas.t(), shat); - if (mStatus == Status::Success && isfinite(value)) - { - GWM_LOG_INFO(IBandwidthSelectable::infoBandwidthCriterion(bandwidthWeight, value)); - GWM_LOG_PROGRESS_PERCENT(exp(- abs(mBandwidthLastCriterion - value))); - mBandwidthLastCriterion = value; - return value; - } - else return DBL_MAX; + return betas; } -double GWRMultiscale::bandwidthSizeCriterionVarCVSerial(BandwidthWeight *bandwidthWeight) -{ - size_t var = mBandwidthSelectionCurrentIndex; - uword nDp = mCoords.n_rows; - vec shat(2, fill::zeros); - double cv = 0.0; - for (uword i = 0; i < nDp; i++) - { - GWM_LOG_STOP_BREAK(mStatus); - vec d = mSpatialWeights[var].distance()->distance(i); - vec w = bandwidthWeight->weight(d); - w(i) = 0.0; - mat xtw = trans(mXi % w); - mat xtwx = xtw * mXi; - mat xtwy = xtw * mYi; - try - { - mat xtwx_inv = inv_sympd(xtwx); - vec beta = xtwx_inv * xtwy; - double res = mYi(i) - det(mXi(i) * beta); - cv += res * res; - } - catch (const exception& e) - { - GWM_LOG_ERROR(e.what()); - return DBL_MAX; - } - } - if (mStatus == Status::Success && isfinite(cv)) - { - GWM_LOG_INFO(IBandwidthSelectable::infoBandwidthCriterion(bandwidthWeight, cv)); - GWM_LOG_PROGRESS_PERCENT(exp(- abs(mBandwidthLastCriterion - cv))); - mBandwidthLastCriterion = cv; - return cv; - } - else return DBL_MAX; -} -double GWRMultiscale::bandwidthSizeCriterionVarAICSerial(BandwidthWeight *bandwidthWeight) +#ifdef ENABLE_OPENMP +vec GWRMultiscale::fitVarCoreOmp(const vec &x, const vec &y, const SpatialWeight& sw, mat &S) { - size_t var = mBandwidthSelectionCurrentIndex; uword nDp = mCoords.n_rows; mat betas(1, nDp, fill::zeros); - vec shat(2, fill::zeros); - for (uword i = 0; i < nDp ; i++) - { - GWM_LOG_STOP_BREAK(mStatus); - vec d = mSpatialWeights[var].distance()->distance(i); - vec w = bandwidthWeight->weight(d); - mat xtw = trans(mXi % w); - mat xtwx = xtw * mXi; - mat xtwy = xtw * mYi; - try - { - mat xtwx_inv = inv_sympd(xtwx); - betas.col(i) = xtwx_inv * xtwy; - mat ci = xtwx_inv * xtw; - mat si = mXi(i) * ci; - shat(0) += si(0, i); - shat(1) += det(si * si.t()); - } - catch (const exception& e) - { - GWM_LOG_ERROR(e.what()); - return DBL_MAX; - } - } - double value = GWRMultiscale::AICc(mXi, mYi, betas.t(), shat); - if (mStatus == Status::Success && isfinite(value)) - { - GWM_LOG_INFO(IBandwidthSelectable::infoBandwidthCriterion(bandwidthWeight, value)); - GWM_LOG_PROGRESS_PERCENT(exp(- abs(mBandwidthLastCriterion - value))); - mBandwidthLastCriterion = value; - return value; - } - return isfinite(value) ? value : DBL_MAX; -} - - -#ifdef ENABLE_OPENMP - -mat GWRMultiscale::fitAllOmp(const mat &x, const vec &y) -{ - uword nDp = mCoords.n_rows, nVar = x.n_cols; - mat betas(nVar, nDp, fill::zeros); bool success = true; std::exception except; - if (mHasHatMatrix ) + std::pair workRange = mWorkRange.value_or(make_pair(0, nDp)); + if (mHasHatMatrix) { - mat betasSE(nVar, nDp, fill::zeros); + uword rangeSize = workRange.second - workRange.first; + S = mat(rangeSize, nDp, fill::zeros); #pragma omp parallel for num_threads(mOmpThreadNum) - for (int i = 0; (uword)i < nDp; i++) + for (int i = int(workRange.first); (uword)i < workRange.second; i++) { GWM_LOG_STOP_CONTINUE(mStatus); - vec w = mInitSpatialWeight.weightVector(i); - mat xtw = trans(x.each_col() % w); + vec w = sw.weightVector(i); + mat xtw = trans(x % w); mat xtwx = xtw * x; mat xtwy = xtw * y; try @@ -660,10 +636,8 @@ mat GWRMultiscale::fitAllOmp(const mat &x, const vec &y) mat xtwx_inv = inv_sympd(xtwx); betas.col(i) = xtwx_inv * xtwy; mat ci = xtwx_inv * xtw; - betasSE.col(i) = sum(ci % ci, 1); - mat si = x.row(i) * ci; - mS0.row(i) = si; - mC.slice(i) = ci; + mat si = x(i) * ci; + S.row(i - workRange.first) = si; } catch (const exception& e) { @@ -673,16 +647,15 @@ mat GWRMultiscale::fitAllOmp(const mat &x, const vec &y) } GWM_LOG_PROGRESS(i + 1, nDp); } - mBetasSE = betasSE.t(); } else { #pragma omp parallel for num_threads(mOmpThreadNum) - for (int i = 0; (uword)i < nDp; i++) + for (int i = int(workRange.first); (uword)i < workRange.second; i++) { GWM_LOG_STOP_CONTINUE(mStatus); - vec w = mInitSpatialWeight.weightVector(i); - mat xtw = trans(x.each_col() % w); + vec w = sw.weightVector(i); + mat xtw = trans(x % w); mat xtwx = xtw * x; mat xtwy = xtw * y; try @@ -706,141 +679,64 @@ mat GWRMultiscale::fitAllOmp(const mat &x, const vec &y) return betas.t(); } -vec GWRMultiscale::fitVarOmp(const vec &x, const vec &y, const uword var, mat &S) +vec GWRMultiscale::fitVarCoreCVOmp(const vec &x, const vec &y, const SpatialWeight& sw) { uword nDp = mCoords.n_rows; - mat betas(1, nDp, fill::zeros); - bool success = true; - std::exception except; - if (mHasHatMatrix) - { - S = mat(mHasHatMatrix ? nDp : 1, nDp, fill::zeros); + vec beta(nDp, fill::zeros); + bool flag = true; + std::pair workRange = mWorkRange.value_or(make_pair(0, nDp)); #pragma omp parallel for num_threads(mOmpThreadNum) - for (int i = 0; (uword)i < nDp; i++) + for (int i = workRange.first; (uword)i < workRange.second; i++) + { + GWM_LOG_STOP_CONTINUE(mStatus); + if (flag) { - GWM_LOG_STOP_CONTINUE(mStatus); - vec w = mSpatialWeights[var].weightVector(i); + vec w = sw.weightVector(i); + w(i) = 0.0; mat xtw = trans(x % w); mat xtwx = xtw * x; mat xtwy = xtw * y; try { mat xtwx_inv = inv_sympd(xtwx); - betas.col(i) = xtwx_inv * xtwy; - mat ci = xtwx_inv * xtw; - mat si = x(i) * ci; - S.row(mHasHatMatrix ? i : 0) = si; + beta(i) = as_scalar(xtwx_inv * xtwy); } catch (const exception& e) { GWM_LOG_ERROR(e.what()); - except = e; - success = false; + flag = false; } - GWM_LOG_PROGRESS(i + 1, nDp); } } - else - { + return beta; +} + +vec GWRMultiscale::fitVarCoreSHatOmp(const vec &x, const vec &y, const SpatialWeight& sw, vec& shat) +{ + uword nDp = mCoords.n_rows; + vec betas(nDp, fill::zeros); + mat shat_all(2, mOmpThreadNum, fill::zeros); + bool flag = true; + std::pair workRange = mWorkRange.value_or(make_pair(0, nDp)); #pragma omp parallel for num_threads(mOmpThreadNum) - for (int i = 0; (uword)i < nDp; i++) + for (int i = workRange.first; (uword)i < workRange.second; i++) + { + GWM_LOG_STOP_CONTINUE(mStatus); + if (flag) { - GWM_LOG_STOP_CONTINUE(mStatus); - vec w = mSpatialWeights[var].weightVector(i); + int thread = omp_get_thread_num(); + vec w = sw.weightVector(i); mat xtw = trans(x % w); mat xtwx = xtw * x; mat xtwy = xtw * y; try { mat xtwx_inv = inv_sympd(xtwx); - betas.col(i) = xtwx_inv * xtwy; - } - catch (const exception& e) - { - GWM_LOG_ERROR(e.what()); - except = e; - success = false; - } - GWM_LOG_PROGRESS(i + 1, nDp); - } - } - if (!success) - { - throw except; - } - return betas.t(); -} - -double GWRMultiscale::bandwidthSizeCriterionAllCVOmp(BandwidthWeight *bandwidthWeight) -{ - uword nDp = mCoords.n_rows; - vec shat(2, fill::zeros); - vec cv_all(mOmpThreadNum, fill::zeros); - bool flag = true; -#pragma omp parallel for num_threads(mOmpThreadNum) - for (int i = 0; (uword)i < nDp; i++) - { - GWM_LOG_STOP_CONTINUE(mStatus); - if (flag) - { - int thread = omp_get_thread_num(); - vec d = mInitSpatialWeight.distance()->distance(i); - vec w = bandwidthWeight->weight(d); - w(i) = 0.0; - mat xtw = trans(mX.each_col() % w); - mat xtwx = xtw * mX; - mat xtwy = xtw * mY; - try - { - mat xtwx_inv = inv_sympd(xtwx); - vec beta = xtwx_inv * xtwy; - double res = mY(i) - det(mX.row(i) * beta); - cv_all(thread) += res * res; - } - catch (const exception& e) - { - GWM_LOG_ERROR(e.what()); - flag = false; - } - } - } - if (mStatus == Status::Success && flag) - { - double cv = sum(cv_all); - GWM_LOG_INFO(IBandwidthSelectable::infoBandwidthCriterion(bandwidthWeight, cv)); - GWM_LOG_PROGRESS_PERCENT(exp(- abs(mBandwidthLastCriterion - cv))); - mBandwidthLastCriterion = cv; - return cv; - } - else return DBL_MAX; -} - -double GWRMultiscale::bandwidthSizeCriterionAllAICOmp(BandwidthWeight *bandwidthWeight) -{ - uword nDp = mCoords.n_rows, nVar = mX.n_cols; - mat betas(nVar, nDp, fill::zeros); - mat shat_all(2, mOmpThreadNum, fill::zeros); - bool flag = true; -#pragma omp parallel for num_threads(mOmpThreadNum) - for (int i = 0; (uword)i < nDp; i++) - { - GWM_LOG_STOP_CONTINUE(mStatus); - if (flag) - { - int thread = omp_get_thread_num(); - vec d = mInitSpatialWeight.distance()->distance(i); - vec w = bandwidthWeight->weight(d); - mat xtw = trans(mX.each_col() % w); - mat xtwx = xtw * mX; - mat xtwy = xtw * mY; - try - { - mat xtwx_inv = inv_sympd(xtwx); - betas.col(i) = xtwx_inv * xtwy; - mat ci = xtwx_inv * xtw; - mat si = mX.row(i) * ci; - shat_all(0, thread) += si(0, i); - shat_all(1, thread) += det(si * si.t()); + betas(i) = as_scalar(xtwx_inv * xtwy); + mat ci = xtwx_inv * xtw; + mat si = x(i) * ci; + shat_all(0, thread) += si(0, i); + shat_all(1, thread) += det(si * si.t()); } catch (const exception& e) { @@ -849,217 +745,14 @@ double GWRMultiscale::bandwidthSizeCriterionAllAICOmp(BandwidthWeight *bandwidth } } } - if (mStatus == Status::Success && flag) - { - vec shat = sum(shat_all, 1); - double value = GWRMultiscale::AICc(mX, mY, betas.t(), shat); - if (isfinite(value)) - { - GWM_LOG_INFO(IBandwidthSelectable::infoBandwidthCriterion(bandwidthWeight, value)); - GWM_LOG_PROGRESS_PERCENT(exp(- abs(mBandwidthLastCriterion - value))); - mBandwidthLastCriterion = value; - return value; - } - else return DBL_MAX; - } - else return DBL_MAX; -} - -double GWRMultiscale::bandwidthSizeCriterionVarCVOmp(BandwidthWeight *bandwidthWeight) -{ - size_t var = mBandwidthSelectionCurrentIndex; - uword nDp = mCoords.n_rows; - vec shat(2, fill::zeros); - vec cv_all(mOmpThreadNum, fill::zeros); - bool flag = true; -#pragma omp parallel for num_threads(mOmpThreadNum) - for (int i = 0; (uword)i < nDp; i++) - { - GWM_LOG_STOP_CONTINUE(mStatus); - if (flag) - { - int thread = omp_get_thread_num(); - vec d = mSpatialWeights[var].distance()->distance(i); - vec w = bandwidthWeight->weight(d); - w(i) = 0.0; - mat xtw = trans(mXi % w); - mat xtwx = xtw * mXi; - mat xtwy = xtw * mYi; - try - { - mat xtwx_inv = inv_sympd(xtwx); - vec beta = xtwx_inv * xtwy; - double res = mYi(i) - det(mXi(i) * beta); - cv_all(thread) += res * res; - } - catch (const exception& e) - { - GWM_LOG_ERROR(e.what()); - flag = false; - } - } - } - if (mStatus == Status::Success && flag) - { - double cv = sum(cv_all); - GWM_LOG_INFO(IBandwidthSelectable::infoBandwidthCriterion(bandwidthWeight, cv)); - GWM_LOG_PROGRESS_PERCENT(exp(- abs(mBandwidthLastCriterion - cv))); - mBandwidthLastCriterion = cv; - return cv; - } - else return DBL_MAX; -} - -double GWRMultiscale::bandwidthSizeCriterionVarAICOmp(BandwidthWeight *bandwidthWeight) -{ - size_t var = mBandwidthSelectionCurrentIndex; - uword nDp = mCoords.n_rows; - mat betas(1, nDp, fill::zeros); - mat shat_all(2, mOmpThreadNum, fill::zeros); - bool flag = true; -#pragma omp parallel for num_threads(mOmpThreadNum) - for (int i = 0; (uword)i < nDp; i++) - { - GWM_LOG_STOP_CONTINUE(mStatus); - if (flag) - { - int thread = omp_get_thread_num(); - vec d = mSpatialWeights[var].distance()->distance(i); - vec w = bandwidthWeight->weight(d); - mat xtw = trans(mXi % w); - mat xtwx = xtw * mXi; - mat xtwy = xtw * mYi; - try - { - mat xtwx_inv = inv_sympd(xtwx); - betas.col(i) = xtwx_inv * xtwy; - mat ci = xtwx_inv * xtw; - mat si = mXi(i) * ci; - shat_all(0, thread) += si(0, i); - shat_all(1, thread) += det(si * si.t()); - } - catch (const exception& e) - { - GWM_LOG_ERROR(e.what()); - flag = false; - } - } - } - if (flag) - { - vec shat = sum(shat_all, 1); - double value = GWRMultiscale::AICc(mXi, mYi, betas.t(), shat); - if (isfinite(value)) - { - GWM_LOG_INFO(IBandwidthSelectable::infoBandwidthCriterion(bandwidthWeight, value)); - GWM_LOG_PROGRESS_PERCENT(exp(- abs(mBandwidthLastCriterion - value))); - mBandwidthLastCriterion = value; - return value; - } - else return DBL_MAX; - } - return DBL_MAX; + shat = sum(shat_all, 1); + return betas; } #endif #ifdef ENABLE_CUDA - -mat GWRMultiscale::fitAllCuda(const mat& x, const vec& y) -{ - uword nDp = mCoords.n_rows, nVar = x.n_cols; - mat betas(nVar, nDp, fill::zeros); - mat xt = trans(x); - cumat u_xt(xt), u_y(y); - cumat u_betas(nVar, nDp); - cumat u_dists(nDp, 1), u_weights(nDp, 1); - custride u_xtw(nVar, nDp, mGroupLength); - mat si(nDp, mGroupLength, fill::zeros); - cube ci(nVar, nDp, mGroupLength, fill::zeros); - cube cct(nVar, nVar, mGroupLength, fill::zeros); - int *d_info, *p_info; - p_info = new int[mGroupLength]; - checkCudaErrors(cudaMalloc(&d_info, sizeof(int) * mGroupLength)); - if (mHasHatMatrix) - { - mat betasSE(nVar, nDp); - cumat u_betasSE(nVar, nDp); - size_t groups = nDp / mGroupLength + (nDp % mGroupLength == 0 ? 0 : 1); - for (size_t i = 0; i < groups; i++) - { - GWM_LOG_STOP_BREAK(mStatus); - size_t begin = i * mGroupLength, length = (begin + mGroupLength > nDp) ? (nDp - begin) : mGroupLength; - for (size_t j = 0, e = begin + j; j < length; j++, e++) - { - checkCudaErrors(mInitSpatialWeight.weightVector(e, u_dists.dmem(), u_weights.dmem())); - u_xtw.strides(j) = u_xt.diagmul(u_weights); - } - custride u_xtwx = u_xtw * u_xt.t(); - custride u_xtwy = u_xtw * u_y; - custride u_xtwxI = u_xtwx.inv(d_info); - checkCudaErrors(cudaMemcpy(p_info, d_info, sizeof(int) * mGroupLength, cudaMemcpyDeviceToHost)); - for (size_t j = 0; j < mGroupLength; j++) - { - if (p_info[j] != 0) - { - std::runtime_error e("Cuda failed to get the inverse of matrix"); - GWM_LOG_ERROR(e.what()); - throw e; - } - } - u_betas.as_stride().strides(begin, begin + length) = u_xtwxI * u_xtwy; - custride u_c = u_xtwxI * u_xtw; - custride u_s = u_xt.as_stride().strides(begin, begin + length).t() * u_c; - custride u_cct = u_c * u_c.t(); - u_s.get(si.memptr()); - mS0.rows(begin, begin + length - 1) = si.head_cols(length).t(); - u_c.get(ci.memptr()); - mC.slices(begin, begin + length - 1) = ci.head_slices(length); - u_cct.get(cct.memptr()); - for (size_t j = 0, e = i * mGroupLength + j; j < mGroupLength && e < nDp; j++, e++) - { - u_s.strides(j).get(si.memptr()); - betasSE.col(e) = diagvec(cct.slice(j)); - } - GWM_LOG_PROGRESS(begin + length, nDp); - } - u_betas.get(betas.memptr()); - mBetasSE = betasSE.t(); - } - else - { - size_t groups = nDp / mGroupLength + (nDp % mGroupLength == 0 ? 0 : 1); - for (size_t i = 0; i < groups; i++) - { - GWM_LOG_STOP_BREAK(mStatus); - size_t begin = i * mGroupLength, length = (begin + mGroupLength > nDp) ? (nDp - begin) : mGroupLength; - for (size_t j = 0, e = begin + j; j < length; j++, e++) - { - checkCudaErrors(mInitSpatialWeight.weightVector(e, u_dists.dmem(), u_weights.dmem())); - u_xtw.strides(j) = u_xt.diagmul(u_weights); - } - custride u_xtwx = u_xtw * u_xt.t(); - custride u_xtwy = u_xtw * u_y; - custride u_xtwxI = u_xtwx.inv(d_info); - checkCudaErrors(cudaMemcpy(p_info, d_info, sizeof(int) * mGroupLength, cudaMemcpyDeviceToHost)); - for (size_t j = 0; j < mGroupLength; j++) - { - if (p_info[j] != 0) - { - std::runtime_error e("Cuda failed to get the inverse of matrix"); - GWM_LOG_ERROR(e.what()); - throw e; - } - } - u_betas.as_stride().strides(begin, begin + length) = u_xtwxI * u_xtwy; - GWM_LOG_PROGRESS(begin + length, nDp); - } - u_betas.get(betas.memptr()); - } - return betas.t(); -} - -vec GWRMultiscale::fitVarCuda(const vec &x, const vec &y, const uword var, mat &S) +vec GWRMultiscale::fitVarCoreCuda(const vec &x, const vec &y, const SpatialWeight& sw, mat &S) { uword nDp = mCoords.n_rows; mat betas(1, nDp, fill::zeros); @@ -1071,18 +764,20 @@ vec GWRMultiscale::fitVarCuda(const vec &x, const vec &y, const uword var, mat & int *d_info, *p_info; p_info = new int[mGroupLength]; checkCudaErrors(cudaMalloc(&d_info, sizeof(int) * mGroupLength)); - size_t groups = nDp / mGroupLength + (nDp % mGroupLength == 0 ? 0 : 1); - S = mat(mHasHatMatrix ? nDp : 1, nDp, fill::zeros); + std::pair workRange = mWorkRange.value_or(make_pair(0, nDp)); + uword rangeSize = workRange.second - workRange.first; + size_t groups = rangeSize / mGroupLength + (rangeSize % mGroupLength == 0 ? 0 : 1); + S = mat(mHasHatMatrix ? rangeSize : 1, nDp, fill::zeros); if (mHasHatMatrix) { mat sg(nDp, mGroupLength, fill::zeros); for (size_t i = 0; i < groups; i++) { GWM_LOG_STOP_BREAK(mStatus); - size_t begin = i * mGroupLength, length = (begin + mGroupLength > nDp) ? (nDp - begin) : mGroupLength; + size_t begin = workRange.first + i * mGroupLength, length = (begin + mGroupLength > workRange.second) ? (workRange.second - begin) : mGroupLength; for (size_t j = 0, e = begin + j; j < length; j++, e++) { - checkCudaErrors(mSpatialWeights[var].weightVector(e, u_dists.dmem(), u_weights.dmem())); + checkCudaErrors(sw.weightVector(e, u_dists.dmem(), u_weights.dmem())); u_xtw.strides(j) = u_xt.diagmul(u_weights); } custride u_xtwx = u_xtw * u_xt.t(); @@ -1111,10 +806,10 @@ vec GWRMultiscale::fitVarCuda(const vec &x, const vec &y, const uword var, mat & for (size_t i = 0; i < groups; i++) { GWM_LOG_STOP_BREAK(mStatus); - size_t begin = i * mGroupLength, length = (begin + mGroupLength > nDp) ? (nDp - begin) : mGroupLength; + size_t begin = workRange.first + i * mGroupLength, length = (begin + mGroupLength > workRange.second) ? (workRange.second - begin) : mGroupLength; for (size_t j = 0, e = begin + j; j < length; j++, e++) { - checkCudaErrors(mSpatialWeights[var].weightVector(e, u_dists.dmem(), u_weights.dmem())); + checkCudaErrors(sw.weightVector(e, u_dists.dmem(), u_weights.dmem())); u_xtw.strides(j) = u_xt.diagmul(u_weights); } custride u_xtwx = u_xtw * u_xt.t(); @@ -1139,27 +834,28 @@ vec GWRMultiscale::fitVarCuda(const vec &x, const vec &y, const uword var, mat & return betas.t(); } -double GWRMultiscale::bandwidthSizeCriterionAllCVCuda(BandwidthWeight* bandwidthWeight) +vec GWRMultiscale::fitVarCoreCVCuda(const vec &x, const vec &y, const SpatialWeight& sw) { - uword nDp = mCoords.n_rows, nVar = mX.n_cols; - size_t elems = nDp; - cumat u_xt(mX.t()), u_y(mY); + uword nDp = mCoords.n_rows; + constexpr size_t nVar = 1; + cumat u_xt(mXi.t()), u_y(mYi); cumat u_dists(nDp, 1), u_weights(nDp, 1); + cumat u_betas(1, nDp); custride u_xtw(nVar, nDp, mGroupLength); - vec yhat(mGroupLength), yhat_all(nDp); int *d_info, *p_info; p_info = new int[mGroupLength]; checkCudaErrors(cudaMalloc(&d_info, sizeof(int) * mGroupLength)); bool success = true; - size_t groups = nDp / mGroupLength + (nDp % mGroupLength == 0 ? 0 : 1); + std::pair workRange = mWorkRange.value_or(make_pair(0, nDp)); + uword rangeSize = workRange.second - workRange.first; + size_t groups = rangeSize / mGroupLength + (rangeSize % mGroupLength == 0 ? 0 : 1); for (size_t i = 0; i < groups && success; i++) { GWM_LOG_STOP_BREAK(mStatus); - size_t begin = i * mGroupLength, length = (begin + mGroupLength > nDp) ? (nDp - begin) : mGroupLength; + size_t begin = workRange.first + i * mGroupLength, length = (begin + mGroupLength > workRange.second) ? (workRange.second - begin) : mGroupLength; for (size_t j = 0, e = begin + j; j < length; j++, e++) { - checkCudaErrors(mInitSpatialWeight.distance()->distance(e, u_dists.dmem(), &elems)); - checkCudaErrors(bandwidthWeight->weight(u_dists.dmem(), u_weights.dmem(), elems)); + checkCudaErrors(sw.weightVector(e, u_dists.dmem(), u_weights.dmem())); checkCudaErrors(cudaMemcpy(u_weights.dmem() + e, &cubase::beta0, sizeof(double), cudaMemcpyHostToDevice)); u_xtw.strides(j) = u_xt.diagmul(u_weights); } @@ -1178,49 +874,41 @@ double GWRMultiscale::bandwidthSizeCriterionAllCVCuda(BandwidthWeight* bandwidth } if (success) { - custride u_betas = u_xtwxI * u_xtwy; - custride u_yhat = u_xt.as_stride().strides(begin, begin + length).t() * u_betas; - u_yhat.get(yhat.memptr()); - yhat_all.rows(begin, begin + length - 1) = yhat.head_rows(length); + u_betas.as_stride().strides(begin, begin + length) = u_xtwxI * u_xtwy; } } checkCudaErrors(cudaFree(d_info)); delete[] p_info; - if (!success) return DBL_MAX; - double cv = as_scalar((mY - yhat_all).t() * (mY - yhat_all)); - if (isfinite(cv)) - { - GWM_LOG_INFO(IBandwidthSelectable::infoBandwidthCriterion(bandwidthWeight, cv)); - GWM_LOG_PROGRESS_PERCENT(exp(- abs(mBandwidthLastCriterion - cv))); - mBandwidthLastCriterion = cv; - return cv; - } - else return DBL_MAX; + vec betas(nDp, fill::zeros); + u_betas.get(betas.memptr()); + return betas; } -double GWRMultiscale::bandwidthSizeCriterionAllAICCuda(BandwidthWeight* bandwidthWeight) +vec GWRMultiscale::fitVarCoreSHatCuda(const vec &x, const vec &y, const SpatialWeight& sw, vec& shat) { - uword nDp = mCoords.n_rows, nVar = mX.n_cols; - size_t elems = nDp; - cumat u_xt(mX.t()), u_y(mY), u_betas(nVar, nDp); + uword nDp = mCoords.n_rows; + constexpr size_t nVar = 1; + cumat u_xt(mXi.t()), u_y(mYi), u_betas(nVar, nDp); cumat u_dists(nDp, 1), u_weights(nDp, 1); custride u_xtw(nVar, nDp, mGroupLength); mat betas(nVar, nDp); - vec shat(2), sst(mGroupLength); + shat = vec(2); + vec sst(mGroupLength); mat sg(nDp, mGroupLength); int *d_info, *p_info; p_info = new int[mGroupLength]; checkCudaErrors(cudaMalloc(&d_info, sizeof(int) * mGroupLength)); bool success = true; - size_t groups = nDp / mGroupLength + (nDp % mGroupLength == 0 ? 0 : 1); + std::pair workRange = mWorkRange.value_or(make_pair(0, nDp)); + uword rangeSize = workRange.second - workRange.first; + size_t groups = rangeSize / mGroupLength + (rangeSize % mGroupLength == 0 ? 0 : 1); for (size_t i = 0; i < groups && success; i++) { GWM_LOG_STOP_BREAK(mStatus); - size_t begin = i * mGroupLength, length = (begin + mGroupLength > nDp) ? (nDp - begin) : mGroupLength; + size_t begin = workRange.first + i * mGroupLength, length = (begin + mGroupLength > workRange.second) ? (workRange.second - begin) : mGroupLength; for (size_t j = 0, e = begin + j; j < length; j++, e++) { - checkCudaErrors(mInitSpatialWeight.distance()->distance(e, u_dists.dmem(), &elems)); - checkCudaErrors(bandwidthWeight->weight(u_dists.dmem(), u_weights.dmem(), elems)); + checkCudaErrors(sw.weightVector(e, u_dists.dmem(), u_weights.dmem())); u_xtw.strides(j) = u_xt.diagmul(u_weights); } custride u_xtwx = u_xtw * u_xt.t(); @@ -1250,71 +938,60 @@ double GWRMultiscale::bandwidthSizeCriterionAllAICCuda(BandwidthWeight* bandwidt } checkCudaErrors(cudaFree(d_info)); delete[] p_info; - if (!success) return DBL_MAX; u_betas.get(betas.memptr()); - double aic = GWRMultiscale::AICc(mX, mY, betas.t(), shat); - if (isfinite(aic)) + return betas.t(); +} +#endif // ENABLE_CUDA + +#ifdef ENABLE_MPI +vec GWRMultiscale::fitVarMpi(const size_t var) +{ + uword nDp = mXi.n_rows; + mat S; + vec beta_p = (this->*mFitVarCore)(mXi, mYi, mSpatialWeights[var], S); + vec beta(nDp); + MPI_Allreduce(beta_p.memptr(), beta.memptr(), nDp, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); + if (mHasHatMatrix) { - GWM_LOG_INFO(IBandwidthSelectable::infoBandwidthCriterion(bandwidthWeight, aic)); - GWM_LOG_PROGRESS_PERCENT(exp(- abs(mBandwidthLastCriterion - aic))); - mBandwidthLastCriterion = aic; - return aic; + mat SArrayi = mSArray.slice(var) - mS0; + mat_mul_mpi(S, SArrayi, mSArray.slice(var), mWorkerId, mWorkerNum, mWorkRangeSize); + mSArray.slice(var) += S; + mS0 = mSArray.slice(var) - SArrayi; } - else return DBL_MAX; + return beta; } -double GWRMultiscale::bandwidthSizeCriterionVarCVCuda(BandwidthWeight* bandwidthWeight) +double GWRMultiscale::bandwidthSizeCriterionVarCVMpi(BandwidthWeight* bandwidthWeight) { - size_t var = mBandwidthSelectionCurrentIndex; - uword nDp = mCoords.n_rows; - size_t elems = nDp; - constexpr size_t nVar = 1; - cumat u_xt(mXi.t()), u_y(mYi); - cumat u_dists(nDp, 1), u_weights(nDp, 1); - custride u_xtw(nVar, nDp, mGroupLength); - vec yhat(mGroupLength), yhat_all(nDp); - int *d_info, *p_info; - p_info = new int[mGroupLength]; - checkCudaErrors(cudaMalloc(&d_info, sizeof(int) * mGroupLength)); - bool success = true; - size_t groups = nDp / mGroupLength + (nDp % mGroupLength == 0 ? 0 : 1); - for (size_t i = 0; i < groups && success; i++) + SpatialWeight sw(bandwidthWeight, mSpatialWeights[mBandwidthSelectionCurrentIndex].distance()); + int status = 1; + arma::Col status_all(mWorkerNum); + vec betas; + try { - GWM_LOG_STOP_BREAK(mStatus); - size_t begin = i * mGroupLength, length = (begin + mGroupLength > nDp) ? (nDp - begin) : mGroupLength; - for (size_t j = 0, e = begin + j; j < length; j++, e++) - { - checkCudaErrors(mSpatialWeights[var].distance()->distance(e, u_dists.dmem(), &elems)); - checkCudaErrors(bandwidthWeight->weight(u_dists.dmem(), u_weights.dmem(), elems)); - checkCudaErrors(cudaMemcpy(u_weights.dmem() + e, &cubase::beta0, sizeof(double), cudaMemcpyHostToDevice)); - u_xtw.strides(j) = u_xt.diagmul(u_weights); - } - custride u_xtwx = u_xtw * u_xt.t(); - custride u_xtwy = u_xtw * u_y; - custride u_xtwxI = u_xtwx.inv(d_info); - checkCudaErrors(cudaMemcpy(p_info, d_info, sizeof(int) * mGroupLength, cudaMemcpyDeviceToHost)); - for (size_t j = 0; j < mGroupLength; j++) - { - if (p_info[j] != 0) - { - GWM_LOG_ERROR("Cuda failed to get the inverse of matrix"); - success = false; - break; - } - } - if (success) - { - custride u_betas = u_xtwxI * u_xtwy; - custride u_yhat = u_xt.as_stride().strides(begin, begin + length).t() * u_betas.strides(0, length); - u_yhat.get(yhat.memptr()); - yhat_all.rows(begin, begin + length - 1) = yhat.head_rows(length); - } + betas = (this->*mFitVarCoreCV)(mXi, mYi, sw); } - checkCudaErrors(cudaFree(d_info)); - delete[] p_info; - if (!success) return DBL_MAX; - double cv = as_scalar((mYi - yhat_all).t() * (mYi - yhat_all)); - if (isfinite(cv)) + catch(const std::exception& e) + { + status = 0; + } + MPI_Allgather(&status, 1, MPI_INT, status_all.memptr(), 1, MPI_INT, MPI_COMM_WORLD); + if (!all(status_all)) + return DBL_MAX; + + // If all right, calculate cv; + double cv; + vec betas_all; + GWM_MPI_MASTER_BEGIN + betas_all = vec(size(betas)); + GWM_MPI_MASTER_END + MPI_Reduce(betas.memptr(), betas_all.memptr(), betas.n_elem, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); + GWM_MPI_MASTER_BEGIN + vec residual = mYi - mXi % betas_all; + cv = sum(residual % residual); + GWM_MPI_MASTER_END + MPI_Bcast(&cv, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); + if (mStatus == Status::Success && isfinite(cv)) { GWM_LOG_INFO(IBandwidthSelectable::infoBandwidthCriterion(bandwidthWeight, cv)); GWM_LOG_PROGRESS_PERCENT(exp(- abs(mBandwidthLastCriterion - cv))); @@ -1324,64 +1001,38 @@ double GWRMultiscale::bandwidthSizeCriterionVarCVCuda(BandwidthWeight* bandwidth else return DBL_MAX; } -double GWRMultiscale::bandwidthSizeCriterionVarAICCuda(BandwidthWeight* bandwidthWeight) +double GWRMultiscale::bandwidthSizeCriterionVarAICMpi(BandwidthWeight* bandwidthWeight) { - size_t var = mBandwidthSelectionCurrentIndex; - uword nDp = mCoords.n_rows; - size_t elems = nDp; - constexpr size_t nVar = 1; - cumat u_xt(mXi.t()), u_y(mYi), u_betas(nVar, nDp); - cumat u_dists(nDp, 1), u_weights(nDp, 1); - custride u_xtw(nVar, nDp, mGroupLength); - mat betas(nVar, nDp); - vec shat(2), sst(mGroupLength); - mat sg(nDp, mGroupLength); - int *d_info, *p_info; - p_info = new int[mGroupLength]; - checkCudaErrors(cudaMalloc(&d_info, sizeof(int) * mGroupLength)); - bool success = true; - size_t groups = nDp / mGroupLength + (nDp % mGroupLength == 0 ? 0 : 1); - for (size_t i = 0; i < groups && success; i++) + SpatialWeight sw(bandwidthWeight, mSpatialWeights[mBandwidthSelectionCurrentIndex].distance()); + int status = 1; + arma::Col status_all(mWorkerNum); + vec betas, shat; + try { - GWM_LOG_STOP_BREAK(mStatus); - size_t begin = i * mGroupLength, length = (begin + mGroupLength > nDp) ? (nDp - begin) : mGroupLength; - for (size_t j = 0, e = begin + j; j < length; j++, e++) - { - checkCudaErrors(mSpatialWeights[var].distance()->distance(e, u_dists.dmem(), &elems)); - checkCudaErrors(bandwidthWeight->weight(u_dists.dmem(), u_weights.dmem(), elems)); - u_xtw.strides(j) = u_xt.diagmul(u_weights); - } - custride u_xtwx = u_xtw * u_xt.t(); - custride u_xtwy = u_xtw * u_y; - custride u_xtwxI = u_xtwx.inv(d_info); - checkCudaErrors(cudaMemcpy(p_info, d_info, sizeof(int) * mGroupLength, cudaMemcpyDeviceToHost)); - for (size_t j = 0; j < mGroupLength; j++) - { - if (p_info[j] != 0) - { - GWM_LOG_ERROR("Cuda failed to get the inverse of matrix"); - success = false; - break; - } - } - if (success) - { - u_betas.as_stride().strides(begin, begin + length) = u_xtwxI * u_xtwy; - custride u_c = u_xtwxI * u_xtw; - custride u_s = u_xt.as_stride().strides(begin, begin + length).t() * u_c; - custride u_sst = u_s * u_s.t(); - u_s.get(sg.memptr()); - u_sst.get(sst.memptr()); - shat(0) += trace(sg.submat(begin, 0, arma::SizeMat(length, length))); - shat(1) += sum(sst); - } + betas = (this->*mFitVarCoreSHat)(mXi, mYi, sw, shat); } - checkCudaErrors(cudaFree(d_info)); - delete[] p_info; - if (!success) return DBL_MAX; - u_betas.get(betas.memptr()); - double aic = GWRMultiscale::AICc(mXi, mYi, betas.t(), shat); - if (isfinite(aic)) + catch(const std::exception& e) + { + status = 0; + } + MPI_Allgather(&status, 1, MPI_INT, status_all.memptr(), 1, MPI_INT, MPI_COMM_WORLD); + if (!all(status_all)) + return DBL_MAX; + + // If all right, calculate cv; + double aic; + vec betas_all, shat_all; + GWM_MPI_MASTER_BEGIN + betas_all = vec(size(betas)); + shat_all = vec(size(shat)); + GWM_MPI_MASTER_END + MPI_Reduce(betas.memptr(), betas_all.memptr(), betas.n_elem, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); + MPI_Reduce(shat.memptr(), shat_all.memptr(), shat.n_elem, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); + GWM_MPI_MASTER_BEGIN + aic = GWRBasic::AICc(mXi, mYi, betas_all, shat_all); + GWM_MPI_MASTER_END + MPI_Bcast(&aic, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); + if (mStatus == Status::Success && isfinite(aic)) { GWM_LOG_INFO(IBandwidthSelectable::infoBandwidthCriterion(bandwidthWeight, aic)); GWM_LOG_PROGRESS_PERCENT(exp(- abs(mBandwidthLastCriterion - aic))); @@ -1390,57 +1041,29 @@ double GWRMultiscale::bandwidthSizeCriterionVarAICCuda(BandwidthWeight* bandwidt } else return DBL_MAX; } - -#endif // ENABLE_CUDA - -GWRMultiscale::BandwidthSizeCriterionFunction GWRMultiscale::bandwidthSizeCriterionAll(GWRMultiscale::BandwidthSelectionCriterionType type) -{ - unordered_map > mapper = { - std::make_pair >(BandwidthSelectionCriterionType::CV, { - #ifdef ENABLE_OPENMP - std::make_pair(ParallelType::OpenMP, &GWRMultiscale::bandwidthSizeCriterionAllCVOmp), - #endif - #ifdef ENABLE_CUDA - std::make_pair(ParallelType::CUDA, &GWRMultiscale::bandwidthSizeCriterionAllCVCuda), - #endif - std::make_pair(ParallelType::SerialOnly, &GWRMultiscale::bandwidthSizeCriterionAllCVSerial) - }), - std::make_pair >(BandwidthSelectionCriterionType::AIC, { - #ifdef ENABLE_OPENMP - std::make_pair(ParallelType::OpenMP, &GWRMultiscale::bandwidthSizeCriterionAllAICOmp), - #endif - #ifdef ENABLE_CUDA - std::make_pair(ParallelType::CUDA, &GWRMultiscale::bandwidthSizeCriterionAllAICCuda), - #endif - std::make_pair(ParallelType::SerialOnly, &GWRMultiscale::bandwidthSizeCriterionAllAICSerial) - }) - }; - return mapper[type][mParallelType]; -} +#endif // ENABLE_MPI GWRMultiscale::BandwidthSizeCriterionFunction GWRMultiscale::bandwidthSizeCriterionVar(GWRMultiscale::BandwidthSelectionCriterionType type) { - unordered_map > mapper = { - std::make_pair >(BandwidthSelectionCriterionType::CV, { - #ifdef ENABLE_OPENMP - std::make_pair(ParallelType::OpenMP, &GWRMultiscale::bandwidthSizeCriterionVarCVOmp), - #endif - #ifdef ENABLE_CUDA - std::make_pair(ParallelType::CUDA, &GWRMultiscale::bandwidthSizeCriterionVarCVCuda), - #endif - std::make_pair(ParallelType::SerialOnly, &GWRMultiscale::bandwidthSizeCriterionVarCVSerial) - }), - std::make_pair >(BandwidthSelectionCriterionType::AIC, { - #ifdef ENABLE_OPENMP - std::make_pair(ParallelType::OpenMP, &GWRMultiscale::bandwidthSizeCriterionVarAICOmp), - #endif - #ifdef ENABLE_CUDA - std::make_pair(ParallelType::CUDA, &GWRMultiscale::bandwidthSizeCriterionVarAICCuda), - #endif - std::make_pair(ParallelType::SerialOnly, &GWRMultiscale::bandwidthSizeCriterionVarAICSerial) - }) - }; - return mapper[type][mParallelType]; +#ifdef ENABLE_MPI + if (mParallelType & ParallelType::MPI) + { + switch (type) + { + case BandwidthSelectionCriterionType::AIC: + return &GWRMultiscale::bandwidthSizeCriterionVarAICMpi; + default: + return &GWRMultiscale::bandwidthSizeCriterionVarCVMpi; + } + } +#endif // ENABLE_MPI + switch (type) + { + case BandwidthSelectionCriterionType::AIC: + return &GWRMultiscale::bandwidthSizeCriterionVarAICBase; + default: + return &GWRMultiscale::bandwidthSizeCriterionVarCVBase; + } } void GWRMultiscale::setParallelType(const ParallelType &type) @@ -1449,27 +1072,36 @@ void GWRMultiscale::setParallelType(const ParallelType &type) { mParallelType = type; switch (type) { - case ParallelType::SerialOnly: - mFitAll = &GWRMultiscale::fitAllSerial; - mFitVar = &GWRMultiscale::fitVarSerial; - break; #ifdef ENABLE_OPENMP case ParallelType::OpenMP: - mFitAll = &GWRMultiscale::fitAllOmp; - mFitVar = &GWRMultiscale::fitVarOmp; + mFitVarCore = &GWRMultiscale::fitVarCoreOmp; + mFitVarCoreCV = &GWRMultiscale::fitVarCoreCVOmp; + mFitVarCoreSHat = &GWRMultiscale::fitVarCoreSHatOmp; break; #endif #ifdef ENABLE_CUDA case ParallelType::CUDA: - mFitAll = &GWRMultiscale::fitAllCuda; - mFitVar = &GWRMultiscale::fitVarCuda; + mFitVarCore = &GWRMultiscale::fitVarCoreCuda; + mFitVarCoreCV = &GWRMultiscale::fitVarCoreCVCuda; + mFitVarCoreSHat = &GWRMultiscale::fitVarCoreSHatCuda; break; #endif default: - mFitAll = &GWRMultiscale::fitAllSerial; - mFitVar = &GWRMultiscale::fitVarSerial; + mFitVarCore = &GWRMultiscale::fitVarCoreSerial; + mFitVarCoreCV = &GWRMultiscale::fitVarCoreCVSerial; + mFitVarCoreSHat = &GWRMultiscale::fitVarCoreSHatSerial; break; } +#ifdef ENABLE_MPI + if (mParallelType & ParallelType::MPI) + { + mFitVar = &GWRMultiscale::fitVarMpi; + } + else + { + mFitVar = &GWRMultiscale::fitVarBase; + } +#endif // ENABLE_MPI } } diff --git a/src/gwmodelpp/spatialweight/BandwidthWeight.cpp b/src/gwmodelpp/spatialweight/BandwidthWeight.cpp index 13a62779..d2ba1e4d 100644 --- a/src/gwmodelpp/spatialweight/BandwidthWeight.cpp +++ b/src/gwmodelpp/spatialweight/BandwidthWeight.cpp @@ -63,7 +63,7 @@ vec BandwidthWeight::weight(vec dist) #ifdef ENABLE_CUDA cudaError_t BandwidthWeight::weight(double* d_dists, double* d_weights, size_t elems) { - if (!mCudaPrepared) throw std::logic_error("Cuda has not been prepared."); + if (!mCudaPrepared) throw std::logic_error("[BandwidthWeight] Cuda has not been prepared."); return gw_weight_cuda(mBandwidth, (int)mKernel, mAdaptive, d_dists, d_weights, elems, mCudaThreads); } #endif // ENABLE_CUDA diff --git a/src/gwmodelpp/spatialweight/CRSDistance.cpp b/src/gwmodelpp/spatialweight/CRSDistance.cpp index c3b97050..0c00116f 100644 --- a/src/gwmodelpp/spatialweight/CRSDistance.cpp +++ b/src/gwmodelpp/spatialweight/CRSDistance.cpp @@ -90,7 +90,8 @@ CRSDistance::CRSDistance(const CRSDistance &distance) : Distance(distance) mParameter = make_unique(fp, dp); #ifdef ENABLE_CUDA mUseCuda = distance.mUseCuda; - if (distance.mCudaPrepared) + mCudaPrepared = false; + if (distance.mUseCuda) { prepareCuda(distance.mGpuID); } @@ -158,25 +159,25 @@ double CRSDistance::minDistance() cudaError_t CRSDistance::prepareCuda(size_t gpuId) { checkCudaErrors(Distance::prepareCuda(gpuId)); - checkCudaErrors(cudaMalloc(&mCudaDp, sizeof(double) * mParameter->dataPoints.n_elem)); - checkCudaErrors(cudaMalloc(&mCudaFp, sizeof(double) * mParameter->focusPoints.n_elem)); - mat dpt = mParameter->dataPoints.t(), fpt = mParameter->focusPoints.t(); - checkCudaErrors(cudaMemcpy(mCudaDp, dpt.mem, sizeof(double) * dpt.n_elem, cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMemcpy(mCudaFp, fpt.mem, sizeof(double) * fpt.n_elem, cudaMemcpyHostToDevice)); - mCudaPrepared = true; + if (!mCudaPrepared) + { + mCudaDp = std::move(cumat(mParameter->dataPoints.t())); + mCudaFp = std::move(cumat(mParameter->focusPoints.t())); + mCudaPrepared = true; + } return cudaSuccess; } cudaError_t CRSDistance::distance(uword focus, double *d_dists, size_t *elems) { if (mParameter == nullptr) throw std::runtime_error("Parameter is nullptr."); - if (!mCudaPrepared) throw std::logic_error("Cuda has not been prepared."); - if (mCudaDp == 0 || mCudaFp == 0 || mCudaThreads == 0) throw std::logic_error("Cuda has not been correctly prepared."); + if (!mCudaPrepared) throw std::logic_error("[CRSDistance] Cuda has not been prepared."); + if (mCudaDp.dmem() == nullptr || mCudaFp.dmem() == nullptr || mCudaThreads == 0) throw std::logic_error("Cuda has not been correctly prepared."); if (focus < mParameter->total) { size_t fbias = focus * mParameter->focusPoints.n_cols; *elems = mParameter->total; - return mCalculatorCuda(mCudaDp, mCudaFp + fbias, mParameter->total, mCudaThreads, d_dists); + return mCalculatorCuda(mCudaDp.dmem(), mCudaFp.dmem() + fbias, mParameter->total, mCudaThreads, d_dists); } else throw std::runtime_error("Target is out of bounds of data points."); } diff --git a/src/gwmodelpp/spatialweight/cuda/BandwidthWeightKernel.cu b/src/gwmodelpp/spatialweight/cuda/BandwidthWeightKernel.cu index 0769b0c5..b7c06688 100644 --- a/src/gwmodelpp/spatialweight/cuda/BandwidthWeightKernel.cu +++ b/src/gwmodelpp/spatialweight/cuda/BandwidthWeightKernel.cu @@ -56,6 +56,14 @@ __global__ void gw_weight_boxcar_kernel(const double *d_dists, double bw, double } +__global__ void gw_weight_ones(const double *d_dists, double bw, double* d_weights, int n) +{ + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index >= n) return; + d_weights[index] = 1; +} + + typedef void(*WEIGHT_KERNEL_CUDA)(const double*, double, double*, int); @@ -72,11 +80,17 @@ cudaError_t gw_weight_cuda(double bw, int kernel, bool adaptive, double *d_dists { cudaError_t error; const WEIGHT_KERNEL_CUDA *kerf = GWRKernelCuda + kernel; + dim3 blockSize(threads), gridSize((ndp + blockSize.x - 1) / blockSize.x); + if (bw == DBL_MAX) + { + gw_weight_ones<< > > (d_dists, bw, d_weight, ndp); + error = cudaGetLastError(); + return error; + } switch (adaptive) { case true: { - dim3 blockSize(threads), gridSize((ndp + blockSize.x - 1) / blockSize.x); // Backup d_dists, used for sort double *d_dists_bak; cudaMalloc((void **)&d_dists_bak, sizeof(double) * ndp); @@ -95,7 +109,6 @@ cudaError_t gw_weight_cuda(double bw, int kernel, bool adaptive, double *d_dists } default: { - dim3 blockSize(threads), gridSize((ndp + blockSize.x - 1) / blockSize.x); (*kerf) << > > (d_dists, bw, d_weight, ndp); error = cudaGetLastError(); break; diff --git a/src/gwmodelpp/utils/armampi.cpp b/src/gwmodelpp/utils/armampi.cpp new file mode 100644 index 00000000..024db9b9 --- /dev/null +++ b/src/gwmodelpp/utils/armampi.cpp @@ -0,0 +1,25 @@ +#include "gwmodelpp/utils/armampi.h" +#include +#include + +using namespace std; +using namespace arma; + +void mat_mul_mpi(mat& a, mat& b, mat& c, const int ip, const int np, const size_t range) +{ + arma::uword m = a.n_rows, n = b.n_cols; + int k = (int) b.n_rows; + arma::Col b_rows(np, arma::fill::zeros); + MPI_Allgather(&k, 1, MPI_INT, b_rows.memptr(), 1, MPI_INT, MPI_COMM_WORLD); + c = mat(m, n, arma::fill::zeros); + mat a_buf; + for (int pi = 0; pi < np; pi++) + { + arma::Col a_counts = b_rows(pi) * b_rows; + arma::Col a_disp = arma::cumsum(a_counts) - a_counts; + a_buf.resize(b_rows(pi), b_rows(ip)); + MPI_Scatterv(a.memptr(), a_counts.mem, a_disp.mem, MPI_DOUBLE, a_buf.memptr(), a_buf.n_elem, MPI_DOUBLE, pi, MPI_COMM_WORLD); + mat ci = a_buf * b; + MPI_Reduce(ci.memptr(), c.memptr(), ci.n_elem, MPI_DOUBLE, MPI_SUM, pi, MPI_COMM_WORLD); + } +} diff --git a/src/gwmodelpp/utils/cumat.cpp b/src/gwmodelpp/utils/cumat.cpp index f03bbf18..f3f31d72 100644 --- a/src/gwmodelpp/utils/cumat.cpp +++ b/src/gwmodelpp/utils/cumat.cpp @@ -77,3 +77,10 @@ cumat &cumat::operator=(cuop_diagmul &&op) op.eval(*this); return *this; } + +void print(cumat &mat) +{ + arma::mat am(mat.nrows(), mat.ncols()); + mat.get(am.memptr()); + am.brief_print(); +} diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index f5fe3bc2..f7cafaad 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -29,9 +29,9 @@ set(SAMPLE_DATA_DIR ${CMAKE_CURRENT_SOURCE_DIR}/data) add_definitions(-DSAMPLE_DATA_DIR="${SAMPLE_DATA_DIR}") set(ADDON_SOURCES - data/londonhp100.cpp - TerminateCheckTelegram.cpp - FileTelegram.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/data/londonhp100.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/TerminateCheckTelegram.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/FileTelegram.cpp ) add_definitions(-DSAMPLE_DATA_DIR="${SAMPLE_DATA_DIR}") @@ -129,3 +129,7 @@ add_test( COMMAND $ --success --skip-benchmarks ) endif() + +if (ENABLE_MPI) +add_subdirectory("./mpi") +endif() diff --git a/test/include/TerminateCheckTelegram.h b/test/include/TerminateCheckTelegram.h index edb588f5..3b803eac 100644 --- a/test/include/TerminateCheckTelegram.h +++ b/test/include/TerminateCheckTelegram.h @@ -56,5 +56,9 @@ class TerminateCheckTelegram : public Logger const std::map ParallelTypeDict = { std::make_pair(ParallelType::SerialOnly, "SerialOnly"), std::make_pair(ParallelType::OpenMP, "OpenMP"), - std::make_pair(ParallelType::CUDA, "CUDA") + std::make_pair(ParallelType::CUDA, "CUDA"), + std::make_pair(ParallelType::MPI, "MPI"), + std::make_pair(ParallelType::MPI_Serial, "MPI_Serial"), + std::make_pair(ParallelType::MPI_MP, "MPI_MP"), + std::make_pair(ParallelType::MPI_CUDA, "MPI_CUDA") }; diff --git a/test/mpi/CMakeLists.txt b/test/mpi/CMakeLists.txt new file mode 100644 index 00000000..255a667a --- /dev/null +++ b/test/mpi/CMakeLists.txt @@ -0,0 +1,30 @@ +cmake_minimum_required(VERSION 3.17) + +if(ENABLE_MPI) +find_package(MPI REQUIRED) +include_directories(${MPI_CXX_HEADER_DIR}) +add_link_options(${MPI_CXX_LINK_FLAGS}) +add_compile_options(${MPI_CXX_COMPILE_OPTIONS}) +add_definitions(${MPI_CXX_COMPILE_DEFINITIONS}) +endif(ENABLE_MPI) + +add_executable(testGWRBasicMpi testGWRBasicMpi.cpp ${ADDON_SOURCES}) +target_link_libraries(testGWRBasicMpi PRIVATE gwmodel ${ARMADILLO_LIBRARIES} ${MPI_CXX_LIBRARIES} Catch2::Catch2WithMain) +add_test( + NAME testGWRBasicMpi + COMMAND ${MPIEXEC_EXECUTABLE} -np 4 $ --success --skip-benchmarks +) + +add_executable(testMpiMatMul testMpiMatMul.cpp) +target_link_libraries(testMpiMatMul PRIVATE gwmodel ${ARMADILLO_LIBRARIES} Catch2::Catch2WithMain) +add_test( + NAME testMpiMatMul + COMMAND ${MPIEXEC_EXECUTABLE} -np 4 $ --success --skip-benchmarks +) + +add_executable(testGWRMultiscaleMpi testGWRMultiscaleMpi.cpp ${ADDON_SOURCES}) +target_link_libraries(testGWRMultiscaleMpi PRIVATE gwmodel ${ARMADILLO_LIBRARIES} ${MPI_CXX_LIBRARIES} Catch2::Catch2WithMain) +add_test( + NAME testGWRMultiscaleMpi + COMMAND ${MPIEXEC_EXECUTABLE} -np 4 $ --success --skip-benchmarks +) diff --git a/test/mpi/testGWRBasicMpi.cpp b/test/mpi/testGWRBasicMpi.cpp new file mode 100644 index 00000000..c5b59934 --- /dev/null +++ b/test/mpi/testGWRBasicMpi.cpp @@ -0,0 +1,310 @@ +#define CATCH_CONFIG_MAIN +#include + +#include +#include +#include +#include +#include "gwmodelpp/GWRBasic.h" +#include "gwmodelpp/spatialweight/CRSDistance.h" +#include "gwmodelpp/spatialweight/BandwidthWeight.h" +#include "gwmodelpp/spatialweight/SpatialWeight.h" +#include "londonhp100.h" +#include "TerminateCheckTelegram.h" +#include "FileTelegram.h" + +using namespace std; +using namespace arma; +using namespace gwm; + +TEST_CASE("BasicGWR: LondonHP") +{ + int iProcess, nProcess; + MPI_Comm_size(MPI_COMM_WORLD, &nProcess); + MPI_Comm_rank(MPI_COMM_WORLD, &iProcess); + + mat londonhp100_coord, londonhp100_data, x; + vec y; + vector londonhp100_fields; + + if (iProcess == 0) + { + if (!read_londonhp100(londonhp100_coord, londonhp100_data, londonhp100_fields)) + { + FAIL("Cannot load londonhp100 data."); + } + y = londonhp100_data.col(0); + x = join_rows(ones(londonhp100_coord.n_rows), londonhp100_data.cols(1, 3)); + } + + const initializer_list parallel_list = { + ParallelType::MPI_Serial +#ifdef ENABLE_OPENMP + , ParallelType::MPI_MP +#endif // ENABLE_OPENMP +#ifdef ENABLE_CUDA + , ParallelType::MPI_CUDA +#endif // ENABLE_CUDA + }; + + SECTION("adaptive bandwidth | no bandwidth optimization | no variable optimization") { + auto parallel = GENERATE_REF(values(parallel_list)); + INFO("Parallel:" << ParallelTypeDict.at(parallel)); + + CRSDistance distance(false); + BandwidthWeight bandwidth(36, true, BandwidthWeight::Gaussian); + SpatialWeight spatial(&bandwidth, &distance); + + GWRBasic algorithm; + if (iProcess == 0) + { + algorithm.setCoords(londonhp100_coord); + algorithm.setDependentVariable(y); + algorithm.setIndependentVariables(x); + } + algorithm.setSpatialWeight(spatial); + algorithm.setParallelType(parallel); + algorithm.setWorkerNum(nProcess); + algorithm.setWorkerId(iProcess); +#ifdef ENABLE_CUDA + if (parallel == ParallelType::MPI_CUDA) + { + algorithm.setGPUId(0); + algorithm.setGroupSize(16); + } +#endif // ENABLE_CUDA + REQUIRE_NOTHROW(algorithm.fit()); + if (iProcess == 0) + { + REQUIRE(algorithm.hasIntercept() == true); + RegressionDiagnostic diagnostic = algorithm.diagnostic(); + REQUIRE_THAT(diagnostic.AIC, Catch::Matchers::WithinAbs(2436.60445730413, 1e-8)); + REQUIRE_THAT(diagnostic.AICc, Catch::Matchers::WithinAbs(2448.27206524754, 1e-8)); + REQUIRE_THAT(diagnostic.RSquare, Catch::Matchers::WithinAbs(0.708010632044736, 1e-8)); + REQUIRE_THAT(diagnostic.RSquareAdjust, Catch::Matchers::WithinAbs(0.674975341723766, 1e-8)); + } + } + + SECTION("fixed bandwidth | no bandwidth optimization | no variable optimization") { + auto parallel = GENERATE_REF(values(parallel_list)); + INFO("Parallel:" << ParallelTypeDict.at(parallel)); + + CRSDistance distance(false); + BandwidthWeight bandwidth(5000, false, BandwidthWeight::Gaussian); + SpatialWeight spatial(&bandwidth, &distance); + + GWRBasic algorithm; + if (iProcess == 0) + { + algorithm.setCoords(londonhp100_coord); + algorithm.setDependentVariable(y); + algorithm.setIndependentVariables(x); + } + algorithm.setSpatialWeight(spatial); + algorithm.setParallelType(parallel); + algorithm.setWorkerId(iProcess); + algorithm.setWorkerNum(nProcess); +#ifdef ENABLE_OPENMP + if (parallel == ParallelType::MPI_MP) + { + algorithm.setOmpThreadNum(2); + } +#endif // ENABLE_OPENMP +#ifdef ENABLE_CUDA + if (parallel == ParallelType::MPI_CUDA) + { + algorithm.setGPUId(0); + algorithm.setGroupSize(16); + } +#endif // ENABLE_CUDA + REQUIRE_NOTHROW(algorithm.fit()); + if (iProcess == 0) + { + RegressionDiagnostic diagnostic = algorithm.diagnostic(); + REQUIRE_THAT(diagnostic.AIC, Catch::Matchers::WithinAbs(2437.649574267587, 1e-8)); + REQUIRE_THAT(diagnostic.AICc, Catch::Matchers::WithinAbs(2447.676281164379, 1e-8)); + REQUIRE_THAT(diagnostic.RSquare, Catch::Matchers::WithinAbs(0.701466295457, 1e-8)); + REQUIRE_THAT(diagnostic.RSquareAdjust, Catch::Matchers::WithinAbs(0.673691594464, 1e-8)); + } + } + + SECTION("adaptive bandwidth | bandwidth optimization | no variable optimization") { + auto parallel = GENERATE_REF(values(parallel_list)); + auto criterion = GENERATE(GWRBasic::BandwidthSelectionCriterionType::CV, GWRBasic::BandwidthSelectionCriterionType::AIC); + INFO("Parallel:" << ParallelTypeDict.at(parallel) << " ; Criterion:" << criterion); + + CRSDistance distance(false); + BandwidthWeight bandwidth(0, true, BandwidthWeight::Gaussian); + SpatialWeight spatial(&bandwidth, &distance); + + GWRBasic algorithm; + if (iProcess == 0) + { + algorithm.setCoords(londonhp100_coord); + algorithm.setDependentVariable(y); + algorithm.setIndependentVariables(x); + } + algorithm.setSpatialWeight(spatial); + algorithm.setIsAutoselectBandwidth(true); + algorithm.setBandwidthSelectionCriterion(criterion); + algorithm.setParallelType(parallel); + algorithm.setWorkerId(iProcess); + algorithm.setWorkerNum(nProcess); +#ifdef ENABLE_OPENMP + if (parallel == ParallelType::MPI_MP) + { + algorithm.setOmpThreadNum(2); + } +#endif // ENABLE_OPENMP +#ifdef ENABLE_CUDA + if (parallel == ParallelType::MPI_CUDA) + { + algorithm.setGPUId(0); + algorithm.setGroupSize(16); + } +#endif // ENABLE_CUDA + REQUIRE_NOTHROW(algorithm.fit()); + if (iProcess == 0) + { + size_t bw = (size_t)algorithm.spatialWeight().weight()->bandwidth(); + size_t bw0 = 67; + switch (criterion) + { + case GWRBasic::BandwidthSelectionCriterionType::AIC: + bw0 = 31; + break; + default: + break; + } + REQUIRE(bw == bw0); + } + } + + SECTION("adaptive bandwidth | no bandwidth optimization | AIC variable optimization") { + auto parallel = GENERATE_REF(values(parallel_list)); + INFO("Parallel:" << ParallelTypeDict.at(parallel)); + + CRSDistance distance(false); + BandwidthWeight bandwidth(36, true, BandwidthWeight::Gaussian); + SpatialWeight spatial(&bandwidth, &distance); + + GWRBasic algorithm; + if (iProcess == 0) + { + algorithm.setCoords(londonhp100_coord); + algorithm.setDependentVariable(y); + algorithm.setIndependentVariables(x); + } + algorithm.setSpatialWeight(spatial); + algorithm.setIsAutoselectIndepVars(true); + algorithm.setIndepVarSelectionThreshold(3.0); + algorithm.setParallelType(parallel); + algorithm.setWorkerNum(nProcess); + algorithm.setWorkerId(iProcess); +#ifdef ENABLE_OPENMP + if (parallel == ParallelType::MPI_MP) + { + algorithm.setOmpThreadNum(2); + } +#endif // ENABLE_OPENMP +#ifdef ENABLE_CUDA + if (parallel == ParallelType::MPI_CUDA) + { + algorithm.setGPUId(0); + algorithm.setGroupSize(16); + } +#endif // ENABLE_CUDA + REQUIRE_NOTHROW(algorithm.fit()); + if (iProcess == 0) + { + VariablesCriterionList criterions = algorithm.indepVarsSelectionCriterionList(); + REQUIRE_THAT(criterions[0].first, Catch::Matchers::Equals(vector({ 2 }))); + REQUIRE_THAT(criterions[0].second, Catch::Matchers::WithinAbs(2551.61359020599, 1e-8)); + REQUIRE_THAT(criterions[1].first, Catch::Matchers::Equals(vector({ 3 }))); + REQUIRE_THAT(criterions[1].second, Catch::Matchers::WithinAbs(2551.30032201349, 1e-8)); + REQUIRE_THAT(criterions[2].first, Catch::Matchers::Equals(vector({ 1 }))); + REQUIRE_THAT(criterions[2].second, Catch::Matchers::WithinAbs(2468.93236280013, 1e-8)); + REQUIRE_THAT(criterions[3].first, Catch::Matchers::Equals(vector({ 1, 3 }))); + REQUIRE_THAT(criterions[3].second, Catch::Matchers::WithinAbs(2452.86447942033, 1e-8)); + REQUIRE_THAT(criterions[4].first, Catch::Matchers::Equals(vector({ 1, 2 }))); + REQUIRE_THAT(criterions[4].second, Catch::Matchers::WithinAbs(2450.59642666509, 1e-8)); + REQUIRE_THAT(criterions[5].first, Catch::Matchers::Equals(vector({ 1, 2, 3 }))); + REQUIRE_THAT(criterions[5].second, Catch::Matchers::WithinAbs(2452.80388934625, 1e-8)); + vector selectedVariables = algorithm.selectedVariables(); + REQUIRE_THAT(selectedVariables, Catch::Matchers::Equals(vector({1, 3}))); + } + } + + SECTION("adaptive bandwidth | CV bandwidth optimization | AIC variable optimization") { + auto parallel = GENERATE_REF(values(parallel_list)); + INFO("Parallel:" << ParallelTypeDict.at(parallel)); + + CRSDistance distance(false); + BandwidthWeight bandwidth(36, true, BandwidthWeight::Gaussian); + SpatialWeight spatial(&bandwidth, &distance); + + GWRBasic algorithm; + if (iProcess == 0) + { + algorithm.setCoords(londonhp100_coord); + algorithm.setDependentVariable(y); + algorithm.setIndependentVariables(x); + } + algorithm.setSpatialWeight(spatial); + algorithm.setIsAutoselectBandwidth(true); + algorithm.setBandwidthSelectionCriterion(GWRBasic::BandwidthSelectionCriterionType::CV); + algorithm.setIsAutoselectIndepVars(true); + algorithm.setIndepVarSelectionThreshold(3.0); + algorithm.setParallelType(parallel); + algorithm.setWorkerId(iProcess); + algorithm.setWorkerNum(nProcess); +#ifdef ENABLE_OPENMP + if (parallel == ParallelType::MPI_MP) + { + algorithm.setOmpThreadNum(2); + } +#endif // ENABLE_OPENMP +#ifdef ENABLE_CUDA + if (parallel == ParallelType::MPI_CUDA) + { + algorithm.setGPUId(0); + algorithm.setGroupSize(16); + } +#endif // ENABLE_CUDA + REQUIRE_NOTHROW(algorithm.fit()); + if (iProcess == 0) + { + VariablesCriterionList criterions = algorithm.indepVarsSelectionCriterionList(); + REQUIRE_THAT(criterions[0].first, Catch::Matchers::Equals(vector({ 2 }))); + REQUIRE_THAT(criterions[0].second, Catch::Matchers::WithinAbs(2551.61359020599, 1e-8)); + REQUIRE_THAT(criterions[1].first, Catch::Matchers::Equals(vector({ 3 }))); + REQUIRE_THAT(criterions[1].second, Catch::Matchers::WithinAbs(2551.30032201349, 1e-8)); + REQUIRE_THAT(criterions[2].first, Catch::Matchers::Equals(vector({ 1 }))); + REQUIRE_THAT(criterions[2].second, Catch::Matchers::WithinAbs(2468.93236280013, 1e-8)); + REQUIRE_THAT(criterions[3].first, Catch::Matchers::Equals(vector({ 1, 3 }))); + REQUIRE_THAT(criterions[3].second, Catch::Matchers::WithinAbs(2452.86447942033, 1e-8)); + REQUIRE_THAT(criterions[4].first, Catch::Matchers::Equals(vector({ 1, 2 }))); + REQUIRE_THAT(criterions[4].second, Catch::Matchers::WithinAbs(2450.59642666509, 1e-8)); + REQUIRE_THAT(criterions[5].first, Catch::Matchers::Equals(vector({ 1, 2, 3 }))); + REQUIRE_THAT(criterions[5].second, Catch::Matchers::WithinAbs(2452.80388934625, 1e-8)); + vector selectedVariables = algorithm.selectedVariables(); + REQUIRE_THAT(selectedVariables, Catch::Matchers::Equals(vector({1, 3}))); + double bw = algorithm.spatialWeight().weight()->bandwidth(); + REQUIRE(bw == 31.0); + RegressionDiagnostic diagnostic = algorithm.diagnostic(); + REQUIRE_THAT(diagnostic.AIC, Catch::Matchers::WithinAbs(2435.8161441795, 1e-8)); + REQUIRE_THAT(diagnostic.AICc, Catch::Matchers::WithinAbs(2445.49629974057, 1e-8)); + REQUIRE_THAT(diagnostic.RSquare, Catch::Matchers::WithinAbs(0.706143867720706, 1e-8)); + REQUIRE_THAT(diagnostic.RSquareAdjust, Catch::Matchers::WithinAbs(0.678982114793865, 1e-8)); + } + } + +} + +int main(int argc, char *argv[]) +{ + MPI_Init(&argc, &argv); + int result = Catch::Session().run( argc, argv ); + MPI_Finalize(); + return result; +} diff --git a/test/mpi/testGWRMultiscaleMpi.cpp b/test/mpi/testGWRMultiscaleMpi.cpp new file mode 100644 index 00000000..dc6c4af2 --- /dev/null +++ b/test/mpi/testGWRMultiscaleMpi.cpp @@ -0,0 +1,304 @@ +#define CATCH_CONFIG_MAIN + +#include + +#include +#include +#include +#include "gwmodelpp/GWRMultiscale.h" + +#include "gwmodelpp/spatialweight/CRSDistance.h" +#include "gwmodelpp/spatialweight/BandwidthWeight.h" +#include "gwmodelpp/spatialweight/SpatialWeight.h" +#include "londonhp100.h" +#include "TerminateCheckTelegram.h" +#include "FileTelegram.h" + +#include + +using namespace std; +using namespace arma; +using namespace gwm; + + +TEST_CASE("MGWR: basic flow") +{ + int iProcess, nProcess; + MPI_Comm_size(MPI_COMM_WORLD, &nProcess); + MPI_Comm_rank(MPI_COMM_WORLD, &iProcess); + + mat londonhp100_coord, londonhp100_data; + vector londonhp100_fields; + if (!read_londonhp100(londonhp100_coord, londonhp100_data, londonhp100_fields)) + { + FAIL("Cannot load londonhp100 data."); + } + vec y = londonhp100_data.col(0); + mat x = join_rows(ones(londonhp100_data.n_rows), londonhp100_data.cols(uvec({1, 3}))); + + uword nVar = 3; + + const initializer_list parallelTypes = { + ParallelType::MPI_Serial, +#ifdef ENABLE_OPENMP + ParallelType::MPI_MP, +#endif // ENABLE_OPENMP +#ifdef ENABLE_CUDA + ParallelType::MPI_CUDA +#endif // ENABLE_CUDA + }; + + SECTION("optim bw cv | null init bw | fixed | bisquare kernel | with hatmatrix") + { + auto parallel = GENERATE_REF(values(parallelTypes)); + INFO("Parallel type: " << ParallelTypeDict.at(parallel)); + + vector spatials; + vector preditorCentered; + vector bandwidthInitialize; + vector bandwidthSelectionApproach; + for (size_t i = 0; i < nVar; i++) + { + CRSDistance distance; + BandwidthWeight bandwidth(0, false, BandwidthWeight::Bisquare); + spatials.push_back(SpatialWeight(&bandwidth, &distance)); + preditorCentered.push_back(i != 0); + bandwidthInitialize.push_back(GWRMultiscale::BandwidthInitilizeType::Null); + bandwidthSelectionApproach.push_back(GWRMultiscale::BandwidthSelectionCriterionType::CV); + } + + GWRMultiscale algorithm; + // algorithm.setTelegram(std::move(make_unique())); + algorithm.setCoords(londonhp100_coord); + algorithm.setDependentVariable(y); + algorithm.setIndependentVariables(x); + algorithm.setSpatialWeights(spatials); + algorithm.setHasHatMatrix(true); + algorithm.setPreditorCentered(preditorCentered); + algorithm.setBandwidthInitilize(bandwidthInitialize); + algorithm.setBandwidthSelectionApproach(bandwidthSelectionApproach); + algorithm.setBandwidthSelectThreshold(vector(3, 1e-5)); + algorithm.setParallelType(parallel); + algorithm.setWorkerId(iProcess); + algorithm.setWorkerNum(nProcess); + REQUIRE_NOTHROW(algorithm.fit()); + + if (iProcess == 0) + { + const vector& spatialWeights = algorithm.spatialWeights(); + REQUIRE_THAT(spatialWeights[0].weight()->bandwidth(), Catch::Matchers::WithinAbs(4623.78, 0.1)); + REQUIRE_THAT(spatialWeights[1].weight()->bandwidth(), Catch::Matchers::WithinAbs(12665.70, 0.1)); + REQUIRE_THAT(spatialWeights[2].weight()->bandwidth(), Catch::Matchers::WithinAbs(12665.70, 0.1)); + + RegressionDiagnostic diagnostic = algorithm.diagnostic(); + REQUIRE_THAT(diagnostic.AICc, Catch::Matchers::WithinAbs(2437.09277417389, 1e-6)); + REQUIRE_THAT(diagnostic.RSquare, Catch::Matchers::WithinAbs(0.744649364494, 1e-6)); + REQUIRE_THAT(diagnostic.RSquareAdjust, Catch::Matchers::WithinAbs(0.712344894394, 1e-6)); + + REQUIRE(algorithm.hasIntercept() == true); + } + } + + SECTION("optim bw AIC | null init bw | adaptive | bisquare kernel | with hatmatrix") + { + auto parallel = GENERATE_REF(values(parallelTypes)); + INFO("Parallel type: " << ParallelTypeDict.at(parallel)); + + vector spatials; + vector preditorCentered; + vector bandwidthInitialize; + vector bandwidthSelectionApproach; + for (size_t i = 0; i < nVar; i++) + { + CRSDistance distance; + BandwidthWeight bandwidth(36, true, BandwidthWeight::Bisquare); + spatials.push_back(SpatialWeight(&bandwidth, &distance)); + preditorCentered.push_back(i != 0); + bandwidthInitialize.push_back(GWRMultiscale::BandwidthInitilizeType::Initial); + bandwidthSelectionApproach.push_back(GWRMultiscale::BandwidthSelectionCriterionType::AIC); + } + + GWRMultiscale algorithm; + // algorithm.setTelegram(std::move(make_unique())); + algorithm.setCoords(londonhp100_coord); + algorithm.setDependentVariable(y); + algorithm.setIndependentVariables(x); + algorithm.setSpatialWeights(spatials); + algorithm.setHasHatMatrix(true); + algorithm.setCriterionType(GWRMultiscale::BackFittingCriterionType::dCVR); + algorithm.setPreditorCentered(preditorCentered); + algorithm.setBandwidthInitilize(bandwidthInitialize); + algorithm.setBandwidthSelectionApproach(bandwidthSelectionApproach); + algorithm.setBandwidthSelectRetryTimes(5); + algorithm.setBandwidthSelectThreshold(vector(3, 1e-5)); + algorithm.setParallelType(parallel); + algorithm.setWorkerId(iProcess); + algorithm.setWorkerNum(nProcess); + REQUIRE_NOTHROW(algorithm.fit()); + + if (iProcess == 0) + { + const vector& spatialWeights = algorithm.spatialWeights(); + REQUIRE_THAT(spatialWeights[0].weight()->bandwidth(), Catch::Matchers::WithinAbs(45, 0.1)); + REQUIRE_THAT(spatialWeights[1].weight()->bandwidth(), Catch::Matchers::WithinAbs(98, 0.1)); + REQUIRE_THAT(spatialWeights[2].weight()->bandwidth(), Catch::Matchers::WithinAbs(98, 0.1)); + + RegressionDiagnostic diagnostic = algorithm.diagnostic(); + REQUIRE_THAT(diagnostic.AICc, Catch::Matchers::WithinAbs(2437.935218705351, 1e-6)); + REQUIRE_THAT(diagnostic.RSquare, Catch::Matchers::WithinAbs(0.7486787930045755, 1e-6)); + REQUIRE_THAT(diagnostic.RSquareAdjust, Catch::Matchers::WithinAbs(0.7118919517893492, 1e-6)); + } + } + + SECTION("optim bw cv | null init bw | adaptive | bisquare kernel | without hatmatrix") + { + auto parallel = GENERATE_REF(values(parallelTypes)); + INFO("Parallel type: " << ParallelTypeDict.at(parallel)); + + vector spatials; + vector preditorCentered; + vector bandwidthInitialize; + vector bandwidthSelectionApproach; + for (size_t i = 0; i < nVar; i++) + { + CRSDistance distance; + BandwidthWeight bandwidth(0, true, BandwidthWeight::Bisquare); + spatials.push_back(SpatialWeight(&bandwidth, &distance)); + preditorCentered.push_back(i != 0); + bandwidthInitialize.push_back(GWRMultiscale::BandwidthInitilizeType::Null); + bandwidthSelectionApproach.push_back(GWRMultiscale::BandwidthSelectionCriterionType::CV); + } + + GWRMultiscale algorithm; + // algorithm.setTelegram(std::move(make_unique())); + algorithm.setCoords(londonhp100_coord); + algorithm.setDependentVariable(y); + algorithm.setIndependentVariables(x); + algorithm.setSpatialWeights(spatials); + algorithm.setHasHatMatrix(false); + algorithm.setPreditorCentered(preditorCentered); + algorithm.setBandwidthInitilize(bandwidthInitialize); + algorithm.setBandwidthSelectionApproach(bandwidthSelectionApproach); + algorithm.setBandwidthSelectThreshold(vector(3, 1e-5)); + algorithm.setParallelType(parallel); + algorithm.setWorkerId(iProcess); + algorithm.setWorkerNum(nProcess); + REQUIRE_NOTHROW(algorithm.fit()); + + if (iProcess == 0) + { + const vector& spatialWeights = algorithm.spatialWeights(); + REQUIRE(spatialWeights[0].weight()->bandwidth() == 35); + REQUIRE(spatialWeights[1].weight()->bandwidth() == 98); + REQUIRE(spatialWeights[2].weight()->bandwidth() == 98); + RegressionDiagnostic diagnostic = algorithm.diagnostic(); + REQUIRE_THAT(diagnostic.RSquare, Catch::Matchers::WithinAbs(0.757377391669, 1e-6)); + REQUIRE(algorithm.hasIntercept() == true); + } + } + + SECTION("optim bw cv | null init bw | adaptive | bisquare | CVR | without hatmatrix") + { + auto parallel = GENERATE_REF(values(parallelTypes)); + INFO("Parallel type: " << ParallelTypeDict.at(parallel)); + + vector spatials; + vector preditorCentered; + vector bandwidthInitialize; + vector bandwidthSelectionApproach; + for (size_t i = 0; i < nVar; i++) + { + CRSDistance distance; + BandwidthWeight bandwidth(36, true, BandwidthWeight::Bisquare); + spatials.push_back(SpatialWeight(&bandwidth, &distance)); + preditorCentered.push_back(i != 0); + bandwidthInitialize.push_back(GWRMultiscale::BandwidthInitilizeType::Initial); + bandwidthSelectionApproach.push_back(GWRMultiscale::BandwidthSelectionCriterionType::CV); + } + + GWRMultiscale algorithm; + // algorithm.setTelegram(std::move(make_unique())); + algorithm.setCoords(londonhp100_coord); + algorithm.setDependentVariable(y); + algorithm.setIndependentVariables(x); + algorithm.setSpatialWeights(spatials); + algorithm.setHasHatMatrix(true); + algorithm.setCriterionType(GWRMultiscale::BackFittingCriterionType::CVR); + algorithm.setPreditorCentered(preditorCentered); + algorithm.setBandwidthInitilize(bandwidthInitialize); + algorithm.setBandwidthSelectionApproach(bandwidthSelectionApproach); + algorithm.setBandwidthSelectRetryTimes(5); + algorithm.setBandwidthSelectThreshold(vector(3, 1e-5)); + algorithm.setParallelType(parallel); + algorithm.setWorkerId(iProcess); + algorithm.setWorkerNum(nProcess); + REQUIRE_NOTHROW(algorithm.fit()); + + if (iProcess == 0) + { + const vector& spatialWeights = algorithm.spatialWeights(); + REQUIRE(spatialWeights[0].weight()->bandwidth() == 35); + REQUIRE(spatialWeights[1].weight()->bandwidth() == 98); + REQUIRE(spatialWeights[2].weight()->bandwidth() == 98); + + RegressionDiagnostic diagnostic = algorithm.diagnostic(); + REQUIRE_THAT(diagnostic.RSquare, Catch::Matchers::WithinAbs(0.757377391669, 1e-6)); + } + } + + SECTION("optim bw cv | null init bw | adaptive | bisquare kernel | with hatmatrix | with bounds") + { + auto parallel = GENERATE_REF(values(parallelTypes)); + INFO("Parallel type: " << ParallelTypeDict.at(parallel)); + + vector spatials; + vector preditorCentered; + vector bandwidthInitialize; + vector bandwidthSelectionApproach; + for (size_t i = 0; i < nVar; i++) + { + CRSDistance distance; + BandwidthWeight bandwidth(36, true, BandwidthWeight::Bisquare); + spatials.push_back(SpatialWeight(&bandwidth, &distance)); + preditorCentered.push_back(i != 0); + bandwidthInitialize.push_back(GWRMultiscale::BandwidthInitilizeType::Null); + bandwidthSelectionApproach.push_back(GWRMultiscale::BandwidthSelectionCriterionType::CV); + } + + GWRMultiscale algorithm; + // algorithm.setTelegram(std::move(make_unique())); + algorithm.setCoords(londonhp100_coord); + algorithm.setDependentVariable(y); + algorithm.setIndependentVariables(x); + algorithm.setSpatialWeights(spatials); + algorithm.setHasHatMatrix(true); + algorithm.setCriterionType(GWRMultiscale::BackFittingCriterionType::CVR); + algorithm.setPreditorCentered(preditorCentered); + algorithm.setBandwidthInitilize(bandwidthInitialize); + algorithm.setBandwidthSelectionApproach(bandwidthSelectionApproach); + algorithm.setBandwidthSelectRetryTimes(5); + algorithm.setBandwidthSelectThreshold(vector(3, 1e-5)); + algorithm.setParallelType(parallel); + algorithm.setWorkerId(iProcess); + algorithm.setWorkerNum(nProcess); + algorithm.setGoldenLowerBounds(50); + algorithm.setGoldenUpperBounds(100); + REQUIRE_NOTHROW(algorithm.fit()); + + if (iProcess == 0) + { + const vector& spatialWeights = algorithm.spatialWeights(); + REQUIRE(spatialWeights[0].weight()->bandwidth() == 52); + REQUIRE(spatialWeights[1].weight()->bandwidth() == 99); + REQUIRE(spatialWeights[2].weight()->bandwidth() == 99); + } + } +} + +int main(int argc, char *argv[]) +{ + MPI_Init(&argc, &argv); + int result = Catch::Session().run( argc, argv ); + MPI_Finalize(); + return result; +} diff --git a/test/mpi/testMpiMatMul.cpp b/test/mpi/testMpiMatMul.cpp new file mode 100644 index 00000000..dc70b5ca --- /dev/null +++ b/test/mpi/testMpiMatMul.cpp @@ -0,0 +1,68 @@ +#include +#include +#include + +#include +#include "gwmodelpp/utils/armampi.h" + +using namespace std; +using namespace arma; + +TEST_CASE("mat mul mpi") +{ + int ip, np; + MPI_Comm_size(MPI_COMM_WORLD, &np); + MPI_Comm_rank(MPI_COMM_WORLD, &ip); + + uword n = 100; + uword size = n / np + (n % np == 0 ? 0 : 1); + uword from = ip * size, to = min(from + size, n); + + // printf("range from %llu to %llu\n", from, to); + + mat A_all, B_all, C_all; + A_all = mat(n, n, arma::fill::randn); + B_all = mat(n, n, arma::fill::randn); + if (ip == 0) + { + C_all = A_all * B_all; + } + + mat A, B, C; + A = A_all.rows(from, to - 1); + B = B_all.rows(from, to - 1); + // printf("process %d begin mat mul\n", ip); + REQUIRE_NOTHROW(mat_mul_mpi(A, B, C, ip, np, size)); + // printf("process %d end mat mul\n", ip); + + if (ip == 0) + { + mat C_gather(n, n); + C_gather.rows(from, to - 1) = C; + uvec received(np, fill::zeros); + received(0) = 1; + auto bufsize = size * n; + double *buf = new double[size * n]; + while (!all(received == 1)) + { + MPI_Status status; + MPI_Recv(buf, bufsize, MPI_DOUBLE, MPI_ANY_SOURCE, 0, MPI_COMM_WORLD, &status); + received(status.MPI_SOURCE) = 1; + uword ifrom = status.MPI_SOURCE * size, ito = min(ifrom + size, n), irange = ito - ifrom; + C_gather.rows(ifrom, ito - 1) = mat(buf, irange, n); + } + REQUIRE(approx_equal(C_gather, C_all, "absdiff", 1e-6)); + } + else + { + MPI_Send(C.memptr(), C.n_elem, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD); + } +} + +int main(int argc, char *argv[]) +{ + MPI_Init(&argc, &argv); + int result = Catch::Session().run( argc, argv ); + MPI_Finalize(); + return result; +} \ No newline at end of file diff --git a/test/testGWRBasic.cpp b/test/testGWRBasic.cpp index 8b3bd80d..3abaa815 100644 --- a/test/testGWRBasic.cpp +++ b/test/testGWRBasic.cpp @@ -102,8 +102,9 @@ TEST_CASE("BasicGWR: LondonHP") REQUIRE_THAT(diagnostic.RSquareAdjust, Catch::Matchers::WithinAbs(0.673691594464, 1e-8)); } - SECTION("adaptive bandwidth | CV bandwidth optimization | no variable optimization") { + SECTION("adaptive bandwidth | bandwidth optimization | no variable optimization") { auto parallel = GENERATE_REF(values(parallel_list)); + auto criterion = GENERATE(GWRBasic::BandwidthSelectionCriterionType::CV, GWRBasic::BandwidthSelectionCriterionType::AIC); INFO("Parallel:" << ParallelTypeDict.at(parallel)); CRSDistance distance(false); @@ -116,7 +117,7 @@ TEST_CASE("BasicGWR: LondonHP") algorithm.setIndependentVariables(x); algorithm.setSpatialWeight(spatial); algorithm.setIsAutoselectBandwidth(true); - algorithm.setBandwidthSelectionCriterion(GWRBasic::BandwidthSelectionCriterionType::CV); + algorithm.setBandwidthSelectionCriterion(criterion); algorithm.setParallelType(parallel); #ifdef ENABLE_OPENMP if (parallel == ParallelType::OpenMP) @@ -133,7 +134,16 @@ TEST_CASE("BasicGWR: LondonHP") #endif // ENABLE_CUDA REQUIRE_NOTHROW(algorithm.fit()); size_t bw = (size_t)algorithm.spatialWeight().weight()->bandwidth(); - REQUIRE(bw == 67); + size_t bw0 = 67; + switch (criterion) + { + case GWRBasic::BandwidthSelectionCriterionType::AIC: + bw0 = 31; + break; + default: + break; + } + REQUIRE(bw == bw0); } SECTION("adaptive bandwidth | no bandwidth optimization | AIC variable optimization") { @@ -377,6 +387,7 @@ TEST_CASE("Basic GWR: cancel") } +#ifdef ENABLE_CUDA TEST_CASE("BasicGWR: Benchmark") { size_t n = 50000, k = 3; @@ -417,3 +428,4 @@ TEST_CASE("BasicGWR: Benchmark") return algorithm.betas(); }; } +#endif // ENABLE_CUDA diff --git a/test/testGWRMultiscale.cpp b/test/testGWRMultiscale.cpp index 5d70a997..22fc19c4 100644 --- a/test/testGWRMultiscale.cpp +++ b/test/testGWRMultiscale.cpp @@ -35,7 +35,7 @@ TEST_CASE("MGWR: basic flow") const initializer_list parallelTypes = { ParallelType::SerialOnly, #ifdef ENABLE_OPENMP - ParallelType::OpenMP, + // ParallelType::OpenMP, #endif // ENABLE_OPENMP #ifdef ENABLE_CUDA ParallelType::CUDA @@ -87,7 +87,7 @@ TEST_CASE("MGWR: basic flow") REQUIRE(algorithm.hasIntercept() == true); } - SECTION("optim bw cv | null init bw | adaptive | bisquare kernel | with hatmatrix") + SECTION("optim bw AIC | null init bw | adaptive | bisquare kernel | with hatmatrix") { auto parallel = GENERATE_REF(values(parallelTypes)); INFO("Parallel type: " << ParallelTypeDict.at(parallel));