From be4007666f1c8a082dc67311d643678fea2a1115 Mon Sep 17 00:00:00 2001
From: Yigong Hu <huyg0180110559@outlook.com>
Date: Sun, 21 Jul 2024 15:52:05 +0100
Subject: [PATCH] Feature: multiprocess support based on MPI (#88)

* edit: MPI with indep var selection

* fix: compile errors

* edit: further rename functions

* add: fit mpi

* fix: runtime errors

* fix: unused variables

* add: bw sel mpi

* fix: bugs caused in non mpi mode

* fix: indep var sel mpi runtime error

* fix: bw sel runtime errors

* edit: activate all tests

* add: compile condition around mpi code

* edit: mpi with omp

* add: mpi with cuda

* fix: fitMpi S send error

* edit: call GWRBasic in MGWR

* fix: split fit core functions

* add: fit core omp and cuda

* edit: use min distance

* add: cubase static methods to create and destory handle

* edit: CRSDistance use cumat

* fix: GWR cuda bug

* edit: MGWR move global fit earlier

* add: mpi mat mul

* add: MGWR mpi fit

* fix: mpi mat mul error

* edit: test matrix size

* edit: use scatter instead of cast

* fix: mgwr mpi run error

* add: bw criterion cv aic mpi mode

* fix: mgwr mpi omp

* edit: mpi with cuda

* edit: MPI code conditional compile

* edit: mgwr set golden bounds of gwr basic

* edit: pre compute b_rows_i

* fix: setGroupSize use std::size_t

* fix: syntax error

* fix(test): MGWR

* fix: type mismatch on armadillo and RcppArmadillo

* fix: use MY_MPI_UWORD to gether sizes

It is unsigned long when  ARMA_32BIT_WORD is defined; otherwise, unsigned long long.

* edit: rename MY_MPI_UWORD

* edit: use int to record process status

* add(workflow): MPI tests

* Revert "add(workflow): MPI tests"

This reverts commit 5bbb210e311cd0372b4654712a0b20cc043f9181.

---------

Co-authored-by: rd21411 <rd21411@bp1-login01.data.bp.acrc.priv>
---
 CMakeLists.txt                                |    1 +
 include/gwmodelpp/GWRBasic.h                  |  228 +--
 include/gwmodelpp/GWRMultiscale.h             |  314 +---
 include/gwmodelpp/IParallelizable.h           |   21 +-
 include/gwmodelpp/spatialweight/CRSDistance.h |   12 +-
 .../gwmodelpp/spatialweight/SpatialWeight.h   |    4 +-
 include/gwmodelpp/utils/armampi.h             |   10 +
 include/gwmodelpp/utils/cumat.hpp             |   43 +
 src/CMakeLists.txt                            |   26 +
 src/gwmodelpp/GWRBasic.cpp                    |  901 +++++++-----
 src/gwmodelpp/GWRMultiscale.cpp               | 1280 ++++++-----------
 .../spatialweight/BandwidthWeight.cpp         |    2 +-
 src/gwmodelpp/spatialweight/CRSDistance.cpp   |   21 +-
 .../cuda/BandwidthWeightKernel.cu             |   17 +-
 src/gwmodelpp/utils/armampi.cpp               |   25 +
 src/gwmodelpp/utils/cumat.cpp                 |    7 +
 test/CMakeLists.txt                           |   10 +-
 test/include/TerminateCheckTelegram.h         |    6 +-
 test/mpi/CMakeLists.txt                       |   30 +
 test/mpi/testGWRBasicMpi.cpp                  |  310 ++++
 test/mpi/testGWRMultiscaleMpi.cpp             |  304 ++++
 test/mpi/testMpiMatMul.cpp                    |   68 +
 test/testGWRBasic.cpp                         |   18 +-
 test/testGWRMultiscale.cpp                    |    4 +-
 24 files changed, 2067 insertions(+), 1595 deletions(-)
 create mode 100644 include/gwmodelpp/utils/armampi.h
 create mode 100644 src/gwmodelpp/utils/armampi.cpp
 create mode 100644 test/mpi/CMakeLists.txt
 create mode 100644 test/mpi/testGWRBasicMpi.cpp
 create mode 100644 test/mpi/testGWRMultiscaleMpi.cpp
 create mode 100644 test/mpi/testMpiMatMul.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4c58d004..675f4862 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,6 +6,7 @@ set(CMAKE_MODULE_PATH cmake)
 
 option(ENABLE_OpenMP "Determines whether OpemMP support should be built" ON)
 option(ENABLE_CUDA "Determines whether CUDA support should be built" OFF)
+option(ENABLE_MPI "Determines whether MPI support should be built" OFF)
 option(WITH_TESTS "Determines whether to build and run tests" ON)
 
 if(ENABLE_CUDA)
diff --git a/include/gwmodelpp/GWRBasic.h b/include/gwmodelpp/GWRBasic.h
index 50daa284..30665d24 100644
--- a/include/gwmodelpp/GWRBasic.h
+++ b/include/gwmodelpp/GWRBasic.h
@@ -26,7 +26,7 @@ namespace gwm
  * 该算法可以通过 OpenMP 加速。
  * 
  */
-class GWRBasic : public GWRBase, public IBandwidthSelectable, public IVarialbeSelectable, public IParallelizable, public IParallelOpenmpEnabled, public IParallelCudaEnabled
+class GWRBasic : public GWRBase, public IBandwidthSelectable, public IVarialbeSelectable, public IParallelizable, public IParallelOpenmpEnabled, public IParallelCudaEnabled, public IParallelMpiEnabled
 {
 public:
 
@@ -47,10 +47,13 @@ class GWRBasic : public GWRBase, public IBandwidthSelectable, public IVarialbeSe
     static std::unordered_map<BandwidthSelectionCriterionType, std::string> BandwidthSelectionCriterionTypeNameMapper;
     
     typedef arma::mat (GWRBasic::*PredictCalculator)(const arma::mat&, const arma::mat&, const arma::vec&);                             //!< \~english Predict function declaration. \~chinese 预测函数声明。
-    typedef arma::mat (GWRBasic::*FitCalculator)(const arma::mat&, const arma::vec&, arma::mat&, arma::vec&, arma::vec&, arma::mat&);   //!< \~english Fit function declaration. \~chinese 拟合函数声明。
+    typedef arma::mat (GWRBasic::*FitCalculator)();   //!< \~english Fit function declaration. \~chinese 拟合函数声明。
+    typedef arma::mat (GWRBasic::*FitCoreCalculator)(const arma::mat&, const arma::vec&, const SpatialWeight&);   //!< \~english Fit function declaration. \~chinese 拟合函数声明。
+    typedef arma::mat (GWRBasic::*FitCoreSHatCalculator)(const arma::mat&, const arma::vec&, const SpatialWeight&, arma::vec&);   //!< \~english Fit function declaration. \~chinese 拟合函数声明。
+    typedef arma::mat (GWRBasic::*FitCoreCVCalculator)(const arma::mat&, const arma::vec&, const SpatialWeight&);   //!< \~english Fit function declaration. \~chinese 拟合函数声明。
 
     typedef double (GWRBasic::*BandwidthSelectionCriterionCalculator)(BandwidthWeight*);        //!< \~english Declaration of criterion calculator for bandwidth selection. \~chinese 带宽优选指标计算函数声明。
-    typedef double (GWRBasic::*IndepVarsSelectCriterionCalculator)(const std::vector<size_t>&); //!< \~english Declaration of criterion calculator for variable selection. \~chinese 变量优选指标计算函数声明。
+    typedef double (GWRBasic::*IndepVarsSelectCriterionCalculator)(const std::vector<std::size_t>&); //!< \~english Declaration of criterion calculator for variable selection. \~chinese 变量优选指标计算函数声明。
 
 private:
 
@@ -361,6 +364,29 @@ class GWRBasic : public GWRBase, public IBandwidthSelectable, public IVarialbeSe
      */
     const arma::mat& s() { return mS; }
 
+    const arma::cube& c() { return mC; }
+
+    /**
+     * \~english
+     * @brief Whether to store hat-matrix \f$S\f$.
+     * 
+     * @return true if store hat-matrix.
+     * @return false if not to store hat-matrix.
+     * 
+     * \~chinese
+     * @brief 是否保存帽子矩阵 \f$S\f$.
+     * 
+     * @return true 如果保存帽子矩阵。
+     * @return false 如果不保存帽子矩阵。
+     * 
+     */
+    bool isStoreS() { return mStoreS ? true : (mHasHatMatrix && (mCoords.n_rows < 8192)); }
+
+    bool isStoreC() { return mStoreC; }
+
+    void setStoreS(bool flag) { mStoreS = flag; }
+
+    void setStoreC(bool flag) { mStoreC = flag; }
 
 public:     // Implement Algorithm
     bool isValid() override;
@@ -381,6 +407,22 @@ class GWRBasic : public GWRBase, public IBandwidthSelectable, public IVarialbeSe
     {
         return mSelectedIndepVars;
     }
+    
+    /**
+     * \~english
+     * @brief Get AIC value with given variables for variable optimization (serial implementation).
+     * 
+     * @param indepVars Given variables
+     * @return double Criterion value
+     * 
+     * \~chinese
+     * @brief 根据指定的变量计算变量优选的AIC值（串行实现）。
+     * 
+     * @param indepVars 指定的变量。
+     * @return double 变量优选的指标值。
+     */
+    double indepVarsSelectionCriterion(const std::vector<std::size_t>& indepVars);
+
 
 public:     // Implement IBandwidthSelectable
     Status getCriterion(BandwidthWeight* weight, double& criterion) override
@@ -388,6 +430,36 @@ class GWRBasic : public GWRBase, public IBandwidthSelectable, public IVarialbeSe
         criterion = (this->*mBandwidthSelectionCriterionFunction)(weight);
         return mStatus;
     }
+    
+    /**
+     * \~english
+     * @brief Get CV value with given bandwidth for bandwidth optimization (serial implementation).
+     * 
+     * @param bandwidthWeight Given bandwidth
+     * @return double Criterion value
+     * 
+     * \~chinese
+     * @brief 根据指定的带宽计算带宽优选的CV值（串行实现）。
+     * 
+     * @param bandwidthWeight 指定的带宽。
+     * @return double 带宽优选的指标值。
+     */
+    double bandwidthSizeCriterionCV(BandwidthWeight* bandwidthWeight);
+    
+    /**
+     * \~english
+     * @brief Get AIC value with given bandwidth for bandwidth optimization (serial implementation).
+     * 
+     * @param bandwidthWeight Given bandwidth
+     * @return double Criterion value
+     * 
+     * \~chinese
+     * @brief 根据指定的带宽计算带宽优选的AIC值（串行实现）。
+     * 
+     * @param bandwidthWeight 指定的带宽。
+     * @return double 带宽优选的指标值。
+     */
+    double bandwidthSizeCriterionAIC(BandwidthWeight* bandwidthWeight);
 
 
 private:
@@ -434,52 +506,16 @@ class GWRBasic : public GWRBase, public IBandwidthSelectable, public IVarialbeSe
      * @param S [out] 帽子矩阵 \f$S\f$。
      * @return mat 回归系数估计值
      */
-    arma::mat fitSerial(const arma::mat& x, const arma::vec& y, arma::mat& betasSE, arma::vec& shat, arma::vec& qDiag, arma::mat& S);
-    
-    /**
-     * \~english
-     * @brief Get CV value with given bandwidth for bandwidth optimization (serial implementation).
-     * 
-     * @param bandwidthWeight Given bandwidth
-     * @return double Criterion value
-     * 
-     * \~chinese
-     * @brief 根据指定的带宽计算带宽优选的CV值（串行实现）。
-     * 
-     * @param bandwidthWeight 指定的带宽。
-     * @return double 带宽优选的指标值。
-     */
-    double bandwidthSizeCriterionCVSerial(BandwidthWeight* bandwidthWeight);
-    
-    /**
-     * \~english
-     * @brief Get AIC value with given bandwidth for bandwidth optimization (serial implementation).
-     * 
-     * @param bandwidthWeight Given bandwidth
-     * @return double Criterion value
-     * 
-     * \~chinese
-     * @brief 根据指定的带宽计算带宽优选的AIC值（串行实现）。
-     * 
-     * @param bandwidthWeight 指定的带宽。
-     * @return double 带宽优选的指标值。
-     */
-    double bandwidthSizeCriterionAICSerial(BandwidthWeight* bandwidthWeight);
-    
-    /**
-     * \~english
-     * @brief Get AIC value with given variables for variable optimization (serial implementation).
-     * 
-     * @param indepVars Given variables
-     * @return double Criterion value
-     * 
-     * \~chinese
-     * @brief 根据指定的变量计算变量优选的AIC值（串行实现）。
-     * 
-     * @param indepVars 指定的变量。
-     * @return double 变量优选的指标值。
-     */
-    double indepVarsSelectionCriterionSerial(const std::vector<size_t>& indepVars);
+    arma::mat fitBase();
+
+
+private:
+
+    arma::mat fitCoreSerial(const arma::mat& x, const arma::vec& y, const SpatialWeight& sw);
+
+    arma::mat fitCoreSHatSerial(const arma::mat& x, const arma::vec& y, const SpatialWeight& sw, arma::vec& shat);
+
+    arma::mat fitCoreCVSerial(const arma::mat& x, const arma::vec& y, const SpatialWeight& sw);
 
 #ifdef ENABLE_OPENMP
 
@@ -525,7 +561,7 @@ class GWRBasic : public GWRBase, public IBandwidthSelectable, public IVarialbeSe
      * @param S [out] 帽子矩阵 \f$S\f$。
      * @return mat 回归系数估计值
      */
-    arma::mat fitOmp(const arma::mat& x, const arma::vec& y, arma::mat& betasSE, arma::vec& shat, arma::vec& qDiag, arma::mat& S);
+    arma::mat fitCoreOmp(const arma::mat& x, const arma::vec& y, const SpatialWeight& sw);
 
     /**
      * \~english
@@ -540,22 +576,7 @@ class GWRBasic : public GWRBase, public IBandwidthSelectable, public IVarialbeSe
      * @param bandwidthWeight 指定的带宽。
      * @return double 带宽优选的指标值。
      */
-    double bandwidthSizeCriterionCVOmp(BandwidthWeight* bandwidthWeight);
-    
-    /**
-     * \~english
-     * @brief Get AIC value with given bandwidth for bandwidth optimization (OpenMP implementation).
-     * 
-     * @param bandwidthWeight Given bandwidth
-     * @return double Criterion value
-     * 
-     * \~chinese
-     * @brief 根据指定的带宽计算带宽优选的AIC值（OpenMP 实现）。
-     * 
-     * @param bandwidthWeight 指定的带宽。
-     * @return double 带宽优选的指标值。
-     */
-    double bandwidthSizeCriterionAICOmp(BandwidthWeight* bandwidthWeight);
+    arma::mat fitCoreCVOmp(const arma::mat& x, const arma::vec& y, const SpatialWeight& sw);
 
     /**
      * \~english
@@ -570,7 +591,7 @@ class GWRBasic : public GWRBase, public IBandwidthSelectable, public IVarialbeSe
      * @param indepVars 指定的变量。
      * @return double 变量优选的指标值。
      */
-    double indepVarsSelectionCriterionOmp(const std::vector<size_t>& indepVars);
+    arma::mat fitCoreSHatOmp(const arma::mat& x, const arma::vec& y, const SpatialWeight& sw, arma::vec& shat);
 
 #endif
 
@@ -593,7 +614,7 @@ class GWRBasic : public GWRBase, public IBandwidthSelectable, public IVarialbeSe
      * @param y 因变量。
      * @return mat 回归系数预测值。
      */
-    arma::mat fitCuda(const arma::mat& x, const arma::vec& y, arma::mat& betasSE, arma::vec& shat, arma::vec& qDiag, arma::mat& S);
+    arma::mat fitCoreCuda(const arma::mat& x, const arma::vec& y, const SpatialWeight& sw);
 
     /**
      * \~english
@@ -633,22 +654,7 @@ class GWRBasic : public GWRBase, public IBandwidthSelectable, public IVarialbeSe
      * @param bandwidthWeight 指定的带宽。
      * @return double 带宽优选的指标值。
      */
-    double bandwidthSizeCriterionCVCuda(BandwidthWeight* bandwidthWeight);
-
-    /**
-     * \~english
-     * @brief Get AIC value with given bandwidth for bandwidth optimization (CUDA implementation).
-     * 
-     * @param bandwidthWeight Given bandwidth
-     * @return double Criterion value
-     * 
-     * \~chinese
-     * @brief 根据指定的带宽计算带宽优选的AIC值（CUDA实现）。
-     * 
-     * @param bandwidthWeight 指定的带宽。
-     * @return double 带宽优选的指标值。
-     */
-    double bandwidthSizeCriterionAICCuda(BandwidthWeight* bandwidthWeight);
+    arma::mat fitCoreCVCuda(const arma::mat& x, const arma::vec& y, const SpatialWeight& sw);
 
     /**
      * \~english
@@ -663,10 +669,17 @@ class GWRBasic : public GWRBase, public IBandwidthSelectable, public IVarialbeSe
      * @param indepVars 指定的变量。
      * @return double 变量优选的指标值。
      */
-    double indepVarsSelectionCriterionCuda(const std::vector<size_t>& indepVars);
+    arma::mat fitCoreSHatCuda(const arma::mat& x, const arma::vec& y, const SpatialWeight& sw, arma::vec& shat);
 
 #endif
 
+#ifdef ENABLE_MPI
+    double indepVarsSelectionCriterionMpi(const std::vector<std::size_t>& indepVars);
+    double bandwidthSizeCriterionCVMpi(BandwidthWeight* bandwidthWeight);
+    double bandwidthSizeCriterionAICMpi(BandwidthWeight* bandwidthWeight);
+    arma::mat fitMpi();
+#endif // ENABLE_MPI
+
 public:     // Implement IParallelizable
     int parallelAbility() const override
     {
@@ -677,6 +690,14 @@ class GWRBasic : public GWRBase, public IBandwidthSelectable, public IVarialbeSe
 #ifdef ENABLE_CUDA
             | ParallelType::CUDA
 #endif // ENABLE_CUDA
+#ifdef ENABLE_MPI
+#ifdef ENABLE_OPENMP
+            | ParallelType::OpenMP
+#endif // ENABLE_OPENMP
+#ifdef ENABLE_CUDA
+            | ParallelType::CUDA
+#endif // ENABLE_CUDA
+#endif // ENABLE_MPI
         ;
     }
 
@@ -687,26 +708,13 @@ class GWRBasic : public GWRBase, public IBandwidthSelectable, public IVarialbeSe
 public:     // Implement IGwmParallelOpenmpEnabled
     void setOmpThreadNum(const int threadNum) override { mOmpThreadNum = threadNum; }
     void setGPUId(const int gpuId) override { mGpuId = gpuId; };
-    void setGroupSize(const size_t size) override { mGroupLength = size; };
+    void setGroupSize(const std::size_t size) override { mGroupLength = size; };
+    int workerId() override { return mWorkerId; }
+    void setWorkerId(int id) override { mWorkerId = id; };
+    void setWorkerNum(int size) override { mWorkerNum = size; };
 
 protected:
 
-    /**
-     * \~english
-     * @brief Whether to store hat-matrix \f$S\f$.
-     * 
-     * @return true if store hat-matrix.
-     * @return false if not to store hat-matrix.
-     * 
-     * \~chinese
-     * @brief 是否保存帽子矩阵 \f$S\f$.
-     * 
-     * @return true 如果保存帽子矩阵。
-     * @return false 如果不保存帽子矩阵。
-     * 
-     */
-    bool isStoreS() { return mHasHatMatrix && (mCoords.n_rows < 8192); }
-
     /**
      * \~english
      * @brief Create distance parameters for prediction.
@@ -728,7 +736,7 @@ class GWRBasic : public GWRBase, public IBandwidthSelectable, public IVarialbeSe
     
     bool mIsAutoselectIndepVars = false;    //!< \~english Whether to auto select variables. \~chinese 是否自动优选变量。
     double mIndepVarSelectionThreshold = 3.0;   //!< \~english The threshold for variable selection. \~chinese 变量优选的阈值。
-    IndepVarsSelectCriterionCalculator mIndepVarsSelectionCriterionFunction = &GWRBasic::indepVarsSelectionCriterionSerial; //!< \~english Criterion calculator for variable selection. \~chinese 变量优选的指标计算函数。
+    IndepVarsSelectCriterionCalculator mIndepVarsSelectionCriterionFunction = &GWRBasic::indepVarsSelectionCriterion; //!< \~english Criterion calculator for variable selection. \~chinese 变量优选的指标计算函数。
     VariablesCriterionList mIndepVarsSelectionCriterionList;    //!< \~english Criterion list of each variable combination. \~chinese 每种变量组合对应的指标值。
     std::vector<std::size_t> mSelectedIndepVars;    //!< \~english Selected variables. \~chinese 优选得到的变量。
     std::size_t mIndepVarSelectionProgressTotal = 0; //!< \~english Total number of independent variable combination. \~chinese 自变量所有组合总数。
@@ -736,24 +744,34 @@ class GWRBasic : public GWRBase, public IBandwidthSelectable, public IVarialbeSe
 
     bool mIsAutoselectBandwidth = false;    //!< \~english Whether to auto select bandwidth. \~chinese 是否自动优选带宽。
     BandwidthSelectionCriterionType mBandwidthSelectionCriterion = BandwidthSelectionCriterionType::AIC;    //!< \~english Type criterion for bandwidth selection. \~chinese 带宽优选的指标值类型。
-    BandwidthSelectionCriterionCalculator mBandwidthSelectionCriterionFunction = &GWRBasic::bandwidthSizeCriterionCVSerial; //!< \~english Criterion calculator for bandwidth selection. \~chinese 带宽优选的指标计算函数。
+    BandwidthSelectionCriterionCalculator mBandwidthSelectionCriterionFunction = &GWRBasic::bandwidthSizeCriterionCV; //!< \~english Criterion calculator for bandwidth selection. \~chinese 带宽优选的指标计算函数。
     BandwidthCriterionList mBandwidthSelectionCriterionList;    //!< \~english Criterion list of each bandwidth. \~chinese 每种带宽组合对应的指标值。
     double mBandwidthLastCriterion = DBL_MAX;   //!< \~english Last criterion for bandwidth selection. \~chinese 上一次带宽优选的有效指标值。
     std::optional<double> mGoldenUpperBounds;
     std::optional<double> mGoldenLowerBounds;
 
     PredictCalculator mPredictFunction = &GWRBasic::predictSerial;  //!< \~english Implementation of predict function. \~chinese 预测的具体实现函数。
-    FitCalculator mFitFunction = &GWRBasic::fitSerial;  //!< \~english Implementation of fit function. \~chinese 拟合的具体实现函数。
+    FitCalculator mFitFunction = &GWRBasic::fitBase;  //!< \~english Implementation of fit function. \~chinese 拟合的具体实现函数。
+    FitCoreCalculator mFitCoreFunction = &GWRBasic::fitCoreSerial;  //!< \~english Implementation of fit function. \~chinese 拟合的具体实现函数。
+    FitCoreSHatCalculator mFitCoreSHatFunction = &GWRBasic::fitCoreSHatSerial;  //!< \~english Implementation of fit function. \~chinese 拟合的具体实现函数。
+    FitCoreCVCalculator mFitCoreCVFunction = &GWRBasic::fitCoreCVSerial;  //!< \~english Implementation of fit function. \~chinese 拟合的具体实现函数。
 
     ParallelType mParallelType = ParallelType::SerialOnly;  //!< \~english Type of parallel method. \~chinese 并行方法类型。
     int mOmpThreadNum = 8;  //!< \~english Number of threads to create. \~chinese 并行计算创建的线程数。
     size_t mGroupLength = 64;   //!< \~english Size of a group computing together. \~chinese 同时计算的一组的大小。
     int mGpuId = 0; //!< \~english The ID of selected GPU. \~chinese 选择的 GPU 的 ID。
+    int mWorkerId = 0;
+    int mWorkerNum = 1;
+    arma::uword mWorkRangeSize = 0;
+    std::optional<std::pair<arma::uword, arma::uword>> mWorkRange;
 
     arma::mat mBetasSE;  //!< \~english Standard errors of coefficient estimates. \~chinese 回归系数估计值的标准差。
     arma::vec mSHat;  //!< \~english A vector of \f$tr(S)\f$ and \f$tr(SS^T)\f$. \~chinese 由 \f$tr(S)\f$ 和 \f$tr(SS^T)\f$ 组成的向量。
     arma::vec mQDiag;  //!< \~english The diagonal elements of matrix \f$Q\f$. \~chinese 矩阵 \f$Q\f$ 的对角线元素。
     arma::mat mS;  //!< \~english The hat-matrix \f$S\f$. \~chinese 帽子矩阵 \f$S\f$。
+    arma::cube mC;//!< \~english All \f$S\f$ matrices. \~chinese 所有 \f$C\f$ 矩阵。
+    bool mStoreS = false; //!< \~english Whether to save S \~chinese 是否保存 S 矩阵
+    bool mStoreC = false; //!< \~english Whether to save C \~chinese 是否保存 C 矩阵
 };
 
 }
diff --git a/include/gwmodelpp/GWRMultiscale.h b/include/gwmodelpp/GWRMultiscale.h
index 6bec0067..af2e77f8 100644
--- a/include/gwmodelpp/GWRMultiscale.h
+++ b/include/gwmodelpp/GWRMultiscale.h
@@ -32,7 +32,13 @@ namespace gwm
  * \~chinese
  * @brief 多尺度GWR算法
  */
-class GWRMultiscale : public SpatialMultiscaleAlgorithm, public IBandwidthSelectable, public IRegressionAnalysis, public IParallelizable, public IParallelOpenmpEnabled, public IParallelCudaEnabled
+class GWRMultiscale : public SpatialMultiscaleAlgorithm,
+    public IBandwidthSelectable,
+    public IRegressionAnalysis,
+    public IParallelizable,
+    public IParallelOpenmpEnabled,
+    public IParallelCudaEnabled,
+    public IParallelMpiEnabled
 {
 public:
 
@@ -83,10 +89,14 @@ class GWRMultiscale : public SpatialMultiscaleAlgorithm, public IBandwidthSelect
     static std::unordered_map<BackFittingCriterionType,std::string> BackFittingCriterionTypeNameMapper; //!< \~english A mapper from backfitting convergence criterion types to their names.  \~chinese 后向迭代算法收敛指标类型到其名称的映射表。
 
     typedef double (GWRMultiscale::*BandwidthSizeCriterionFunction)(BandwidthWeight*);  //!< \~english Function to calculate the criterion for given bandwidth size. \~chinese 根据指定带宽大小计算对应指标值的函数。
-    
-    typedef arma::mat (GWRMultiscale::*FitAllFunction)(const arma::mat&, const arma::vec&); //!< \~english Function to fit a model for all variables. \~chinese 根据所有变量拟合模型的函数。
-    
-    typedef arma::vec (GWRMultiscale::*FitVarFunction)(const arma::vec&, const arma::vec&, const arma::uword, arma::mat&);  //!< \~english Function to fit a model for the given variable. \~chinese 根据给定变量拟合模型的函数。
+        
+    typedef arma::vec (GWRMultiscale::*FitVarFunction)(const size_t);  //!< \~english Function to fit a model for the given variable. \~chinese 根据给定变量拟合模型的函数。
+        
+    typedef arma::vec (GWRMultiscale::*FitVarCoreFunction)(const arma::vec&, const arma::vec&, const SpatialWeight&, arma::mat&);  //!< \~english Function to fit a model for the given variable. \~chinese 根据给定变量拟合模型的函数。
+        
+    typedef arma::vec (GWRMultiscale::*FitVarCoreCVFunction)(const arma::vec&, const arma::vec&, const SpatialWeight&);  //!< \~english Function to fit a model for the given variable. \~chinese 根据给定变量拟合模型的函数。
+        
+    typedef arma::vec (GWRMultiscale::*FitVarCoreSHatFunction)(const arma::vec&, const arma::vec&, const SpatialWeight&, arma::vec&);  //!< \~english Function to fit a model for the given variable. \~chinese 根据给定变量拟合模型的函数。
     
 private:
 
@@ -539,21 +549,6 @@ class GWRMultiscale : public SpatialMultiscaleAlgorithm, public IBandwidthSelect
      * @return arma::mat 回归系数估计值 \f$\beta\f$。
      */
     const arma::mat& betas() const { return mBetas; }
-
-    /**
-     * \~english
-     * @brief Get criterion calculator function for optimize bandwidth size for all variables.
-     * 
-     * @param type The criterion type for optimize bandwidth size for all variables.
-     * @return BandwidthSizeCriterionFunction The criterion calculator for optimize bandwidth size for all variables.
-     * 
-     * \~chinese
-     * @brief 获取所有变量带宽优选的指标值计算函数。
-     * 
-     * @param type 所有变量带宽优选的指标值类型。
-     * @return BandwidthSizeCriterionFunction 所有变量带宽优选的指标值计算函数。
-     */
-    BandwidthSizeCriterionFunction bandwidthSizeCriterionAll(BandwidthSelectionCriterionType type);
     
     /**
      * \~english
@@ -625,7 +620,10 @@ class GWRMultiscale : public SpatialMultiscaleAlgorithm, public IBandwidthSelect
 
 public:     // IParallelCudaEnabled interface
     void setGPUId(const int gpuId) override { mGpuId = gpuId; }
-    void setGroupSize(const size_t size) override { mGroupLength = size; }
+    void setGroupSize(const std::size_t size) override { mGroupLength = size; }
+    int workerId() override { return mWorkerId; }
+    void setWorkerId(int id) override { mWorkerId = id; };
+    void setWorkerNum(int size) override { mWorkerNum = size; };
 
 protected:
 
@@ -662,76 +660,10 @@ class GWRMultiscale : public SpatialMultiscaleAlgorithm, public IBandwidthSelect
      * @param y 因变量 \f$y\f$。
      * @return arma::mat 回归系数估计值 \f$\beta\f$。
      */
-    arma::mat backfitting(const arma::mat &x, const arma::vec &y);
-
-    /**
-     * \~english
-     * @brief The serial implementation of fit function for all variables.
-     * 
-     * @param x Independent variables \f$X\f$.
-     * @param y Dependent variable \f$y\f$.
-     * @return arma::mat Coefficient estimates \f$\beta\f$.
-     * 
-     * \~chinese
-     * @brief 拟合所有变量的非并行实现。
-     * 
-     * @param x 自变量矩阵 \f$X\f$。
-     * @param y 因变量 \f$y\f$。
-     * @return arma::mat 回归系数估计值 \f$\beta\f$。
-     */
-    arma::mat fitAllSerial(const arma::mat& x, const arma::vec& y);
-
-    /**
-     * \~english
-     * @brief The serial implementation of fit function for one variable.
-     * 
-     * @param x Independent variables \f$X\f$.
-     * @param y Dependent variable \f$y\f$.
-     * @param var The index of this variable.
-     * @param S The hat matrix \f$S\f$.
-     * @return arma::vec The coefficient estimates corresponding to this variable.
-     * 
-     * \~chinese
-     * @brief 拟合单个变量的非并行实现。
-     * 
-     * @param x 自变量矩阵 \f$X\f$。
-     * @param y 因变量 \f$y\f$。
-     * @param var 当前变量的索引值。
-     * @param S 帽子矩阵 \f$S\f$
-     * @return arma::vec 该变量对应的回归系数估计值。
-     */
-    arma::vec fitVarSerial(const arma::vec& x, const arma::vec& y, const arma::uword var, arma::mat& S);
-
-    /**
-     * \~english
-     * @brief The serial implementation of CV criterion calculator for given bandwidth size and all variables.
-     * 
-     * @param bandwidthWeight Badwidth weight.
-     * @return double CV criterion value.
-     * 
-     * \~chinese
-     * @brief 为指定带宽值和所有变量计算CV指标值函数的非并行实现。
-     * 
-     * @param bandwidthWeight 带宽值。
-     * @return double CV指标值。
-     */
-    double bandwidthSizeCriterionAllCVSerial(BandwidthWeight* bandwidthWeight);
-
-    /**
-     * \~english
-     * @brief The serial implementation of AIC criterion calculator for given bandwidth size and all variables.
-     * 
-     * @param bandwidthWeight Badwidth weight.
-     * @return double AIC criterion value.
-     * 
-     * \~chinese
-     * @brief 为指定带宽值和所有变量计算AIC指标值函数的非并行实现。
-     * 
-     * @param bandwidthWeight 带宽值。
-     * @return double AIC指标值。
-     */
-    double bandwidthSizeCriterionAllAICSerial(BandwidthWeight* bandwidthWeight);
+    arma::mat backfitting(const arma::mat& betas0);
 
+    arma::vec fitVarBase(const size_t var);
+    
     /**
      * \~english
      * @brief The serial implementation of CV criterion calculator for given bandwidth size and one variable.
@@ -745,8 +677,8 @@ class GWRMultiscale : public SpatialMultiscaleAlgorithm, public IBandwidthSelect
      * @param bandwidthWeight 带宽值。
      * @return double CV指标值。
      */
-    double bandwidthSizeCriterionVarCVSerial(BandwidthWeight* bandwidthWeight);
-
+    double bandwidthSizeCriterionVarCVBase(BandwidthWeight* bandwidthWeight);
+    
     /**
      * \~english
      * @brief The serial implementation of AIC criterion calculator for given bandwidth size and one variable.
@@ -760,29 +692,11 @@ class GWRMultiscale : public SpatialMultiscaleAlgorithm, public IBandwidthSelect
      * @param bandwidthWeight 带宽值。
      * @return double AIC指标值。
      */
-    double bandwidthSizeCriterionVarAICSerial(BandwidthWeight* bandwidthWeight);
+    double bandwidthSizeCriterionVarAICBase(BandwidthWeight* bandwidthWeight);
 
-#ifdef ENABLE_OPENMP
-    /**
-     * \~english
-     * @brief The openmp parallel implementation of fit function for all variables.
-     * 
-     * @param x Independent variables \f$X\f$.
-     * @param y Dependent variable \f$y\f$.
-     * @return arma::mat Coefficient estimates \f$\beta\f$.
-     * 
-     * \~chinese
-     * @brief 拟合所有变量的多线程实现。
-     * 
-     * @param x 自变量矩阵 \f$X\f$。
-     * @param y 因变量 \f$y\f$。
-     * @return arma::mat 回归系数估计值 \f$\beta\f$。
-     */
-    arma::mat fitAllOmp(const arma::mat& x, const arma::vec& y);
-    
     /**
      * \~english
-     * @brief The openmp parallel implementation of fit function for one variable.
+     * @brief The serial implementation of fit function for one variable.
      * 
      * @param x Independent variables \f$X\f$.
      * @param y Dependent variable \f$y\f$.
@@ -791,7 +705,7 @@ class GWRMultiscale : public SpatialMultiscaleAlgorithm, public IBandwidthSelect
      * @return arma::vec The coefficient estimates corresponding to this variable.
      * 
      * \~chinese
-     * @brief 拟合单个变量的多线程实现。
+     * @brief 拟合单个变量的非并行实现。
      * 
      * @param x 自变量矩阵 \f$X\f$。
      * @param y 因变量 \f$y\f$。
@@ -799,88 +713,22 @@ class GWRMultiscale : public SpatialMultiscaleAlgorithm, public IBandwidthSelect
      * @param S 帽子矩阵 \f$S\f$
      * @return arma::vec 该变量对应的回归系数估计值。
      */
-    arma::vec fitVarOmp(const arma::vec& x, const arma::vec& y, const arma::uword var, arma::mat& S);
-    
-    /**
-     * \~english
-     * @brief The openmp parallel implementation of CV criterion calculator for given bandwidth size and all variables.
-     * 
-     * @param bandwidthWeight Badwidth weight.
-     * @return double CV criterion value.
-     * 
-     * \~chinese
-     * @brief 为指定带宽值和所有变量计算CV指标值函数的多线程实现。
-     * 
-     * @param bandwidthWeight 带宽值。
-     * @return double CV指标值。
-     */
-    double bandwidthSizeCriterionAllCVOmp(BandwidthWeight* bandwidthWeight);
+    arma::vec fitVarCoreSerial(const arma::vec& x, const arma::vec& y, const SpatialWeight& sw, arma::mat& S);
 
-    /**
-     * \~english
-     * @brief The openmp parallel implementation of AIC criterion calculator for given bandwidth size and all variables.
-     * 
-     * @param bandwidthWeight Badwidth weight.
-     * @return double AIC criterion value.
-     * 
-     * \~chinese
-     * @brief 为指定带宽值和所有变量计算AIC指标值函数的多线程实现。
-     * 
-     * @param bandwidthWeight 带宽值。
-     * @return double AIC指标值。
-     */
-    double bandwidthSizeCriterionAllAICOmp(BandwidthWeight* bandwidthWeight);
+    arma::vec fitVarCoreCVSerial(const arma::vec& x, const arma::vec& y, const SpatialWeight& sw);
 
-    /**
-     * \~english
-     * @brief The openmp parallel implementation of CV criterion calculator for given bandwidth size and one variable.
-     * 
-     * @param bandwidthWeight Badwidth weight.
-     * @return double CV criterion value.
-     * 
-     * \~chinese
-     * @brief 为指定带宽值和某个变量计算CV指标值函数的多线程实现。
-     * 
-     * @param bandwidthWeight 带宽值。
-     * @return double CV指标值。
-     */
-    double bandwidthSizeCriterionVarCVOmp(BandwidthWeight* bandwidthWeight);
+    arma::vec fitVarCoreSHatSerial(const arma::vec& x, const arma::vec& y, const SpatialWeight& sw, arma::vec& shat);
 
-    /**
-     * \~english
-     * @brief The openmp parallel implementation of AIC criterion calculator for given bandwidth size and one variable.
-     * 
-     * @param bandwidthWeight Badwidth weight.
-     * @return double AIC criterion value.
-     * 
-     * \~chinese
-     * @brief 为指定带宽值和某个变量计算AIC指标值函数的多线程实现。
-     * 
-     * @param bandwidthWeight 带宽值。
-     * @return double AIC指标值。
-     */
-    double bandwidthSizeCriterionVarAICOmp(BandwidthWeight* bandwidthWeight);
+#ifdef ENABLE_OPENMP
+    arma::vec fitVarCoreOmp(const arma::vec& x, const arma::vec& y, const SpatialWeight& sw, arma::mat& S);
+
+    arma::vec fitVarCoreCVOmp(const arma::vec& x, const arma::vec& y, const SpatialWeight& sw);
+
+    arma::vec fitVarCoreSHatOmp(const arma::vec& x, const arma::vec& y, const SpatialWeight& sw, arma::vec& shat);
 #endif
 
 #ifdef ENABLE_CUDA
     
-    /**
-     * \~english
-     * @brief The CUDA implementation of fit function for all variables.
-     * 
-     * @param x Independent variables \f$X\f$.
-     * @param y Dependent variable \f$y\f$.
-     * @return arma::mat Coefficient estimates \f$\beta\f$.
-     * 
-     * \~chinese
-     * @brief 拟合所有变量的CUDA实现。
-     * 
-     * @param x 自变量矩阵 \f$X\f$。
-     * @param y 因变量 \f$y\f$。
-     * @return arma::mat 回归系数估计值 \f$\beta\f$。
-     */
-    arma::mat fitAllCuda(const arma::mat& x, const arma::vec& y);
-    
     /**
      * \~english
      * @brief The CUDA implementation of fit function for one variable.
@@ -900,90 +748,34 @@ class GWRMultiscale : public SpatialMultiscaleAlgorithm, public IBandwidthSelect
      * @param S 帽子矩阵 \f$S\f$
      * @return arma::vec 该变量对应的回归系数估计值。
      */
-    arma::vec fitVarCuda(const arma::vec& x, const arma::vec& y, const arma::uword var, arma::mat& S);
-    
-    /**
-     * \~english
-     * @brief The CUDA implementation of CV criterion calculator for given bandwidth size and all variables.
-     * 
-     * @param bandwidthWeight Badwidth weight.
-     * @return double CV criterion value.
-     * 
-     * \~chinese
-     * @brief 为指定带宽值和所有变量计算CV指标值函数的CUDA实现。
-     * 
-     * @param bandwidthWeight 带宽值。
-     * @return double CV指标值。
-     */
-    double bandwidthSizeCriterionAllCVCuda(BandwidthWeight* bandwidthWeight);
+    arma::vec fitVarCoreCuda(const arma::vec& x, const arma::vec& y, const SpatialWeight& sw, arma::mat& S);
     
-    /**
-     * \~english
-     * @brief The CUDA implementation of AIC criterion calculator for given bandwidth size and all variables.
-     * 
-     * @param bandwidthWeight Badwidth weight.
-     * @return double AIC criterion value.
-     * 
-     * \~chinese
-     * @brief 为指定带宽值和所有变量计算AIC指标值函数的CUDA实现。
-     * 
-     * @param bandwidthWeight 带宽值。
-     * @return double AIC指标值。
-     */
-    double bandwidthSizeCriterionAllAICCuda(BandwidthWeight* bandwidthWeight);
-    
-    /**
-     * \~english
-     * @brief The CUDA implementation of CV criterion calculator for given bandwidth size and one variable.
-     * 
-     * @param bandwidthWeight Badwidth weight.
-     * @return double CV criterion value.
-     * 
-     * \~chinese
-     * @brief 为指定带宽值和某个变量计算CV指标值函数的CUDA实现。
-     * 
-     * @param bandwidthWeight 带宽值。
-     * @return double CV指标值。
-     */
-    double bandwidthSizeCriterionVarCVCuda(BandwidthWeight* bandwidthWeight);
+    arma::vec fitVarCoreCVCuda(const arma::vec& x, const arma::vec& y, const SpatialWeight& sw);
     
-    /**
-     * \~english
-     * @brief The CUDA implementation of AIC criterion calculator for given bandwidth size and one variable.
-     * 
-     * @param bandwidthWeight Badwidth weight.
-     * @return double AIC criterion value.
-     * 
-     * \~chinese
-     * @brief 为指定带宽值和某个变量计算AIC指标值函数的CUDA实现。
-     * 
-     * @param bandwidthWeight 带宽值。
-     * @return double AIC指标值。
-     */
-    double bandwidthSizeCriterionVarAICCuda(BandwidthWeight* bandwidthWeight);
+    arma::vec fitVarCoreSHatCuda(const arma::vec& x, const arma::vec& y, const SpatialWeight& sw, arma::vec& shat);
 
 #endif // ENABLE_CUDA
 
-    /**
-     * \~english
-     * @brief Create a Initial Distance Parameter object
-     * 
-     * \~chinese
-     * @brief 创建初始距离参数对象。
-     */
-    void createInitialDistanceParameter();
+#ifdef ENABLE_MPI
+    arma::vec fitVarMpi(const size_t var);
+    double bandwidthSizeCriterionVarCVMpi(BandwidthWeight* bandwidthWeight);
+    double bandwidthSizeCriterionVarAICMpi(BandwidthWeight* bandwidthWeight);
+#endif // ENABLE_MPI
 
 private:
-    FitAllFunction mFitAll = &GWRMultiscale::fitAllSerial;  //!< \~english Calculator to fit a model for all variables. \~chinese 为所有变量拟合模型的函数。
-    FitVarFunction mFitVar = &GWRMultiscale::fitVarSerial;  //!< \~english Calculator to fit a model for one variable. \~chinese 为单一变量拟合模型的函数。
+    FitVarFunction mFitVar = &GWRMultiscale::fitVarBase;  //!< \~english Calculator to fit a model for one variable. \~chinese 为单一变量拟合模型的函数。
+    FitVarCoreFunction mFitVarCore = &GWRMultiscale::fitVarCoreSerial;  //!< \~english Calculator to fit a model for one variable. \~chinese 为单一变量拟合模型的函数。
+    FitVarCoreCVFunction mFitVarCoreCV = &GWRMultiscale::fitVarCoreCVSerial;  //!< \~english Calculator to fit a model for one variable. \~chinese 为单一变量拟合模型的函数。
+    FitVarCoreSHatFunction mFitVarCoreSHat = &GWRMultiscale::fitVarCoreSHatSerial;  //!< \~english Calculator to fit a model for one variable. \~chinese 为单一变量拟合模型的函数。
 
     SpatialWeight mInitSpatialWeight;   //!< \~english Spatial weighting sheme for initializing bandwidth. \~chinese 计算初始带宽值时所用的空间权重配置。
-    BandwidthSizeCriterionFunction mBandwidthSizeCriterion = &GWRMultiscale::bandwidthSizeCriterionAllCVSerial; //!< \~english The criterion calculator for given bandwidth size. \~chinese 根据指定带宽值计算指标值的函数。
+    BandwidthSizeCriterionFunction mBandwidthSizeCriterion = &GWRMultiscale::bandwidthSizeCriterionVarCVBase; //!< \~english The criterion calculator for given bandwidth size. \~chinese 根据指定带宽值计算指标值的函数。
     size_t mBandwidthSelectionCurrentIndex = 0; //!< \~english The index of variable which currently the algorithm select bandwidth for. \~chinese 当前正在选带宽的变量索引值。
     double mBandwidthLastCriterion = DBL_MAX;   //!< \~english Last criterion for bandwidth selection. \~chinese 上一次带宽优选的有效指标值。
     std::optional<double> mGoldenUpperBounds;
     std::optional<double> mGoldenLowerBounds;
     std::vector<double> mMaxDistances;
+    std::vector<double> mMinDistances;
 
     std::vector<BandwidthInitilizeType> mBandwidthInitilize;    //!< \~english Type of bandwidth initilization values. \~chinese 带宽初始值类型。
     std::vector<BandwidthSelectionCriterionType> mBandwidthSelectionApproach;   //!< \~english Type of bandwidth selection approach. \~chinese 带宽选择方法类型。
@@ -1020,6 +812,12 @@ class GWRMultiscale : public SpatialMultiscaleAlgorithm, public IBandwidthSelect
     int mOmpThreadNum = 8;  //!< \~english Number of threads. \~chinese 并行线程数。
     size_t mGroupLength = 64;   //!< \~english Size of a group computing together. \~chinese 同时计算的一组的大小。
     int mGpuId = 0; //!< \~english The ID of selected GPU. \~chinese 选择的 GPU 的 ID。
+    int mWorkerId = 0;
+    int mWorkerNum = 1;
+#ifdef ENABLE_MPI
+    arma::uword mWorkRangeSize = 0;
+#endif // ENABLE_MPI
+    std::optional<std::pair<arma::uword, arma::uword>> mWorkRange;
 
 public:
     static int treeChildCount;  //!< \~english  \~chinese
diff --git a/include/gwmodelpp/IParallelizable.h b/include/gwmodelpp/IParallelizable.h
index ec5a4fff..19158bd8 100644
--- a/include/gwmodelpp/IParallelizable.h
+++ b/include/gwmodelpp/IParallelizable.h
@@ -16,7 +16,11 @@ enum ParallelType
 {
     SerialOnly = 1 << 0,    //!< \~english Use no parallel methods. \~chinese 不并行。
     OpenMP = 1 << 1,        //!< \~english Use multithread methods. \~chinese 多线程并行。
-    CUDA = 1 << 2           //!< \~english Use CUDA accelerated methods. \~chinese CUDA加速。
+    CUDA = 1 << 2,          //!< \~english Use CUDA accelerated methods. \~chinese CUDA加速。
+    MPI = (1 << 3),
+    MPI_Serial = (1 << 3) | (1 << 0),
+    MPI_MP = (1 << 3) | (1 << 1),
+    MPI_CUDA = (1 << 3) | (1 << 2)
 };
 
 /**
@@ -137,10 +141,23 @@ struct IParallelCudaEnabled
      * 对于大多数 GPU 可选择值 64。
      * 
      */
-    virtual void setGroupSize(const size_t size) = 0;
+    virtual void setGroupSize(const std::size_t size) = 0;
     
 };
 
+struct IParallelMpiEnabled
+{
+    virtual int workerId() = 0;
+    virtual void setWorkerId(int id) = 0;
+    virtual void setWorkerNum(int size) = 0;
+};
+
+#define GWM_MPI_MASTER_BEGIN if (workerId() == 0) {
+#define GWM_MPI_MASTER_END }
+#define GWM_MPI_WORKER_BEGIN if (workerId() != 0) {
+#define GWM_MPI_WORKER_END }
+#define GWM_MPI_MASTER_WORKER_SWITCH } else {
+
 }
 
 #endif  // IPARALLELIZABLE_H
\ No newline at end of file
diff --git a/include/gwmodelpp/spatialweight/CRSDistance.h b/include/gwmodelpp/spatialweight/CRSDistance.h
index 3f953bfb..461f0ead 100644
--- a/include/gwmodelpp/spatialweight/CRSDistance.h
+++ b/include/gwmodelpp/spatialweight/CRSDistance.h
@@ -4,6 +4,7 @@
 #include "Distance.h"
 
 #ifdef ENABLE_CUDA
+#include "gwmodelpp/utils/cumat.hpp"
 #include "gwmodelpp/spatialweight/cuda/CRSDistanceKernel.h"
 #include "gwmodelpp/spatialweight/cuda/ISpatialCudaEnabled.h"
 #endif // ENABLE_CUDA
@@ -146,13 +147,6 @@ class CRSDistance : public Distance
 
     virtual ~CRSDistance()
     {
-#ifdef ENABLE_CUDA
-        if (mCudaPrepared)
-        {
-            cudaFree(mCudaDp);
-            cudaFree(mCudaFp);
-        }
-#endif
     }
 
     virtual Distance * clone() const override
@@ -222,8 +216,8 @@ class CRSDistance : public Distance
     CalculatorType mCalculator = &EuclideanDistance;  //!< \~english Calculator \~chinese 距离计算方法
 
 #ifdef ENABLE_CUDA
-    double* mCudaDp = 0;    //!< \~english Device pointer to data points \~chinese 指向数据点的设备指针
-    double* mCudaFp = 0;    //!< \~english Device pointer to focus points \~chinese 指向关注点的设备指针
+    cumat mCudaDp;    //!< \~english Device pointer to data points \~chinese 指向数据点的设备指针
+    cumat mCudaFp;    //!< \~english Device pointer to focus points \~chinese 指向关注点的设备指针
     CalculatorCudaType mCalculatorCuda = &eu_dist_cuda;  //!< \~english CUDA based Calculator \~chinese 基于 CUDA 的距离计算方法
 #endif
 
diff --git a/include/gwmodelpp/spatialweight/SpatialWeight.h b/include/gwmodelpp/spatialweight/SpatialWeight.h
index 8bd8901c..5a35e69c 100644
--- a/include/gwmodelpp/spatialweight/SpatialWeight.h
+++ b/include/gwmodelpp/spatialweight/SpatialWeight.h
@@ -377,7 +377,7 @@ class SpatialWeight
      * @param focus 当前样本的索引值。
      * @return vec 当前样本到其他所有样本的空间权重向量。
      */
-    virtual arma::vec weightVector(arma::uword focus)
+    virtual arma::vec weightVector(arma::uword focus) const
     {
         return mWeight->weight(mDistance->distance(focus));
     }
@@ -412,7 +412,7 @@ class SpatialWeight
      * @param d_weights \~english Device pointer to distances \~chinese 指向输出权重的设备指针
      * @return cudaError_t \~english CUDA error or success \~chinese CUDA 错误或成功 
      */
-    virtual cudaError_t weightVector(arma::uword focus, double* d_dists, double* d_weights)
+    virtual cudaError_t weightVector(arma::uword focus, double* d_dists, double* d_weights) const
     {
         cudaError_t error;
         size_t elems = 0;
diff --git a/include/gwmodelpp/utils/armampi.h b/include/gwmodelpp/utils/armampi.h
new file mode 100644
index 00000000..d1866a0a
--- /dev/null
+++ b/include/gwmodelpp/utils/armampi.h
@@ -0,0 +1,10 @@
+#include "armadillo_config.h"
+#include "mpi.h"
+
+#ifdef ARMA_32BIT_WORD
+#define GWM_MPI_UWORD MPI_UNSIGNED_LONG
+#else   // ARMA_32BIT_WORD
+#define GWM_MPI_UWORD MPI_UNSIGNED_LONG_LONG
+#endif  // ARMA_32BIT_WORD
+
+void mat_mul_mpi(arma::mat& a, arma::mat& b, arma::mat& c, const int ip, const int np, const size_t range);
diff --git a/include/gwmodelpp/utils/cumat.hpp b/include/gwmodelpp/utils/cumat.hpp
index 9fc51cb2..3e8af06e 100644
--- a/include/gwmodelpp/utils/cumat.hpp
+++ b/include/gwmodelpp/utils/cumat.hpp
@@ -47,6 +47,21 @@ class cubase
 {
 public:
     static cublasHandle_t handle;   //!< Save handle for cublas
+    static auto create_handle()
+    {
+        if (handle == nullptr) return cublasCreate(&handle);
+        else return CUBLAS_STATUS_SUCCESS;
+    }
+    static auto destory_handle()
+    {
+        if (handle != nullptr)
+        {
+            auto error = cublasDestroy(handle);
+            handle = nullptr;
+            return error;
+        }
+        else return CUBLAS_STATUS_SUCCESS;
+    }
     constexpr static const double alpha1 = 1.0;
     constexpr static const double beta0 = 0.0;
     constexpr static const double beta1 = 1.0;
@@ -245,6 +260,32 @@ class cumat : public cubase
      */
     const cuop_trans<cumat> t() const;
 
+    void resize(size_t rows, size_t cols)
+    {
+        if (dMem != nullptr && mIsRelease)
+        {
+            cudaFree(dMem);
+        }
+        cudaMalloc(&dMem, sizeof(double) * rows * cols);
+    }
+
+    cumat& operator=(const cumat& right)
+    {
+        resize(right.mRows, right.mCols);
+        cudaMemcpy(dMem, right.dMem, nbytes(), cudaMemcpyDeviceToDevice);
+        return *this;
+    }
+
+    cumat& operator=(cumat&& right)
+    {
+        mRows = right.mRows;
+        mCols = right.mCols;
+        dMem = right.dMem;
+        mIsRelease = true;
+        right.mIsRelease = false;
+        return *this;
+    }
+
     cumat& operator=(const cuop_trans<cumat>& right);
 
     template<class L, class R>
@@ -282,6 +323,8 @@ class cumat : public cubase
     size_t mCols = 0;
 };
 
+void print(const cumat& mat);
+
 /**
  * @brief \~english Strided matrix. \~chinese 条带矩阵。
  * 
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index e93a8ce1..c4cdb921 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -15,6 +15,15 @@ if(OpenMP_FOUND AND OpenMP_C_FOUND AND OpenMP_CXX_FOUND)
 endif(OpenMP_FOUND AND OpenMP_C_FOUND AND OpenMP_CXX_FOUND)
 endif()
 
+if(ENABLE_MPI)
+    find_package(MPI REQUIRED)
+    add_definitions(-DENABLE_MPI)
+    include_directories(${MPI_CXX_HEADER_DIR})
+    add_link_options(${MPI_CXX_LINK_FLAGS})
+    add_compile_options(${MPI_CXX_COMPILE_OPTIONS})
+    add_definitions(${MPI_CXX_COMPILE_DEFINITIONS})
+endif(ENABLE_MPI)
+
 find_package(GSL REQUIRED)
 if(GSL_FOUND)
     include_directories(${GSL_INCLUDE_DIRS})
@@ -137,6 +146,17 @@ if(ENABLE_CUDA)
     list(PREPEND SOURCES_ALL ${SOURCES_CUDA})
 endif(ENABLE_CUDA)
 
+if(ENABLE_MPI)
+    set(HEADERS_MPI
+        ../include/gwmodelpp/utils/armampi.h
+    )
+    set(SOURCES_MPI
+        gwmodelpp/utils/armampi.cpp
+    )
+    list(PREPEND HEADERS_ALL ${HEADERS_MPI})
+    list(PREPEND SOURCES_ALL ${SOURCES_MPI})
+endif(ENABLE_MPI)
+
 add_library(gwmodel STATIC ${HEADERS_ALL} ${SOURCES_ALL})
 set_property(TARGET gwmodel PROPERTY POSITION_INDEPENDENT_CODE ON)
 
@@ -194,6 +214,12 @@ if(OpenMP_FOUND)
     )
 endif(OpenMP_FOUND)
 
+if(ENABLE_MPI AND MPI_FOUND)
+    target_link_libraries(gwmodel
+        ${MPI_CXX_LIBRARIES}
+    )
+endif()
+
 if(USE_CUDA_SHARED)
     set(HEADERS_CUDA_SHARED
         ../include/gwmodelcuda/StdTelegram.h
diff --git a/src/gwmodelpp/GWRBasic.cpp b/src/gwmodelpp/GWRBasic.cpp
index 28c34649..3c300b23 100644
--- a/src/gwmodelpp/GWRBasic.cpp
+++ b/src/gwmodelpp/GWRBasic.cpp
@@ -14,10 +14,36 @@
 #include "cumat.hpp"
 #endif
 
+#ifdef ENABLE_MPI
+#include "mpi.h"
+#include "gwmodelpp/utils/armampi.h"
+#endif
+
 using namespace std;
 using namespace arma;
 using namespace gwm;
 
+enum class GWRBasicFitMpiTags
+{
+    Betas = 1 << 0,
+    BetasSE,
+    SHat,
+    QDiag,
+    SMat,
+    Finished
+};
+
+enum class GWRBasicAICMpiTags
+{
+    SHat = 1 << 4,
+    Betas
+};
+
+enum class GWRBasicCVMpiTags
+{
+    Betas = 1 << 8
+};
+
 RegressionDiagnostic GWRBasic::CalcDiagnostic(const mat& x, const vec& y, const mat& betas, const vec& shat)
 {
     vec r = y - sum(betas % x, 1);
@@ -37,11 +63,47 @@ mat GWRBasic::fit()
 {
     GWM_LOG_STAGE("Initializing");
     uword nDp = mCoords.n_rows, nVars = mX.n_cols;
+#ifdef ENABLE_MPI
+    // Sync x, y, coords with other processes
+    if (mParallelType & ParallelType::MPI)
+    {
+        uword aShape[4];
+        if (mWorkerId == 0) // master process
+        {
+            // sync matrix dimensions
+            uword nDim = mCoords.n_cols;
+            mWorkRangeSize = nDp / uword(mWorkerNum) + ((nDp % uword(mWorkerNum) == 0) ? 0 : 1);
+            aShape[0] = nDp;
+            aShape[1] = nVars;
+            aShape[2] = nDim;
+            aShape[3] = mWorkRangeSize;
+            // cout << mWorkerId << " process send size: [" << aShape[0] << "," << aShape[1] << "," << aShape[2] << "," << aShape[3] << "]\n";
+        }
+        MPI_Bcast(aShape, 4, GWM_MPI_UWORD, 0, MPI_COMM_WORLD);
+        if (mWorkerId > 0)                // slave process
+        {
+            // cout << mWorkerId << " process recv size: [" << aShape[0] << "," << aShape[1] << "," << aShape[2] << "," << aShape[3] << "]\n";
+            nDp = aShape[0];
+            nVars = aShape[1];
+            mX.resize(nDp, nVars);
+            mY.resize(nDp);
+            mCoords.resize(aShape[0], aShape[2]);
+            mWorkRangeSize = aShape[3];
+        }
+        MPI_Bcast(mX.memptr(), mX.n_elem, MPI_DOUBLE, 0, MPI_COMM_WORLD);
+        MPI_Bcast(mY.memptr(), mY.n_elem, MPI_DOUBLE, 0, MPI_COMM_WORLD);
+        MPI_Bcast(mCoords.memptr(), mCoords.n_elem, MPI_DOUBLE, 0, MPI_COMM_WORLD);
+        uword workRangeFrom = uword(mWorkerId) * mWorkRangeSize;
+        // cout << mWorkerId << " process work range: [" << workRangeFrom << "," << min(workRangeFrom + mWorkRangeSize, nDp) << "]\n";
+        mWorkRange = make_pair(workRangeFrom, min(workRangeFrom + mWorkRangeSize, nDp));
+        // cout << mWorkerId << " process work range: [" << mWorkRange.value().first << "," << mWorkRange.value().second << "]\n";
+    }
+#endif // ENABLE_MPI
     createDistanceParameter();
 #ifdef ENABLE_CUDA
-    if (mParallelType == ParallelType::CUDA)
+    if (mParallelType & ParallelType::CUDA)
     {
-        cublasCreate(&cubase::handle);
+        cubase::create_handle();
         mSpatialWeight.prepareCuda(mGpuId);
     }
 #endif // ENABLE_CUDA
@@ -58,16 +120,15 @@ mat GWRBasic::fit()
         size_t k = indep_vars.size();
         mIndepVarSelectionProgressTotal = (k + 1) * k / 2;
         mIndepVarSelectionProgressCurrent = 0;
-
         GWM_LOG_INFO(IVarialbeSelectable::infoVariableCriterion());
         VariableForwardSelector selector(indep_vars, mIndepVarSelectionThreshold);
         mSelectedIndepVars = selector.optimize(this);
         if (mSelectedIndepVars.size() > 0)
         {
             mX = mX.cols(VariableForwardSelector::index2uvec(mSelectedIndepVars, mHasIntercept));
-            nVars = mX.n_cols;
             mIndepVarsSelectionCriterionList = selector.indepVarsCriterion();
         }
+        nVars = mX.n_cols;
         GWM_LOG_STOP_RETURN(mStatus, mat(nDp, nVars, arma::fill::zeros));
     }
 
@@ -85,7 +146,7 @@ mat GWRBasic::fit()
         {
             mSpatialWeight.setWeight(bw);
 #ifdef ENABLE_CUDA
-            if (mParallelType == ParallelType::CUDA)
+            if (mParallelType & ParallelType::CUDA)
             {
                 mSpatialWeight.prepareCuda(mGpuId);
             }
@@ -96,34 +157,15 @@ mat GWRBasic::fit()
     }
 
     GWM_LOG_STAGE("Model fitting");
-    mBetas = (this->*mFitFunction)(mX, mY, mBetasSE, mSHat, mQDiag, mS);
-    GWM_LOG_STOP_RETURN(mStatus, mat(nDp, nVars, arma::fill::zeros));
-
-    GWM_LOG_STAGE("Model Diagnostic");
-    mDiagnostic = CalcDiagnostic(mX, mY, mBetas, mSHat);
-    double trS = mSHat(0), trStS = mSHat(1);
-    double sigmaHat = mDiagnostic.RSS / (nDp - 2 * trS + trStS);
-    mBetasSE = sqrt(sigmaHat * mBetasSE);
-    vec yhat = Fitted(mX, mBetas);
-    vec res = mY - yhat;
-    vec stu_res = res / sqrt(sigmaHat * mQDiag);
-    mat betasTV = mBetas / mBetasSE;
-    vec dybar2 = (mY - mean(mY)) % (mY - mean(mY));
-    vec dyhat2 = (mY - yhat) % (mY - yhat);
-    vec localR2 = vec(nDp, fill::zeros);
-    for (uword i = 0; i < nDp; i++)
-    {
-        vec w = mSpatialWeight.weightVector(i);
-        double tss = sum(dybar2 % w);
-        double rss = sum(dyhat2 % w);
-        localR2(i) = (tss - rss) / tss;
-    }
+    mBetas = (this->*mFitFunction)();
+    
 #ifdef ENABLE_CUDA
-    if (mParallelType == ParallelType::CUDA)
+    if (mParallelType & ParallelType::CUDA)
     {
-        cublasDestroy(cubase::handle);
+        cubase::destory_handle();
     }
 #endif // ENABLE_CUDA
+
     return mBetas;
 }
 
@@ -133,9 +175,9 @@ mat GWRBasic::predict(const mat& locations)
     uword nDp = mCoords.n_rows, nVars = mX.n_cols;
     createPredictionDistanceParameter(locations);
 #ifdef ENABLE_CUDA
-    if (mParallelType == ParallelType::CUDA)
+    if (mParallelType & ParallelType::CUDA)
     {
-        cublasCreate(&cubase::handle);
+        cubase::create_handle();
         mSpatialWeight.prepareCuda(mGpuId);
     }
 #endif // ENABLE_CUDA
@@ -146,9 +188,9 @@ mat GWRBasic::predict(const mat& locations)
     GWM_LOG_STOP_RETURN(mStatus, mat(nDp, nVars, arma::fill::zeros));
 
 #ifdef ENABLE_CUDA
-    if (mParallelType == ParallelType::CUDA)
+    if (mParallelType & ParallelType::CUDA)
     {
-        cublasDestroy(cubase::handle);
+        cubase::destory_handle();
     }
 #endif // ENABLE_CUDA
 
@@ -190,15 +232,30 @@ mat GWRBasic::predictSerial(const mat& locations, const mat& x, const vec& y)
     return betas.t();
 }
 
-mat GWRBasic::fitSerial(const mat& x, const vec& y, mat& betasSE, vec& shat, vec& qDiag, mat& S)
+arma::mat gwm::GWRBasic::fitBase()
+{
+    mBetas = (this->*mFitCoreFunction)(mX, mY, mSpatialWeight);
+    mDiagnostic = CalcDiagnostic(mX, mY, mBetas, mSHat);
+    double trS = mSHat(0), trStS = mSHat(1);
+    double nDp = double(mCoords.n_rows);
+    double sigmaHat = mDiagnostic.RSS / (nDp - 2 * trS + trStS);
+    mBetasSE = sqrt(sigmaHat * mBetasSE);
+    return mBetas;
+}
+
+mat GWRBasic::fitCoreSerial(const mat &x, const vec &y, const SpatialWeight &sw)
 {
     uword nDp = mCoords.n_rows, nVar = x.n_cols;
     mat betas(nVar, nDp, fill::zeros);
-    betasSE = mat(nVar, nDp, fill::zeros);
-    shat = vec(2, fill::zeros);
-    qDiag = vec(nDp, fill::zeros);
-    S = mat(isStoreS() ? nDp : 1, nDp, fill::zeros);
-    for (uword i = 0; i < nDp; i++)
+    mBetasSE = mat(nVar, nDp, fill::zeros);
+    mSHat = vec(2, fill::zeros);
+    mQDiag = vec(nDp, fill::zeros);
+    std::pair<uword, uword> workRange = mWorkRange.value_or(make_pair(0, nDp));
+    uword rangeSize = workRange.second - workRange.first;
+    mS = mat(isStoreS() ? rangeSize : 1, nDp, fill::zeros);
+    mC = cube(nVar, nDp, isStoreC() ? rangeSize : 1, fill::zeros);
+    // cout << mWorkerId << " process work range: [" << workRange.first << "," << workRange.second << "]\n";
+    for (uword i = workRange.first; i < workRange.second; i++)
     {
         GWM_LOG_STOP_BREAK(mStatus);
         vec w = mSpatialWeight.weightVector(i);
@@ -210,14 +267,15 @@ mat GWRBasic::fitSerial(const mat& x, const vec& y, mat& betasSE, vec& shat, vec
             mat xtwx_inv = inv_sympd(xtwx);
             betas.col(i) = xtwx_inv * xtwy;
             mat ci = xtwx_inv * xtw;
-            betasSE.col(i) = sum(ci % ci, 1);
+            mBetasSE.col(i) = sum(ci % ci, 1);
             mat si = x.row(i) * ci;
-            shat(0) += si(0, i);
-            shat(1) += det(si * si.t());
+            mSHat(0) += si(0, i);
+            mSHat(1) += det(si * si.t());
             vec p = - si.t();
             p(i) += 1.0;
-            qDiag += p % p;
-            S.row(isStoreS() ? i : 0) = si;
+            mQDiag += p % p;
+            mS.row(isStoreS() ? (i - workRange.first) : 0) = si;
+            mC.slice(isStoreC() ? (i - workRange.first) : 0) = ci;
         }
         catch (const exception& e)
         {
@@ -226,123 +284,137 @@ mat GWRBasic::fitSerial(const mat& x, const vec& y, mat& betasSE, vec& shat, vec
         }
         GWM_LOG_PROGRESS(i + 1, nDp);
     }
-    betasSE = betasSE.t();
+    mBetasSE = mBetasSE.t();
     return betas.t();
 }
 
-double GWRBasic::bandwidthSizeCriterionCVSerial(BandwidthWeight* bandwidthWeight)
+mat GWRBasic::fitCoreCVSerial(const mat& x, const vec& y, const SpatialWeight& sw)
 {
-    uword nDp = mCoords.n_rows;
-    vec shat(2, fill::zeros);
-    double cv = 0.0;
-    for (uword i = 0; i < nDp; i++)
+    uword nDp = mCoords.n_rows, nVar = x.n_cols;
+    mat betas(nVar, nDp, fill::zeros);
+    std::pair<uword, uword> workRange = mWorkRange.value_or(make_pair(0, nDp));
+    for (uword i = workRange.first; i < workRange.second; i++)
     {
         GWM_LOG_STOP_BREAK(mStatus);
-        vec d = mSpatialWeight.distance()->distance(i);
-        vec w = bandwidthWeight->weight(d);
+        vec w = sw.weightVector(i);
         w(i) = 0.0;
-        mat xtw = trans(mX.each_col() % w);
-        mat xtwx = xtw * mX;
-        mat xtwy = xtw * mY;
+        mat xtw = trans(x.each_col() % w);
+        mat xtwx = xtw * x;
+        mat xtwy = xtw * y;
         try
         {
             mat xtwx_inv = inv_sympd(xtwx);
-            vec beta = xtwx_inv * xtwy;
-            double res = mY(i) - det(mX.row(i) * beta);
-            cv += res * res;
+            betas.col(i) = xtwx_inv * xtwy;
         }
         catch (const exception& e)
         {
             GWM_LOG_ERROR(e.what());
-            return DBL_MAX;
+            throw e;
         }
     }
-    if (mStatus == Status::Success && isfinite(cv))
-    {
-        GWM_LOG_INFO(IBandwidthSelectable::infoBandwidthCriterion(bandwidthWeight, cv));
-        GWM_LOG_PROGRESS_PERCENT(exp(- abs(mBandwidthLastCriterion - cv)));
-        mBandwidthLastCriterion = cv;
-        return cv;
-    }
-    else return DBL_MAX;
+    return betas.t();
 }
 
-double GWRBasic::bandwidthSizeCriterionAICSerial(BandwidthWeight* bandwidthWeight)
+mat GWRBasic::fitCoreSHatSerial(const mat& x, const vec& y, const SpatialWeight& sw, vec& shat)
 {
-    uword nDp = mCoords.n_rows, nVar = mX.n_cols;
+    uword nDp = mCoords.n_rows, nVar = x.n_cols;
     mat betas(nVar, nDp, fill::zeros);
-    vec shat(2, fill::zeros);
-    for (uword i = 0; i < nDp; i++)
+    shat = vec(2, arma::fill::zeros);
+    std::pair<uword, uword> workRange = mWorkRange.value_or(make_pair(0, nDp));
+    for (uword i = workRange.first; i < workRange.second; i++)
     {
         GWM_LOG_STOP_BREAK(mStatus);
-        vec d = mSpatialWeight.distance()->distance(i);
-        vec w = bandwidthWeight->weight(d);
-        mat xtw = trans(mX.each_col() % w);
-        mat xtwx = xtw * mX;
-        mat xtwy = xtw * mY;
+        vec w = sw.weightVector(i);
+        mat xtw = trans(x.each_col() % w);
+        mat xtwx = xtw * x;
+        mat xtwy = xtw * y;
         try
         {
             mat xtwx_inv = inv_sympd(xtwx);
             betas.col(i) = xtwx_inv * xtwy;
             mat ci = xtwx_inv * xtw;
-            mat si = mX.row(i) * ci;
+            mat si = x.row(i) * ci;
             shat(0) += si(0, i);
             shat(1) += det(si * si.t());
         }
         catch (const exception& e)
         {
             GWM_LOG_ERROR(e.what());
-            return DBL_MAX;
+            throw e;
         }
     }
-    double value = GWRBase::AICc(mX, mY, betas.t(), shat);
-    if (mStatus == Status::Success && isfinite(value))
+    return betas.t();
+}
+
+double GWRBasic::bandwidthSizeCriterionCV(BandwidthWeight* bandwidthWeight)
+{
+    SpatialWeight sw(bandwidthWeight, mSpatialWeight.distance());
+    try
     {
-        GWM_LOG_INFO(IBandwidthSelectable::infoBandwidthCriterion(bandwidthWeight, value));
-        GWM_LOG_PROGRESS_PERCENT(exp(- abs(mBandwidthLastCriterion - value)));
-        mBandwidthLastCriterion = value;
-        return value;
+        mat betas = (this->*mFitCoreCVFunction)(mX, mY, sw);
+        vec res = mY - sum(mX % betas, 1);
+        double cv = sum(res % res);
+        if (mStatus == Status::Success && isfinite(cv))
+        {
+            GWM_LOG_INFO(IBandwidthSelectable::infoBandwidthCriterion(bandwidthWeight, cv));
+            GWM_LOG_PROGRESS_PERCENT(exp(- abs(mBandwidthLastCriterion - cv)));
+            mBandwidthLastCriterion = cv;
+            return cv;
+        }
+        else return DBL_MAX;
+    }
+    catch(const std::exception& e)
+    {
+        return DBL_MAX;
     }
-    else return DBL_MAX;
+    
 }
 
-double GWRBasic::indepVarsSelectionCriterionSerial(const vector<size_t>& indepVars)
+double GWRBasic::bandwidthSizeCriterionAIC(BandwidthWeight* bandwidthWeight)
 {
-    mat x = mX.cols(VariableForwardSelector::index2uvec(indepVars, mHasIntercept));
-    vec y = mY;
-    uword nDp = mCoords.n_rows, nVar = x.n_cols;
-    mat betas(nVar, nDp, fill::zeros);
-    vec shat(2, fill::zeros);
-    for (uword i = 0; i < nDp; i++)
+    SpatialWeight sw(bandwidthWeight, mSpatialWeight.distance());
+    try
     {
-        GWM_LOG_STOP_BREAK(mStatus);
-        vec w(nDp, fill::ones);
-        mat xtw = trans(x.each_col() % w);
-        mat xtwx = xtw * x;
-        mat xtwy = xtw * y;
-        try
+        vec shat;
+        mat betas = (this->*mFitCoreSHatFunction)(mX, mY, sw, shat);
+        double value = GWRBase::AICc(mX, mY, betas, shat);
+        if (mStatus == Status::Success && isfinite(value))
         {
-            mat xtwx_inv = inv_sympd(xtwx);
-            betas.col(i) = xtwx_inv * xtwy;
-            mat ci = xtwx_inv * xtw;
-            mat si = x.row(i) * ci;
-            shat(0) += si(0, i);
-            shat(1) += det(si * si.t());
+            GWM_LOG_INFO(IBandwidthSelectable::infoBandwidthCriterion(bandwidthWeight, value));
+            GWM_LOG_PROGRESS_PERCENT(exp(- abs(mBandwidthLastCriterion - value)));
+            mBandwidthLastCriterion = value;
+            return value;
         }
-        catch (const exception& e)
+        else return DBL_MAX;
+    }
+    catch(const std::exception& e)
+    {
+        return DBL_MAX;
+    }
+}
+
+double gwm::GWRBasic::indepVarsSelectionCriterion(const vector<size_t>& indepVars)
+{
+    mat x = mX.cols(VariableForwardSelector::index2uvec(indepVars, mHasIntercept));
+    SpatialWeight sw = mSpatialWeight;
+    sw.weight<BandwidthWeight>()->setBandwidth(DBL_MAX);
+    try
+    {
+        vec shat;
+        mat betas = (this->*mFitCoreSHatFunction)(x, mY, sw, shat);
+        GWM_LOG_PROGRESS(++mIndepVarSelectionProgressCurrent, mIndepVarSelectionProgressTotal);
+        if (mStatus == Status::Success)
         {
-            GWM_LOG_ERROR(e.what());
-            return DBL_MAX;
+            double aic = GWRBase::AICc(x, mY, betas, shat);
+            GWM_LOG_INFO(IVarialbeSelectable::infoVariableCriterion(indepVars, aic));
+            return aic;
         }
+        else return DBL_MAX;
     }
-    GWM_LOG_PROGRESS(++mIndepVarSelectionProgressCurrent, mIndepVarSelectionProgressTotal);
-    if (mStatus == Status::Success)
+    catch(const std::exception& e)
     {
-        double value = GWRBase::AICc(x, y, betas.t(), shat);
-        GWM_LOG_INFO(IVarialbeSelectable::infoVariableCriterion(indepVars, value));
-        return value;
+        return DBL_MAX;
     }
-    else return DBL_MAX;
 }
 
 #ifdef ENABLE_OPENMP
@@ -383,18 +455,21 @@ mat GWRBasic::predictOmp(const mat& locations, const mat& x, const vec& y)
     return betas.t();
 }
 
-mat GWRBasic::fitOmp(const mat& x, const vec& y, mat& betasSE, vec& shat, vec& qDiag, mat& S)
+mat GWRBasic::fitCoreOmp(const mat& x, const vec& y, const SpatialWeight& sw)
 {
     uword nDp = mCoords.n_rows, nVar = x.n_cols;
     mat betas(nVar, nDp, fill::zeros);
-    betasSE = mat(nVar, nDp, fill::zeros);
-    S = mat(isStoreS() ? nDp : 1, nDp, fill::zeros);
+    mBetasSE = mat(nVar, nDp, fill::zeros);
+    std::pair<uword, uword> workRange = mWorkRange.value_or(make_pair(0, nDp));
+    uword rangeSize = workRange.second - workRange.first;
+    mS = mat(isStoreS() ? rangeSize : 1, nDp, fill::zeros);
+    mC = cube(nVar, nDp, isStoreC() ? rangeSize : 1, fill::zeros);
     mat shat_all(2, mOmpThreadNum, fill::zeros);
     mat qDiag_all(nDp, mOmpThreadNum, fill::zeros);
     bool success = true;
     std::exception except;
 #pragma omp parallel for num_threads(mOmpThreadNum)
-    for (int i = 0; (uword)i < nDp; i++)
+    for (int i = int(workRange.first); (uword)i < workRange.second; i++)
     {
         GWM_LOG_STOP_CONTINUE(mStatus);
         if (success)
@@ -409,14 +484,15 @@ mat GWRBasic::fitOmp(const mat& x, const vec& y, mat& betasSE, vec& shat, vec& q
                 mat xtwx_inv = inv_sympd(xtwx);
                 betas.col(i) = xtwx_inv * xtwy;
                 mat ci = xtwx_inv * xtw;
-                betasSE.col(i) = sum(ci % ci, 1);
+                mBetasSE.col(i) = sum(ci % ci, 1);
                 mat si = x.row(i) * ci;
                 shat_all(0, thread) += si(0, i);
                 shat_all(1, thread) += det(si * si.t());
                 vec p = - si.t();
                 p(i) += 1.0;
                 qDiag_all.col(thread) += p % p;
-                S.row(isStoreS() ? i : 0) = si;
+                mS.row(isStoreS() ? (i - workRange.first) : 0) = si;
+                mC.slice(isStoreC() ? (i - workRange.first) : 0) = ci;
             }
             catch (const exception& e)
             {
@@ -431,74 +507,26 @@ mat GWRBasic::fitOmp(const mat& x, const vec& y, mat& betasSE, vec& shat, vec& q
     {
         throw except;
     }
-    shat = sum(shat_all, 1);
-    qDiag = sum(qDiag_all, 1);
-    betasSE = betasSE.t();
+    mSHat = sum(shat_all, 1);
+    mQDiag = sum(qDiag_all, 1);
+    mBetasSE = mBetasSE.t();
     return betas.t();
 }
 
-double GWRBasic::bandwidthSizeCriterionCVOmp(BandwidthWeight* bandwidthWeight)
+arma::mat GWRBasic::fitCoreCVOmp(const arma::mat& x, const arma::vec& y, const SpatialWeight& sw)
 {
-    uword nDp = mCoords.n_rows;
-    vec shat(2, fill::zeros);
-    vec cv_all(mOmpThreadNum, fill::zeros);
-    bool flag = true;
-#pragma omp parallel for num_threads(mOmpThreadNum)
-    for (int i = 0; (uword)i < nDp; i++)
-    {
-        GWM_LOG_STOP_CONTINUE(mStatus);
-        if (flag)
-        {
-            int thread = omp_get_thread_num();
-            vec d = mSpatialWeight.distance()->distance(i);
-            vec w = bandwidthWeight->weight(d);
-            w(i) = 0.0;
-            mat xtw = trans(mX.each_col() % w);
-            mat xtwx = xtw * mX;
-            mat xtwy = xtw * mY;
-            try
-            {
-                mat xtwx_inv = inv_sympd(xtwx);
-                vec beta = xtwx_inv * xtwy;
-                double res = mY(i) - det(mX.row(i) * beta);
-                if (isfinite(res))
-                    cv_all(thread) += res * res;
-                else
-                    flag = false;
-            }
-            catch (const exception& e)
-            {
-                GWM_LOG_ERROR(e.what());
-                flag = false;
-            }
-        }
-    }
-    if (mStatus == Status::Success && flag)
-    {
-        double cv = sum(cv_all);
-        GWM_LOG_INFO(IBandwidthSelectable::infoBandwidthCriterion(bandwidthWeight, cv));
-        GWM_LOG_PROGRESS_PERCENT(exp(- abs(mBandwidthLastCriterion - cv)));
-        mBandwidthLastCriterion = cv;
-        return cv;
-    }
-    else return DBL_MAX;
-}
-
-double GWRBasic::bandwidthSizeCriterionAICOmp(BandwidthWeight* bandwidthWeight)
-{
-    uword nDp = mCoords.n_rows, nVar = mX.n_cols;
+    uword nDp = mCoords.n_rows, nVar = x.n_cols;
     mat betas(nVar, nDp, fill::zeros);
-    mat shat_all(2, mOmpThreadNum, fill::zeros);
     bool flag = true;
+    std::pair<uword, uword> workRange = mWorkRange.value_or(make_pair(0, nDp));
 #pragma omp parallel for num_threads(mOmpThreadNum)
-    for (int i = 0; (uword)i < nDp; i++)
+    for (int i = int(workRange.first); (uword)i < workRange.second; i++)
     {
         GWM_LOG_STOP_CONTINUE(mStatus);
         if (flag)
         {
-            int thread = omp_get_thread_num();
-            vec d = mSpatialWeight.distance()->distance(i);
-            vec w = bandwidthWeight->weight(d);
+            vec w = sw.weightVector(i);
+            w(i) = 0.0;
             mat xtw = trans(mX.each_col() % w);
             mat xtwx = xtw * mX;
             mat xtwy = xtw * mY;
@@ -506,10 +534,6 @@ double GWRBasic::bandwidthSizeCriterionAICOmp(BandwidthWeight* bandwidthWeight)
             {
                 mat xtwx_inv = inv_sympd(xtwx);
                 betas.col(i) = xtwx_inv * xtwy;
-                mat ci = xtwx_inv * xtw;
-                mat si = mX.row(i) * ci;
-                shat_all(0, thread) += si(0, i);
-                shat_all(1, thread) += det(si * si.t());
             }
             catch (const exception& e)
             {
@@ -518,38 +542,24 @@ double GWRBasic::bandwidthSizeCriterionAICOmp(BandwidthWeight* bandwidthWeight)
             }
         }
     }
-    if (mStatus == Status::Success && flag)
-    {
-        vec shat = sum(shat_all, 1);
-        double value = GWRBase::AICc(mX, mY, betas.t(), shat);
-        if (isfinite(value))
-        {
-            GWM_LOG_INFO(IBandwidthSelectable::infoBandwidthCriterion(bandwidthWeight, value));
-            GWM_LOG_PROGRESS_PERCENT(exp(- abs(mBandwidthLastCriterion - value)));
-            mBandwidthLastCriterion = value;
-            return value;
-        }
-        else return DBL_MAX;
-    }
-    else return DBL_MAX;
+    return betas.t();
 }
 
-double GWRBasic::indepVarsSelectionCriterionOmp(const vector<size_t>& indepVars)
+arma::mat GWRBasic::fitCoreSHatOmp(const arma::mat& x, const arma::vec& y, const SpatialWeight& sw, arma::vec& shat)
 {
-    mat x = mX.cols(VariableForwardSelector::index2uvec(indepVars, mHasIntercept));
-    vec y = mY;
     uword nDp = mCoords.n_rows, nVar = x.n_cols;
     mat betas(nVar, nDp, fill::zeros);
-    mat shat(2, mOmpThreadNum, fill::zeros);
+    mat shat_all(2, mOmpThreadNum, fill::zeros);
     int flag = true;
+    std::pair<uword, uword> workRange = mWorkRange.value_or(make_pair(0, nDp));
 #pragma omp parallel for num_threads(mOmpThreadNum)
-    for (int i = 0; (uword)i < nDp; i++)
+    for (int i = int(workRange.first); (uword)i < workRange.second; i++)
     {
         GWM_LOG_STOP_CONTINUE(mStatus);
         if (flag)
         {
             int thread = omp_get_thread_num();
-            vec w(nDp, fill::ones);
+            vec w = sw.weightVector(i);
             mat xtw = trans(x.each_col() % w);
             mat xtwx = xtw * x;
             mat xtwy = xtw * y;
@@ -559,8 +569,8 @@ double GWRBasic::indepVarsSelectionCriterionOmp(const vector<size_t>& indepVars)
                 betas.col(i) = xtwx_inv * xtwy;
                 mat ci = xtwx_inv * xtw;
                 mat si = x.row(i) * ci;
-                shat(0, thread) += si(0, i);
-                shat(1, thread) += det(si * si.t());
+                shat_all(0, thread) += si(0, i);
+                shat_all(1, thread) += as_scalar(si * si.t());
             }
             catch (const exception& e)
             {
@@ -569,29 +579,27 @@ double GWRBasic::indepVarsSelectionCriterionOmp(const vector<size_t>& indepVars)
             }
         }
     }
-    GWM_LOG_PROGRESS(++mIndepVarSelectionProgressCurrent, mIndepVarSelectionProgressTotal);
-    if (mStatus == Status::Success && flag)
-    {
-        double value = GWRBase::AICc(x, y, betas.t(), sum(shat, 1));
-        GWM_LOG_INFO(IVarialbeSelectable::infoVariableCriterion(indepVars, value));
-        return value;
-    }
-    else return DBL_MAX;
+    shat = sum(shat_all, 1);
+    return betas.t();
 }
 #endif
 
 #ifdef ENABLE_CUDA
 
-mat GWRBasic::fitCuda(const mat& x, const vec& y, mat& betasSE, vec& shat, vec& qDiag, mat& S)
+mat GWRBasic::fitCoreCuda(const mat& x, const vec& y, const SpatialWeight& sw)
 {
-    uword nDp = mCoords.n_rows, nVar = x.n_cols;
+    uword nDp = x.n_rows, nVar = x.n_cols;
     mat xt = trans(x);
     mat betas = mat(nVar, nDp, arma::fill::zeros);
-    betasSE = mat(nVar, nDp, arma::fill::zeros);
-    shat = vec(2, arma::fill::zeros);
-    qDiag = vec(nDp, arma::fill::zeros);
-    S = mat(isStoreS() ? nDp : 1, nDp, arma::fill::zeros);
-    size_t groups = nDp / mGroupLength + (nDp % mGroupLength == 0 ? 0 : 1);
+    mat ci(nVar, nDp, arma::fill::zeros);
+    mBetasSE = mat(nVar, nDp, arma::fill::zeros);
+    mSHat = vec(2, arma::fill::zeros);
+    mQDiag = vec(nDp, arma::fill::zeros);
+    mS = mat(isStoreS() ? nDp : 1, nDp, arma::fill::zeros);
+    mC = cube(nVar, nDp, isStoreC() ? nDp : 1, arma::fill::zeros);
+    std::pair<uword, uword> workRange = mWorkRange.value_or(make_pair(0, nDp));
+    uword rangeSize = workRange.second - workRange.first;
+    size_t groups = rangeSize / mGroupLength + (rangeSize % mGroupLength == 0 ? 0 : 1);
     cumat u_xt(xt), u_y(y);
     cumat u_betas(nVar, nDp), u_betasSE(nVar, nDp);
     cumat u_dists(nDp, 1), u_weights(nDp, 1);
@@ -608,10 +616,10 @@ mat GWRBasic::fitCuda(const mat& x, const vec& y, mat& betasSE, vec& shat, vec&
     for (size_t i = 0; i < groups; i++)
     {
         GWM_LOG_STOP_BREAK(mStatus);
-        size_t begin = i * mGroupLength, length = (begin + mGroupLength > nDp) ? (nDp - begin) : mGroupLength;
+        size_t begin = workRange.first + i * mGroupLength, length = (begin + mGroupLength > workRange.second) ? (workRange.second - begin) : mGroupLength;
         for (size_t j = 0, e = begin + j; j < length; j++, e++)
         {
-            checkCudaErrors(mSpatialWeight.weightVector(e, u_dists.dmem(), u_weights.dmem()));
+            checkCudaErrors(sw.weightVector(e, u_dists.dmem(), u_weights.dmem()));
             u_xtw.strides(j) = u_xt.diagmul(u_weights);
         }
         // xtwx and xtwy
@@ -638,26 +646,28 @@ mat GWRBasic::fitCuda(const mat& x, const vec& y, mat& betasSE, vec& shat, vec&
         // si = t(xti) * ci [1*k,k*n]
         u_s = u_xt.as_stride().strides(begin, begin + length).t() * u_c;
         u_s.get(sg.memptr());
-        shat(0) += trace(sg.submat(begin, 0, arma::SizeMat(length, length)));
+        mSHat(0) += trace(sg.submat(begin, 0, arma::SizeMat(length, length)));
         // cct = ci * cit [k*n,t(k*n)]
         u_cct = u_c * u_c.t();
         u_sst = u_s * u_s.t();
         u_cct.get(cct.memptr());
         u_sst.get(sst.memptr());
-        shat(1) += sum(sst.head(length));
+        mSHat(1) += sum(sst.head(length));
         // Transfer to cpu Perform further diagnostic
         for (size_t j = 0, e = begin + j; j < length; j++, e++)
         {
-            betasSE.col(e) = diagvec(cct.slice(j));
+            mBetasSE.col(e) = diagvec(cct.slice(j));
             vec p = -sg.col(j);
             p(e) += 1.0;
-            qDiag += p % p;
-            S.row(isStoreS() ? e : 0) = sg.col(j).t();
+            mQDiag += p % p;
+            mS.row(isStoreS() ? e : 0) = sg.col(j).t();
+            u_c.strides(j).get(ci.memptr());
+            mC.slice(isStoreC() ? e : 0) = ci;
         }
         GWM_LOG_PROGRESS(begin + length, nDp);
     }
     u_betas.get(betas.memptr());
-    betasSE = betasSE.t();
+    mBetasSE = mBetasSE.t();
     cudaFree(d_info);
     return betas.t();
 }
@@ -666,7 +676,9 @@ arma::mat GWRBasic::predictCuda(const mat& locations, const mat& x, const vec& y
 {
     uword nRp = locations.n_rows, nDp = mCoords.n_rows, nVar = x.n_cols;
     mat betas(nVar, nRp, fill::zeros);
-    size_t groups = nRp / mGroupLength + (nRp % mGroupLength == 0 ? 0 : 1);
+    std::pair<uword, uword> workRange = mWorkRange.value_or(make_pair(0, nDp));
+    uword rangeSize = workRange.second - workRange.first;
+    size_t groups = rangeSize / mGroupLength + (rangeSize % mGroupLength == 0 ? 0 : 1);
     cumat u_xt(x.t()), u_y(y);
     custride u_xtwx(nVar, nVar, mGroupLength), u_xtwy(nVar, 1, mGroupLength), u_xtwxI(nVar, nVar, mGroupLength);
     cumat u_betas(nVar, nRp);
@@ -678,7 +690,7 @@ arma::mat GWRBasic::predictCuda(const mat& locations, const mat& x, const vec& y
     for (size_t i = 0; i < groups; i++)
     {
         GWM_LOG_STOP_BREAK(mStatus);
-        size_t begin = i * mGroupLength, length = (begin + mGroupLength > nRp) ? (nRp - begin) : mGroupLength;
+        size_t begin = workRange.first + i * mGroupLength, length = (begin + mGroupLength > workRange.second) ? (workRange.second - begin) : mGroupLength;
         for (size_t j = 0, e = begin + j; j < length; j++, e++)
         {
             checkCudaErrors(mSpatialWeight.weightVector(e, u_dists.dmem(), u_weights.dmem()));
@@ -706,29 +718,29 @@ arma::mat GWRBasic::predictCuda(const mat& locations, const mat& x, const vec& y
     return betas.t();
 }
 
-double GWRBasic::bandwidthSizeCriterionCVCuda(BandwidthWeight* bandwidthWeight)
+arma::mat GWRBasic::fitCoreCVCuda(const arma::mat& x, const arma::vec& y, const SpatialWeight& sw)
 {
     uword nDp = mCoords.n_rows, nVar = mX.n_cols;
-    size_t elems = nDp;
+    mat betas(nVar, nDp, fill::zeros);
     cumat u_xt(mX.t()), u_y(mY);
     cumat u_dists(nDp, 1), u_weights(nDp, 1);
     custride u_xtw(nVar, nDp, mGroupLength);
     custride u_xtwx(nVar, nVar, mGroupLength), u_xtwy(nVar, 1, mGroupLength), u_xtwxI(nVar, nVar, mGroupLength);
-    custride u_betas(nVar, 1, mGroupLength), u_yhat(1, 1, mGroupLength);
-    vec yhat(mGroupLength), yhat_all(nDp);
+    cumat u_betas(nVar, nDp);
     int *d_info, *p_info;
     p_info = new int[mGroupLength];
     checkCudaErrors(cudaMalloc(&d_info, sizeof(int) * mGroupLength));
     bool success = true;
-    size_t groups = nDp / mGroupLength + (nDp % mGroupLength == 0 ? 0 : 1);
+    std::pair<uword, uword> workRange = mWorkRange.value_or(make_pair(0, nDp));
+    uword rangeSize = workRange.second - workRange.first;
+    size_t groups = rangeSize / mGroupLength + (rangeSize % mGroupLength == 0 ? 0 : 1);
     for (size_t i = 0; i < groups && success; i++)
     {
         GWM_LOG_STOP_BREAK(mStatus);
-        size_t begin = i * mGroupLength, length = (begin + mGroupLength > nDp) ? (nDp - begin) : mGroupLength;
+        size_t begin = workRange.first + i * mGroupLength, length = (begin + mGroupLength > workRange.second) ? (workRange.second - begin) : mGroupLength;
         for (size_t j = 0, e = begin + j; j < length; j++, e++)
         {
-            checkCudaErrors(mSpatialWeight.distance()->distance(e, u_dists.dmem(), &elems));
-            checkCudaErrors(bandwidthWeight->weight(u_dists.dmem(), u_weights.dmem(), elems));
+            checkCudaErrors(sw.weightVector(e, u_dists.dmem(), u_weights.dmem()));
             checkCudaErrors(cudaMemcpy(u_weights.dmem() + e, &cubase::beta0, sizeof(double), cudaMemcpyHostToDevice));
             u_xtw.strides(j) = u_xt.diagmul(u_weights);
         }
@@ -747,57 +759,44 @@ double GWRBasic::bandwidthSizeCriterionCVCuda(BandwidthWeight* bandwidthWeight)
         }
         if (success)
         {
-            u_betas = u_xtwxI * u_xtwy;
-            u_yhat = u_xt.as_stride().strides(begin, begin + length).t() * u_betas;
-            u_yhat.get(yhat.memptr());
-            yhat_all.rows(begin, begin + length - 1) = yhat.head_rows(length);
+            u_betas.as_stride().strides(begin, begin + length) = u_xtwxI * u_xtwy;
         }
     }
     checkCudaErrors(cudaFree(d_info));
     delete[] p_info;
-    if (!success) return DBL_MAX;
-    double cv = as_scalar((mY - yhat_all).t() * (mY - yhat_all));
-    if (isfinite(cv))
-    {
-        GWM_LOG_INFO(IBandwidthSelectable::infoBandwidthCriterion(bandwidthWeight, cv));
-        GWM_LOG_PROGRESS_PERCENT(exp(- abs(mBandwidthLastCriterion - cv)));
-        mBandwidthLastCriterion = cv;
-        return cv;
-    }
-    else return DBL_MAX;
+    u_betas.get(betas.memptr());
+    return betas.t();
 }
 
-double GWRBasic::bandwidthSizeCriterionAICCuda(BandwidthWeight* bandwidthWeight)
+arma::mat GWRBasic::fitCoreSHatCuda(const arma::mat& x, const arma::vec& y, const SpatialWeight& sw, vec& shat)
 {
-    uword nDp = mCoords.n_rows, nVar = mX.n_cols;
-    size_t elems = nDp;
-    cumat u_xt(mX.t()), u_y(mY), u_betas(nVar, nDp);
+    uword nDp = mCoords.n_rows, nVar = x.n_cols;
+    mat betas(nVar, nDp, fill::zeros);
+    shat = vec(2, fill::zeros);
+    cumat u_xt(x.t()), u_y(y), u_betas(nVar, nDp);
     cumat u_dists(nDp, 1), u_weights(nDp, 1);
     custride u_xtw(nVar, nDp, mGroupLength);
-    custride u_xtwx(nVar, nVar, mGroupLength), u_xtwy(nVar, 1, mGroupLength), u_xtwxI(nVar, nVar, mGroupLength);
-    custride u_c(nVar, nDp, mGroupLength), u_s(1, nDp, mGroupLength);
-    custride u_sst(1, 1, mGroupLength);
-    mat betas(nVar, nDp);
-    vec shat(2), sst(mGroupLength);
+    vec sst(mGroupLength);
     mat sg(nDp, mGroupLength);
     int *d_info, *p_info;
     p_info = new int[mGroupLength];
     checkCudaErrors(cudaMalloc(&d_info, sizeof(int) * mGroupLength));
     bool success = true;
-    size_t groups = nDp / mGroupLength + (nDp % mGroupLength == 0 ? 0 : 1);
+    std::pair<uword, uword> workRange = mWorkRange.value_or(make_pair(0, nDp));
+    uword rangeSize = workRange.second - workRange.first;
+    size_t groups = rangeSize / mGroupLength + (rangeSize % mGroupLength == 0 ? 0 : 1);
     for (size_t i = 0; i < groups && success; i++)
     {
         GWM_LOG_STOP_BREAK(mStatus);
-        size_t begin = i * mGroupLength, length = (begin + mGroupLength > nDp) ? (nDp - begin) : mGroupLength;
+        size_t begin = workRange.first + i * mGroupLength, length = (begin + mGroupLength > workRange.second) ? (workRange.second - begin) : mGroupLength;
         for (size_t j = 0, e = begin + j; j < length; j++, e++)
         {
-            checkCudaErrors(mSpatialWeight.distance()->distance(e, u_dists.dmem(), &elems));
-            checkCudaErrors(bandwidthWeight->weight(u_dists.dmem(), u_weights.dmem(), elems));
+            checkCudaErrors(sw.weightVector(e, u_dists.dmem(), u_weights.dmem()));
             u_xtw.strides(j) = u_xt.diagmul(u_weights);
         }
-        u_xtwx = u_xtw * u_xt.t();
-        u_xtwy = u_xtw * u_y;
-        u_xtwxI = u_xtwx.inv(d_info);
+        custride u_xtwx = u_xtw * u_xt.t();
+        custride u_xtwy = u_xtw * u_y;
+        custride u_xtwxI = u_xtwx.inv(d_info);
         checkCudaErrors(cudaMemcpy(p_info, d_info, sizeof(int) * mGroupLength, cudaMemcpyDeviceToHost));
         for (size_t j = 0; j < mGroupLength; j++)
         {
@@ -811,132 +810,269 @@ double GWRBasic::bandwidthSizeCriterionAICCuda(BandwidthWeight* bandwidthWeight)
         if (success)
         {
             u_betas.as_stride().strides(begin, begin + length) = u_xtwxI * u_xtwy;
-            u_c = u_xtwxI * u_xtw;
-            u_s = u_xt.as_stride().strides(begin, begin + length).t() * u_c;
+            custride u_c = u_xtwxI * u_xtw;
+            custride u_s = u_xt.as_stride().strides(begin, begin + length).t() * u_c;
+            custride u_sst = u_s * u_s.t();
             u_s.get(sg.memptr());
-            shat(0) += trace(sg.submat(begin, 0, arma::SizeMat(length, length)));
-            u_sst = u_s * u_s.t();
             u_sst.get(sst.memptr());
+            shat(0) += trace(sg.submat(begin, 0, arma::SizeMat(length, length)));
             shat(1) += sum(sst);
         }
     }
     checkCudaErrors(cudaFree(d_info));
     delete[] p_info;
-    if (!success) return DBL_MAX;
     u_betas.get(betas.memptr());
-    double aic = GWRBasic::AICc(mX, mY, betas.t(), shat);
-    if (isfinite(aic))
-    {
-        GWM_LOG_INFO(IBandwidthSelectable::infoBandwidthCriterion(bandwidthWeight, aic));
-        GWM_LOG_PROGRESS_PERCENT(exp(- abs(mBandwidthLastCriterion - aic)));
-        mBandwidthLastCriterion = aic;
-        return aic;
-    }
-    else return DBL_MAX;
+    return betas.t();
 }
 
-double GWRBasic::indepVarsSelectionCriterionCuda(const std::vector<size_t>& indepVars)
+#endif
+
+#ifdef ENABLE_MPI
+double GWRBasic::indepVarsSelectionCriterionMpi(const vector<size_t>& indepVars)
 {
     mat x = mX.cols(VariableForwardSelector::index2uvec(indepVars, mHasIntercept));
     mat y = mY;
-    uword nDp = mCoords.n_rows, nVar = x.n_cols;
-    cumat u_xt(x.t()), u_y(y), u_betas(nVar, nDp);
-    cumat u_weights(vec(nDp, fill::ones));
-    custride u_xtw(nVar, nDp, mGroupLength);
-    mat betas(nVar, nDp);
-    vec shat(2), sst(mGroupLength);
-    mat sg(nDp, mGroupLength);
-    int *d_info, *p_info;
-    p_info = new int[mGroupLength];
-    checkCudaErrors(cudaMalloc(&d_info, sizeof(int) * mGroupLength));
-    bool success = true;
-    size_t groups = nDp / mGroupLength + (nDp % mGroupLength == 0 ? 0 : 1);
-    for (size_t i = 0; i < groups && success; i++)
+    SpatialWeight sw = mSpatialWeight;
+    sw.weight<BandwidthWeight>()->setBandwidth(DBL_MAX);
+    vec shat;
+    double aic;
+    mat betas = (this->*mFitCoreSHatFunction)(x, y, sw, shat);
+GWM_MPI_MASTER_BEGIN
+    umat received(mWorkerNum, 2, arma::fill::zeros);
+    received.row(0).fill(1);
+    unique_ptr<double[], default_delete<double[]>> buf(new double[betas.n_elem]);
+    while (!all(all(received == 1)))
     {
-        GWM_LOG_STOP_BREAK(mStatus);
-        size_t begin = i * mGroupLength, length = (begin + mGroupLength > nDp) ? (nDp - begin) : mGroupLength;
-        for (size_t j = 0, e = begin + j; j < length; j++, e++)
+        MPI_Status status;
+        MPI_Recv(buf.get(), betas.n_elem, MPI_DOUBLE, MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
+        // printf("0 process received message with %d from %d\n", status.MPI_TAG, status.MPI_SOURCE);
+        received(status.MPI_SOURCE, status.MPI_TAG - int(GWRBasicAICMpiTags::SHat)) = 1;
+        switch (GWRBasicAICMpiTags(status.MPI_TAG))
         {
-            u_xtw.strides(j) = u_xt.diagmul(u_weights);
-        }
-        custride u_xtwx = u_xtw * u_xt.t();
-        custride u_xtwy = u_xtw * u_y;
-        custride u_xtwxI = u_xtwx.inv(d_info);
-        checkCudaErrors(cudaMemcpy(p_info, d_info, sizeof(int) * mGroupLength, cudaMemcpyDeviceToHost));
-        for (size_t j = 0; j < mGroupLength; j++)
-        {
-            if (p_info[j] != 0)
-            {
-                GWM_LOG_ERROR("Cuda failed to get the inverse of matrix");
-                success = false;
-                break;
-            }
+        case GWRBasicAICMpiTags::SHat:
+            shat += vec(buf.get(), 2);
+            break;
+        case GWRBasicAICMpiTags::Betas:
+            betas += mat(buf.get(), betas.n_rows, betas.n_cols);
+            break;
+        default:
+            break;
         }
-        if (success)
+    }
+    aic = GWRBasic::AICc(x, y, betas, shat);
+GWM_MPI_MASTER_END
+GWM_MPI_WORKER_BEGIN
+    MPI_Send(shat.memptr(), 2, MPI_DOUBLE, 0, int(GWRBasicAICMpiTags::SHat), MPI_COMM_WORLD);
+    MPI_Send(betas.memptr(), betas.n_elem, MPI_DOUBLE, 0, int(GWRBasicAICMpiTags::Betas), MPI_COMM_WORLD);
+GWM_MPI_WORKER_END
+    MPI_Bcast(&aic, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
+    return aic;
+}
+
+double GWRBasic::bandwidthSizeCriterionCVMpi(BandwidthWeight* bandwidthWeight)
+{
+    uword nDp = mX.n_rows, nVar = mX.n_cols;
+    SpatialWeight sw(bandwidthWeight, mSpatialWeight.distance());
+    double cv;
+    mat betas = (this->*mFitCoreCVFunction)(mX, mY, sw).t();
+GWM_MPI_MASTER_BEGIN
+    uvec received(mWorkerNum, arma::fill::zeros);
+    received.row(0).fill(1);
+    unique_ptr<double[], default_delete<double[]>> buf(new double[betas.n_elem]);
+    while (!all(received))
+    {
+        MPI_Status status;
+        MPI_Recv(buf.get(), betas.n_elem, MPI_DOUBLE, MPI_ANY_SOURCE, int(GWRBasicCVMpiTags::Betas), MPI_COMM_WORLD, &status);
+        received(status.MPI_SOURCE) = 1;
+        betas += mat(buf.get(), nVar, nDp);
+    }
+    vec residual = mY - sum(mX % betas.t(), 1);
+    cv = sum(residual % residual);
+GWM_MPI_MASTER_END
+GWM_MPI_WORKER_BEGIN
+    MPI_Send(betas.memptr(), betas.n_elem, MPI_DOUBLE, 0, int(GWRBasicCVMpiTags::Betas), MPI_COMM_WORLD);
+GWM_MPI_WORKER_END
+    MPI_Bcast(&cv, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
+    return cv;
+}
+
+double GWRBasic::bandwidthSizeCriterionAICMpi(BandwidthWeight* bandwidthWeight)
+{
+    SpatialWeight sw(bandwidthWeight, mSpatialWeight.distance());
+    double aic;
+    vec shat(2, fill::zeros);
+    mat betas = (this->*mFitCoreSHatFunction)(mX, mY, sw, shat);
+GWM_MPI_MASTER_BEGIN
+    umat received(mWorkerNum, 2, arma::fill::zeros);
+    received.row(0).fill(1);
+    unique_ptr<double[], default_delete<double[]>> buf(new double[betas.n_elem]);
+    while (!all(all(received)))
+    {
+        MPI_Status status;
+        MPI_Recv(buf.get(), betas.n_elem, MPI_DOUBLE, MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
+        // printf("0 process received message with %d from %d\n", status.MPI_TAG, status.MPI_SOURCE);
+        received(status.MPI_SOURCE, status.MPI_TAG - int(GWRBasicAICMpiTags::SHat)) = 1;
+        switch (GWRBasicAICMpiTags(status.MPI_TAG))
         {
-            u_betas.as_stride().strides(begin, begin + length) = u_xtwxI * u_xtwy;
-            custride u_c = u_xtwxI * u_xtw;
-            custride u_s = u_xt.as_stride().strides(begin, begin + length).t() * u_c;
-            custride u_sst = u_s * u_s.t();
-            u_s.get(sg.memptr());
-            u_sst.get(sst.memptr());
-            shat(0) += trace(sg.submat(begin, 0, arma::SizeMat(length, length)));
-            shat(1) += sum(sst);
+        case GWRBasicAICMpiTags::SHat:
+            shat += vec(buf.get(), 2);
+            break;
+        case GWRBasicAICMpiTags::Betas:
+            betas += mat(buf.get(), betas.n_rows, betas.n_cols);
+            break;
+        default:
+            break;
         }
     }
-    checkCudaErrors(cudaFree(d_info));
-    delete[] p_info;
-    if (!success) return DBL_MAX;
-    u_betas.get(betas.memptr());
-    GWM_LOG_PROGRESS(++mIndepVarSelectionProgressCurrent, mIndepVarSelectionProgressTotal);
-    double aic = GWRBasic::AICc(x, y, betas.t(), shat);
-    if (isfinite(aic))
+    aic = GWRBasic::AICc(mX, mY, betas, shat);
+GWM_MPI_MASTER_END
+GWM_MPI_WORKER_BEGIN
+    MPI_Send(shat.memptr(), shat.n_elem, MPI_DOUBLE, 0, int(GWRBasicAICMpiTags::SHat), MPI_COMM_WORLD);
+    MPI_Send(betas.memptr(), betas.n_elem, MPI_DOUBLE, 0, int(GWRBasicAICMpiTags::Betas), MPI_COMM_WORLD);
+GWM_MPI_WORKER_END
+    MPI_Bcast(&aic, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
+    return aic;
+}
+
+arma::mat gwm::GWRBasic::fitMpi()
+{
+    // fit GWR on each process
+    uword nDp = mCoords.n_rows, nVar = mX.n_cols;
+    mBetas = (this->*mFitCoreFunction)(mX, mY, mSpatialWeight).t();
+    mBetasSE = mBetasSE.t();
+    mS = mS.t();
+    // gather results to master process
+    GWM_MPI_MASTER_BEGIN
+    mat shat_all(2, mWorkerNum);
+    mat qdiag_all(nDp, mWorkerNum);
+    shat_all.col(0) = mSHat;
+    qdiag_all.col(0) = mQDiag;
+    // prepare to receive data
+    umat received(mWorkerNum, 4, fill::zeros);
+    received.row(0).fill(1);
+    int bufSize = nDp * nVar;
+    unique_ptr<double[], std::default_delete<double[]>> buf(new double[bufSize]);
+    while (!all(all(received)))
     {
-        GWM_LOG_INFO(IVarialbeSelectable::infoVariableCriterion(indepVars, aic));
-        mBandwidthLastCriterion = aic;
-        return aic;
+        MPI_Status status;
+        MPI_Recv(buf.get(), bufSize, MPI_DOUBLE, MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
+        // printf("0 process received message with %d from %d\n", status.MPI_TAG, status.MPI_SOURCE);
+        received(status.MPI_SOURCE, status.MPI_TAG - int(GWRBasicFitMpiTags::Betas)) = 1;
+        uword rangeFrom = status.MPI_SOURCE * mWorkRangeSize, rangeTo = min(rangeFrom + mWorkRangeSize, nDp), rangeSize = rangeTo - rangeFrom;
+        switch (GWRBasicFitMpiTags(status.MPI_TAG))
+        {
+        case GWRBasicFitMpiTags::Betas:
+            mBetas.cols(rangeFrom, rangeTo - 1) = mat(buf.get(), nVar, rangeSize);
+            break;
+        case GWRBasicFitMpiTags::BetasSE:
+            mBetasSE.cols(rangeFrom, rangeTo - 1) = mat(buf.get(), nVar, rangeSize);
+            break;
+        case GWRBasicFitMpiTags::SHat:
+            shat_all.col(status.MPI_SOURCE) = vec(buf.get(), 2);
+            break;
+        case GWRBasicFitMpiTags::QDiag:
+            qdiag_all.col(status.MPI_SOURCE) = vec(buf.get(), nDp);
+            break;
+        default:
+            break;
+        }
     }
-    else return DBL_MAX;
+    mSHat = sum(shat_all, 1);
+    mQDiag = sum(qdiag_all, 1);
+    // vec dybar2 = (mY - mean(mY)) % (mY - mean(mY));
+    // vec dyhat2 = (mY - yhat) % (mY - yhat);
+    // vec localR2 = vec(nDp, fill::zeros);
+    // for (uword i = 0; i < nDp; i++)
+    // {
+    //     vec w = mSpatialWeight.weightVector(i);
+    //     double tss = sum(dybar2 % w);
+    //     double rss = sum(dyhat2 % w);
+    //     localR2(i) = (tss - rss) / tss;
+    // }
+    GWM_MPI_MASTER_END
+    GWM_MPI_WORKER_BEGIN
+    std::pair<uword, uword> workRange = mWorkRange.value_or(make_pair(0, nDp));
+    // printf("%d process work range: [%lld, %lld]\n", mWorkerId, workRange.first, workRange.second);
+    // cout << mWorkerId << " process work range: [" << workRange.first << "," << workRange.second << "]\n";
+    mat betas = mBetas.cols(workRange.first, workRange.second - 1);
+    mat betasSE = mBetasSE.cols(workRange.first, workRange.second - 1);
+    MPI_Send(betas.memptr(), betas.n_elem, MPI_DOUBLE, 0, int(GWRBasicFitMpiTags::Betas), MPI_COMM_WORLD);
+    MPI_Send(betasSE.memptr(), betasSE.n_elem, MPI_DOUBLE, 0, int(GWRBasicFitMpiTags::BetasSE), MPI_COMM_WORLD);
+    MPI_Send(mSHat.memptr(), mSHat.n_elem, MPI_DOUBLE, 0, int(GWRBasicFitMpiTags::SHat), MPI_COMM_WORLD);
+    MPI_Send(mQDiag.memptr(), mQDiag.n_elem, MPI_DOUBLE, 0, int(GWRBasicFitMpiTags::QDiag), MPI_COMM_WORLD);
+    GWM_MPI_WORKER_END
+    MPI_Bcast(mBetas.memptr(), mBetas.n_elem, MPI_DOUBLE, 0, MPI_COMM_WORLD);
+    MPI_Bcast(mBetasSE.memptr(), mBetasSE.n_elem, MPI_DOUBLE, 0, MPI_COMM_WORLD);
+    MPI_Bcast(mSHat.memptr(), mSHat.n_elem, MPI_DOUBLE, 0, MPI_COMM_WORLD);
+    MPI_Bcast(mQDiag.memptr(), mQDiag.n_elem, MPI_DOUBLE, 0, MPI_COMM_WORLD);
+    mBetas = mBetas.t();
+    mBetasSE = mBetasSE.t();
+    mS = mS.t();
+    // printf("shat [%lf,%lf]", mSHat(0), mSHat(1));
+    // diagnostic in master process
+    GWM_LOG_STAGE("Model Diagnostic");
+    mDiagnostic = CalcDiagnostic(mX, mY, mBetas, mSHat);
+    double trS = mSHat(0), trStS = mSHat(1);
+    double sigmaHat = mDiagnostic.RSS / (nDp - 2 * trS + trStS);
+    mBetasSE = sqrt(sigmaHat * mBetasSE);    
+    // check cancel status
+    GWM_LOG_STOP_RETURN(mStatus, mat(nDp, nVar, arma::fill::zeros));
+    return mBetas;
 }
 
-#endif
+// double GWRBasic::indepVarsSelectionCriterion(const mat& x, const vec& y, vec& shat)
+// {
+//     try
+//     {
+//         mat S;
+//         mat betas = (this->*mFitCoreSHatFunction)(x, y, mSpatialWeight, shat, S);
+//         GWM_LOG_PROGRESS(++mIndepVarSelectionProgressCurrent, mIndepVarSelectionProgressTotal);
+//         if (mStatus == Status::Success)
+//         {
+//             double rss = GWRBase::RSS(x, y, betas);
+//             return rss;
+//         }
+//         else return DBL_MAX;
+//     }
+//     catch(const std::exception& e)
+//     {
+//         return DBL_MAX;
+//     }
+    
+// }
+#endif // ENABLE_MPI
 
 void GWRBasic::setBandwidthSelectionCriterion(const BandwidthSelectionCriterionType& criterion)
 {
+#ifdef ENABLE_MPI
+    if (mParallelType & ParallelType::MPI)
+    {
+        switch (criterion)
+        {
+        case BandwidthSelectionCriterionType::CV:
+            mBandwidthSelectionCriterionFunction = &GWRBasic::bandwidthSizeCriterionCVMpi;
+            break;
+        default:
+            mBandwidthSelectionCriterionFunction = &GWRBasic::bandwidthSizeCriterionAICMpi;
+            break;
+        }
+    }
+    else
+    {
+#endif // ENABLE_MPI
+        switch (criterion)
+        {
+        case BandwidthSelectionCriterionType::CV:
+            mBandwidthSelectionCriterionFunction = &GWRBasic::bandwidthSizeCriterionCV;
+            break;
+        default:
+            mBandwidthSelectionCriterionFunction = &GWRBasic::bandwidthSizeCriterionAIC;
+            break;
+        }
+#ifdef ENABLE_MPI
+    }
+#endif // ENABLE_MPI
     mBandwidthSelectionCriterion = criterion;
-    unordered_map<BandwidthSelectionCriterionType, BandwidthSelectionCriterionCalculator> mapper;
-    switch (mParallelType)
-    {
-    case ParallelType::SerialOnly:
-        mapper = {
-            make_pair(BandwidthSelectionCriterionType::CV, &GWRBasic::bandwidthSizeCriterionCVSerial),
-            make_pair(BandwidthSelectionCriterionType::AIC, &GWRBasic::bandwidthSizeCriterionAICSerial)
-        };
-        break;
-#ifdef ENABLE_OPENMP
-    case ParallelType::OpenMP:
-        mapper = {
-            make_pair(BandwidthSelectionCriterionType::CV, &GWRBasic::bandwidthSizeCriterionCVOmp),
-            make_pair(BandwidthSelectionCriterionType::AIC, &GWRBasic::bandwidthSizeCriterionAICOmp)
-        };
-        break;
-#endif
-#ifdef ENABLE_CUDA
-    case ParallelType::CUDA:
-        mapper = {
-            make_pair(BandwidthSelectionCriterionType::CV, &GWRBasic::bandwidthSizeCriterionCVCuda),
-            make_pair(BandwidthSelectionCriterionType::AIC, &GWRBasic::bandwidthSizeCriterionAICCuda)
-        };
-        break;
-#endif
-    default:
-        mapper = {
-            make_pair(BandwidthSelectionCriterionType::CV, &GWRBasic::bandwidthSizeCriterionCVSerial),
-            make_pair(BandwidthSelectionCriterionType::AIC, &GWRBasic::bandwidthSizeCriterionAICSerial)
-        };
-        break;
-    }
-    mBandwidthSelectionCriterionFunction = mapper[mBandwidthSelectionCriterion];
 }
 
 void GWRBasic::setParallelType(const ParallelType& type)
@@ -944,32 +1080,51 @@ void GWRBasic::setParallelType(const ParallelType& type)
     if (type & parallelAbility())
     {
         mParallelType = type;
-        switch (type) {
+        switch (mParallelType) {
         case ParallelType::SerialOnly:
+        case ParallelType::MPI_Serial:
             mPredictFunction = &GWRBasic::predictSerial;
-            mFitFunction = &GWRBasic::fitSerial;
-            mIndepVarsSelectionCriterionFunction = &GWRBasic::indepVarsSelectionCriterionSerial;
+            mFitCoreFunction = &GWRBasic::fitCoreSerial;
+            mFitCoreCVFunction = &GWRBasic::fitCoreCVSerial;
+            mFitCoreSHatFunction = &GWRBasic::fitCoreSHatSerial;
             break;
 #ifdef ENABLE_OPENMP
         case ParallelType::OpenMP:
+        case ParallelType::MPI_MP:
             mPredictFunction = &GWRBasic::predictOmp;
-            mFitFunction = &GWRBasic::fitOmp;
-            mIndepVarsSelectionCriterionFunction = &GWRBasic::indepVarsSelectionCriterionOmp;
+            mFitCoreFunction = &GWRBasic::fitCoreOmp;
+            mFitCoreCVFunction = &GWRBasic::fitCoreCVOmp;
+            mFitCoreSHatFunction = &GWRBasic::fitCoreSHatOmp;
             break;
 #endif // ENABLE_OPENMP
 #ifdef ENABLE_CUDA
         case ParallelType::CUDA:
+        case ParallelType::MPI_CUDA:
             mPredictFunction = &GWRBasic::predictCuda;
-            mFitFunction = &GWRBasic::fitCuda;
-            mIndepVarsSelectionCriterionFunction = &GWRBasic::indepVarsSelectionCriterionCuda;
+            mFitCoreFunction = &GWRBasic::fitCoreCuda;
+            mFitCoreCVFunction = &GWRBasic::fitCoreCVCuda;
+            mFitCoreSHatFunction = &GWRBasic::fitCoreSHatCuda;
             break;
 #endif // ENABLE_CUDA
         default:
             mPredictFunction = &GWRBasic::predictSerial;
-            mFitFunction = &GWRBasic::fitSerial;
-            mIndepVarsSelectionCriterionFunction = &GWRBasic::indepVarsSelectionCriterionSerial;
+            mFitCoreFunction = &GWRBasic::fitCoreSerial;
+            mFitCoreCVFunction = &GWRBasic::fitCoreCVSerial;
+            mFitCoreSHatFunction = &GWRBasic::fitCoreSHatSerial;
             break;
         }
+#ifdef ENABLE_MPI
+        if (type & ParallelType::MPI)
+        {
+            mFitFunction = &GWRBasic::fitMpi;
+            mIndepVarsSelectionCriterionFunction = &GWRBasic::indepVarsSelectionCriterionMpi;
+        }
+        else
+        {
+            mFitFunction = &GWRBasic::fitBase;
+            mIndepVarsSelectionCriterionFunction = &GWRBasic::indepVarsSelectionCriterion;
+        }
+#endif
         setBandwidthSelectionCriterion(mBandwidthSelectionCriterion);
     }
 }
diff --git a/src/gwmodelpp/GWRMultiscale.cpp b/src/gwmodelpp/GWRMultiscale.cpp
index ed16facd..5c255eaf 100644
--- a/src/gwmodelpp/GWRMultiscale.cpp
+++ b/src/gwmodelpp/GWRMultiscale.cpp
@@ -9,6 +9,7 @@
 #include "BandwidthSelector.h"
 #include "VariableForwardSelector.h"
 #include "Logger.h"
+#include "GWRBasic.h"
 
 #ifdef ENABLE_CUDA
 #include <cuda_runtime.h>
@@ -16,12 +17,23 @@
 #include "CudaUtils.h"
 #endif // ENABLE_CUDA
 
+#ifdef ENABLE_MPI
+#include <mpi.h>
+#include "gwmodelpp/utils/armampi.h"
+#endif
+
 using namespace std;
 using namespace arma;
 using namespace gwm;
 
 int GWRMultiscale::treeChildCount = 0;
 
+enum class GWRMultiscaleFitMPITags
+{
+    Betas = 1 << 0,
+    SMat
+};
+
 unordered_map<GWRMultiscale::BandwidthInitilizeType,string> GWRMultiscale::BandwidthInitilizeTypeNameMapper = {
     make_pair(GWRMultiscale::BandwidthInitilizeType::Null, ("Not initilized, not specified")),
     make_pair(GWRMultiscale::BandwidthInitilizeType::Initial, ("Initilized")),
@@ -67,27 +79,11 @@ RegressionDiagnostic GWRMultiscale::CalcDiagnostic(const mat &x, const vec &y, c
 
 mat GWRMultiscale::fit()
 {
-    GWM_LOG_STAGE("Initializing");
     uword nDp = mX.n_rows, nVar = mX.n_cols;
-    createDistanceParameter(nVar);
-    createInitialDistanceParameter();
-    mMaxDistances.resize(nVar);
-#ifdef ENABLE_CUDA
-    if (mParallelType == ParallelType::CUDA)
-    {
-        cublasCreate(&cubase::handle);
-        mInitSpatialWeight.prepareCuda(mGpuId);
-        for (size_t i = 0; i < nVar; i++)
-        {
-            mSpatialWeights[i].prepareCuda(mGpuId);
-        }
-    }
-#endif // ENABLE_CUDA
-    GWM_LOG_STOP_RETURN(mStatus, mat(nDp, nVar, arma::fill::zeros));
-
     // ********************************
     // Centering and scaling predictors
     // ********************************
+    GWM_LOG_STAGE("Centering and scaling predictors");
     mX0 = mX;
     mY0 = mY;
     for (uword i = (mHasIntercept ? 1 : 0); i < nVar ; i++)
@@ -99,6 +95,100 @@ mat GWRMultiscale::fit()
     }
     GWM_LOG_STOP_RETURN(mStatus, mat(nDp, nVar, arma::fill::zeros));
 
+    // *****************************************************
+    // Calculate the initial beta0 from the above bandwidths
+    // *****************************************************
+    GWM_LOG_STAGE("Calculating initial betas");
+    GWRBasic gwr;
+    gwr.setCoords(mCoords);
+    gwr.setDependentVariable(mY);
+    gwr.setIndependentVariables(mX);
+    gwr.setSpatialWeight(mInitSpatialWeight);
+    gwr.setIsAutoselectBandwidth(true);
+    switch (mBandwidthSelectionApproach[0])
+    {
+    case GWRMultiscale::BandwidthSelectionCriterionType::CV:
+        gwr.setBandwidthSelectionCriterion(GWRBasic::BandwidthSelectionCriterionType::CV);
+        break;
+    case GWRMultiscale::BandwidthSelectionCriterionType::AIC:
+        gwr.setBandwidthSelectionCriterion(GWRBasic::BandwidthSelectionCriterionType::AIC);
+    default:
+        break;
+    }
+    gwr.setParallelType(mParallelType);
+    switch (mParallelType)
+    {
+    case ParallelType::OpenMP:
+    case ParallelType::MPI_MP:
+        gwr.setOmpThreadNum(mOmpThreadNum);
+        break;
+    case ParallelType::CUDA:
+    case ParallelType::MPI_CUDA:
+        gwr.setGroupSize(mGroupLength);
+        gwr.setGPUId(mGpuId);
+    default:
+        break;
+    }
+    if (mParallelType & ParallelType::MPI)
+    {
+        gwr.setWorkerId(mWorkerId);
+        gwr.setWorkerNum(mWorkerNum);
+    }
+    gwr.setStoreS(mHasHatMatrix);
+    gwr.setStoreC(mHasHatMatrix);
+    if (mGoldenLowerBounds.has_value()) gwr.setGoldenLowerBounds(mGoldenLowerBounds.value());
+    if (mGoldenUpperBounds.has_value()) gwr.setGoldenUpperBounds(mGoldenUpperBounds.value());
+    mat betas = gwr.fit();
+    mBetasSE = gwr.betasSE();
+    GWM_LOG_STOP_RETURN(mStatus, mat(nDp, nVar, arma::fill::zeros));
+
+    GWM_LOG_STAGE("Initializing");
+    createDistanceParameter(nVar);
+    mMaxDistances.resize(nVar);
+    mMinDistances.resize(nVar);
+#ifdef ENABLE_MPI
+    if (mParallelType & ParallelType::MPI)
+    {
+        uword aShape[4];
+        if (mWorkerId == 0) // master process
+        {
+            // sync matrix dimensions
+            uword nDim = mCoords.n_cols;
+            mWorkRangeSize = nDp / uword(mWorkerNum) + ((nDp % uword(mWorkerNum) == 0) ? 0 : 1);
+            aShape[0] = nDp;
+            aShape[1] = nVar;
+            aShape[2] = nDim;
+            aShape[3] = mWorkRangeSize;
+        }
+        MPI_Bcast(aShape, 4, GWM_MPI_UWORD, 0, MPI_COMM_WORLD);
+        if (mWorkerId > 0)                // slave process
+        {
+            nDp = aShape[0];
+            nVar = aShape[1];
+            mX.resize(nDp, nVar);
+            mY.resize(nDp);
+            mCoords.resize(aShape[0], aShape[2]);
+            mWorkRangeSize = aShape[3];
+        }
+        MPI_Bcast(mX.memptr(), mX.n_elem, MPI_DOUBLE, 0, MPI_COMM_WORLD);
+        MPI_Bcast(mY.memptr(), mY.n_elem, MPI_DOUBLE, 0, MPI_COMM_WORLD);
+        MPI_Bcast(mCoords.memptr(), mCoords.n_elem, MPI_DOUBLE, 0, MPI_COMM_WORLD);
+        uword workRangeFrom = uword(mWorkerId) * mWorkRangeSize;
+        mWorkRange = make_pair(workRangeFrom, min(workRangeFrom + mWorkRangeSize, nDp));
+    }
+#endif // ENABLE_MPI
+#ifdef ENABLE_CUDA
+    if (mParallelType & ParallelType::CUDA)
+    {
+        cubase::create_handle();
+        for (size_t i = 0; i < nVar; i++)
+        {
+            mSpatialWeights[i].prepareCuda(mGpuId);
+        }
+    }
+#endif // ENABLE_CUDA
+    GWM_LOG_STOP_RETURN(mStatus, mat(nDp, nVar, arma::fill::zeros));
+
     // ***********************
     // Intialize the bandwidth
     // ***********************
@@ -115,18 +205,20 @@ mat GWRMultiscale::fit()
             BandwidthWeight* bw0 = bandwidth(i);
             bool adaptive = bw0->adaptive();
             mMaxDistances[i] = mSpatialWeights[i].distance()->maxDistance();
+            mMinDistances[i] = mSpatialWeights[i].distance()->minDistance();
 
             GWM_LOG_INFO(string(GWM_LOG_TAG_MGWR_INITIAL_BW) + to_string(i));
             BandwidthSelector selector;
             selector.setBandwidth(bw0);
-            selector.setLower(mGoldenLowerBounds.value_or(adaptive ? mAdaptiveLower : mMaxDistances[i] / 5000.0));
+            selector.setLower(mGoldenLowerBounds.value_or(adaptive ? mAdaptiveLower : mMinDistances[i]));
             selector.setUpper(mGoldenUpperBounds.value_or(adaptive ? mCoords.n_rows : mMaxDistances[i]));
             BandwidthWeight* bw = selector.optimize(this);
             if (bw)
             {
                 mSpatialWeights[i].setWeight(bw);
 #ifdef ENABLE_CUDA
-                mSpatialWeights[i].prepareCuda(mGpuId);
+                if (mParallelType & ParallelType::CUDA)
+                    mSpatialWeights[i].prepareCuda(mGpuId);
 #endif // ENABLE_CUDA
             }
             GWM_LOG_INFO(string(GWM_LOG_TAG_MGWR_INITIAL_BW)  + to_string(i)  + "," + to_string(bw->bandwidth()));
@@ -134,102 +226,83 @@ mat GWRMultiscale::fit()
         GWM_LOG_STOP_RETURN(mStatus, mat(nDp, nVar, arma::fill::zeros));
     }
 
-    // *****************************************************
-    // Calculate the initial beta0 from the above bandwidths
-    // *****************************************************
-    BandwidthWeight* bw0 = bandwidth(0);
-    bool adaptive = bw0->adaptive();
-    mBandwidthSizeCriterion = bandwidthSizeCriterionAll(mBandwidthSelectionApproach[0]);
-    
-    GWM_LOG_STAGE("Calculating initial bandwidth");
-    BandwidthSelector initBwSelector;
-    initBwSelector.setBandwidth(bw0);
-    double maxDist = mSpatialWeights[0].distance()->maxDistance();
-    initBwSelector.setLower(mGoldenLowerBounds.value_or(adaptive ? mAdaptiveLower : maxDist / 5000.0));
-    initBwSelector.setUpper(mGoldenUpperBounds.value_or(adaptive ? mCoords.n_rows : maxDist));
-    GWM_LOG_INFO(IBandwidthSelectable::infoBandwidthCriterion(bw0));
-    BandwidthWeight* initBw = initBwSelector.optimize(this);
-    if (!initBw)
-    {
-        throw std::runtime_error("Cannot select initial bandwidth.");
-    }
-    mInitSpatialWeight.setWeight(initBw);
-#ifdef ENABLE_CUDA
-    mInitSpatialWeight.prepareCuda(mGpuId);
-#endif // ENABLE_CUDA
-    GWM_LOG_STOP_RETURN(mStatus, mat(nDp, nVar, arma::fill::zeros));
+    std::pair<uword, uword> workRange = mWorkRange.value_or(make_pair(0, nDp));
 
-    // 初始化诊断信息矩阵
+    GWM_LOG_STAGE("Initializing diagnostic matrices");
+    mat idm = eye(nVar, nVar);
     if (mHasHatMatrix)
     {
-        mS0 = mat(nDp, nDp, fill::zeros);
-        mSArray = cube(nDp, nDp, nVar, fill::zeros);
-        mC = cube(nVar, nDp, nDp, fill::zeros);
+        mS0 = gwr.s();
+        mSArray = cube(mS0.n_rows, mS0.n_cols, nVar, fill::zeros);
+        mC = gwr.c();
+        for (uword i = 0; i < nVar; ++i)
+        {
+            for (uword j = workRange.first; j < workRange.second ; ++j)
+            {
+                uword e = j - workRange.first;
+                mSArray.slice(i).row(e) = mX(j, i) * (idm.row(i) * mC.slice(e));
+            }
+        }
     }
 
     GWM_LOG_STAGE("Model fitting");
-    mBetas = backfitting(mX, mY);
+    mBetas = backfitting(betas);
     GWM_LOG_STOP_RETURN(mStatus, mat(nDp, nVar, arma::fill::zeros));
 
     // Diagnostic
     GWM_LOG_STAGE("Model Diagnostic");
-    vec shat = { 
-        mHasHatMatrix ? trace(mS0) : 0,
-        mHasHatMatrix ? trace(mS0.t() * mS0) : 0
-    };
-    mDiagnostic = CalcDiagnostic(mX, mY, shat, mRSS0);
+    vec shat(2, arma::fill::zeros);
     if (mHasHatMatrix)
     {
+#ifdef ENABLE_MPI
+        if (mParallelType & ParallelType::MPI)
+        {
+            vec shati(2);
+            shati(0) = trace(mS0.cols(workRange.first, workRange.second - 1));
+            shati(1) = trace(mS0 * mS0.t());
+            MPI_Reduce(shati.memptr(), shat.memptr(), 2, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
+            MPI_Bcast(shat.memptr(), 2, MPI_DOUBLE, 0, MPI_COMM_WORLD);
+        }
+        else
+        {
+#endif // ENABLE_MPI
+            shat = {
+                mHasHatMatrix ? trace(mS0) : 0,
+                mHasHatMatrix ? trace(mS0 * mS0.t()) : 0
+            };
+#ifdef ENABLE_MPI
+        }
+#endif // ENABLE_MPI
         mBetasTV = mBetas / mBetasSE;
     }
+    mDiagnostic = CalcDiagnostic(mX, mY, shat, mRSS0);
     vec yhat = Fitted(mX, mBetas);
     vec residual = mY - yhat;
 
     // Cleaning
 #ifdef ENABLE_CUDA
-    if (mParallelType == ParallelType::CUDA)
+    if (mParallelType & ParallelType::CUDA)
     {
-        cublasDestroy(cubase::handle);
+        cubase::destory_handle();
     }
 #endif // ENABLE_CUDA
 
     return mBetas;
 }
 
-void GWRMultiscale::createInitialDistanceParameter()
-{//回归距离计算
-    if (mInitSpatialWeight.distance()->type() == Distance::DistanceType::CRSDistance || 
-        mInitSpatialWeight.distance()->type() == Distance::DistanceType::MinkwoskiDistance)
-    {
-        mInitSpatialWeight.distance()->makeParameter({ mCoords, mCoords });
-    }
-}
-
-mat GWRMultiscale::backfitting(const mat &x, const vec &y)
+mat GWRMultiscale::backfitting(const mat& betas0)
 {
-    GWM_LOG_MGWR_BACKFITTING("Model fitting with inital bandwidth");
     uword nDp = mCoords.n_rows, nVar = mX.n_cols;
-    mat betas = (this->*mFitAll)(x, y);
+    mat betas = betas0;
+    // cout << "[backfitting] Process " << mWorkerId << " betas\n";
     GWM_LOG_STOP_RETURN(mStatus, mat(nDp, nVar, arma::fill::zeros));
 
-    mat idm = eye(nVar, nVar);
-    if (mHasHatMatrix)
-    {
-        for (uword i = 0; i < nVar; ++i)
-        {
-            for (uword j = 0; j < nDp ; ++j)
-            {
-                mSArray.slice(i).row(j) = x(j, i) * (idm.row(i) * mC.slice(j));
-            }
-        }
-    }
-
     // ***********************************************************
     // Select the optimum bandwidths for each independent variable
     // ***********************************************************
     GWM_LOG_MGWR_BACKFITTING("Selecting the optimum bandwidths for each independent variable");
     uvec bwChangeNo(nVar, fill::zeros);
-    vec resid = y - Fitted(x, betas);
+    vec resid = mY - Fitted(mX, betas);
     double RSS0 = sum(resid % resid), RSS1 = DBL_MAX;
     double criterion = DBL_MAX;
     for (size_t iteration = 1; iteration <= mMaxIteration && criterion > mCriterionThreshold; iteration++)
@@ -239,21 +312,20 @@ mat GWRMultiscale::backfitting(const mat &x, const vec &y)
         for (uword i = 0; i < nVar  ; i++)
         {
             GWM_LOG_STOP_BREAK(mStatus);
-            vec fi = betas.col(i) % x.col(i);
-            vec yi = resid + fi;
+            mXi = mX.col(i);
+            vec fi = betas.col(i) % mX.col(i);
+            mYi = resid + fi;
             if (mBandwidthInitilize[i] != BandwidthInitilizeType::Specified)
             {
                 GWM_LOG_MGWR_BACKFITTING("#variable-bandwidth-selection " + to_string(i));
                 mBandwidthSizeCriterion = bandwidthSizeCriterionVar(mBandwidthSelectionApproach[i]);
                 mBandwidthSelectionCurrentIndex = i;
-                mYi = yi;
-                mXi = mX.col(i);
                 BandwidthWeight* bwi0 = bandwidth(i);
                 bool adaptive = bwi0->adaptive();
                 BandwidthSelector selector;
                 selector.setBandwidth(bwi0);
-                double maxDist = mMaxDistances[i];
-                selector.setLower(mGoldenLowerBounds.value_or(adaptive ? mAdaptiveLower : maxDist / 5000.0));
+                double maxDist = mMaxDistances[i], minDist = mMinDistances[i];
+                selector.setLower(mGoldenLowerBounds.value_or(adaptive ? mAdaptiveLower : minDist));
                 selector.setUpper(mGoldenUpperBounds.value_or(adaptive ? mCoords.n_rows : maxDist));
                 BandwidthWeight* bwi = selector.optimize(this);
                 double bwi0s = bwi0->bandwidth(), bwi1s = bwi->bandwidth();
@@ -291,17 +363,13 @@ mat GWRMultiscale::backfitting(const mat &x, const vec &y)
             }
             GWM_LOG_STOP_BREAK(mStatus);
 
-            mat S;
-            betas.col(i) = (this->*mFitVar)(x.col(i), yi, i, S);
-            if (mHasHatMatrix)
-            {
-                mat SArrayi = mSArray.slice(i) - mS0;
-                mSArray.slice(i) = S * SArrayi + S;
-                mS0 = mSArray.slice(i) - SArrayi;
-            }
-            resid = y - Fitted(x, betas);
+            vec betai = (this->*mFitVar)(i);
+            // betas.brief_print("betas");
+            // betai.brief_print("betai");
+            betas.col(i) = betai;
+            resid = mY - Fitted(mX, betas);
         }
-        RSS1 = RSS(x, y, betas);
+        RSS1 = RSS(mX, mY, betas);
         criterion = (mCriterionType == BackFittingCriterionType::CVR) ?
                     abs(RSS1 - RSS0) :
                     sqrt(abs(RSS1 - RSS0) / RSS1);
@@ -316,6 +384,65 @@ mat GWRMultiscale::backfitting(const mat &x, const vec &y)
     return betas;
 }
 
+arma::vec gwm::GWRMultiscale::fitVarBase(const size_t var)
+{
+    mat S;
+    vec beta = (this->*mFitVarCore)(mXi, mYi, mSpatialWeights[var], S);
+    if (mHasHatMatrix)
+    {
+        mat SArrayi = mSArray.slice(var) - mS0;
+        mSArray.slice(var) = S * SArrayi + S;
+        mS0 = mSArray.slice(var) - SArrayi;
+    }
+    return beta;
+}
+
+double gwm::GWRMultiscale::bandwidthSizeCriterionVarCVBase(BandwidthWeight* bandwidthWeight)
+{
+    SpatialWeight sw(bandwidthWeight, mSpatialWeights[mBandwidthSelectionCurrentIndex].distance());
+    try
+    {
+        vec betas = (this->*mFitVarCoreCV)(mXi, mYi, sw);
+        vec res = mYi - mXi % betas;
+        double cv = sum(res % res);
+        if (mStatus == Status::Success && isfinite(cv))
+        {
+            GWM_LOG_INFO(IBandwidthSelectable::infoBandwidthCriterion(bandwidthWeight, cv));
+            GWM_LOG_PROGRESS_PERCENT(exp(- abs(mBandwidthLastCriterion - cv)));
+            mBandwidthLastCriterion = cv;
+            return cv;
+        }
+        else return DBL_MAX;
+    }
+    catch(const std::exception& e)
+    {
+        return DBL_MAX;
+    }
+}
+
+double gwm::GWRMultiscale::bandwidthSizeCriterionVarAICBase(BandwidthWeight* bandwidthWeight)
+{
+    SpatialWeight sw(bandwidthWeight, mSpatialWeights[mBandwidthSelectionCurrentIndex].distance());
+    try
+    {
+        vec shat;
+        mat betas = (this->*mFitVarCoreSHat)(mXi, mYi, sw, shat);
+        double value = GWRBase::AICc(mXi, mYi, betas, shat);
+        if (mStatus == Status::Success && isfinite(value))
+        {
+            GWM_LOG_INFO(IBandwidthSelectable::infoBandwidthCriterion(bandwidthWeight, value));
+            GWM_LOG_PROGRESS_PERCENT(exp(- abs(mBandwidthLastCriterion - value)));
+            mBandwidthLastCriterion = value;
+            return value;
+        }
+        else return DBL_MAX;
+    }
+    catch(const std::exception& e)
+    {
+        return DBL_MAX;
+    }
+}
+
 bool GWRMultiscale::isValid()
 {
     if (!(mX.n_cols > 0))
@@ -359,78 +486,22 @@ bool GWRMultiscale::isValid()
     return true;
 }
 
-mat GWRMultiscale::fitAllSerial(const mat& x, const vec& y)
-{
-    uword nDp = mCoords.n_rows, nVar = x.n_cols;
-    mat betas(nVar, nDp, fill::zeros);
-    if (mHasHatMatrix )
-    {
-        mat betasSE(nVar, nDp, fill::zeros);
-        for (uword i = 0; i < nDp ; i++)
-        {
-            GWM_LOG_STOP_BREAK(mStatus);
-            vec w = mInitSpatialWeight.weightVector(i);
-            mat xtw = trans(x.each_col() % w);
-            mat xtwx = xtw * x;
-            mat xtwy = xtw * y;
-            try
-            {
-                mat xtwx_inv = inv_sympd(xtwx);
-                betas.col(i) = xtwx_inv * xtwy;
-                mat ci = xtwx_inv * xtw;
-                betasSE.col(i) = sum(ci % ci, 1);
-                mat si = x.row(i) * ci;
-                mS0.row(i) = si;
-                mC.slice(i) = ci;
-            }
-            catch (const exception& e)
-            {
-                GWM_LOG_ERROR(e.what());
-                throw e;
-            }
-            GWM_LOG_PROGRESS(i + 1, nDp);
-        }
-        mBetasSE = betasSE.t();
-    }
-    else
-    {
-        for (int i = 0; (uword)i < nDp ; i++)
-        {
-            GWM_LOG_STOP_BREAK(mStatus);
-            vec w = mInitSpatialWeight.weightVector(i);
-            mat xtw = trans(x.each_col() % w);
-            mat xtwx = xtw * x;
-            mat xtwy = xtw * y;
-            try
-            {
-                mat xtwx_inv = inv_sympd(xtwx);
-                betas.col(i) = xtwx_inv * xtwy;
-            }
-            catch (const exception& e)
-            {
-                GWM_LOG_ERROR(e.what());
-                throw e;
-            }
-            GWM_LOG_PROGRESS(i + 1, nDp);
-        }
-    }
-    return betas.t();
-}
-
-vec GWRMultiscale::fitVarSerial(const vec &x, const vec &y, const uword var, mat &S)
+vec GWRMultiscale::fitVarCoreSerial(const vec &x, const vec &y, const SpatialWeight& sw, mat &S)
 {
     uword nDp = mCoords.n_rows;
     mat betas(1, nDp, fill::zeros);
     bool success = true;
     std::exception except;
+    std::pair<uword, uword> workRange = mWorkRange.value_or(make_pair(0, nDp));
     if (mHasHatMatrix)
     {
         mat ci, si;
-        S = mat(mHasHatMatrix ? nDp : 1, nDp, fill::zeros);
-        for (uword i = 0; i < nDp  ; i++)
+        uword rangeSize = workRange.second - workRange.first;
+        S = mat(rangeSize, nDp, fill::zeros);
+        for (uword i = workRange.first; i < workRange.second; i++)
         {
             GWM_LOG_STOP_BREAK(mStatus);
-            vec w = mSpatialWeights[var].weightVector(i);
+            vec w = sw.weightVector(i);
             mat xtw = trans(x % w);
             mat xtwx = xtw * x;
             mat xtwy = xtw * y;
@@ -440,7 +511,7 @@ vec GWRMultiscale::fitVarSerial(const vec &x, const vec &y, const uword var, mat
                 betas.col(i) = xtwx_inv * xtwy;
                 ci = xtwx_inv * xtw;
                 si = x(i) * ci;
-                S.row(mHasHatMatrix ? i : 0) = si;
+                S.row(i - workRange.first) = si;
             }
             catch (const exception& e)
             {
@@ -453,10 +524,10 @@ vec GWRMultiscale::fitVarSerial(const vec &x, const vec &y, const uword var, mat
     }
     else
     {
-        for (int i = 0; (uword)i < nDp  ; i++)
+        for (uword i = workRange.first; i < workRange.second; i++)
         {
             GWM_LOG_STOP_BREAK(mStatus);
-            vec w = mSpatialWeights[var].weightVector(i);
+            vec w = sw.weightVector(i);
             mat xtw = trans(x % w);
             mat xtwx = xtw * x;
             mat xtwy = xtw * y;
@@ -481,178 +552,83 @@ vec GWRMultiscale::fitVarSerial(const vec &x, const vec &y, const uword var, mat
     return betas.t();
 }
 
-double GWRMultiscale::bandwidthSizeCriterionAllCVSerial(BandwidthWeight *bandwidthWeight)
+vec GWRMultiscale::fitVarCoreCVSerial(const vec &x, const vec &y, const SpatialWeight& sw)
 {
-    uword nDp = mCoords.n_rows;
-    vec shat(2, fill::zeros);
-    double cv = 0.0;
-    for (uword i = 0; i < nDp; i++)
+    uword nDp = x.n_rows;
+    vec beta(nDp, fill::zeros);
+    std::pair<uword, uword> workRange = mWorkRange.value_or(make_pair(0, nDp));
+    for (uword i = workRange.first; i < workRange.second; i++)
     {
         GWM_LOG_STOP_BREAK(mStatus);
-        vec d = mInitSpatialWeight.distance()->distance(i);
-        vec w = bandwidthWeight->weight(d);
+        vec w = sw.weightVector(i);
         w(i) = 0.0;
-        mat xtw = trans(mX.each_col() % w);
-        mat xtwx = xtw * mX;
-        mat xtwy = xtw * mY;
+        mat xtw = trans(x % w);
+        mat xtwx = xtw * x;
+        mat xtwy = xtw * y;
         try
         {
             mat xtwx_inv = inv_sympd(xtwx);
-            vec beta = xtwx_inv * xtwy;
-            double res = mY(i) - det(mX.row(i) * beta);
-            cv += res * res;
+            beta(i) = as_scalar(xtwx_inv * xtwy);
         }
         catch (const exception& e)
         {
             GWM_LOG_ERROR(e.what());
-            return DBL_MAX;
+            throw e;
         }
     }
-    if (mStatus == Status::Success && isfinite(cv))
-    {
-        GWM_LOG_INFO(IBandwidthSelectable::infoBandwidthCriterion(bandwidthWeight, cv));
-        GWM_LOG_PROGRESS_PERCENT(exp(- abs(mBandwidthLastCriterion - cv)));
-        mBandwidthLastCriterion = cv;
-        return cv;
-    }
-    else return DBL_MAX;
+    return beta;
 }
 
-double GWRMultiscale::bandwidthSizeCriterionAllAICSerial(BandwidthWeight *bandwidthWeight)
+vec GWRMultiscale::fitVarCoreSHatSerial(const vec &x, const vec &y, const SpatialWeight& sw, vec& shat)
 {
-    uword nDp = mCoords.n_rows, nVar = mX.n_cols;
-    mat betas(nVar, nDp, fill::zeros);
-    vec shat(2, fill::zeros);
-    for (uword i = 0; i < nDp ; i++)
+    uword nDp = x.n_rows;
+    vec betas(nDp, fill::zeros);
+    shat = vec(2, fill::zeros);
+    std::pair<uword, uword> workRange = mWorkRange.value_or(make_pair(0, nDp));
+    for (uword i = workRange.first; i < workRange.second; i++)
     {
         GWM_LOG_STOP_BREAK(mStatus);
-        vec d = mInitSpatialWeight.distance()->distance(i);
-        vec w = bandwidthWeight->weight(d);
-        mat xtw = trans(mX.each_col() % w);
-        mat xtwx = xtw * mX;
-        mat xtwy = xtw * mY;
+        vec w = sw.weightVector(i);
+        mat xtw = trans(x % w);
+        mat xtwx = xtw * x;
+        mat xtwy = xtw * y;
         try
         {
             mat xtwx_inv = inv_sympd(xtwx);
-            betas.col(i) = xtwx_inv * xtwy;
+            betas(i) = as_scalar(xtwx_inv * xtwy);
             mat ci = xtwx_inv * xtw;
-            mat si = mX.row(i) * ci;
+            mat si = x(i) * ci;
             shat(0) += si(0, i);
-            shat(1) += det(si * si.t());
+            shat(1) += as_scalar(si * si.t());
         }
         catch (const exception& e)
         {
             GWM_LOG_ERROR(e.what());
-            return DBL_MAX;
+            throw e;
         }
     }
-    double value = GWRMultiscale::AICc(mX, mY, betas.t(), shat);
-    if (mStatus == Status::Success && isfinite(value))
-    {
-        GWM_LOG_INFO(IBandwidthSelectable::infoBandwidthCriterion(bandwidthWeight, value));
-        GWM_LOG_PROGRESS_PERCENT(exp(- abs(mBandwidthLastCriterion - value)));
-        mBandwidthLastCriterion = value;
-        return value;
-    }
-    else return DBL_MAX;
+    return betas;
 }
 
-double GWRMultiscale::bandwidthSizeCriterionVarCVSerial(BandwidthWeight *bandwidthWeight)
-{
-    size_t var = mBandwidthSelectionCurrentIndex;
-    uword nDp = mCoords.n_rows;
-    vec shat(2, fill::zeros);
-    double cv = 0.0;
-    for (uword i = 0; i < nDp; i++)
-    {
-        GWM_LOG_STOP_BREAK(mStatus);
-        vec d = mSpatialWeights[var].distance()->distance(i);
-        vec w = bandwidthWeight->weight(d);
-        w(i) = 0.0;
-        mat xtw = trans(mXi % w);
-        mat xtwx = xtw * mXi;
-        mat xtwy = xtw * mYi;
-        try
-        {
-            mat xtwx_inv = inv_sympd(xtwx);
-            vec beta = xtwx_inv * xtwy;
-            double res = mYi(i) - det(mXi(i) * beta);
-            cv += res * res;
-        }
-        catch (const exception& e)
-        {
-            GWM_LOG_ERROR(e.what());
-            return DBL_MAX;
-        }
-    }
-    if (mStatus == Status::Success && isfinite(cv))
-    {
-        GWM_LOG_INFO(IBandwidthSelectable::infoBandwidthCriterion(bandwidthWeight, cv));
-        GWM_LOG_PROGRESS_PERCENT(exp(- abs(mBandwidthLastCriterion - cv)));
-        mBandwidthLastCriterion = cv;
-        return cv;
-    }
-    else return DBL_MAX;
-}
 
-double GWRMultiscale::bandwidthSizeCriterionVarAICSerial(BandwidthWeight *bandwidthWeight)
+#ifdef ENABLE_OPENMP
+vec GWRMultiscale::fitVarCoreOmp(const vec &x, const vec &y, const SpatialWeight& sw, mat &S)
 {
-    size_t var = mBandwidthSelectionCurrentIndex;
     uword nDp = mCoords.n_rows;
     mat betas(1, nDp, fill::zeros);
-    vec shat(2, fill::zeros);
-    for (uword i = 0; i < nDp ; i++)
-    {
-        GWM_LOG_STOP_BREAK(mStatus);
-        vec d = mSpatialWeights[var].distance()->distance(i);
-        vec w = bandwidthWeight->weight(d);
-        mat xtw = trans(mXi % w);
-        mat xtwx = xtw * mXi;
-        mat xtwy = xtw * mYi;
-        try
-        {
-            mat xtwx_inv = inv_sympd(xtwx);
-            betas.col(i) = xtwx_inv * xtwy;
-            mat ci = xtwx_inv * xtw;
-            mat si = mXi(i) * ci;
-            shat(0) += si(0, i);
-            shat(1) += det(si * si.t());
-        }
-        catch (const exception& e)
-        {
-            GWM_LOG_ERROR(e.what());
-            return DBL_MAX;
-        }
-    }
-    double value = GWRMultiscale::AICc(mXi, mYi, betas.t(), shat);
-    if (mStatus == Status::Success && isfinite(value))
-    {
-        GWM_LOG_INFO(IBandwidthSelectable::infoBandwidthCriterion(bandwidthWeight, value));
-        GWM_LOG_PROGRESS_PERCENT(exp(- abs(mBandwidthLastCriterion - value)));
-        mBandwidthLastCriterion = value;
-        return value;
-    }
-    return isfinite(value) ? value : DBL_MAX;
-}
-
-
-#ifdef ENABLE_OPENMP
-
-mat GWRMultiscale::fitAllOmp(const mat &x, const vec &y)
-{
-    uword nDp = mCoords.n_rows, nVar = x.n_cols;
-    mat betas(nVar, nDp, fill::zeros);
     bool success = true;
     std::exception except;
-    if (mHasHatMatrix )
+    std::pair<uword, uword> workRange = mWorkRange.value_or(make_pair(0, nDp));
+    if (mHasHatMatrix)
     {
-        mat betasSE(nVar, nDp, fill::zeros);
+        uword rangeSize = workRange.second - workRange.first;
+        S = mat(rangeSize, nDp, fill::zeros);
 #pragma omp parallel for num_threads(mOmpThreadNum)
-        for (int i = 0; (uword)i < nDp; i++)
+        for (int i = int(workRange.first); (uword)i < workRange.second; i++)
         {
             GWM_LOG_STOP_CONTINUE(mStatus);
-            vec w = mInitSpatialWeight.weightVector(i);
-            mat xtw = trans(x.each_col() % w);
+            vec w = sw.weightVector(i);
+            mat xtw = trans(x % w);
             mat xtwx = xtw * x;
             mat xtwy = xtw * y;
             try
@@ -660,10 +636,8 @@ mat GWRMultiscale::fitAllOmp(const mat &x, const vec &y)
                 mat xtwx_inv = inv_sympd(xtwx);
                 betas.col(i) = xtwx_inv * xtwy;
                 mat ci = xtwx_inv * xtw;
-                betasSE.col(i) = sum(ci % ci, 1);
-                mat si = x.row(i) * ci;
-                mS0.row(i) = si;
-                mC.slice(i) = ci;
+                mat si = x(i) * ci;
+                S.row(i - workRange.first) = si;
             }
             catch (const exception& e)
             {
@@ -673,16 +647,15 @@ mat GWRMultiscale::fitAllOmp(const mat &x, const vec &y)
             }
             GWM_LOG_PROGRESS(i + 1, nDp);
         }
-        mBetasSE = betasSE.t();
     }
     else
     {
 #pragma omp parallel for num_threads(mOmpThreadNum)
-        for (int i = 0; (uword)i < nDp; i++)
+        for (int i = int(workRange.first); (uword)i < workRange.second; i++)
         {
             GWM_LOG_STOP_CONTINUE(mStatus);
-            vec w = mInitSpatialWeight.weightVector(i);
-            mat xtw = trans(x.each_col() % w);
+            vec w = sw.weightVector(i);
+            mat xtw = trans(x % w);
             mat xtwx = xtw * x;
             mat xtwy = xtw * y;
             try
@@ -706,141 +679,64 @@ mat GWRMultiscale::fitAllOmp(const mat &x, const vec &y)
     return betas.t();
 }
 
-vec GWRMultiscale::fitVarOmp(const vec &x, const vec &y, const uword var, mat &S)
+vec GWRMultiscale::fitVarCoreCVOmp(const vec &x, const vec &y, const SpatialWeight& sw)
 {
     uword nDp = mCoords.n_rows;
-    mat betas(1, nDp, fill::zeros);
-    bool success = true;
-    std::exception except;
-    if (mHasHatMatrix)
-    {
-        S = mat(mHasHatMatrix ? nDp : 1, nDp, fill::zeros);
+    vec beta(nDp, fill::zeros);
+    bool flag = true;
+    std::pair<uword, uword> workRange = mWorkRange.value_or(make_pair(0, nDp));
 #pragma omp parallel for num_threads(mOmpThreadNum)
-        for (int i = 0; (uword)i < nDp; i++)
+    for (int i = workRange.first; (uword)i < workRange.second; i++)
+    {
+        GWM_LOG_STOP_CONTINUE(mStatus);
+        if (flag)
         {
-            GWM_LOG_STOP_CONTINUE(mStatus);
-            vec w = mSpatialWeights[var].weightVector(i);
+            vec w = sw.weightVector(i);
+            w(i) = 0.0;
             mat xtw = trans(x % w);
             mat xtwx = xtw * x;
             mat xtwy = xtw * y;
             try
             {
                 mat xtwx_inv = inv_sympd(xtwx);
-                betas.col(i) = xtwx_inv * xtwy;
-                mat ci = xtwx_inv * xtw;
-                mat si = x(i) * ci;
-                S.row(mHasHatMatrix ? i : 0) = si;
+                beta(i) = as_scalar(xtwx_inv * xtwy);
             }
             catch (const exception& e)
             {
                 GWM_LOG_ERROR(e.what());
-                except = e;
-                success = false;
+                flag = false;
             }
-            GWM_LOG_PROGRESS(i + 1, nDp);
         }
     }
-    else
-    {
+    return beta;
+}
+
+vec GWRMultiscale::fitVarCoreSHatOmp(const vec &x, const vec &y, const SpatialWeight& sw, vec& shat)
+{
+    uword nDp = mCoords.n_rows;
+    vec betas(nDp, fill::zeros);
+    mat shat_all(2, mOmpThreadNum, fill::zeros);
+    bool flag = true;
+    std::pair<uword, uword> workRange = mWorkRange.value_or(make_pair(0, nDp));
 #pragma omp parallel for num_threads(mOmpThreadNum)
-        for (int i = 0; (uword)i < nDp; i++)
+    for (int i = workRange.first; (uword)i < workRange.second; i++)
+    {
+        GWM_LOG_STOP_CONTINUE(mStatus);
+        if (flag)
         {
-            GWM_LOG_STOP_CONTINUE(mStatus);
-            vec w = mSpatialWeights[var].weightVector(i);
+            int thread = omp_get_thread_num();
+            vec w = sw.weightVector(i);
             mat xtw = trans(x % w);
             mat xtwx = xtw * x;
             mat xtwy = xtw * y;
             try
             {
                 mat xtwx_inv = inv_sympd(xtwx);
-                betas.col(i) = xtwx_inv * xtwy;
-            }
-            catch (const exception& e)
-            {
-                GWM_LOG_ERROR(e.what());
-                except = e;
-                success = false;
-            }
-            GWM_LOG_PROGRESS(i + 1, nDp);
-        }
-    }
-    if (!success)
-    {
-        throw except;
-    }
-    return betas.t();
-}
-
-double GWRMultiscale::bandwidthSizeCriterionAllCVOmp(BandwidthWeight *bandwidthWeight)
-{
-    uword nDp = mCoords.n_rows;
-    vec shat(2, fill::zeros);
-    vec cv_all(mOmpThreadNum, fill::zeros);
-    bool flag = true;
-#pragma omp parallel for num_threads(mOmpThreadNum)
-    for (int i = 0; (uword)i < nDp; i++)
-    {
-        GWM_LOG_STOP_CONTINUE(mStatus);
-        if (flag)
-        {
-            int thread = omp_get_thread_num();
-            vec d = mInitSpatialWeight.distance()->distance(i);
-            vec w = bandwidthWeight->weight(d);
-            w(i) = 0.0;
-            mat xtw = trans(mX.each_col() % w);
-            mat xtwx = xtw * mX;
-            mat xtwy = xtw * mY;
-            try
-            {
-                mat xtwx_inv = inv_sympd(xtwx);
-                vec beta = xtwx_inv * xtwy;
-                double res = mY(i) - det(mX.row(i) * beta);
-                cv_all(thread) += res * res;
-            }
-            catch (const exception& e)
-            {
-                GWM_LOG_ERROR(e.what());
-                flag = false;
-            }
-        }
-    }
-    if (mStatus == Status::Success && flag)
-    {
-        double cv = sum(cv_all);
-        GWM_LOG_INFO(IBandwidthSelectable::infoBandwidthCriterion(bandwidthWeight, cv));
-        GWM_LOG_PROGRESS_PERCENT(exp(- abs(mBandwidthLastCriterion - cv)));
-        mBandwidthLastCriterion = cv;
-        return cv;
-    }
-    else return DBL_MAX;
-}
-
-double GWRMultiscale::bandwidthSizeCriterionAllAICOmp(BandwidthWeight *bandwidthWeight)
-{
-    uword nDp = mCoords.n_rows, nVar = mX.n_cols;
-    mat betas(nVar, nDp, fill::zeros);
-    mat shat_all(2, mOmpThreadNum, fill::zeros);
-    bool flag = true;
-#pragma omp parallel for num_threads(mOmpThreadNum)
-    for (int i = 0; (uword)i < nDp; i++)
-    {
-        GWM_LOG_STOP_CONTINUE(mStatus);
-        if (flag)
-        {
-            int thread = omp_get_thread_num();
-            vec d = mInitSpatialWeight.distance()->distance(i);
-            vec w = bandwidthWeight->weight(d);
-            mat xtw = trans(mX.each_col() % w);
-            mat xtwx = xtw * mX;
-            mat xtwy = xtw * mY;
-            try
-            {
-                mat xtwx_inv = inv_sympd(xtwx);
-                betas.col(i) = xtwx_inv * xtwy;
-                mat ci = xtwx_inv * xtw;
-                mat si = mX.row(i) * ci;
-                shat_all(0, thread) += si(0, i);
-                shat_all(1, thread) += det(si * si.t());
+                betas(i) = as_scalar(xtwx_inv * xtwy);
+                mat ci = xtwx_inv * xtw;
+                mat si = x(i) * ci;
+                shat_all(0, thread) += si(0, i);
+                shat_all(1, thread) += det(si * si.t());
             }
             catch (const exception& e)
             {
@@ -849,217 +745,14 @@ double GWRMultiscale::bandwidthSizeCriterionAllAICOmp(BandwidthWeight *bandwidth
             }
         }
     }
-    if (mStatus == Status::Success && flag)
-    {
-        vec shat = sum(shat_all, 1);
-        double value = GWRMultiscale::AICc(mX, mY, betas.t(), shat);
-        if (isfinite(value))
-        {
-            GWM_LOG_INFO(IBandwidthSelectable::infoBandwidthCriterion(bandwidthWeight, value));
-            GWM_LOG_PROGRESS_PERCENT(exp(- abs(mBandwidthLastCriterion - value)));
-            mBandwidthLastCriterion = value;
-            return value;
-        }
-        else return DBL_MAX;
-    }
-    else return DBL_MAX;
-}
-
-double GWRMultiscale::bandwidthSizeCriterionVarCVOmp(BandwidthWeight *bandwidthWeight)
-{
-    size_t var = mBandwidthSelectionCurrentIndex;
-    uword nDp = mCoords.n_rows;
-    vec shat(2, fill::zeros);
-    vec cv_all(mOmpThreadNum, fill::zeros);
-    bool flag = true;
-#pragma omp parallel for num_threads(mOmpThreadNum)
-    for (int i = 0; (uword)i < nDp; i++)
-    {
-        GWM_LOG_STOP_CONTINUE(mStatus);
-        if (flag)
-        {
-            int thread = omp_get_thread_num();
-            vec d = mSpatialWeights[var].distance()->distance(i);
-            vec w = bandwidthWeight->weight(d);
-            w(i) = 0.0;
-            mat xtw = trans(mXi % w);
-            mat xtwx = xtw * mXi;
-            mat xtwy = xtw * mYi;
-            try
-            {
-                mat xtwx_inv = inv_sympd(xtwx);
-                vec beta = xtwx_inv * xtwy;
-                double res = mYi(i) - det(mXi(i) * beta);
-                cv_all(thread) += res * res;
-            }
-            catch (const exception& e)
-            {
-                GWM_LOG_ERROR(e.what());
-                flag = false;
-            }
-        }
-    }
-    if (mStatus == Status::Success && flag)
-    {
-        double cv = sum(cv_all);
-        GWM_LOG_INFO(IBandwidthSelectable::infoBandwidthCriterion(bandwidthWeight, cv));
-        GWM_LOG_PROGRESS_PERCENT(exp(- abs(mBandwidthLastCriterion - cv)));
-        mBandwidthLastCriterion = cv;
-        return cv;
-    }
-    else return DBL_MAX;
-}
-
-double GWRMultiscale::bandwidthSizeCriterionVarAICOmp(BandwidthWeight *bandwidthWeight)
-{
-    size_t var = mBandwidthSelectionCurrentIndex;
-    uword nDp = mCoords.n_rows;
-    mat betas(1, nDp, fill::zeros);
-    mat shat_all(2, mOmpThreadNum, fill::zeros);
-    bool flag = true;
-#pragma omp parallel for num_threads(mOmpThreadNum)
-    for (int i = 0; (uword)i < nDp; i++)
-    {
-        GWM_LOG_STOP_CONTINUE(mStatus);
-        if (flag)
-        {
-            int thread = omp_get_thread_num();
-            vec d = mSpatialWeights[var].distance()->distance(i);
-            vec w = bandwidthWeight->weight(d);
-            mat xtw = trans(mXi % w);
-            mat xtwx = xtw * mXi;
-            mat xtwy = xtw * mYi;
-            try
-            {
-                mat xtwx_inv = inv_sympd(xtwx);
-                betas.col(i) = xtwx_inv * xtwy;
-                mat ci = xtwx_inv * xtw;
-                mat si = mXi(i) * ci;
-                shat_all(0, thread) += si(0, i);
-                shat_all(1, thread) += det(si * si.t());
-            }
-            catch (const exception& e)
-            {
-                GWM_LOG_ERROR(e.what());
-                flag = false;
-            }
-        }
-    }
-    if (flag)
-    {
-        vec shat = sum(shat_all, 1);
-        double value = GWRMultiscale::AICc(mXi, mYi, betas.t(), shat);
-        if (isfinite(value))
-        {
-            GWM_LOG_INFO(IBandwidthSelectable::infoBandwidthCriterion(bandwidthWeight, value));
-            GWM_LOG_PROGRESS_PERCENT(exp(- abs(mBandwidthLastCriterion - value)));
-            mBandwidthLastCriterion = value;
-            return value;
-        }
-        else return DBL_MAX;
-    }
-    return DBL_MAX;
+    shat = sum(shat_all, 1);
+    return betas;
 }
 
 #endif
 
 #ifdef ENABLE_CUDA
-
-mat GWRMultiscale::fitAllCuda(const mat& x, const vec& y)
-{
-    uword nDp = mCoords.n_rows, nVar = x.n_cols;
-    mat betas(nVar, nDp, fill::zeros);
-    mat xt = trans(x);
-    cumat u_xt(xt), u_y(y);
-    cumat u_betas(nVar, nDp);
-    cumat u_dists(nDp, 1), u_weights(nDp, 1);
-    custride u_xtw(nVar, nDp, mGroupLength);
-    mat si(nDp, mGroupLength, fill::zeros);
-    cube ci(nVar, nDp, mGroupLength, fill::zeros);
-    cube cct(nVar, nVar, mGroupLength, fill::zeros);
-    int *d_info, *p_info;
-    p_info = new int[mGroupLength];
-    checkCudaErrors(cudaMalloc(&d_info, sizeof(int) * mGroupLength));
-    if (mHasHatMatrix)
-    {
-        mat betasSE(nVar, nDp);
-        cumat u_betasSE(nVar, nDp);
-        size_t groups = nDp / mGroupLength + (nDp % mGroupLength == 0 ? 0 : 1);
-        for (size_t i = 0; i < groups; i++)
-        {
-            GWM_LOG_STOP_BREAK(mStatus);
-            size_t begin = i * mGroupLength, length = (begin + mGroupLength > nDp) ? (nDp - begin) : mGroupLength;
-            for (size_t j = 0, e = begin + j; j < length; j++, e++)
-            {
-                checkCudaErrors(mInitSpatialWeight.weightVector(e, u_dists.dmem(), u_weights.dmem()));
-                u_xtw.strides(j) = u_xt.diagmul(u_weights);
-            }
-            custride u_xtwx = u_xtw * u_xt.t();
-            custride u_xtwy = u_xtw * u_y;
-            custride u_xtwxI = u_xtwx.inv(d_info);
-            checkCudaErrors(cudaMemcpy(p_info, d_info, sizeof(int) * mGroupLength, cudaMemcpyDeviceToHost));
-            for (size_t j = 0; j < mGroupLength; j++)
-            {
-                if (p_info[j] != 0)
-                {
-                    std::runtime_error e("Cuda failed to get the inverse of matrix");
-                    GWM_LOG_ERROR(e.what());
-                    throw e;
-                }
-            }
-            u_betas.as_stride().strides(begin, begin + length) = u_xtwxI * u_xtwy;
-            custride u_c = u_xtwxI * u_xtw;
-            custride u_s = u_xt.as_stride().strides(begin, begin + length).t() * u_c;
-            custride u_cct = u_c * u_c.t();
-            u_s.get(si.memptr());
-            mS0.rows(begin, begin + length - 1) = si.head_cols(length).t();
-            u_c.get(ci.memptr());
-            mC.slices(begin, begin + length - 1) = ci.head_slices(length);
-            u_cct.get(cct.memptr());
-            for (size_t j = 0, e = i * mGroupLength + j; j < mGroupLength && e < nDp; j++, e++)
-            {
-                u_s.strides(j).get(si.memptr());
-                betasSE.col(e) = diagvec(cct.slice(j));
-            }
-            GWM_LOG_PROGRESS(begin + length, nDp);
-        }
-        u_betas.get(betas.memptr());
-        mBetasSE = betasSE.t();
-    }
-    else
-    {
-        size_t groups = nDp / mGroupLength + (nDp % mGroupLength == 0 ? 0 : 1);
-        for (size_t i = 0; i < groups; i++)
-        {
-            GWM_LOG_STOP_BREAK(mStatus);
-            size_t begin = i * mGroupLength, length = (begin + mGroupLength > nDp) ? (nDp - begin) : mGroupLength;
-            for (size_t j = 0, e = begin + j; j < length; j++, e++)
-            {
-                checkCudaErrors(mInitSpatialWeight.weightVector(e, u_dists.dmem(), u_weights.dmem()));
-                u_xtw.strides(j) = u_xt.diagmul(u_weights);
-            }
-            custride u_xtwx = u_xtw * u_xt.t();
-            custride u_xtwy = u_xtw * u_y;
-            custride u_xtwxI = u_xtwx.inv(d_info);
-            checkCudaErrors(cudaMemcpy(p_info, d_info, sizeof(int) * mGroupLength, cudaMemcpyDeviceToHost));
-            for (size_t j = 0; j < mGroupLength; j++)
-            {
-                if (p_info[j] != 0)
-                {
-                    std::runtime_error e("Cuda failed to get the inverse of matrix");
-                    GWM_LOG_ERROR(e.what());
-                    throw e;
-                }
-            }
-            u_betas.as_stride().strides(begin, begin + length) = u_xtwxI * u_xtwy;
-            GWM_LOG_PROGRESS(begin + length, nDp);
-        }
-        u_betas.get(betas.memptr());
-    }
-    return betas.t();
-}
-
-vec GWRMultiscale::fitVarCuda(const vec &x, const vec &y, const uword var, mat &S)
+vec GWRMultiscale::fitVarCoreCuda(const vec &x, const vec &y, const SpatialWeight& sw, mat &S)
 {
     uword nDp = mCoords.n_rows;
     mat betas(1, nDp, fill::zeros);
@@ -1071,18 +764,20 @@ vec GWRMultiscale::fitVarCuda(const vec &x, const vec &y, const uword var, mat &
     int *d_info, *p_info;
     p_info = new int[mGroupLength];
     checkCudaErrors(cudaMalloc(&d_info, sizeof(int) * mGroupLength));
-    size_t groups = nDp / mGroupLength + (nDp % mGroupLength == 0 ? 0 : 1);
-    S = mat(mHasHatMatrix ? nDp : 1, nDp, fill::zeros);
+    std::pair<uword, uword> workRange = mWorkRange.value_or(make_pair(0, nDp));
+    uword rangeSize = workRange.second - workRange.first;
+    size_t groups = rangeSize / mGroupLength + (rangeSize % mGroupLength == 0 ? 0 : 1);
+    S = mat(mHasHatMatrix ? rangeSize : 1, nDp, fill::zeros);
     if (mHasHatMatrix)
     {
         mat sg(nDp, mGroupLength, fill::zeros);
         for (size_t i = 0; i < groups; i++)
         {
             GWM_LOG_STOP_BREAK(mStatus);
-            size_t begin = i * mGroupLength, length = (begin + mGroupLength > nDp) ? (nDp - begin) : mGroupLength;
+            size_t begin = workRange.first + i * mGroupLength, length = (begin + mGroupLength > workRange.second) ? (workRange.second - begin) : mGroupLength;
             for (size_t j = 0, e = begin + j; j < length; j++, e++)
             {
-                checkCudaErrors(mSpatialWeights[var].weightVector(e, u_dists.dmem(), u_weights.dmem()));
+                checkCudaErrors(sw.weightVector(e, u_dists.dmem(), u_weights.dmem()));
                 u_xtw.strides(j) = u_xt.diagmul(u_weights);
             }
             custride u_xtwx = u_xtw * u_xt.t();
@@ -1111,10 +806,10 @@ vec GWRMultiscale::fitVarCuda(const vec &x, const vec &y, const uword var, mat &
         for (size_t i = 0; i < groups; i++)
         {
             GWM_LOG_STOP_BREAK(mStatus);
-            size_t begin = i * mGroupLength, length = (begin + mGroupLength > nDp) ? (nDp - begin) : mGroupLength;
+            size_t begin = workRange.first + i * mGroupLength, length = (begin + mGroupLength > workRange.second) ? (workRange.second - begin) : mGroupLength;
             for (size_t j = 0, e = begin + j; j < length; j++, e++)
             {
-                checkCudaErrors(mSpatialWeights[var].weightVector(e, u_dists.dmem(), u_weights.dmem()));
+                checkCudaErrors(sw.weightVector(e, u_dists.dmem(), u_weights.dmem()));
                 u_xtw.strides(j) = u_xt.diagmul(u_weights);
             }
             custride u_xtwx = u_xtw * u_xt.t();
@@ -1139,27 +834,28 @@ vec GWRMultiscale::fitVarCuda(const vec &x, const vec &y, const uword var, mat &
     return betas.t();
 }
 
-double GWRMultiscale::bandwidthSizeCriterionAllCVCuda(BandwidthWeight* bandwidthWeight)
+vec GWRMultiscale::fitVarCoreCVCuda(const vec &x, const vec &y, const SpatialWeight& sw)
 {
-    uword nDp = mCoords.n_rows, nVar = mX.n_cols;
-    size_t elems = nDp;
-    cumat u_xt(mX.t()), u_y(mY);
+    uword nDp = mCoords.n_rows;
+    constexpr size_t nVar = 1;
+    cumat u_xt(mXi.t()), u_y(mYi);
     cumat u_dists(nDp, 1), u_weights(nDp, 1);
+    cumat u_betas(1, nDp);
     custride u_xtw(nVar, nDp, mGroupLength);
-    vec yhat(mGroupLength), yhat_all(nDp);
     int *d_info, *p_info;
     p_info = new int[mGroupLength];
     checkCudaErrors(cudaMalloc(&d_info, sizeof(int) * mGroupLength));
     bool success = true;
-    size_t groups = nDp / mGroupLength + (nDp % mGroupLength == 0 ? 0 : 1);
+    std::pair<uword, uword> workRange = mWorkRange.value_or(make_pair(0, nDp));
+    uword rangeSize = workRange.second - workRange.first;
+    size_t groups = rangeSize / mGroupLength + (rangeSize % mGroupLength == 0 ? 0 : 1);
     for (size_t i = 0; i < groups && success; i++)
     {
         GWM_LOG_STOP_BREAK(mStatus);
-        size_t begin = i * mGroupLength, length = (begin + mGroupLength > nDp) ? (nDp - begin) : mGroupLength;
+        size_t begin = workRange.first + i * mGroupLength, length = (begin + mGroupLength > workRange.second) ? (workRange.second - begin) : mGroupLength;
         for (size_t j = 0, e = begin + j; j < length; j++, e++)
         {
-            checkCudaErrors(mInitSpatialWeight.distance()->distance(e, u_dists.dmem(), &elems));
-            checkCudaErrors(bandwidthWeight->weight(u_dists.dmem(), u_weights.dmem(), elems));
+            checkCudaErrors(sw.weightVector(e, u_dists.dmem(), u_weights.dmem()));
             checkCudaErrors(cudaMemcpy(u_weights.dmem() + e, &cubase::beta0, sizeof(double), cudaMemcpyHostToDevice));
             u_xtw.strides(j) = u_xt.diagmul(u_weights);
         }
@@ -1178,49 +874,41 @@ double GWRMultiscale::bandwidthSizeCriterionAllCVCuda(BandwidthWeight* bandwidth
         }
         if (success)
         {
-            custride u_betas = u_xtwxI * u_xtwy;
-            custride u_yhat = u_xt.as_stride().strides(begin, begin + length).t() * u_betas;
-            u_yhat.get(yhat.memptr());
-            yhat_all.rows(begin, begin + length - 1) = yhat.head_rows(length);
+            u_betas.as_stride().strides(begin, begin + length) = u_xtwxI * u_xtwy;
         }
     }
     checkCudaErrors(cudaFree(d_info));
     delete[] p_info;
-    if (!success) return DBL_MAX;
-    double cv = as_scalar((mY - yhat_all).t() * (mY - yhat_all));
-    if (isfinite(cv))
-    {
-        GWM_LOG_INFO(IBandwidthSelectable::infoBandwidthCriterion(bandwidthWeight, cv));
-        GWM_LOG_PROGRESS_PERCENT(exp(- abs(mBandwidthLastCriterion - cv)));
-        mBandwidthLastCriterion = cv;
-        return cv;
-    }
-    else return DBL_MAX;
+    vec betas(nDp, fill::zeros);
+    u_betas.get(betas.memptr());
+    return betas;
 }
 
-double GWRMultiscale::bandwidthSizeCriterionAllAICCuda(BandwidthWeight* bandwidthWeight)
+vec GWRMultiscale::fitVarCoreSHatCuda(const vec &x, const vec &y, const SpatialWeight& sw, vec& shat)
 {
-    uword nDp = mCoords.n_rows, nVar = mX.n_cols;
-    size_t elems = nDp;
-    cumat u_xt(mX.t()), u_y(mY), u_betas(nVar, nDp);
+    uword nDp = mCoords.n_rows;
+    constexpr size_t nVar = 1;
+    cumat u_xt(mXi.t()), u_y(mYi), u_betas(nVar, nDp);
     cumat u_dists(nDp, 1), u_weights(nDp, 1);
     custride u_xtw(nVar, nDp, mGroupLength);
     mat betas(nVar, nDp);
-    vec shat(2), sst(mGroupLength);
+    shat = vec(2);
+    vec sst(mGroupLength);
     mat sg(nDp, mGroupLength);
     int *d_info, *p_info;
     p_info = new int[mGroupLength];
     checkCudaErrors(cudaMalloc(&d_info, sizeof(int) * mGroupLength));
     bool success = true;
-    size_t groups = nDp / mGroupLength + (nDp % mGroupLength == 0 ? 0 : 1);
+    std::pair<uword, uword> workRange = mWorkRange.value_or(make_pair(0, nDp));
+    uword rangeSize = workRange.second - workRange.first;
+    size_t groups = rangeSize / mGroupLength + (rangeSize % mGroupLength == 0 ? 0 : 1);
     for (size_t i = 0; i < groups && success; i++)
     {
         GWM_LOG_STOP_BREAK(mStatus);
-        size_t begin = i * mGroupLength, length = (begin + mGroupLength > nDp) ? (nDp - begin) : mGroupLength;
+        size_t begin = workRange.first + i * mGroupLength, length = (begin + mGroupLength > workRange.second) ? (workRange.second - begin) : mGroupLength;
         for (size_t j = 0, e = begin + j; j < length; j++, e++)
         {
-            checkCudaErrors(mInitSpatialWeight.distance()->distance(e, u_dists.dmem(), &elems));
-            checkCudaErrors(bandwidthWeight->weight(u_dists.dmem(), u_weights.dmem(), elems));
+            checkCudaErrors(sw.weightVector(e, u_dists.dmem(), u_weights.dmem()));
             u_xtw.strides(j) = u_xt.diagmul(u_weights);
         }
         custride u_xtwx = u_xtw * u_xt.t();
@@ -1250,71 +938,60 @@ double GWRMultiscale::bandwidthSizeCriterionAllAICCuda(BandwidthWeight* bandwidt
     }
     checkCudaErrors(cudaFree(d_info));
     delete[] p_info;
-    if (!success) return DBL_MAX;
     u_betas.get(betas.memptr());
-    double aic = GWRMultiscale::AICc(mX, mY, betas.t(), shat);
-    if (isfinite(aic))
+    return betas.t();
+}
+#endif // ENABLE_CUDA
+
+#ifdef ENABLE_MPI
+vec GWRMultiscale::fitVarMpi(const size_t var)
+{
+    uword nDp = mXi.n_rows;
+    mat S;
+    vec beta_p = (this->*mFitVarCore)(mXi, mYi, mSpatialWeights[var], S);
+    vec beta(nDp);
+    MPI_Allreduce(beta_p.memptr(), beta.memptr(), nDp, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
+    if (mHasHatMatrix)
     {
-        GWM_LOG_INFO(IBandwidthSelectable::infoBandwidthCriterion(bandwidthWeight, aic));
-        GWM_LOG_PROGRESS_PERCENT(exp(- abs(mBandwidthLastCriterion - aic)));
-        mBandwidthLastCriterion = aic;
-        return aic;
+        mat SArrayi = mSArray.slice(var) - mS0;
+        mat_mul_mpi(S, SArrayi, mSArray.slice(var), mWorkerId, mWorkerNum, mWorkRangeSize);
+        mSArray.slice(var) += S;
+        mS0 = mSArray.slice(var) - SArrayi;
     }
-    else return DBL_MAX;
+    return beta;
 }
 
-double GWRMultiscale::bandwidthSizeCriterionVarCVCuda(BandwidthWeight* bandwidthWeight)
+double GWRMultiscale::bandwidthSizeCriterionVarCVMpi(BandwidthWeight* bandwidthWeight)
 {
-    size_t var = mBandwidthSelectionCurrentIndex;
-    uword nDp = mCoords.n_rows;
-    size_t elems = nDp;
-    constexpr size_t nVar = 1;
-    cumat u_xt(mXi.t()), u_y(mYi);
-    cumat u_dists(nDp, 1), u_weights(nDp, 1);
-    custride u_xtw(nVar, nDp, mGroupLength);
-    vec yhat(mGroupLength), yhat_all(nDp);
-    int *d_info, *p_info;
-    p_info = new int[mGroupLength];
-    checkCudaErrors(cudaMalloc(&d_info, sizeof(int) * mGroupLength));
-    bool success = true;
-    size_t groups = nDp / mGroupLength + (nDp % mGroupLength == 0 ? 0 : 1);
-    for (size_t i = 0; i < groups && success; i++)
+    SpatialWeight sw(bandwidthWeight, mSpatialWeights[mBandwidthSelectionCurrentIndex].distance());
+    int status = 1;
+    arma::Col<int> status_all(mWorkerNum);
+    vec betas;
+    try
     {
-        GWM_LOG_STOP_BREAK(mStatus);
-        size_t begin = i * mGroupLength, length = (begin + mGroupLength > nDp) ? (nDp - begin) : mGroupLength;
-        for (size_t j = 0, e = begin + j; j < length; j++, e++)
-        {
-            checkCudaErrors(mSpatialWeights[var].distance()->distance(e, u_dists.dmem(), &elems));
-            checkCudaErrors(bandwidthWeight->weight(u_dists.dmem(), u_weights.dmem(), elems));
-            checkCudaErrors(cudaMemcpy(u_weights.dmem() + e, &cubase::beta0, sizeof(double), cudaMemcpyHostToDevice));
-            u_xtw.strides(j) = u_xt.diagmul(u_weights);
-        }
-        custride u_xtwx = u_xtw * u_xt.t();
-        custride u_xtwy = u_xtw * u_y;
-        custride u_xtwxI = u_xtwx.inv(d_info);
-        checkCudaErrors(cudaMemcpy(p_info, d_info, sizeof(int) * mGroupLength, cudaMemcpyDeviceToHost));
-        for (size_t j = 0; j < mGroupLength; j++)
-        {
-            if (p_info[j] != 0)
-            {
-                GWM_LOG_ERROR("Cuda failed to get the inverse of matrix");
-                success = false;
-                break;
-            }
-        }
-        if (success)
-        {
-            custride u_betas = u_xtwxI * u_xtwy;
-            custride u_yhat = u_xt.as_stride().strides(begin, begin + length).t() * u_betas.strides(0, length);
-            u_yhat.get(yhat.memptr());
-            yhat_all.rows(begin, begin + length - 1) = yhat.head_rows(length);
-        }
+        betas = (this->*mFitVarCoreCV)(mXi, mYi, sw);
     }
-    checkCudaErrors(cudaFree(d_info));
-    delete[] p_info;
-    if (!success) return DBL_MAX;
-    double cv = as_scalar((mYi - yhat_all).t() * (mYi - yhat_all));
-    if (isfinite(cv))
+    catch(const std::exception& e)
+    {
+        status = 0;
+    }
+    MPI_Allgather(&status, 1, MPI_INT, status_all.memptr(), 1, MPI_INT, MPI_COMM_WORLD);
+    if (!all(status_all))
+        return DBL_MAX;
+    
+    // If all right, calculate cv;
+    double cv;
+    vec betas_all;
+    GWM_MPI_MASTER_BEGIN
+    betas_all = vec(size(betas));
+    GWM_MPI_MASTER_END
+    MPI_Reduce(betas.memptr(), betas_all.memptr(), betas.n_elem, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
+    GWM_MPI_MASTER_BEGIN
+    vec residual = mYi - mXi % betas_all;
+    cv = sum(residual % residual);
+    GWM_MPI_MASTER_END
+    MPI_Bcast(&cv, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
+    if (mStatus == Status::Success && isfinite(cv))
     {
         GWM_LOG_INFO(IBandwidthSelectable::infoBandwidthCriterion(bandwidthWeight, cv));
         GWM_LOG_PROGRESS_PERCENT(exp(- abs(mBandwidthLastCriterion - cv)));
@@ -1324,64 +1001,38 @@ double GWRMultiscale::bandwidthSizeCriterionVarCVCuda(BandwidthWeight* bandwidth
     else return DBL_MAX;
 }
 
-double GWRMultiscale::bandwidthSizeCriterionVarAICCuda(BandwidthWeight* bandwidthWeight)
+double GWRMultiscale::bandwidthSizeCriterionVarAICMpi(BandwidthWeight* bandwidthWeight)
 {
-    size_t var = mBandwidthSelectionCurrentIndex;
-    uword nDp = mCoords.n_rows;
-    size_t elems = nDp;
-    constexpr size_t nVar = 1;
-    cumat u_xt(mXi.t()), u_y(mYi), u_betas(nVar, nDp);
-    cumat u_dists(nDp, 1), u_weights(nDp, 1);
-    custride u_xtw(nVar, nDp, mGroupLength);
-    mat betas(nVar, nDp);
-    vec shat(2), sst(mGroupLength);
-    mat sg(nDp, mGroupLength);
-    int *d_info, *p_info;
-    p_info = new int[mGroupLength];
-    checkCudaErrors(cudaMalloc(&d_info, sizeof(int) * mGroupLength));
-    bool success = true;
-    size_t groups = nDp / mGroupLength + (nDp % mGroupLength == 0 ? 0 : 1);
-    for (size_t i = 0; i < groups && success; i++)
+    SpatialWeight sw(bandwidthWeight, mSpatialWeights[mBandwidthSelectionCurrentIndex].distance());
+    int status = 1;
+    arma::Col<int> status_all(mWorkerNum);
+    vec betas, shat;
+    try
     {
-        GWM_LOG_STOP_BREAK(mStatus);
-        size_t begin = i * mGroupLength, length = (begin + mGroupLength > nDp) ? (nDp - begin) : mGroupLength;
-        for (size_t j = 0, e = begin + j; j < length; j++, e++)
-        {
-            checkCudaErrors(mSpatialWeights[var].distance()->distance(e, u_dists.dmem(), &elems));
-            checkCudaErrors(bandwidthWeight->weight(u_dists.dmem(), u_weights.dmem(), elems));
-            u_xtw.strides(j) = u_xt.diagmul(u_weights);
-        }
-        custride u_xtwx = u_xtw * u_xt.t();
-        custride u_xtwy = u_xtw * u_y;
-        custride u_xtwxI = u_xtwx.inv(d_info);
-        checkCudaErrors(cudaMemcpy(p_info, d_info, sizeof(int) * mGroupLength, cudaMemcpyDeviceToHost));
-        for (size_t j = 0; j < mGroupLength; j++)
-        {
-            if (p_info[j] != 0)
-            {
-                GWM_LOG_ERROR("Cuda failed to get the inverse of matrix");
-                success = false;
-                break;
-            }
-        }
-        if (success)
-        {
-            u_betas.as_stride().strides(begin, begin + length) = u_xtwxI * u_xtwy;
-            custride u_c = u_xtwxI * u_xtw;
-            custride u_s = u_xt.as_stride().strides(begin, begin + length).t() * u_c;
-            custride u_sst = u_s * u_s.t();
-            u_s.get(sg.memptr());
-            u_sst.get(sst.memptr());
-            shat(0) += trace(sg.submat(begin, 0, arma::SizeMat(length, length)));
-            shat(1) += sum(sst);
-        }
+        betas = (this->*mFitVarCoreSHat)(mXi, mYi, sw, shat);
     }
-    checkCudaErrors(cudaFree(d_info));
-    delete[] p_info;
-    if (!success) return DBL_MAX;
-    u_betas.get(betas.memptr());
-    double aic = GWRMultiscale::AICc(mXi, mYi, betas.t(), shat);
-    if (isfinite(aic))
+    catch(const std::exception& e)
+    {
+        status = 0;
+    }
+    MPI_Allgather(&status, 1, MPI_INT, status_all.memptr(), 1, MPI_INT, MPI_COMM_WORLD);
+    if (!all(status_all))
+        return DBL_MAX;
+    
+    // If all right, calculate cv;
+    double aic;
+    vec betas_all, shat_all;
+    GWM_MPI_MASTER_BEGIN
+    betas_all = vec(size(betas));
+    shat_all = vec(size(shat));
+    GWM_MPI_MASTER_END
+    MPI_Reduce(betas.memptr(), betas_all.memptr(), betas.n_elem, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
+    MPI_Reduce(shat.memptr(), shat_all.memptr(), shat.n_elem, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
+    GWM_MPI_MASTER_BEGIN
+    aic = GWRBasic::AICc(mXi, mYi, betas_all, shat_all);
+    GWM_MPI_MASTER_END
+    MPI_Bcast(&aic, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
+    if (mStatus == Status::Success && isfinite(aic))
     {
         GWM_LOG_INFO(IBandwidthSelectable::infoBandwidthCriterion(bandwidthWeight, aic));
         GWM_LOG_PROGRESS_PERCENT(exp(- abs(mBandwidthLastCriterion - aic)));
@@ -1390,57 +1041,29 @@ double GWRMultiscale::bandwidthSizeCriterionVarAICCuda(BandwidthWeight* bandwidt
     }
     else return DBL_MAX;
 }
-
-#endif // ENABLE_CUDA
-
-GWRMultiscale::BandwidthSizeCriterionFunction GWRMultiscale::bandwidthSizeCriterionAll(GWRMultiscale::BandwidthSelectionCriterionType type)
-{
-    unordered_map<BandwidthSelectionCriterionType, unordered_map<ParallelType, BandwidthSizeCriterionFunction> > mapper = {
-        std::make_pair<BandwidthSelectionCriterionType, unordered_map<ParallelType, BandwidthSizeCriterionFunction> >(BandwidthSelectionCriterionType::CV, {
-        #ifdef ENABLE_OPENMP
-            std::make_pair(ParallelType::OpenMP, &GWRMultiscale::bandwidthSizeCriterionAllCVOmp),
-        #endif
-        #ifdef ENABLE_CUDA
-            std::make_pair(ParallelType::CUDA, &GWRMultiscale::bandwidthSizeCriterionAllCVCuda),
-        #endif
-            std::make_pair(ParallelType::SerialOnly, &GWRMultiscale::bandwidthSizeCriterionAllCVSerial)
-        }),
-        std::make_pair<BandwidthSelectionCriterionType, unordered_map<ParallelType, BandwidthSizeCriterionFunction> >(BandwidthSelectionCriterionType::AIC, {
-        #ifdef ENABLE_OPENMP
-            std::make_pair(ParallelType::OpenMP, &GWRMultiscale::bandwidthSizeCriterionAllAICOmp),
-        #endif
-        #ifdef ENABLE_CUDA
-            std::make_pair(ParallelType::CUDA, &GWRMultiscale::bandwidthSizeCriterionAllAICCuda),
-        #endif
-            std::make_pair(ParallelType::SerialOnly, &GWRMultiscale::bandwidthSizeCriterionAllAICSerial)
-        })
-    };
-    return mapper[type][mParallelType];
-}
+#endif // ENABLE_MPI
 
 GWRMultiscale::BandwidthSizeCriterionFunction GWRMultiscale::bandwidthSizeCriterionVar(GWRMultiscale::BandwidthSelectionCriterionType type)
 {
-    unordered_map<BandwidthSelectionCriterionType, unordered_map<ParallelType, BandwidthSizeCriterionFunction> > mapper = {
-        std::make_pair<BandwidthSelectionCriterionType, unordered_map<ParallelType, BandwidthSizeCriterionFunction> >(BandwidthSelectionCriterionType::CV, {
-        #ifdef ENABLE_OPENMP
-            std::make_pair(ParallelType::OpenMP, &GWRMultiscale::bandwidthSizeCriterionVarCVOmp),
-        #endif
-        #ifdef ENABLE_CUDA
-            std::make_pair(ParallelType::CUDA, &GWRMultiscale::bandwidthSizeCriterionVarCVCuda),
-        #endif
-            std::make_pair(ParallelType::SerialOnly, &GWRMultiscale::bandwidthSizeCriterionVarCVSerial)
-        }),
-        std::make_pair<BandwidthSelectionCriterionType, unordered_map<ParallelType, BandwidthSizeCriterionFunction> >(BandwidthSelectionCriterionType::AIC, {
-        #ifdef ENABLE_OPENMP
-            std::make_pair(ParallelType::OpenMP, &GWRMultiscale::bandwidthSizeCriterionVarAICOmp),
-        #endif
-        #ifdef ENABLE_CUDA
-            std::make_pair(ParallelType::CUDA, &GWRMultiscale::bandwidthSizeCriterionVarAICCuda),
-        #endif
-            std::make_pair(ParallelType::SerialOnly, &GWRMultiscale::bandwidthSizeCriterionVarAICSerial)
-        })
-    };
-    return mapper[type][mParallelType];
+#ifdef ENABLE_MPI
+    if (mParallelType & ParallelType::MPI)
+    {
+        switch (type)
+        {
+        case BandwidthSelectionCriterionType::AIC:
+            return &GWRMultiscale::bandwidthSizeCriterionVarAICMpi;
+        default:
+            return &GWRMultiscale::bandwidthSizeCriterionVarCVMpi;
+        }
+    }
+#endif // ENABLE_MPI
+    switch (type)
+    {
+    case BandwidthSelectionCriterionType::AIC:
+        return &GWRMultiscale::bandwidthSizeCriterionVarAICBase;
+    default:
+        return &GWRMultiscale::bandwidthSizeCriterionVarCVBase;
+    }
 }
 
 void GWRMultiscale::setParallelType(const ParallelType &type)
@@ -1449,27 +1072,36 @@ void GWRMultiscale::setParallelType(const ParallelType &type)
     {
         mParallelType = type;
         switch (type) {
-        case ParallelType::SerialOnly:
-            mFitAll = &GWRMultiscale::fitAllSerial;
-            mFitVar = &GWRMultiscale::fitVarSerial;
-            break;
 #ifdef ENABLE_OPENMP
         case ParallelType::OpenMP:
-            mFitAll = &GWRMultiscale::fitAllOmp;
-            mFitVar = &GWRMultiscale::fitVarOmp;
+            mFitVarCore = &GWRMultiscale::fitVarCoreOmp;
+            mFitVarCoreCV = &GWRMultiscale::fitVarCoreCVOmp;
+            mFitVarCoreSHat = &GWRMultiscale::fitVarCoreSHatOmp;
             break;
 #endif
 #ifdef ENABLE_CUDA
         case ParallelType::CUDA:
-            mFitAll = &GWRMultiscale::fitAllCuda;
-            mFitVar = &GWRMultiscale::fitVarCuda;
+            mFitVarCore = &GWRMultiscale::fitVarCoreCuda;
+            mFitVarCoreCV = &GWRMultiscale::fitVarCoreCVCuda;
+            mFitVarCoreSHat = &GWRMultiscale::fitVarCoreSHatCuda;
             break;
 #endif
         default:
-            mFitAll = &GWRMultiscale::fitAllSerial;
-            mFitVar = &GWRMultiscale::fitVarSerial;
+            mFitVarCore = &GWRMultiscale::fitVarCoreSerial;
+            mFitVarCoreCV = &GWRMultiscale::fitVarCoreCVSerial;
+            mFitVarCoreSHat = &GWRMultiscale::fitVarCoreSHatSerial;
             break;
         }
+#ifdef ENABLE_MPI
+        if (mParallelType & ParallelType::MPI)
+        {
+            mFitVar = &GWRMultiscale::fitVarMpi;
+        }
+        else
+        {
+            mFitVar = &GWRMultiscale::fitVarBase;
+        }
+#endif // ENABLE_MPI
     }
 }
 
diff --git a/src/gwmodelpp/spatialweight/BandwidthWeight.cpp b/src/gwmodelpp/spatialweight/BandwidthWeight.cpp
index 13a62779..d2ba1e4d 100644
--- a/src/gwmodelpp/spatialweight/BandwidthWeight.cpp
+++ b/src/gwmodelpp/spatialweight/BandwidthWeight.cpp
@@ -63,7 +63,7 @@ vec BandwidthWeight::weight(vec dist)
 #ifdef ENABLE_CUDA
 cudaError_t BandwidthWeight::weight(double* d_dists, double* d_weights, size_t elems)
 {
-    if (!mCudaPrepared) throw std::logic_error("Cuda has not been prepared.");
+    if (!mCudaPrepared) throw std::logic_error("[BandwidthWeight] Cuda has not been prepared.");
     return gw_weight_cuda(mBandwidth, (int)mKernel, mAdaptive, d_dists, d_weights, elems, mCudaThreads);
 }
 #endif // ENABLE_CUDA
diff --git a/src/gwmodelpp/spatialweight/CRSDistance.cpp b/src/gwmodelpp/spatialweight/CRSDistance.cpp
index c3b97050..0c00116f 100644
--- a/src/gwmodelpp/spatialweight/CRSDistance.cpp
+++ b/src/gwmodelpp/spatialweight/CRSDistance.cpp
@@ -90,7 +90,8 @@ CRSDistance::CRSDistance(const CRSDistance &distance) : Distance(distance)
         mParameter = make_unique<Parameter>(fp, dp);
 #ifdef ENABLE_CUDA
         mUseCuda = distance.mUseCuda;
-        if (distance.mCudaPrepared)
+        mCudaPrepared = false;
+        if (distance.mUseCuda)
         {
             prepareCuda(distance.mGpuID);
         }
@@ -158,25 +159,25 @@ double CRSDistance::minDistance()
 cudaError_t CRSDistance::prepareCuda(size_t gpuId)
 {
     checkCudaErrors(Distance::prepareCuda(gpuId));
-    checkCudaErrors(cudaMalloc(&mCudaDp, sizeof(double) * mParameter->dataPoints.n_elem));
-    checkCudaErrors(cudaMalloc(&mCudaFp, sizeof(double) * mParameter->focusPoints.n_elem));
-    mat dpt = mParameter->dataPoints.t(), fpt = mParameter->focusPoints.t();
-    checkCudaErrors(cudaMemcpy(mCudaDp, dpt.mem, sizeof(double) * dpt.n_elem, cudaMemcpyHostToDevice));
-    checkCudaErrors(cudaMemcpy(mCudaFp, fpt.mem, sizeof(double) * fpt.n_elem, cudaMemcpyHostToDevice));
-    mCudaPrepared = true;
+    if (!mCudaPrepared)
+    {
+        mCudaDp = std::move(cumat(mParameter->dataPoints.t()));
+        mCudaFp = std::move(cumat(mParameter->focusPoints.t()));
+        mCudaPrepared = true;
+    }
     return cudaSuccess;
 }
 
 cudaError_t CRSDistance::distance(uword focus, double *d_dists, size_t *elems)
 {
     if (mParameter == nullptr) throw std::runtime_error("Parameter is nullptr.");
-    if (!mCudaPrepared) throw std::logic_error("Cuda has not been prepared.");
-    if (mCudaDp == 0 || mCudaFp == 0 || mCudaThreads == 0) throw std::logic_error("Cuda has not been correctly prepared.");
+    if (!mCudaPrepared) throw std::logic_error("[CRSDistance] Cuda has not been prepared.");
+    if (mCudaDp.dmem() == nullptr || mCudaFp.dmem() == nullptr || mCudaThreads == 0) throw std::logic_error("Cuda has not been correctly prepared.");
     if (focus < mParameter->total)
     {
         size_t fbias = focus * mParameter->focusPoints.n_cols;
         *elems = mParameter->total;
-        return mCalculatorCuda(mCudaDp, mCudaFp + fbias, mParameter->total, mCudaThreads, d_dists);
+        return mCalculatorCuda(mCudaDp.dmem(), mCudaFp.dmem() + fbias, mParameter->total, mCudaThreads, d_dists);
     }
     else throw std::runtime_error("Target is out of bounds of data points.");
 }
diff --git a/src/gwmodelpp/spatialweight/cuda/BandwidthWeightKernel.cu b/src/gwmodelpp/spatialweight/cuda/BandwidthWeightKernel.cu
index 0769b0c5..b7c06688 100644
--- a/src/gwmodelpp/spatialweight/cuda/BandwidthWeightKernel.cu
+++ b/src/gwmodelpp/spatialweight/cuda/BandwidthWeightKernel.cu
@@ -56,6 +56,14 @@ __global__ void gw_weight_boxcar_kernel(const double *d_dists, double bw, double
 }
 
 
+__global__ void gw_weight_ones(const double *d_dists, double bw, double* d_weights, int n)
+{
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (index >= n) return;
+    d_weights[index] = 1;
+}
+
+
 typedef void(*WEIGHT_KERNEL_CUDA)(const double*, double, double*, int);
 
 
@@ -72,11 +80,17 @@ cudaError_t gw_weight_cuda(double bw, int kernel, bool adaptive, double *d_dists
 {
     cudaError_t error;
     const WEIGHT_KERNEL_CUDA *kerf = GWRKernelCuda + kernel;
+    dim3 blockSize(threads), gridSize((ndp + blockSize.x - 1) / blockSize.x);
+    if (bw == DBL_MAX)
+    {
+        gw_weight_ones<< <gridSize, blockSize >> > (d_dists, bw, d_weight, ndp);
+        error = cudaGetLastError();
+        return error;
+    }
     switch (adaptive)
     {
         case true:
         {
-            dim3 blockSize(threads), gridSize((ndp + blockSize.x - 1) / blockSize.x);
             // Backup d_dists, used for sort
             double *d_dists_bak;
             cudaMalloc((void **)&d_dists_bak, sizeof(double) * ndp);
@@ -95,7 +109,6 @@ cudaError_t gw_weight_cuda(double bw, int kernel, bool adaptive, double *d_dists
         }
         default:
         {
-            dim3 blockSize(threads), gridSize((ndp + blockSize.x - 1) / blockSize.x);
             (*kerf) << <gridSize, blockSize >> > (d_dists, bw, d_weight, ndp);
             error = cudaGetLastError();
             break;
diff --git a/src/gwmodelpp/utils/armampi.cpp b/src/gwmodelpp/utils/armampi.cpp
new file mode 100644
index 00000000..024db9b9
--- /dev/null
+++ b/src/gwmodelpp/utils/armampi.cpp
@@ -0,0 +1,25 @@
+#include "gwmodelpp/utils/armampi.h"
+#include <mpi.h>
+#include <memory>
+
+using namespace std;
+using namespace arma;
+
+void mat_mul_mpi(mat& a, mat& b, mat& c, const int ip, const int np, const size_t range)
+{
+    arma::uword m = a.n_rows, n = b.n_cols;
+    int k = (int) b.n_rows;
+    arma::Col<int> b_rows(np, arma::fill::zeros);
+    MPI_Allgather(&k, 1, MPI_INT, b_rows.memptr(), 1, MPI_INT, MPI_COMM_WORLD);
+    c = mat(m, n, arma::fill::zeros);
+    mat a_buf;
+    for (int pi = 0; pi < np; pi++)
+    {
+        arma::Col<int> a_counts = b_rows(pi) * b_rows;
+        arma::Col<int> a_disp = arma::cumsum(a_counts) - a_counts;
+        a_buf.resize(b_rows(pi), b_rows(ip));
+        MPI_Scatterv(a.memptr(), a_counts.mem, a_disp.mem, MPI_DOUBLE, a_buf.memptr(), a_buf.n_elem, MPI_DOUBLE, pi, MPI_COMM_WORLD);
+        mat ci = a_buf * b;
+        MPI_Reduce(ci.memptr(), c.memptr(), ci.n_elem, MPI_DOUBLE, MPI_SUM, pi, MPI_COMM_WORLD);
+    }
+}
diff --git a/src/gwmodelpp/utils/cumat.cpp b/src/gwmodelpp/utils/cumat.cpp
index f03bbf18..f3f31d72 100644
--- a/src/gwmodelpp/utils/cumat.cpp
+++ b/src/gwmodelpp/utils/cumat.cpp
@@ -77,3 +77,10 @@ cumat &cumat::operator=(cuop_diagmul &&op)
     op.eval(*this);
     return *this;
 }
+
+void print(cumat &mat)
+{
+    arma::mat am(mat.nrows(), mat.ncols());
+    mat.get(am.memptr());
+    am.brief_print();
+}
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index f5fe3bc2..f7cafaad 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -29,9 +29,9 @@ set(SAMPLE_DATA_DIR ${CMAKE_CURRENT_SOURCE_DIR}/data)
 add_definitions(-DSAMPLE_DATA_DIR="${SAMPLE_DATA_DIR}")
 
 set(ADDON_SOURCES
-    data/londonhp100.cpp
-    TerminateCheckTelegram.cpp
-    FileTelegram.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/data/londonhp100.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/TerminateCheckTelegram.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/FileTelegram.cpp
 )
 
 add_definitions(-DSAMPLE_DATA_DIR="${SAMPLE_DATA_DIR}")
@@ -129,3 +129,7 @@ add_test(
     COMMAND $<TARGET_FILE:testCuMat> --success --skip-benchmarks
 )
 endif()
+
+if (ENABLE_MPI)
+add_subdirectory("./mpi")
+endif()
diff --git a/test/include/TerminateCheckTelegram.h b/test/include/TerminateCheckTelegram.h
index edb588f5..3b803eac 100644
--- a/test/include/TerminateCheckTelegram.h
+++ b/test/include/TerminateCheckTelegram.h
@@ -56,5 +56,9 @@ class TerminateCheckTelegram : public Logger
 const std::map<ParallelType, std::string> ParallelTypeDict = {
     std::make_pair(ParallelType::SerialOnly, "SerialOnly"),
     std::make_pair(ParallelType::OpenMP, "OpenMP"),
-    std::make_pair(ParallelType::CUDA, "CUDA")
+    std::make_pair(ParallelType::CUDA, "CUDA"),
+    std::make_pair(ParallelType::MPI, "MPI"),
+    std::make_pair(ParallelType::MPI_Serial, "MPI_Serial"),
+    std::make_pair(ParallelType::MPI_MP, "MPI_MP"),
+    std::make_pair(ParallelType::MPI_CUDA, "MPI_CUDA")
 };
diff --git a/test/mpi/CMakeLists.txt b/test/mpi/CMakeLists.txt
new file mode 100644
index 00000000..255a667a
--- /dev/null
+++ b/test/mpi/CMakeLists.txt
@@ -0,0 +1,30 @@
+cmake_minimum_required(VERSION 3.17)
+
+if(ENABLE_MPI)
+find_package(MPI REQUIRED)
+include_directories(${MPI_CXX_HEADER_DIR})
+add_link_options(${MPI_CXX_LINK_FLAGS})
+add_compile_options(${MPI_CXX_COMPILE_OPTIONS})
+add_definitions(${MPI_CXX_COMPILE_DEFINITIONS})
+endif(ENABLE_MPI)
+
+add_executable(testGWRBasicMpi testGWRBasicMpi.cpp ${ADDON_SOURCES})
+target_link_libraries(testGWRBasicMpi PRIVATE gwmodel ${ARMADILLO_LIBRARIES} ${MPI_CXX_LIBRARIES} Catch2::Catch2WithMain)
+add_test(
+    NAME testGWRBasicMpi 
+    COMMAND ${MPIEXEC_EXECUTABLE} -np 4 $<TARGET_FILE:testGWRBasicMpi> --success --skip-benchmarks
+)
+
+add_executable(testMpiMatMul testMpiMatMul.cpp)
+target_link_libraries(testMpiMatMul PRIVATE gwmodel ${ARMADILLO_LIBRARIES} Catch2::Catch2WithMain)
+add_test(
+    NAME testMpiMatMul
+    COMMAND ${MPIEXEC_EXECUTABLE} -np 4 $<TARGET_FILE:testMpiMatMul> --success --skip-benchmarks
+)
+
+add_executable(testGWRMultiscaleMpi testGWRMultiscaleMpi.cpp ${ADDON_SOURCES})
+target_link_libraries(testGWRMultiscaleMpi PRIVATE gwmodel ${ARMADILLO_LIBRARIES} ${MPI_CXX_LIBRARIES} Catch2::Catch2WithMain)
+add_test(
+    NAME testGWRMultiscaleMpi 
+    COMMAND ${MPIEXEC_EXECUTABLE} -np 4 $<TARGET_FILE:testGWRMultiscaleMpi> --success --skip-benchmarks
+)
diff --git a/test/mpi/testGWRBasicMpi.cpp b/test/mpi/testGWRBasicMpi.cpp
new file mode 100644
index 00000000..c5b59934
--- /dev/null
+++ b/test/mpi/testGWRBasicMpi.cpp
@@ -0,0 +1,310 @@
+#define CATCH_CONFIG_MAIN
+#include <catch2/catch_all.hpp>
+
+#include <vector>
+#include <string>
+#include <armadillo>
+#include <mpi.h>
+#include "gwmodelpp/GWRBasic.h"
+#include "gwmodelpp/spatialweight/CRSDistance.h"
+#include "gwmodelpp/spatialweight/BandwidthWeight.h"
+#include "gwmodelpp/spatialweight/SpatialWeight.h"
+#include "londonhp100.h"
+#include "TerminateCheckTelegram.h"
+#include "FileTelegram.h"
+
+using namespace std;
+using namespace arma;
+using namespace gwm;
+
+TEST_CASE("BasicGWR: LondonHP")
+{
+    int iProcess, nProcess;
+    MPI_Comm_size(MPI_COMM_WORLD, &nProcess);
+    MPI_Comm_rank(MPI_COMM_WORLD, &iProcess);
+
+    mat londonhp100_coord, londonhp100_data, x;
+    vec y;
+    vector<string> londonhp100_fields;
+
+    if (iProcess == 0)
+    {
+        if (!read_londonhp100(londonhp100_coord, londonhp100_data, londonhp100_fields))
+        {
+            FAIL("Cannot load londonhp100 data.");
+        }
+        y = londonhp100_data.col(0);
+        x = join_rows(ones(londonhp100_coord.n_rows), londonhp100_data.cols(1, 3));
+    }
+
+    const initializer_list<ParallelType> parallel_list = {
+        ParallelType::MPI_Serial
+#ifdef ENABLE_OPENMP
+        , ParallelType::MPI_MP
+#endif // ENABLE_OPENMP
+#ifdef ENABLE_CUDA
+        , ParallelType::MPI_CUDA
+#endif // ENABLE_CUDA
+    };
+
+    SECTION("adaptive bandwidth | no bandwidth optimization | no variable optimization") {
+        auto parallel = GENERATE_REF(values(parallel_list));
+        INFO("Parallel:" << ParallelTypeDict.at(parallel));
+
+        CRSDistance distance(false);
+        BandwidthWeight bandwidth(36, true, BandwidthWeight::Gaussian);
+        SpatialWeight spatial(&bandwidth, &distance);
+
+        GWRBasic algorithm;
+        if (iProcess == 0)
+        {
+            algorithm.setCoords(londonhp100_coord);
+            algorithm.setDependentVariable(y);
+            algorithm.setIndependentVariables(x);
+        }
+        algorithm.setSpatialWeight(spatial);
+        algorithm.setParallelType(parallel);
+        algorithm.setWorkerNum(nProcess);
+        algorithm.setWorkerId(iProcess);
+#ifdef ENABLE_CUDA
+        if (parallel == ParallelType::MPI_CUDA)
+        {
+            algorithm.setGPUId(0);
+            algorithm.setGroupSize(16);
+        }
+#endif // ENABLE_CUDA
+        REQUIRE_NOTHROW(algorithm.fit());
+        if (iProcess == 0)
+        {
+            REQUIRE(algorithm.hasIntercept() == true);
+            RegressionDiagnostic diagnostic = algorithm.diagnostic();
+            REQUIRE_THAT(diagnostic.AIC, Catch::Matchers::WithinAbs(2436.60445730413, 1e-8));
+            REQUIRE_THAT(diagnostic.AICc, Catch::Matchers::WithinAbs(2448.27206524754, 1e-8));
+            REQUIRE_THAT(diagnostic.RSquare, Catch::Matchers::WithinAbs(0.708010632044736, 1e-8));
+            REQUIRE_THAT(diagnostic.RSquareAdjust, Catch::Matchers::WithinAbs(0.674975341723766, 1e-8));
+        }
+    }
+    
+    SECTION("fixed bandwidth | no bandwidth optimization | no variable optimization") {
+        auto parallel = GENERATE_REF(values(parallel_list));
+        INFO("Parallel:" << ParallelTypeDict.at(parallel));
+        
+        CRSDistance distance(false);
+        BandwidthWeight bandwidth(5000, false, BandwidthWeight::Gaussian);
+        SpatialWeight spatial(&bandwidth, &distance);
+
+        GWRBasic algorithm;
+        if (iProcess == 0)
+        {
+            algorithm.setCoords(londonhp100_coord);
+            algorithm.setDependentVariable(y);
+            algorithm.setIndependentVariables(x);
+        }
+        algorithm.setSpatialWeight(spatial);
+        algorithm.setParallelType(parallel);
+        algorithm.setWorkerId(iProcess);
+        algorithm.setWorkerNum(nProcess);
+#ifdef ENABLE_OPENMP
+        if (parallel == ParallelType::MPI_MP)
+        {
+            algorithm.setOmpThreadNum(2);
+        }
+#endif // ENABLE_OPENMP
+#ifdef ENABLE_CUDA
+        if (parallel == ParallelType::MPI_CUDA)
+        {
+            algorithm.setGPUId(0);
+            algorithm.setGroupSize(16);
+        }
+#endif // ENABLE_CUDA
+        REQUIRE_NOTHROW(algorithm.fit());
+        if (iProcess == 0)
+        {
+            RegressionDiagnostic diagnostic = algorithm.diagnostic();
+            REQUIRE_THAT(diagnostic.AIC, Catch::Matchers::WithinAbs(2437.649574267587, 1e-8));
+            REQUIRE_THAT(diagnostic.AICc, Catch::Matchers::WithinAbs(2447.676281164379, 1e-8));
+            REQUIRE_THAT(diagnostic.RSquare, Catch::Matchers::WithinAbs(0.701466295457, 1e-8));
+            REQUIRE_THAT(diagnostic.RSquareAdjust, Catch::Matchers::WithinAbs(0.673691594464, 1e-8));
+        }
+    }
+
+    SECTION("adaptive bandwidth | bandwidth optimization | no variable optimization") {
+        auto parallel = GENERATE_REF(values(parallel_list));
+        auto criterion = GENERATE(GWRBasic::BandwidthSelectionCriterionType::CV, GWRBasic::BandwidthSelectionCriterionType::AIC);
+        INFO("Parallel:" << ParallelTypeDict.at(parallel) << " ; Criterion:" << criterion);
+        
+        CRSDistance distance(false);
+        BandwidthWeight bandwidth(0, true, BandwidthWeight::Gaussian);
+        SpatialWeight spatial(&bandwidth, &distance);
+
+        GWRBasic algorithm;
+        if (iProcess == 0)
+        {
+            algorithm.setCoords(londonhp100_coord);
+            algorithm.setDependentVariable(y);
+            algorithm.setIndependentVariables(x);
+        }
+        algorithm.setSpatialWeight(spatial);
+        algorithm.setIsAutoselectBandwidth(true);
+        algorithm.setBandwidthSelectionCriterion(criterion);
+        algorithm.setParallelType(parallel);
+        algorithm.setWorkerId(iProcess);
+        algorithm.setWorkerNum(nProcess);
+#ifdef ENABLE_OPENMP
+        if (parallel == ParallelType::MPI_MP)
+        {
+            algorithm.setOmpThreadNum(2);
+        }
+#endif // ENABLE_OPENMP
+#ifdef ENABLE_CUDA
+        if (parallel == ParallelType::MPI_CUDA)
+        {
+            algorithm.setGPUId(0);
+            algorithm.setGroupSize(16);
+        }
+#endif // ENABLE_CUDA
+        REQUIRE_NOTHROW(algorithm.fit());
+        if (iProcess == 0)
+        {
+            size_t bw = (size_t)algorithm.spatialWeight().weight<BandwidthWeight>()->bandwidth();
+            size_t bw0 = 67;
+            switch (criterion)
+            {
+            case GWRBasic::BandwidthSelectionCriterionType::AIC:
+                bw0 = 31;
+                break;
+            default:
+                break;
+            }
+            REQUIRE(bw == bw0);
+        }
+    }
+    
+    SECTION("adaptive bandwidth | no bandwidth optimization | AIC variable optimization") {
+        auto parallel = GENERATE_REF(values(parallel_list));
+        INFO("Parallel:" << ParallelTypeDict.at(parallel));
+        
+        CRSDistance distance(false);
+        BandwidthWeight bandwidth(36, true, BandwidthWeight::Gaussian);
+        SpatialWeight spatial(&bandwidth, &distance);
+
+        GWRBasic algorithm;
+        if (iProcess == 0)
+        {
+            algorithm.setCoords(londonhp100_coord);
+            algorithm.setDependentVariable(y);
+            algorithm.setIndependentVariables(x);
+        }
+        algorithm.setSpatialWeight(spatial);
+        algorithm.setIsAutoselectIndepVars(true);
+        algorithm.setIndepVarSelectionThreshold(3.0);
+        algorithm.setParallelType(parallel);
+        algorithm.setWorkerNum(nProcess);
+        algorithm.setWorkerId(iProcess);
+#ifdef ENABLE_OPENMP
+        if (parallel == ParallelType::MPI_MP)
+        {
+            algorithm.setOmpThreadNum(2);
+        }
+#endif // ENABLE_OPENMP
+#ifdef ENABLE_CUDA
+        if (parallel == ParallelType::MPI_CUDA)
+        {
+            algorithm.setGPUId(0);
+            algorithm.setGroupSize(16);
+        }
+#endif // ENABLE_CUDA
+        REQUIRE_NOTHROW(algorithm.fit());
+        if (iProcess == 0)
+        {
+            VariablesCriterionList criterions = algorithm.indepVarsSelectionCriterionList();
+            REQUIRE_THAT(criterions[0].first, Catch::Matchers::Equals(vector<size_t>({ 2 })));
+            REQUIRE_THAT(criterions[0].second, Catch::Matchers::WithinAbs(2551.61359020599, 1e-8));
+            REQUIRE_THAT(criterions[1].first, Catch::Matchers::Equals(vector<size_t>({ 3 })));
+            REQUIRE_THAT(criterions[1].second, Catch::Matchers::WithinAbs(2551.30032201349, 1e-8));
+            REQUIRE_THAT(criterions[2].first, Catch::Matchers::Equals(vector<size_t>({ 1 })));
+            REQUIRE_THAT(criterions[2].second, Catch::Matchers::WithinAbs(2468.93236280013, 1e-8));
+            REQUIRE_THAT(criterions[3].first, Catch::Matchers::Equals(vector<size_t>({ 1, 3 })));
+            REQUIRE_THAT(criterions[3].second, Catch::Matchers::WithinAbs(2452.86447942033, 1e-8));
+            REQUIRE_THAT(criterions[4].first, Catch::Matchers::Equals(vector<size_t>({ 1, 2 })));
+            REQUIRE_THAT(criterions[4].second, Catch::Matchers::WithinAbs(2450.59642666509, 1e-8));
+            REQUIRE_THAT(criterions[5].first, Catch::Matchers::Equals(vector<size_t>({ 1, 2, 3 })));
+            REQUIRE_THAT(criterions[5].second, Catch::Matchers::WithinAbs(2452.80388934625, 1e-8));
+            vector<size_t> selectedVariables = algorithm.selectedVariables();
+            REQUIRE_THAT(selectedVariables, Catch::Matchers::Equals(vector<size_t>({1, 3})));
+        }
+    }
+
+    SECTION("adaptive bandwidth | CV bandwidth optimization | AIC variable optimization") {
+        auto parallel = GENERATE_REF(values(parallel_list));
+        INFO("Parallel:" << ParallelTypeDict.at(parallel));
+        
+        CRSDistance distance(false);
+        BandwidthWeight bandwidth(36, true, BandwidthWeight::Gaussian);
+        SpatialWeight spatial(&bandwidth, &distance);
+        
+        GWRBasic algorithm;
+        if (iProcess == 0)
+        {
+            algorithm.setCoords(londonhp100_coord);
+            algorithm.setDependentVariable(y);
+            algorithm.setIndependentVariables(x);
+        }
+        algorithm.setSpatialWeight(spatial);
+        algorithm.setIsAutoselectBandwidth(true);
+        algorithm.setBandwidthSelectionCriterion(GWRBasic::BandwidthSelectionCriterionType::CV);
+        algorithm.setIsAutoselectIndepVars(true);
+        algorithm.setIndepVarSelectionThreshold(3.0);
+        algorithm.setParallelType(parallel);
+        algorithm.setWorkerId(iProcess);
+        algorithm.setWorkerNum(nProcess);
+#ifdef ENABLE_OPENMP
+        if (parallel == ParallelType::MPI_MP)
+        {
+            algorithm.setOmpThreadNum(2);
+        }
+#endif // ENABLE_OPENMP
+#ifdef ENABLE_CUDA
+        if (parallel == ParallelType::MPI_CUDA)
+        {
+            algorithm.setGPUId(0);
+            algorithm.setGroupSize(16);
+        }
+#endif // ENABLE_CUDA
+        REQUIRE_NOTHROW(algorithm.fit());
+        if (iProcess == 0)
+        {
+            VariablesCriterionList criterions = algorithm.indepVarsSelectionCriterionList();
+            REQUIRE_THAT(criterions[0].first, Catch::Matchers::Equals(vector<size_t>({ 2 })));
+            REQUIRE_THAT(criterions[0].second, Catch::Matchers::WithinAbs(2551.61359020599, 1e-8));
+            REQUIRE_THAT(criterions[1].first, Catch::Matchers::Equals(vector<size_t>({ 3 })));
+            REQUIRE_THAT(criterions[1].second, Catch::Matchers::WithinAbs(2551.30032201349, 1e-8));
+            REQUIRE_THAT(criterions[2].first, Catch::Matchers::Equals(vector<size_t>({ 1 })));
+            REQUIRE_THAT(criterions[2].second, Catch::Matchers::WithinAbs(2468.93236280013, 1e-8));
+            REQUIRE_THAT(criterions[3].first, Catch::Matchers::Equals(vector<size_t>({ 1, 3 })));
+            REQUIRE_THAT(criterions[3].second, Catch::Matchers::WithinAbs(2452.86447942033, 1e-8));
+            REQUIRE_THAT(criterions[4].first, Catch::Matchers::Equals(vector<size_t>({ 1, 2 })));
+            REQUIRE_THAT(criterions[4].second, Catch::Matchers::WithinAbs(2450.59642666509, 1e-8));
+            REQUIRE_THAT(criterions[5].first, Catch::Matchers::Equals(vector<size_t>({ 1, 2, 3 })));
+            REQUIRE_THAT(criterions[5].second, Catch::Matchers::WithinAbs(2452.80388934625, 1e-8));
+            vector<size_t> selectedVariables = algorithm.selectedVariables();
+            REQUIRE_THAT(selectedVariables, Catch::Matchers::Equals(vector<size_t>({1, 3})));
+            double bw = algorithm.spatialWeight().weight<BandwidthWeight>()->bandwidth();
+            REQUIRE(bw == 31.0);
+            RegressionDiagnostic diagnostic = algorithm.diagnostic();
+            REQUIRE_THAT(diagnostic.AIC, Catch::Matchers::WithinAbs(2435.8161441795, 1e-8));
+            REQUIRE_THAT(diagnostic.AICc, Catch::Matchers::WithinAbs(2445.49629974057, 1e-8));
+            REQUIRE_THAT(diagnostic.RSquare, Catch::Matchers::WithinAbs(0.706143867720706, 1e-8));
+            REQUIRE_THAT(diagnostic.RSquareAdjust, Catch::Matchers::WithinAbs(0.678982114793865, 1e-8));
+        }
+    }
+
+}
+
+int main(int argc, char *argv[])
+{
+    MPI_Init(&argc, &argv);
+    int result = Catch::Session().run( argc, argv );
+    MPI_Finalize();
+    return result;
+}
diff --git a/test/mpi/testGWRMultiscaleMpi.cpp b/test/mpi/testGWRMultiscaleMpi.cpp
new file mode 100644
index 00000000..dc6c4af2
--- /dev/null
+++ b/test/mpi/testGWRMultiscaleMpi.cpp
@@ -0,0 +1,304 @@
+#define CATCH_CONFIG_MAIN
+
+#include <catch2/catch_all.hpp>
+
+#include <vector>
+#include <string>
+#include <armadillo>
+#include "gwmodelpp/GWRMultiscale.h"
+
+#include "gwmodelpp/spatialweight/CRSDistance.h"
+#include "gwmodelpp/spatialweight/BandwidthWeight.h"
+#include "gwmodelpp/spatialweight/SpatialWeight.h"
+#include "londonhp100.h"
+#include "TerminateCheckTelegram.h"
+#include "FileTelegram.h"
+
+#include <mpi.h>
+
+using namespace std;
+using namespace arma;
+using namespace gwm;
+
+
+TEST_CASE("MGWR: basic flow")
+{
+    int iProcess, nProcess;
+    MPI_Comm_size(MPI_COMM_WORLD, &nProcess);
+    MPI_Comm_rank(MPI_COMM_WORLD, &iProcess);
+
+    mat londonhp100_coord, londonhp100_data;
+    vector<string> londonhp100_fields;
+    if (!read_londonhp100(londonhp100_coord, londonhp100_data, londonhp100_fields))
+    {
+        FAIL("Cannot load londonhp100 data.");
+    }
+    vec y = londonhp100_data.col(0);
+    mat x = join_rows(ones(londonhp100_data.n_rows), londonhp100_data.cols(uvec({1, 3})));
+    
+    uword nVar = 3;
+
+    const initializer_list<ParallelType> parallelTypes = {
+        ParallelType::MPI_Serial,
+#ifdef ENABLE_OPENMP
+        ParallelType::MPI_MP,
+#endif // ENABLE_OPENMP
+#ifdef ENABLE_CUDA
+        ParallelType::MPI_CUDA
+#endif // ENABLE_CUDA
+    };
+
+    SECTION("optim bw cv | null init bw | fixed | bisquare kernel | with hatmatrix")
+    {
+        auto parallel = GENERATE_REF(values(parallelTypes));
+        INFO("Parallel type: " << ParallelTypeDict.at(parallel));
+
+        vector<SpatialWeight> spatials;
+        vector<bool> preditorCentered;
+        vector<GWRMultiscale::BandwidthInitilizeType> bandwidthInitialize;
+        vector<GWRMultiscale::BandwidthSelectionCriterionType> bandwidthSelectionApproach;
+        for (size_t i = 0; i < nVar; i++)
+        {
+            CRSDistance distance;
+            BandwidthWeight bandwidth(0, false, BandwidthWeight::Bisquare);
+            spatials.push_back(SpatialWeight(&bandwidth, &distance));
+            preditorCentered.push_back(i != 0);
+            bandwidthInitialize.push_back(GWRMultiscale::BandwidthInitilizeType::Null);
+            bandwidthSelectionApproach.push_back(GWRMultiscale::BandwidthSelectionCriterionType::CV);
+        }
+
+        GWRMultiscale algorithm;
+        // algorithm.setTelegram(std::move(make_unique<Logger>()));
+        algorithm.setCoords(londonhp100_coord);
+        algorithm.setDependentVariable(y);
+        algorithm.setIndependentVariables(x);
+        algorithm.setSpatialWeights(spatials);
+        algorithm.setHasHatMatrix(true);
+        algorithm.setPreditorCentered(preditorCentered);
+        algorithm.setBandwidthInitilize(bandwidthInitialize);
+        algorithm.setBandwidthSelectionApproach(bandwidthSelectionApproach);
+        algorithm.setBandwidthSelectThreshold(vector(3, 1e-5));
+        algorithm.setParallelType(parallel);
+        algorithm.setWorkerId(iProcess);
+        algorithm.setWorkerNum(nProcess);
+        REQUIRE_NOTHROW(algorithm.fit());
+
+        if (iProcess == 0)
+        {
+            const vector<SpatialWeight>& spatialWeights = algorithm.spatialWeights();
+            REQUIRE_THAT(spatialWeights[0].weight<BandwidthWeight>()->bandwidth(), Catch::Matchers::WithinAbs(4623.78, 0.1));
+            REQUIRE_THAT(spatialWeights[1].weight<BandwidthWeight>()->bandwidth(), Catch::Matchers::WithinAbs(12665.70, 0.1));
+            REQUIRE_THAT(spatialWeights[2].weight<BandwidthWeight>()->bandwidth(), Catch::Matchers::WithinAbs(12665.70, 0.1));
+
+            RegressionDiagnostic diagnostic = algorithm.diagnostic();
+            REQUIRE_THAT(diagnostic.AICc, Catch::Matchers::WithinAbs(2437.09277417389, 1e-6));
+            REQUIRE_THAT(diagnostic.RSquare, Catch::Matchers::WithinAbs(0.744649364494, 1e-6));
+            REQUIRE_THAT(diagnostic.RSquareAdjust, Catch::Matchers::WithinAbs(0.712344894394, 1e-6));
+
+            REQUIRE(algorithm.hasIntercept() == true);
+        }
+    }
+
+    SECTION("optim bw AIC | null init bw | adaptive | bisquare kernel | with hatmatrix")
+    {
+        auto parallel = GENERATE_REF(values(parallelTypes));
+        INFO("Parallel type: " << ParallelTypeDict.at(parallel));
+
+        vector<SpatialWeight> spatials;
+        vector<bool> preditorCentered;
+        vector<GWRMultiscale::BandwidthInitilizeType> bandwidthInitialize;
+        vector<GWRMultiscale::BandwidthSelectionCriterionType> bandwidthSelectionApproach;
+        for (size_t i = 0; i < nVar; i++)
+        {
+            CRSDistance distance;
+            BandwidthWeight bandwidth(36, true, BandwidthWeight::Bisquare);
+            spatials.push_back(SpatialWeight(&bandwidth, &distance));
+            preditorCentered.push_back(i != 0);
+            bandwidthInitialize.push_back(GWRMultiscale::BandwidthInitilizeType::Initial);
+            bandwidthSelectionApproach.push_back(GWRMultiscale::BandwidthSelectionCriterionType::AIC);
+        }
+
+        GWRMultiscale algorithm;
+        // algorithm.setTelegram(std::move(make_unique<Logger>()));
+        algorithm.setCoords(londonhp100_coord);
+        algorithm.setDependentVariable(y);
+        algorithm.setIndependentVariables(x);
+        algorithm.setSpatialWeights(spatials);
+        algorithm.setHasHatMatrix(true);
+        algorithm.setCriterionType(GWRMultiscale::BackFittingCriterionType::dCVR);
+        algorithm.setPreditorCentered(preditorCentered);
+        algorithm.setBandwidthInitilize(bandwidthInitialize);
+        algorithm.setBandwidthSelectionApproach(bandwidthSelectionApproach);
+        algorithm.setBandwidthSelectRetryTimes(5);
+        algorithm.setBandwidthSelectThreshold(vector(3, 1e-5));
+        algorithm.setParallelType(parallel);
+        algorithm.setWorkerId(iProcess);
+        algorithm.setWorkerNum(nProcess);
+        REQUIRE_NOTHROW(algorithm.fit());
+
+        if (iProcess == 0)
+        {
+            const vector<SpatialWeight>& spatialWeights = algorithm.spatialWeights();
+            REQUIRE_THAT(spatialWeights[0].weight<BandwidthWeight>()->bandwidth(), Catch::Matchers::WithinAbs(45, 0.1));
+            REQUIRE_THAT(spatialWeights[1].weight<BandwidthWeight>()->bandwidth(), Catch::Matchers::WithinAbs(98, 0.1));
+            REQUIRE_THAT(spatialWeights[2].weight<BandwidthWeight>()->bandwidth(), Catch::Matchers::WithinAbs(98, 0.1));
+
+            RegressionDiagnostic diagnostic = algorithm.diagnostic();
+            REQUIRE_THAT(diagnostic.AICc, Catch::Matchers::WithinAbs(2437.935218705351, 1e-6));
+            REQUIRE_THAT(diagnostic.RSquare, Catch::Matchers::WithinAbs(0.7486787930045755, 1e-6));
+            REQUIRE_THAT(diagnostic.RSquareAdjust, Catch::Matchers::WithinAbs(0.7118919517893492, 1e-6));
+        }
+    }
+
+    SECTION("optim bw cv | null init bw | adaptive | bisquare kernel | without hatmatrix")
+    {
+        auto parallel = GENERATE_REF(values(parallelTypes));
+        INFO("Parallel type: " << ParallelTypeDict.at(parallel));
+        
+        vector<SpatialWeight> spatials;
+        vector<bool> preditorCentered;
+        vector<GWRMultiscale::BandwidthInitilizeType> bandwidthInitialize;
+        vector<GWRMultiscale::BandwidthSelectionCriterionType> bandwidthSelectionApproach;
+        for (size_t i = 0; i < nVar; i++)
+        {
+            CRSDistance distance;
+            BandwidthWeight bandwidth(0, true, BandwidthWeight::Bisquare);
+            spatials.push_back(SpatialWeight(&bandwidth, &distance));
+            preditorCentered.push_back(i != 0);
+            bandwidthInitialize.push_back(GWRMultiscale::BandwidthInitilizeType::Null);
+            bandwidthSelectionApproach.push_back(GWRMultiscale::BandwidthSelectionCriterionType::CV);
+        }
+
+        GWRMultiscale algorithm;
+        // algorithm.setTelegram(std::move(make_unique<Logger>()));
+        algorithm.setCoords(londonhp100_coord);
+        algorithm.setDependentVariable(y);
+        algorithm.setIndependentVariables(x);
+        algorithm.setSpatialWeights(spatials);
+        algorithm.setHasHatMatrix(false);
+        algorithm.setPreditorCentered(preditorCentered);
+        algorithm.setBandwidthInitilize(bandwidthInitialize);
+        algorithm.setBandwidthSelectionApproach(bandwidthSelectionApproach);
+        algorithm.setBandwidthSelectThreshold(vector(3, 1e-5));
+        algorithm.setParallelType(parallel);
+        algorithm.setWorkerId(iProcess);
+        algorithm.setWorkerNum(nProcess);
+        REQUIRE_NOTHROW(algorithm.fit());
+
+        if (iProcess == 0)
+        {
+            const vector<SpatialWeight>& spatialWeights = algorithm.spatialWeights();
+            REQUIRE(spatialWeights[0].weight<BandwidthWeight>()->bandwidth() == 35);
+            REQUIRE(spatialWeights[1].weight<BandwidthWeight>()->bandwidth() == 98);
+            REQUIRE(spatialWeights[2].weight<BandwidthWeight>()->bandwidth() == 98);
+            RegressionDiagnostic diagnostic = algorithm.diagnostic();
+            REQUIRE_THAT(diagnostic.RSquare, Catch::Matchers::WithinAbs(0.757377391669, 1e-6));
+            REQUIRE(algorithm.hasIntercept() == true);
+        }
+    }
+
+    SECTION("optim bw cv | null init bw | adaptive | bisquare | CVR | without hatmatrix")
+    {
+        auto parallel = GENERATE_REF(values(parallelTypes));
+        INFO("Parallel type: " << ParallelTypeDict.at(parallel));
+        
+        vector<SpatialWeight> spatials;
+        vector<bool> preditorCentered;
+        vector<GWRMultiscale::BandwidthInitilizeType> bandwidthInitialize;
+        vector<GWRMultiscale::BandwidthSelectionCriterionType> bandwidthSelectionApproach;
+        for (size_t i = 0; i < nVar; i++)
+        {
+            CRSDistance distance;
+            BandwidthWeight bandwidth(36, true, BandwidthWeight::Bisquare);
+            spatials.push_back(SpatialWeight(&bandwidth, &distance));
+            preditorCentered.push_back(i != 0);
+            bandwidthInitialize.push_back(GWRMultiscale::BandwidthInitilizeType::Initial);
+            bandwidthSelectionApproach.push_back(GWRMultiscale::BandwidthSelectionCriterionType::CV);
+        }
+
+        GWRMultiscale algorithm;
+        // algorithm.setTelegram(std::move(make_unique<Logger>()));
+        algorithm.setCoords(londonhp100_coord);
+        algorithm.setDependentVariable(y);
+        algorithm.setIndependentVariables(x);
+        algorithm.setSpatialWeights(spatials);
+        algorithm.setHasHatMatrix(true);
+        algorithm.setCriterionType(GWRMultiscale::BackFittingCriterionType::CVR);
+        algorithm.setPreditorCentered(preditorCentered);
+        algorithm.setBandwidthInitilize(bandwidthInitialize);
+        algorithm.setBandwidthSelectionApproach(bandwidthSelectionApproach);
+        algorithm.setBandwidthSelectRetryTimes(5);
+        algorithm.setBandwidthSelectThreshold(vector(3, 1e-5));
+        algorithm.setParallelType(parallel);
+        algorithm.setWorkerId(iProcess);
+        algorithm.setWorkerNum(nProcess);
+        REQUIRE_NOTHROW(algorithm.fit());
+
+        if (iProcess == 0)
+        {
+            const vector<SpatialWeight>& spatialWeights = algorithm.spatialWeights();
+            REQUIRE(spatialWeights[0].weight<BandwidthWeight>()->bandwidth() == 35);
+            REQUIRE(spatialWeights[1].weight<BandwidthWeight>()->bandwidth() == 98);
+            REQUIRE(spatialWeights[2].weight<BandwidthWeight>()->bandwidth() == 98);
+
+            RegressionDiagnostic diagnostic = algorithm.diagnostic();
+            REQUIRE_THAT(diagnostic.RSquare, Catch::Matchers::WithinAbs(0.757377391669, 1e-6));
+        }
+    }
+
+    SECTION("optim bw cv | null init bw | adaptive | bisquare kernel | with hatmatrix | with bounds")
+    {
+        auto parallel = GENERATE_REF(values(parallelTypes));
+        INFO("Parallel type: " << ParallelTypeDict.at(parallel));
+
+        vector<SpatialWeight> spatials;
+        vector<bool> preditorCentered;
+        vector<GWRMultiscale::BandwidthInitilizeType> bandwidthInitialize;
+        vector<GWRMultiscale::BandwidthSelectionCriterionType> bandwidthSelectionApproach;
+        for (size_t i = 0; i < nVar; i++)
+        {
+            CRSDistance distance;
+            BandwidthWeight bandwidth(36, true, BandwidthWeight::Bisquare);
+            spatials.push_back(SpatialWeight(&bandwidth, &distance));
+            preditorCentered.push_back(i != 0);
+            bandwidthInitialize.push_back(GWRMultiscale::BandwidthInitilizeType::Null);
+            bandwidthSelectionApproach.push_back(GWRMultiscale::BandwidthSelectionCriterionType::CV);
+        }
+
+        GWRMultiscale algorithm;
+        // algorithm.setTelegram(std::move(make_unique<Logger>()));
+        algorithm.setCoords(londonhp100_coord);
+        algorithm.setDependentVariable(y);
+        algorithm.setIndependentVariables(x);
+        algorithm.setSpatialWeights(spatials);
+        algorithm.setHasHatMatrix(true);
+        algorithm.setCriterionType(GWRMultiscale::BackFittingCriterionType::CVR);
+        algorithm.setPreditorCentered(preditorCentered);
+        algorithm.setBandwidthInitilize(bandwidthInitialize);
+        algorithm.setBandwidthSelectionApproach(bandwidthSelectionApproach);
+        algorithm.setBandwidthSelectRetryTimes(5);
+        algorithm.setBandwidthSelectThreshold(vector(3, 1e-5));
+        algorithm.setParallelType(parallel);
+        algorithm.setWorkerId(iProcess);
+        algorithm.setWorkerNum(nProcess);
+        algorithm.setGoldenLowerBounds(50);
+        algorithm.setGoldenUpperBounds(100);
+        REQUIRE_NOTHROW(algorithm.fit());
+
+        if (iProcess == 0)
+        {
+            const vector<SpatialWeight>& spatialWeights = algorithm.spatialWeights();
+            REQUIRE(spatialWeights[0].weight<BandwidthWeight>()->bandwidth() == 52);
+            REQUIRE(spatialWeights[1].weight<BandwidthWeight>()->bandwidth() == 99);
+            REQUIRE(spatialWeights[2].weight<BandwidthWeight>()->bandwidth() == 99);
+        }
+    }
+}
+
+int main(int argc, char *argv[])
+{
+    MPI_Init(&argc, &argv);
+    int result = Catch::Session().run( argc, argv );
+    MPI_Finalize();
+    return result;
+}
diff --git a/test/mpi/testMpiMatMul.cpp b/test/mpi/testMpiMatMul.cpp
new file mode 100644
index 00000000..dc70b5ca
--- /dev/null
+++ b/test/mpi/testMpiMatMul.cpp
@@ -0,0 +1,68 @@
+#include <catch2/catch_all.hpp>
+#include <armadillo>
+#include <iostream>
+
+#include <mpi.h>
+#include "gwmodelpp/utils/armampi.h"
+
+using namespace std;
+using namespace arma;
+
+TEST_CASE("mat mul mpi")
+{
+    int ip, np;
+    MPI_Comm_size(MPI_COMM_WORLD, &np);
+    MPI_Comm_rank(MPI_COMM_WORLD, &ip);
+
+    uword n = 100;
+    uword size = n / np + (n % np == 0 ? 0 : 1);
+    uword from = ip * size, to = min(from + size, n);
+
+    // printf("range from %llu to %llu\n", from, to);
+
+    mat A_all, B_all, C_all;
+    A_all = mat(n, n, arma::fill::randn);
+    B_all = mat(n, n, arma::fill::randn);
+    if (ip == 0)
+    {
+        C_all = A_all * B_all;
+    }
+
+    mat A, B, C;
+    A = A_all.rows(from, to - 1);
+    B = B_all.rows(from, to - 1);
+    // printf("process %d begin mat mul\n", ip);
+    REQUIRE_NOTHROW(mat_mul_mpi(A, B, C, ip, np, size));
+    // printf("process %d end mat mul\n", ip);
+
+    if (ip == 0)
+    {
+        mat C_gather(n, n);
+        C_gather.rows(from, to - 1) = C;
+        uvec received(np, fill::zeros);
+        received(0) = 1;
+        auto bufsize = size * n;
+        double *buf = new double[size * n];
+        while (!all(received == 1))
+        {
+            MPI_Status status;
+            MPI_Recv(buf, bufsize, MPI_DOUBLE, MPI_ANY_SOURCE, 0, MPI_COMM_WORLD, &status);
+            received(status.MPI_SOURCE) = 1;
+            uword ifrom = status.MPI_SOURCE * size, ito = min(ifrom + size, n), irange = ito - ifrom; 
+            C_gather.rows(ifrom, ito - 1) = mat(buf, irange, n);
+        }
+        REQUIRE(approx_equal(C_gather, C_all, "absdiff", 1e-6));
+    }
+    else
+    {
+        MPI_Send(C.memptr(), C.n_elem, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);
+    }
+}
+
+int main(int argc, char *argv[])
+{
+    MPI_Init(&argc, &argv);
+    int result = Catch::Session().run( argc, argv );
+    MPI_Finalize();
+    return result;
+}
\ No newline at end of file
diff --git a/test/testGWRBasic.cpp b/test/testGWRBasic.cpp
index 8b3bd80d..3abaa815 100644
--- a/test/testGWRBasic.cpp
+++ b/test/testGWRBasic.cpp
@@ -102,8 +102,9 @@ TEST_CASE("BasicGWR: LondonHP")
         REQUIRE_THAT(diagnostic.RSquareAdjust, Catch::Matchers::WithinAbs(0.673691594464, 1e-8));
     }
 
-    SECTION("adaptive bandwidth | CV bandwidth optimization | no variable optimization") {
+    SECTION("adaptive bandwidth | bandwidth optimization | no variable optimization") {
         auto parallel = GENERATE_REF(values(parallel_list));
+        auto criterion = GENERATE(GWRBasic::BandwidthSelectionCriterionType::CV, GWRBasic::BandwidthSelectionCriterionType::AIC);
         INFO("Parallel:" << ParallelTypeDict.at(parallel));
         
         CRSDistance distance(false);
@@ -116,7 +117,7 @@ TEST_CASE("BasicGWR: LondonHP")
         algorithm.setIndependentVariables(x);
         algorithm.setSpatialWeight(spatial);
         algorithm.setIsAutoselectBandwidth(true);
-        algorithm.setBandwidthSelectionCriterion(GWRBasic::BandwidthSelectionCriterionType::CV);
+        algorithm.setBandwidthSelectionCriterion(criterion);
         algorithm.setParallelType(parallel);
 #ifdef ENABLE_OPENMP
         if (parallel == ParallelType::OpenMP)
@@ -133,7 +134,16 @@ TEST_CASE("BasicGWR: LondonHP")
 #endif // ENABLE_CUDA
         REQUIRE_NOTHROW(algorithm.fit());
         size_t bw = (size_t)algorithm.spatialWeight().weight<BandwidthWeight>()->bandwidth();
-        REQUIRE(bw == 67);
+        size_t bw0 = 67;
+        switch (criterion)
+        {
+        case GWRBasic::BandwidthSelectionCriterionType::AIC:
+            bw0 = 31;
+            break;
+        default:
+            break;
+        }
+        REQUIRE(bw == bw0);
     }
     
     SECTION("adaptive bandwidth | no bandwidth optimization | AIC variable optimization") {
@@ -377,6 +387,7 @@ TEST_CASE("Basic GWR: cancel")
 
 }
 
+#ifdef ENABLE_CUDA
 TEST_CASE("BasicGWR: Benchmark")
 {
     size_t n = 50000, k = 3;
@@ -417,3 +428,4 @@ TEST_CASE("BasicGWR: Benchmark")
         return algorithm.betas();
     };
 }
+#endif // ENABLE_CUDA
diff --git a/test/testGWRMultiscale.cpp b/test/testGWRMultiscale.cpp
index 5d70a997..22fc19c4 100644
--- a/test/testGWRMultiscale.cpp
+++ b/test/testGWRMultiscale.cpp
@@ -35,7 +35,7 @@ TEST_CASE("MGWR: basic flow")
     const initializer_list<ParallelType> parallelTypes = {
         ParallelType::SerialOnly,
 #ifdef ENABLE_OPENMP
-        ParallelType::OpenMP,
+        // ParallelType::OpenMP,
 #endif // ENABLE_OPENMP
 #ifdef ENABLE_CUDA
         ParallelType::CUDA
@@ -87,7 +87,7 @@ TEST_CASE("MGWR: basic flow")
         REQUIRE(algorithm.hasIntercept() == true);
     }
 
-    SECTION("optim bw cv | null init bw | adaptive | bisquare kernel | with hatmatrix")
+    SECTION("optim bw AIC | null init bw | adaptive | bisquare kernel | with hatmatrix")
     {
         auto parallel = GENERATE_REF(values(parallelTypes));
         INFO("Parallel type: " << ParallelTypeDict.at(parallel));