Feature: multiprocess support based on MPI (#88)

* edit: MPI with indep var selection * fix: compile errors * edit: further rename functions * add: fit mpi * fix: runtime errors * fix: unused variables * add: bw sel mpi * fix: bugs caused in non mpi mode * fix: indep var sel mpi runtime error * fix: bw sel runtime errors * edit: activate all tests * add: compile condition around mpi code * edit: mpi with omp * add: mpi with cuda * fix: fitMpi S send error * edit: call GWRBasic in MGWR * fix: split fit core functions * add: fit core omp and cuda * edit: use min distance * add: cubase static methods to create and destory handle * edit: CRSDistance use cumat * fix: GWR cuda bug * edit: MGWR move global fit earlier * add: mpi mat mul * add: MGWR mpi fit * fix: mpi mat mul error * edit: test matrix size * edit: use scatter instead of cast * fix: mgwr mpi run error * add: bw criterion cv aic mpi mode * fix: mgwr mpi omp * edit: mpi with cuda * edit: MPI code conditional compile * edit: mgwr set golden bounds of gwr basic * edit: pre compute b_rows_i * fix: setGroupSize use std::size_t * fix: syntax error * fix(test): MGWR * fix: type mismatch on armadillo and RcppArmadillo * fix: use MY_MPI_UWORD to gether sizes It is unsigned long when ARMA_32BIT_WORD is defined; otherwise, unsigned long long. * edit: rename MY_MPI_UWORD * edit: use int to record process status * add(workflow): MPI tests * Revert "add(workflow): MPI tests" This reverts commit 5bbb210. --------- Co-authored-by: rd21411 <[email protected]>
GWmodel-Lab · Jul 21, 2024 · be40076 · be40076
1 parent 2adab8e
commit be40076
Show file tree

Hide file tree

Showing 24 changed files with 2,067 additions and 1,595 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -6,6 +6,7 @@ set(CMAKE_MODULE_PATH cmake)
 
 option(ENABLE_OpenMP "Determines whether OpemMP support should be built" ON)
 option(ENABLE_CUDA "Determines whether CUDA support should be built" OFF)
+option(ENABLE_MPI "Determines whether MPI support should be built" OFF)
 option(WITH_TESTS "Determines whether to build and run tests" ON)
 
 if(ENABLE_CUDA)

diff --git a/include/gwmodelpp/GWRBasic.h b/include/gwmodelpp/GWRBasic.h
diff --git a/include/gwmodelpp/GWRMultiscale.h b/include/gwmodelpp/GWRMultiscale.h
diff --git a/include/gwmodelpp/IParallelizable.h b/include/gwmodelpp/IParallelizable.h
@@ -16,7 +16,11 @@ enum ParallelType
 {
     SerialOnly = 1 << 0,    //!< \~english Use no parallel methods. \~chinese 不并行。
     OpenMP = 1 << 1,        //!< \~english Use multithread methods. \~chinese 多线程并行。
-    CUDA = 1 << 2           //!< \~english Use CUDA accelerated methods. \~chinese CUDA加速。
+    CUDA = 1 << 2,          //!< \~english Use CUDA accelerated methods. \~chinese CUDA加速。
+    MPI = (1 << 3),
+    MPI_Serial = (1 << 3) | (1 << 0),
+    MPI_MP = (1 << 3) | (1 << 1),
+    MPI_CUDA = (1 << 3) | (1 << 2)
 };
 
 /**
@@ -137,10 +141,23 @@ struct IParallelCudaEnabled
      * 对于大多数 GPU 可选择值 64。
      * 
      */
-    virtual void setGroupSize(const size_t size) = 0;
+    virtual void setGroupSize(const std::size_t size) = 0;
 
 };
 
+struct IParallelMpiEnabled
+{
+    virtual int workerId() = 0;
+    virtual void setWorkerId(int id) = 0;
+    virtual void setWorkerNum(int size) = 0;
+};
+
+#define GWM_MPI_MASTER_BEGIN if (workerId() == 0) {
+#define GWM_MPI_MASTER_END }
+#define GWM_MPI_WORKER_BEGIN if (workerId() != 0) {
+#define GWM_MPI_WORKER_END }
+#define GWM_MPI_MASTER_WORKER_SWITCH } else {
+
 }
 
 #endif  // IPARALLELIZABLE_H
diff --git a/include/gwmodelpp/spatialweight/CRSDistance.h b/include/gwmodelpp/spatialweight/CRSDistance.h
@@ -4,6 +4,7 @@
 #include "Distance.h"
 
 #ifdef ENABLE_CUDA
+#include "gwmodelpp/utils/cumat.hpp"
 #include "gwmodelpp/spatialweight/cuda/CRSDistanceKernel.h"
 #include "gwmodelpp/spatialweight/cuda/ISpatialCudaEnabled.h"
 #endif // ENABLE_CUDA
@@ -146,13 +147,6 @@ class CRSDistance : public Distance
 
     virtual ~CRSDistance()
     {
-#ifdef ENABLE_CUDA
-        if (mCudaPrepared)
-        {
-            cudaFree(mCudaDp);
-            cudaFree(mCudaFp);
-        }
-#endif
     }
 
     virtual Distance * clone() const override
@@ -222,8 +216,8 @@ class CRSDistance : public Distance
     CalculatorType mCalculator = &EuclideanDistance;  //!< \~english Calculator \~chinese 距离计算方法
 
 #ifdef ENABLE_CUDA
-    double* mCudaDp = 0;    //!< \~english Device pointer to data points \~chinese 指向数据点的设备指针
-    double* mCudaFp = 0;    //!< \~english Device pointer to focus points \~chinese 指向关注点的设备指针
+    cumat mCudaDp;    //!< \~english Device pointer to data points \~chinese 指向数据点的设备指针
+    cumat mCudaFp;    //!< \~english Device pointer to focus points \~chinese 指向关注点的设备指针
     CalculatorCudaType mCalculatorCuda = &eu_dist_cuda;  //!< \~english CUDA based Calculator \~chinese 基于 CUDA 的距离计算方法
 #endif
 

diff --git a/include/gwmodelpp/spatialweight/SpatialWeight.h b/include/gwmodelpp/spatialweight/SpatialWeight.h
@@ -377,7 +377,7 @@ class SpatialWeight
      * @param focus 当前样本的索引值。
      * @return vec 当前样本到其他所有样本的空间权重向量。
      */
-    virtual arma::vec weightVector(arma::uword focus)
+    virtual arma::vec weightVector(arma::uword focus) const
     {
         return mWeight->weight(mDistance->distance(focus));
     }
@@ -412,7 +412,7 @@ class SpatialWeight
      * @param d_weights \~english Device pointer to distances \~chinese 指向输出权重的设备指针
      * @return cudaError_t \~english CUDA error or success \~chinese CUDA 错误或成功 
      */
-    virtual cudaError_t weightVector(arma::uword focus, double* d_dists, double* d_weights)
+    virtual cudaError_t weightVector(arma::uword focus, double* d_dists, double* d_weights) const
     {
         cudaError_t error;
         size_t elems = 0;

diff --git a/include/gwmodelpp/utils/armampi.h b/include/gwmodelpp/utils/armampi.h
@@ -0,0 +1,10 @@
+#include "armadillo_config.h"
+#include "mpi.h"
+
+#ifdef ARMA_32BIT_WORD
+#define GWM_MPI_UWORD MPI_UNSIGNED_LONG
+#else   // ARMA_32BIT_WORD
+#define GWM_MPI_UWORD MPI_UNSIGNED_LONG_LONG
+#endif  // ARMA_32BIT_WORD
+
+void mat_mul_mpi(arma::mat& a, arma::mat& b, arma::mat& c, const int ip, const int np, const size_t range);
diff --git a/include/gwmodelpp/utils/cumat.hpp b/include/gwmodelpp/utils/cumat.hpp
@@ -47,6 +47,21 @@ class cubase
 {
 public:
     static cublasHandle_t handle;   //!< Save handle for cublas
+    static auto create_handle()
+    {
+        if (handle == nullptr) return cublasCreate(&handle);
+        else return CUBLAS_STATUS_SUCCESS;
+    }
+    static auto destory_handle()
+    {
+        if (handle != nullptr)
+        {
+            auto error = cublasDestroy(handle);
+            handle = nullptr;
+            return error;
+        }
+        else return CUBLAS_STATUS_SUCCESS;
+    }
     constexpr static const double alpha1 = 1.0;
     constexpr static const double beta0 = 0.0;
     constexpr static const double beta1 = 1.0;
@@ -245,6 +260,32 @@ class cumat : public cubase
      */
     const cuop_trans<cumat> t() const;
 
+    void resize(size_t rows, size_t cols)
+    {
+        if (dMem != nullptr && mIsRelease)
+        {
+            cudaFree(dMem);
+        }
+        cudaMalloc(&dMem, sizeof(double) * rows * cols);
+    }
+
+    cumat& operator=(const cumat& right)
+    {
+        resize(right.mRows, right.mCols);
+        cudaMemcpy(dMem, right.dMem, nbytes(), cudaMemcpyDeviceToDevice);
+        return *this;
+    }
+
+    cumat& operator=(cumat&& right)
+    {
+        mRows = right.mRows;
+        mCols = right.mCols;
+        dMem = right.dMem;
+        mIsRelease = true;
+        right.mIsRelease = false;
+        return *this;
+    }
+
     cumat& operator=(const cuop_trans<cumat>& right);
 
     template<class L, class R>
@@ -282,6 +323,8 @@ class cumat : public cubase
     size_t mCols = 0;
 };
 
+void print(const cumat& mat);
+
 /**
  * @brief \~english Strided matrix. \~chinese 条带矩阵。
  * 

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -15,6 +15,15 @@ if(OpenMP_FOUND AND OpenMP_C_FOUND AND OpenMP_CXX_FOUND)
 endif(OpenMP_FOUND AND OpenMP_C_FOUND AND OpenMP_CXX_FOUND)
 endif()
 
+if(ENABLE_MPI)
+    find_package(MPI REQUIRED)
+    add_definitions(-DENABLE_MPI)
+    include_directories(${MPI_CXX_HEADER_DIR})
+    add_link_options(${MPI_CXX_LINK_FLAGS})
+    add_compile_options(${MPI_CXX_COMPILE_OPTIONS})
+    add_definitions(${MPI_CXX_COMPILE_DEFINITIONS})
+endif(ENABLE_MPI)
+
 find_package(GSL REQUIRED)
 if(GSL_FOUND)
     include_directories(${GSL_INCLUDE_DIRS})
@@ -137,6 +146,17 @@ if(ENABLE_CUDA)
     list(PREPEND SOURCES_ALL ${SOURCES_CUDA})
 endif(ENABLE_CUDA)
 
+if(ENABLE_MPI)
+    set(HEADERS_MPI
+        ../include/gwmodelpp/utils/armampi.h
+    )
+    set(SOURCES_MPI
+        gwmodelpp/utils/armampi.cpp
+    )
+    list(PREPEND HEADERS_ALL ${HEADERS_MPI})
+    list(PREPEND SOURCES_ALL ${SOURCES_MPI})
+endif(ENABLE_MPI)
+
 add_library(gwmodel STATIC ${HEADERS_ALL} ${SOURCES_ALL})
 set_property(TARGET gwmodel PROPERTY POSITION_INDEPENDENT_CODE ON)
 
@@ -194,6 +214,12 @@ if(OpenMP_FOUND)
     )
 endif(OpenMP_FOUND)
 
+if(ENABLE_MPI AND MPI_FOUND)
+    target_link_libraries(gwmodel
+        ${MPI_CXX_LIBRARIES}
+    )
+endif()
+
 if(USE_CUDA_SHARED)
     set(HEADERS_CUDA_SHARED
         ../include/gwmodelcuda/StdTelegram.h