rapidsai · raydouglass · Oct 2, 2023 · Sep 28, 2023 · Sep 29, 2023
@@ -13,7 +13,7 @@ if [[ "$(arch)" == "aarch64" ]]; then
 fi
 
 # Always install latest dask for testing
-python -m pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main git+https://github.com/rapidsai/[email protected]
+python -m pip install git+https://github.com/dask/dask.git@2023.9.2 git+https://github.com/dask/distributed.git@2023.9.2 git+https://github.com/rapidsai/[email protected]
 
 # echo to expand wildcard before adding `[extra]` requires for pip
 python -m pip install $(echo ./dist/cuml*.whl)[test]

@@ -16,12 +16,12 @@ dependencies:
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.0
-- dask-core>=2023.7.1
+- dask-core==2023.9.2
 - dask-cuda==23.10.*
 - dask-cudf==23.10.*
 - dask-ml
-- dask>=2023.7.1
-- distributed>=2023.7.1
+- dask==2023.9.2
+- distributed==2023.9.2
 - doxygen=1.9.1
 - gcc_linux-64=11.*
 - gmock>=1.13.0

@@ -18,12 +18,12 @@ dependencies:
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.0
-- dask-core>=2023.7.1
+- dask-core==2023.9.2
 - dask-cuda==23.10.*
 - dask-cudf==23.10.*
 - dask-ml
-- dask>=2023.7.1
-- distributed>=2023.7.1
+- dask==2023.9.2
+- distributed==2023.9.2
 - doxygen=1.9.1
 - gcc_linux-64=11.*
 - gmock>=1.13.0

@@ -76,9 +76,9 @@ requirements:
     - cudf ={{ minor_version }}
     - cupy >=12.0.0
     - dask-cudf ={{ minor_version }}
-    - dask >=2023.7.1
-    - dask-core>=2023.7.1
-    - distributed >=2023.7.1
+    - dask ==2023.9.2
+    - dask-core==2023.9.2
+    - distributed ==2023.9.2
     - joblib >=0.11
     - libcuml ={{ version }}
     - libcumlprims ={{ minor_version }}

@@ -21,12 +21,24 @@
 
 #include <cumlprims/opg/matrix/data.hpp>
 #include <cumlprims/opg/matrix/part_descriptor.hpp>
+#include <vector>
 using namespace MLCommon;
 
 namespace ML {
 namespace GLM {
 namespace opg {
 
+/**
+ * @brief Calculate unique class labels across multiple GPUs in a multi-node environment.
+ * @param[in] handle: the internal cuml handle object
+ * @param[in] input_desc: PartDescriptor object for the input
+ * @param[in] labels: labels data
+ * @returns host vector that stores the distinct labels
+ */
+std::vector<float> getUniquelabelsMG(const raft::handle_t& handle,
+                                     Matrix::PartDescriptor& input_desc,
+                                     std::vector<Matrix::Data<float>*>& labels);
+
 /**
  * @brief performs MNMG fit operation for the logistic regression using quasi newton methods
  * @param[in] handle: the internal cuml handle object

@@ -103,6 +103,12 @@ inline void qn_fit_x_mg(const raft::handle_t& handle,
       ML::GLM::opg::qn_fit_mg<T, decltype(loss)>(
         handle, pams, loss, X, y, Z, w0_data, f, num_iters, n_samples, rank, n_ranks);
     } break;
+    case QN_LOSS_SOFTMAX: {
+      ASSERT(C > 2, "qn_mg.cuh: softmax invalid C");
+      ML::GLM::detail::Softmax<T> loss(handle, D, C, pams.fit_intercept);
+      ML::GLM::opg::qn_fit_mg<T, decltype(loss)>(
+        handle, pams, loss, X, y, Z, w0_data, f, num_iters, n_samples, rank, n_ranks);
+    } break;
     default: {
       ASSERT(false, "qn_mg.cuh: unknown loss function type (id = %d).", pams.loss);
     }

@@ -21,15 +21,59 @@
 #include <cuml/linear_model/qn.h>
 #include <cuml/linear_model/qn_mg.hpp>
 #include <raft/core/comms.hpp>
+#include <raft/core/device_mdarray.hpp>
 #include <raft/core/error.hpp>
 #include <raft/core/handle.hpp>
+#include <raft/label/classlabels.cuh>
 #include <raft/util/cudart_utils.hpp>
+#include <vector>
 using namespace MLCommon;
 
 namespace ML {
 namespace GLM {
 namespace opg {
 
+template <typename T>
+std::vector<T> distinct_mg(const raft::handle_t& handle, T* y, size_t n)
+{
+  cudaStream_t stream              = handle.get_stream();
+  raft::comms::comms_t const& comm = raft::resource::get_comms(handle);
+  int rank                         = comm.get_rank();
+  int n_ranks                      = comm.get_size();
+
+  rmm::device_uvector<T> unique_y(0, stream);
+  raft::label::getUniquelabels(unique_y, y, n, stream);
+
+  rmm::device_uvector<size_t> recv_counts(n_ranks, stream);
+  auto send_count = raft::make_device_scalar<size_t>(handle, unique_y.size());
+  comm.allgather(send_count.data_handle(), recv_counts.data(), 1, stream);
+  comm.sync_stream(stream);
+
+  std::vector<size_t> recv_counts_host(n_ranks);
+  raft::copy(recv_counts_host.data(), recv_counts.data(), n_ranks, stream);
+
+  std::vector<size_t> displs(n_ranks);
+  size_t pos = 0;
+  for (int i = 0; i < n_ranks; ++i) {
+    displs[i] = pos;
+    pos += recv_counts_host[i];
+  }
+
+  rmm::device_uvector<T> recv_buff(displs.back() + recv_counts_host.back(), stream);
+  comm.allgatherv(
+    unique_y.data(), recv_buff.data(), recv_counts_host.data(), displs.data(), stream);
+  comm.sync_stream(stream);
+
+  rmm::device_uvector<T> global_unique_y(0, stream);
+  int n_distinct =
+    raft::label::getUniquelabels(global_unique_y, recv_buff.data(), recv_buff.size(), stream);
+
+  std::vector<T> global_unique_y_host(global_unique_y.size());
+  raft::copy(global_unique_y_host.data(), global_unique_y.data(), global_unique_y.size(), stream);
+
+  return global_unique_y_host;
+}
+
 template <typename T>
 void qnFit_impl(const raft::handle_t& handle,
                 const qn_params& pams,
@@ -46,17 +90,6 @@ void qnFit_impl(const raft::handle_t& handle,
                 int rank,
                 int n_ranks)
 {
-  switch (pams.loss) {
-    case QN_LOSS_LOGISTIC: {
-      RAFT_EXPECTS(
-        C == 2,
-        "qn_mg.cu: only the LOGISTIC loss is supported currently. The number of classes must be 2");
-    } break;
-    default: {
-      RAFT_EXPECTS(false, "qn_mg.cu: unknown loss function type (id = %d).", pams.loss);
-    }
-  }
-
   auto X_simple = SimpleDenseMat<T>(X, N, D, X_col_major ? COL_MAJOR : ROW_MAJOR);
 
   ML::GLM::opg::qn_fit_x_mg(handle,
@@ -113,6 +146,17 @@ void qnFit_impl(raft::handle_t& handle,
                 input_desc.uniqueRanks().size());
 }
 
+std::vector<float> getUniquelabelsMG(const raft::handle_t& handle,
+                                     Matrix::PartDescriptor& input_desc,
+                                     std::vector<Matrix::Data<float>*>& labels)
+{
+  RAFT_EXPECTS(labels.size() == 1,
+               "getUniqueLabelsMG currently does not accept more than one data chunk");
+  Matrix::Data<float>* data_y = labels[0];
+  int n_rows                  = input_desc.totalElementsOwnedBy(input_desc.rank);
+  return distinct_mg<float>(handle, data_y->ptr, n_rows);
+}
+
 void qnFit(raft::handle_t& handle,
            std::vector<Matrix::Data<float>*>& input_data,
            Matrix::PartDescriptor& input_desc,

@@ -175,10 +175,10 @@ dependencies:
       - output_types: [conda, requirements, pyproject]
         packages:
           - cudf==23.10.*
-          - dask>=2023.7.1
+          - dask==2023.9.2
           - dask-cuda==23.10.*
           - dask-cudf==23.10.*
-          - distributed>=2023.7.1
+          - distributed==2023.9.2
           - joblib>=0.11
           - numba>=0.57
             # TODO: Is scipy really a hard dependency, or should
@@ -192,7 +192,7 @@ dependencies:
           - cupy>=12.0.0
       - output_types: conda
         packages:
-          - dask-core>=2023.7.1
+          - dask-core==2023.9.2
       - output_types: pyproject
         packages:
           - *treelite_runtime

@@ -70,8 +70,8 @@ Packages required for multigpu algorithms*:
 - ucx-py version matching the cuML version
 - dask-cudf version matching the cuML version
 - nccl>=2.5
-- dask>=2023.7.1
-- distributed>=2023.7.1
+- dask==2023.9.2
+- distributed==2023.9.2
 
 * this can be avoided with `--singlegpu` argument flag.
 

@@ -174,4 +174,11 @@ def _create_model(sessionId, datatype, **kwargs):
     def _func_fit(f, data, n_rows, n_cols, partsToSizes, rank):
         inp_X = concatenate([X for X, _ in data])
         inp_y = concatenate([y for _, y in data])
-        return f.fit([(inp_X, inp_y)], n_rows, n_cols, partsToSizes, rank)
+        n_ranks = max([p[0] for p in partsToSizes]) + 1
+        aggregated_partsToSizes = [[i, 0] for i in range(n_ranks)]
+        for p in partsToSizes:
+            aggregated_partsToSizes[p[0]][1] += p[1]
+
+        return f.fit(
+            [(inp_X, inp_y)], n_rows, n_cols, aggregated_partsToSizes, rank
+        )
@@ -79,11 +79,18 @@ cdef extern from "cuml/linear_model/qn_mg.hpp" namespace "ML::GLM::opg" nogil:
         float *f,
         int *num_iters) except +
 
+    cdef vector[float] getUniquelabelsMG(
+        const handle_t& handle,
+        PartDescriptor &input_desc,
+        vector[floatData_t*] labels) except+
+
 
 class LogisticRegressionMG(MGFitMixin, LogisticRegression):
 
     def __init__(self, **kwargs):
         super(LogisticRegressionMG, self).__init__(**kwargs)
+        if self.penalty != "l2" and self.penalty != "none":
+            assert False, "Currently only support 'l2' and 'none' penalty"
 
     @property
     @cuml.internals.api_base_return_array_skipall
@@ -102,8 +109,8 @@ class LogisticRegressionMG(MGFitMixin, LogisticRegression):
 
         self.solver_model.coef_ = value
 
-    def prepare_for_fit(self, n_classes):
-        self.solver_model.qnparams = QNParams(
+    def create_qnparams(self):
+        return QNParams(
             loss=self.loss,
             penalty_l1=self.l1_strength,
             penalty_l2=self.l2_strength,
@@ -118,8 +125,11 @@ class LogisticRegressionMG(MGFitMixin, LogisticRegression):
             penalty_normalized=self.penalty_normalized
         )
 
+    def prepare_for_fit(self, n_classes):
+        self.solver_model.qnparams = self.create_qnparams()
+
         # modified
-        qnpams = self.qnparams.params
+        qnpams = self.solver_model.qnparams.params
 
         # modified qnp
         solves_classification = qnpams['loss'] in {
@@ -174,8 +184,14 @@ class LogisticRegressionMG(MGFitMixin, LogisticRegression):
         cdef float objective32
         cdef int num_iters
 
-        # TODO: calculate _num_classes at runtime
-        self._num_classes = 2
+        cdef vector[float] c_classes_
+        c_classes_ = getUniquelabelsMG(
+            handle_[0],
+            deref(<PartDescriptor*><uintptr_t>input_desc),
+            deref(<vector[floatData_t*]*><uintptr_t>y))
+        self.classes_ = np.sort(list(c_classes_)).astype('float32')
+
+        self._num_classes = len(self.classes_)
         self.loss = "sigmoid" if self._num_classes <= 2 else "softmax"
         self.prepare_for_fit(self._num_classes)
         cdef uintptr_t mat_coef_ptr = self.coef_.ptr
@@ -194,6 +210,8 @@ class LogisticRegressionMG(MGFitMixin, LogisticRegression):
                 self._num_classes,
                 <float*> &objective32,
                 <int*> &num_iters)
+        else:
+            assert False, "dtypes other than float32 are currently not supported yet. See issue: https://github.com/rapidsai/cuml/issues/5589"
 
         self.solver_model._calc_intercept()