diff --git a/CMakeLists.txt b/CMakeLists.txt
index c4cad8e17..76f5aedc2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -60,6 +60,9 @@ option(ENABLE_CUFFT_BACKEND "Enable the cuFFT backend for the DFT interface" OFF
option(ENABLE_ROCFFT_BACKEND "Enable the rocFFT backend for the DFT interface" OFF)
option(ENABLE_PORTFFT_BACKEND "Enable the portFFT DFT backend for the DFT interface. Cannot be used with other DFT backends." OFF)
+# sparse
+option(ENABLE_CUSPARSE_BACKEND "Enable the cuSPARSE backend for the SPARSE_BLAS interface" OFF)
+
set(ONEMKL_SYCL_IMPLEMENTATION "dpc++" CACHE STRING "Name of the SYCL compiler")
set(HIP_TARGETS "" CACHE STRING "Target HIP architectures")
@@ -102,7 +105,8 @@ if(ENABLE_MKLGPU_BACKEND
list(APPEND DOMAINS_LIST "dft")
endif()
if(ENABLE_MKLCPU_BACKEND
- OR ENABLE_MKLGPU_BACKEND)
+ OR ENABLE_MKLGPU_BACKEND
+ OR ENABLE_CUSPARSE_BACKEND)
list(APPEND DOMAINS_LIST "sparse_blas")
endif()
@@ -129,7 +133,7 @@ if(CMAKE_CXX_COMPILER OR NOT ONEMKL_SYCL_IMPLEMENTATION STREQUAL "dpc++")
string(REPLACE "\\" "/" CMAKE_CXX_COMPILER ${CMAKE_CXX_COMPILER})
endif()
else()
- if(ENABLE_CUBLAS_BACKEND OR ENABLE_CURAND_BACKEND OR ENABLE_CUSOLVER_BACKEND OR ENABLE_CUFFT_BACKEND
+ if(ENABLE_CUBLAS_BACKEND OR ENABLE_CURAND_BACKEND OR ENABLE_CUSOLVER_BACKEND OR ENABLE_CUFFT_BACKEND OR ENABLE_CUSPARSE_BACKEND
OR ENABLE_ROCBLAS_BACKEND OR ENABLE_ROCRAND_BACKEND OR ENABLE_ROCSOLVER_BACKEND OR ENABLE_ROCFFT_BACKEND)
set(CMAKE_CXX_COMPILER "clang++")
elseif(ENABLE_MKLGPU_BACKEND)
diff --git a/README.md b/README.md
index 5dc8c9c3b..dc023c67c 100644
--- a/README.md
+++ b/README.md
@@ -18,8 +18,8 @@ oneMKL is part of the [UXL Foundation](http://www.uxlfoundation.org).
- oneMKL interface |
- oneMKL selector |
+ oneMKL interface |
+ oneMKL selector |
Intel(R) oneAPI Math Kernel Library (oneMKL) |
x86 CPU, Intel GPU |
@@ -28,10 +28,10 @@ oneMKL is part of the [UXL Foundation](http://www.uxlfoundation.org).
NVIDIA cuBLAS |
NVIDIA GPU |
-
+
NVIDIA cuSOLVER |
NVIDIA GPU |
-
+
NVIDIA cuRAND |
NVIDIA GPU |
@@ -40,6 +40,10 @@ oneMKL is part of the [UXL Foundation](http://www.uxlfoundation.org).
NVIDIA cuFFT |
NVIDIA GPU |
+
+ NVIDIA cuSPARSE |
+ NVIDIA GPU |
+
NETLIB LAPACK |
x86 CPU |
@@ -329,7 +333,7 @@ Supported compilers include:
Dynamic, Static |
- SPARSE_BLAS |
+ SPARSE_BLAS |
x86 CPU |
Intel(R) oneMKL |
Intel DPC++ |
@@ -341,6 +345,12 @@ Supported compilers include:
Intel DPC++ |
Dynamic, Static |
+
+ NVIDIA GPU |
+ NVIDIA cuSPARSE |
+ Open DPC++ |
+ Dynamic, Static |
+
diff --git a/cmake/FindCompiler.cmake b/cmake/FindCompiler.cmake
index 556211999..8aefc2623 100644
--- a/cmake/FindCompiler.cmake
+++ b/cmake/FindCompiler.cmake
@@ -37,7 +37,7 @@ if(is_dpcpp)
# Check if the Nvidia target is supported. PortFFT uses this for choosing default configuration.
check_cxx_compiler_flag("-fsycl -fsycl-targets=nvptx64-nvidia-cuda" dpcpp_supports_nvptx64)
- if(ENABLE_CURAND_BACKEND OR ENABLE_CUSOLVER_BACKEND)
+ if(ENABLE_CURAND_BACKEND OR ENABLE_CUSOLVER_BACKEND OR ENABLE_CUSPARSE_BACKEND)
list(APPEND UNIX_INTERFACE_COMPILE_OPTIONS
-fsycl-targets=nvptx64-nvidia-cuda -fsycl-unnamed-lambda)
list(APPEND UNIX_INTERFACE_LINK_OPTIONS
@@ -51,7 +51,7 @@ if(is_dpcpp)
-fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend
--offload-arch=${HIP_TARGETS})
endif()
- if(ENABLE_CURAND_BACKEND OR ENABLE_CUSOLVER_BACKEND OR ENABLE_ROCBLAS_BACKEND
+ if(ENABLE_CURAND_BACKEND OR ENABLE_CUSOLVER_BACKEND OR ENABLE_CUSPARSE_BACKEND OR ENABLE_ROCBLAS_BACKEND
OR ENABLE_ROCRAND_BACKEND OR ENABLE_ROCSOLVER_BACKEND)
set_target_properties(ONEMKL::SYCL::SYCL PROPERTIES
INTERFACE_COMPILE_OPTIONS "${UNIX_INTERFACE_COMPILE_OPTIONS}"
diff --git a/docs/building_the_project_with_dpcpp.rst b/docs/building_the_project_with_dpcpp.rst
index 6076117f7..efe92f285 100644
--- a/docs/building_the_project_with_dpcpp.rst
+++ b/docs/building_the_project_with_dpcpp.rst
@@ -104,6 +104,9 @@ The most important supported build options are:
* - ENABLE_CURAND_BACKEND
- True, False
- False
+ * - ENABLE_CUSPARSE_BACKEND
+ - True, False
+ - False
* - ENABLE_NETLIB_BACKEND
- True, False
- False
@@ -183,8 +186,8 @@ Building for CUDA
^^^^^^^^^^^^^^^^^
The CUDA backends can be enabled with ``ENABLE_CUBLAS_BACKEND``,
-``ENABLE_CUFFT_BACKEND``, ``ENABLE_CURAND_BACKEND``, and
-``ENABLE_CUSOLVER_BACKEND``.
+``ENABLE_CUFFT_BACKEND``, ``ENABLE_CURAND_BACKEND``,
+``ENABLE_CUSOLVER_BACKEND``, and ``ENABLE_CUSPARSE_BACKEND``.
No additional parameters are required for using CUDA libraries. In most cases,
the CUDA libraries should be found automatically by CMake.
@@ -371,6 +374,7 @@ disabled using the Ninja build system:
-DENABLE_CUBLAS_BACKEND=True \
-DENABLE_CUSOLVER_BACKEND=True \
-DENABLE_CURAND_BACKEND=True \
+ -DENABLE_CUSPARSE_BACKEND=True \
-DBUILD_FUNCTIONAL_TESTS=False
``$ONEMKL_DIR`` points at the oneMKL source directly. The x86 CPU (``MKLCPU``)
diff --git a/docs/domains/sparse_linear_algebra.rst b/docs/domains/sparse_linear_algebra.rst
index eab5afd56..07d90359a 100644
--- a/docs/domains/sparse_linear_algebra.rst
+++ b/docs/domains/sparse_linear_algebra.rst
@@ -20,21 +20,150 @@ Currently known limitations:
- ``oneapi::mkl::sparse::set_csr_data`` and
``oneapi::mkl::sparse::set_coo_data`` functions cannot be used on a handle
that has already been used for an operation or its optimize function. Doing so
- will throw an ``oneapi::mkl::unimplemented`` exception.
+ will throw a ``oneapi::mkl::unimplemented`` exception.
- Using ``spsv`` with the ``oneapi::mkl::sparse::spsv_alg::no_optimize_alg`` and
a sparse matrix that does not have the
- ``oneapi::mkl::sparse::matrix_property::sorted`` property will throw an
+ ``oneapi::mkl::sparse::matrix_property::sorted`` property will throw a
``oneapi::mkl::unimplemented`` exception.
- Using ``spmm`` on Intel GPU with a sparse matrix that is
``oneapi::mkl::transpose::conjtrans`` and has the
- ``oneapi::mkl::sparse::matrix_property::symmetric`` property will throw an
+ ``oneapi::mkl::sparse::matrix_property::symmetric`` property will throw a
``oneapi::mkl::unimplemented`` exception.
- Using ``spmv`` with a sparse matrix that is
``oneapi::mkl::transpose::conjtrans`` with a ``type_view``
- ``matrix_descr::symmetric`` or ``matrix_descr::hermitian`` will throw an
+ ``matrix_descr::symmetric`` or ``matrix_descr::hermitian`` will throw a
``oneapi::mkl::unimplemented`` exception.
- Using ``spsv`` on Intel GPU with a sparse matrix that is
- ``oneapi::mkl::transpose::conjtrans`` and will throw an
+ ``oneapi::mkl::transpose::conjtrans`` and will throw a
``oneapi::mkl::unimplemented`` exception.
- Scalar parameters ``alpha`` and ``beta`` should be host pointers to prevent
synchronizations and copies to the host.
+
+
+cuSPARSE backend
+----------------
+
+Currently known limitations:
+
+- The COO format requires the indices to be sorted by row. See the `cuSPARSE
+ documentation
+ `_. Sparse
+ operations using matrices with the COO format without the property
+ ``matrix_property::sorted_by_rows`` or ``matrix_property::sorted`` will throw
+ a ``oneapi::mkl::unimplemented`` exception.
+- Using ``spmm`` with the algorithm ``spmm_alg::csr_alg3`` and an ``opA`` other
+ than ``transpose::nontrans`` or an ``opB`` ``transpose::conjtrans`` will throw
+ a ``oneapi::mkl::unimplemented`` exception.
+- Using ``spmm`` with the algorithm ``spmm_alg::csr_alg3``,
+ ``opB=transpose::trans`` and real fp64 precision will throw a
+ ``oneapi::mkl::unimplemented`` exception. This configuration can fail as of
+ CUDA 12.6.2, see the related issue
+ `here`_.
+- Using ``spmv`` with a ``type_view`` other than ``matrix_descr::general`` will
+ throw a ``oneapi::mkl::unimplemented`` exception.
+- Using ``spsv`` with the algorithm ``spsv_alg::no_optimize_alg`` may still
+ perform some mandatory preprocessing.
+- oneMKL Interface does not provide a way to use non-default algorithms without
+ calling preprocess functions such as ``cusparseSpMM_preprocess`` or
+ ``cusparseSpMV_preprocess``. Feel free to create an issue if this is needed.
+
+
+Operation algorithms mapping
+----------------------------
+
+The following tables describe how a oneMKL SYCL Interface algorithm maps to the
+backend's algorithms. Refer to the backend's documentation for a more detailed
+explanation of the algorithms.
+
+Backends with no equivalent algorithms will fallback to the backend's default
+behavior.
+
+
+spmm
+^^^^
+
+.. list-table::
+ :header-rows: 1
+ :widths: 10 30 45
+
+ * - ``spmm_alg`` value
+ - MKLCPU/MKLGPU
+ - cuSPARSE
+ * - ``default_alg``
+ - none
+ - ``CUSPARSE_SPMM_ALG_DEFAULT``
+ * - ``no_optimize_alg``
+ - none
+ - ``CUSPARSE_SPMM_ALG_DEFAULT``
+ * - ``coo_alg1``
+ - none
+ - ``CUSPARSE_SPMM_COO_ALG1``
+ * - ``coo_alg2``
+ - none
+ - ``CUSPARSE_SPMM_COO_ALG2``
+ * - ``coo_alg3``
+ - none
+ - ``CUSPARSE_SPMM_COO_ALG3``
+ * - ``coo_alg4``
+ - none
+ - ``CUSPARSE_SPMM_COO_ALG4``
+ * - ``csr_alg1``
+ - none
+ - ``CUSPARSE_SPMM_CSR_ALG1``
+ * - ``csr_alg2``
+ - none
+ - ``CUSPARSE_SPMM_CSR_ALG2``
+ * - ``csr_alg3``
+ - none
+ - ``CUSPARSE_SPMM_CSR_ALG3``
+
+
+spmv
+^^^^
+
+.. list-table::
+ :header-rows: 1
+ :widths: 10 30 45
+
+ * - ``spmv_alg`` value
+ - MKLCPU/MKLGPU
+ - cuSPARSE
+ * - ``default_alg``
+ - none
+ - ``CUSPARSE_SPMV_ALG_DEFAULT``
+ * - ``no_optimize_alg``
+ - none
+ - ``CUSPARSE_SPMV_ALG_DEFAULT``
+ * - ``coo_alg1``
+ - none
+ - ``CUSPARSE_SPMV_COO_ALG1``
+ * - ``coo_alg2``
+ - none
+ - ``CUSPARSE_SPMV_COO_ALG2``
+ * - ``csr_alg1``
+ - none
+ - ``CUSPARSE_SPMV_CSR_ALG1``
+ * - ``csr_alg2``
+ - none
+ - ``CUSPARSE_SPMV_CSR_ALG2``
+ * - ``csr_alg3``
+ - none
+ - ``CUSPARSE_SPMV_ALG_DEFAULT``
+
+
+spsv
+^^^^
+
+.. list-table::
+ :header-rows: 1
+ :widths: 10 30 45
+
+ * - ``spsv_alg`` value
+ - MKLCPU/MKLGPU
+ - cuSPARSE
+ * - ``default_alg``
+ - none
+ - ``CUSPARSE_SPSV_ALG_DEFAULT``
+ * - ``no_optimize_alg``
+ - none
+ - ``CUSPARSE_SPSV_ALG_DEFAULT``
diff --git a/examples/README.md b/examples/README.md
index 0dad8772d..45a100131 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -4,7 +4,7 @@ oneAPI Math Kernel Library (oneMKL) Interfaces offers examples with the followin
- rng: uniform_usm
- lapack: getrs_usm
- dft: complex_fwd_usm, real_fwd_usm
-- sparse_blas: sparse_gemv_usm
+- sparse_blas: sparse_spmv_usm
Each routine has one run-time dispatching example and one compile-time dispatching example (which uses both mklcpu and cuda backends), located in `example/<$domain>/run_time_dispatching` and `example/<$domain>/compile_time_dispatching` subfolders, respectively.
@@ -487,111 +487,119 @@ Unsupported Configuration:
Run-time dispatching examples with mklcpu backend
```
$ export ONEAPI_DEVICE_SELECTOR="opencl:cpu"
-$ ./bin/example_sparse_blas_gemv_usm
+$ ./bin/example_sparse_blas_spmv_usm
########################################################################
-# Sparse Matrix-Vector Multiply Example:
-#
+# Sparse Matrix-Vector Multiply Example:
+#
# y = alpha * op(A) * x + beta * y
-#
+#
# where A is a sparse matrix in CSR format, x and y are dense vectors
# and alpha, beta are floating point type precision scalars.
-#
+#
# Using apis:
-# sparse::gemv
-#
+# sparse::spmv
+#
# Using single precision (float) data type
-#
+#
# Device will be selected during runtime.
# The environment variable ONEAPI_DEVICE_SELECTOR can be used to specify
# available devices
-#
+#
########################################################################
-Running Sparse BLAS GEMV USM example on CPU device.
-Device name is: Intel(R) Core(TM) i7-6700K CPU @ 4.00GHz
+Running Sparse BLAS SPMV USM example on CPU device.
+Device name is: Intel(R) Xeon(R) Gold 6326 CPU @ 2.90GHz
Running with single precision real data type:
- sparse::gemv parameters:
- transA = nontrans
- nrows = 64
- alpha = 1, beta = 0
+ sparse::spmv parameters:
+ transA = nontrans
+ nrows = 64
+ alpha = 1, beta = 0
- sparse::gemv example passed
- Finished
-Sparse BLAS GEMV USM example ran OK.
+ sparse::spmv example passed
+ Finished
+Sparse BLAS SPMV USM example ran OK.
```
Run-time dispatching examples with mklgpu backend
```
$ export ONEAPI_DEVICE_SELECTOR="level_zero:gpu"
-$ ./bin/example_sparse_blas_gemv_usm
+$ ./bin/example_sparse_blas_spmv_usm
########################################################################
-# Sparse Matrix-Vector Multiply Example:
-#
+# Sparse Matrix-Vector Multiply Example:
+#
# y = alpha * op(A) * x + beta * y
-#
+#
# where A is a sparse matrix in CSR format, x and y are dense vectors
# and alpha, beta are floating point type precision scalars.
-#
+#
# Using apis:
-# sparse::gemv
-#
+# sparse::spmv
+#
# Using single precision (float) data type
-#
+#
# Device will be selected during runtime.
# The environment variable ONEAPI_DEVICE_SELECTOR can be used to specify
# available devices
-#
+#
########################################################################
-Running Sparse BLAS GEMV USM example on GPU device.
+Running Sparse BLAS SPMV USM example on GPU device.
Device name is: Intel(R) HD Graphics 530 [0x1912]
Running with single precision real data type:
- sparse::gemv parameters:
- transA = nontrans
- nrows = 64
- alpha = 1, beta = 0
+ sparse::spmv parameters:
+ transA = nontrans
+ nrows = 64
+ alpha = 1, beta = 0
- sparse::gemv example passed
- Finished
-Sparse BLAS GEMV USM example ran OK.
+ sparse::spmv example passed
+ Finished
+Sparse BLAS SPMV USM example ran OK.
```
-Compile-time dispatching example with mklcpu backend
+Compile-time dispatching example with both mklcpu and cusparse backend
```
-$ export ONEAPI_DEVICE_SELECTOR="opencl:cpu"
-$ ./bin/example_sparse_blas_gemv_usm_mklcpu
+$ ./bin/sparse_blas_spmv_usm_mklcpu_cusparse
########################################################################
-# Sparse Matrix-Vector Multiply Example:
-#
+# Sparse Matrix-Vector Multiply Example:
+#
# y = alpha * op(A) * x + beta * y
-#
-# where A is a sparse matrix in CSR format, x and y are dense vectors
+#
+# where A is a sparse matrix in COO format, x and y are dense vectors
# and alpha, beta are floating point type precision scalars.
-#
+#
# Using apis:
-# sparse::gemv
-#
+# sparse::spmv
+#
# Using single precision (float) data type
-#
-# Running on Intel CPU device
-#
+#
+# Running on both Intel CPU and Nvidia GPU devices
+#
########################################################################
-Running Sparse BLAS GEMV USM example on CPU device.
-Device name is: Intel(R) Core(TM) i7-6700K CPU @ 4.00GHz
+Running Sparse BLAS SPMV USM example on:
+ CPU device: Intel(R) Xeon(R) Gold 6326 CPU @ 2.90GHz
+ GPU device: NVIDIA A100-PCIE-40GB
Running with single precision real data type:
- sparse::gemv parameters:
- transA = nontrans
- nrows = 64
- alpha = 1, beta = 0
+ sparse::spmv parameters:
+ transA = nontrans
+ size = 8
+ alpha = 1, beta = 0
+
+ sparse::spmv example passed
+ Finished
+
+ sparse::spmv parameters:
+ transA = nontrans
+ size = 8
+ alpha = 1, beta = 0
- sparse::gemv example passed
- Finished
-Sparse BLAS GEMV USM example ran OK.
+ sparse::spmv example passed
+ Finished
+Sparse BLAS SPMV USM example ran OK on MKLCPU and CUSPARSE.
```
diff --git a/examples/sparse_blas/compile_time_dispatching/CMakeLists.txt b/examples/sparse_blas/compile_time_dispatching/CMakeLists.txt
index 5dbbba8a4..a38f4ebd4 100644
--- a/examples/sparse_blas/compile_time_dispatching/CMakeLists.txt
+++ b/examples/sparse_blas/compile_time_dispatching/CMakeLists.txt
@@ -18,27 +18,24 @@
#===============================================================================
#Build object from all sources
-set(SPARSE_BLAS_BACKENDS "")
-
-if(ENABLE_MKLCPU_BACKEND)
- list(APPEND SPARSE_BLAS_BACKENDS "mklcpu")
+set(SPARSE_CT_SOURCES "")
+if(ENABLE_MKLCPU_BACKEND AND ENABLE_CUSPARSE_BACKEND)
+ list(APPEND SPARSE_CT_SOURCES "sparse_blas_spmv_usm_mklcpu_cusparse")
endif()
include(WarningsUtils)
-foreach(backend ${SPARSE_BLAS_BACKENDS})
- set(EXAMPLE_NAME example_sparse_blas_spmv_usm_${backend})
- add_executable(${EXAMPLE_NAME} sparse_blas_spmv_usm_${backend}.cpp)
- target_include_directories(${EXAMPLE_NAME}
+foreach(sparse_ct_source ${SPARSE_CT_SOURCES})
+ add_executable(${sparse_ct_source} ${sparse_ct_source}.cpp)
+ target_include_directories(${sparse_ct_source}
PUBLIC ${PROJECT_SOURCE_DIR}/examples/include
PUBLIC ${PROJECT_SOURCE_DIR}/include
PUBLIC ${CMAKE_BINARY_DIR}/bin
)
- add_dependencies(${EXAMPLE_NAME} onemkl_sparse_blas_${backend})
- target_link_libraries(${EXAMPLE_NAME} PRIVATE ONEMKL::SYCL::SYCL onemkl_sparse_blas_${backend})
+ target_link_libraries(${sparse_ct_source} PRIVATE ONEMKL::SYCL::SYCL onemkl_sparse_blas_mklcpu onemkl_sparse_blas_cusparse)
# Register example as ctest
- add_test(NAME sparse_blas/EXAMPLE/CT/sparse_blas_spmv_usm_${backend} COMMAND ${EXAMPLE_NAME})
-endforeach(backend)
+ add_test(NAME sparse_blas/EXAMPLE/CT/${sparse_ct_source} COMMAND ${sparse_ct_source})
+endforeach(sparse_ct_source)
diff --git a/examples/sparse_blas/compile_time_dispatching/sparse_blas_spmv_usm_mklcpu.cpp b/examples/sparse_blas/compile_time_dispatching/sparse_blas_spmv_usm_mklcpu_cusparse.cpp
similarity index 55%
rename from examples/sparse_blas/compile_time_dispatching/sparse_blas_spmv_usm_mklcpu.cpp
rename to examples/sparse_blas/compile_time_dispatching/sparse_blas_spmv_usm_mklcpu_cusparse.cpp
index 964afb49b..31ce1975c 100644
--- a/examples/sparse_blas/compile_time_dispatching/sparse_blas_spmv_usm_mklcpu.cpp
+++ b/examples/sparse_blas/compile_time_dispatching/sparse_blas_spmv_usm_mklcpu_cusparse.cpp
@@ -22,7 +22,7 @@
* Content:
* This example demonstrates use of DPCPP API oneapi::mkl::sparse::spmv
* using unified shared memory to perform general sparse matrix-vector
-* multiplication on a INTEL CPU SYCL device.
+* multiplication on a INTEL CPU SYCL device and an NVIDIA GPU SYCL device.
*
* y = alpha * op(A) * x + beta * y
*
@@ -59,69 +59,54 @@
//
// is performed and finally the results are post processed.
//
-template
-int run_sparse_matrix_vector_multiply_example(const sycl::device& cpu_dev) {
+template
+int run_sparse_matrix_vector_multiply_example(selectorType& selector) {
+ auto queue = selector.get_queue();
+
// Matrix data size
- intType size = 4;
- intType nrows = size * size * size;
+ static constexpr intType size = 8;
- // Set scalar fp values
- fp alpha = set_fp_value(fp(1.0));
- fp beta = set_fp_value(fp(0.0));
+ // Set scalar fpType values
+ fpType alpha = set_fp_value(fpType(1.0));
+ fpType beta = set_fp_value(fpType(0.0));
- // Catch asynchronous exceptions
- auto exception_handler = [](sycl::exception_list exceptions) {
- for (std::exception_ptr const& e : exceptions) {
- try {
- std::rethrow_exception(e);
- }
- catch (sycl::exception const& e) {
- std::cout << "Caught asynchronous SYCL "
- "exception during sparse::spmv:\n"
- << e.what() << std::endl;
- }
- }
- };
+ intType nnz = 9;
+ // host_ia must be sorted to maintain the sorted_by_rows property
+ intType host_ia[] = { 0, 0, 1, 3, 4, 4, 4, 7, 7 };
+ intType host_ja[] = { 0, 7, 2, 2, 5, 4, 0, 0, 7 };
+
+ intType* ia = (intType*)sycl::malloc_shared(nnz * sizeof(intType), queue);
+ intType* ja = (intType*)sycl::malloc_shared(nnz * sizeof(intType), queue);
+ fpType* a = (fpType*)sycl::malloc_shared(nnz * sizeof(fpType), queue);
+ fpType* x = (fpType*)sycl::malloc_shared(size * sizeof(fpType), queue);
+ fpType* y = (fpType*)sycl::malloc_shared(size * sizeof(fpType), queue);
- // create execution queue and buffers of matrix data
- sycl::queue cpu_queue(cpu_dev, exception_handler);
- oneapi::mkl::backend_selector cpu_selector{ cpu_queue };
-
- intType *ia, *ja;
- fp *a, *x, *y, *z;
- std::size_t sizea = static_cast(27 * nrows);
- std::size_t sizeja = static_cast(27 * nrows);
- std::size_t sizeia = static_cast(nrows + 1);
- std::size_t sizevec = static_cast(nrows);
-
- ia = (intType*)sycl::malloc_shared(sizeia * sizeof(intType), cpu_queue);
- ja = (intType*)sycl::malloc_shared(sizeja * sizeof(intType), cpu_queue);
- a = (fp*)sycl::malloc_shared(sizea * sizeof(fp), cpu_queue);
- x = (fp*)sycl::malloc_shared(sizevec * sizeof(fp), cpu_queue);
- y = (fp*)sycl::malloc_shared(sizevec * sizeof(fp), cpu_queue);
- z = (fp*)sycl::malloc_shared(sizevec * sizeof(fp), cpu_queue);
-
- if (!ia || !ja || !a || !x || !y || !z) {
+ if (!ia || !ja || !a || !x || !y) {
throw std::runtime_error("Failed to allocate USM memory");
}
- intType nnz = generate_sparse_matrix(size, ia, ja, a);
+ // Copy ia and ja
+ queue.memcpy(ia, host_ia, nnz * sizeof(intType)).wait_and_throw();
+ queue.memcpy(ja, host_ja, nnz * sizeof(intType)).wait_and_throw();
+
+ // Init matrix values
+ for (int i = 0; i < nnz; i++) {
+ a[i] = set_fp_value(fpType(i + 1));
+ }
// Init vectors x and y
- for (int i = 0; i < nrows; i++) {
- x[i] = set_fp_value(fp(1.0));
- y[i] = set_fp_value(fp(0.0));
- z[i] = set_fp_value(fp(0.0));
+ for (int i = 0; i < size; i++) {
+ x[i] = set_fp_value(fpType(i + 1));
+ y[i] = set_fp_value(fpType(0.0));
}
std::vector int_ptr_vec;
int_ptr_vec.push_back(ia);
int_ptr_vec.push_back(ja);
- std::vector fp_ptr_vec;
+ std::vector fp_ptr_vec;
fp_ptr_vec.push_back(a);
fp_ptr_vec.push_back(x);
fp_ptr_vec.push_back(y);
- fp_ptr_vec.push_back(z);
//
// Execute Matrix Multiply
@@ -137,49 +122,52 @@ int run_sparse_matrix_vector_multiply_example(const sycl::device& cpu_dev) {
? "nontrans"
: (transA == oneapi::mkl::transpose::trans ? "trans" : "conjtrans"))
<< std::endl;
- std::cout << "\t\t\tnrows = " << nrows << std::endl;
+ std::cout << "\t\t\tsize = " << size << std::endl;
std::cout << "\t\t\talpha = " << alpha << ", beta = " << beta << std::endl;
- // Create and initialize handle for a Sparse Matrix in CSR format
+ // Create and initialize handle for a Sparse Matrix in COO format sorted by rows
oneapi::mkl::sparse::matrix_handle_t A_handle = nullptr;
- oneapi::mkl::sparse::init_csr_matrix(cpu_selector, &A_handle, nrows, nrows, nnz,
+ oneapi::mkl::sparse::init_coo_matrix(selector, &A_handle, size, size, nnz,
oneapi::mkl::index_base::zero, ia, ja, a);
+ // cuSPARSE backend requires that the property sorted_by_rows or sorted is set when using matrices in COO format.
+ // Setting these properties is also the best practice to get best performance.
+ oneapi::mkl::sparse::set_matrix_property(selector, A_handle,
+ oneapi::mkl::sparse::matrix_property::sorted_by_rows);
// Create and initialize dense vector handles
oneapi::mkl::sparse::dense_vector_handle_t x_handle = nullptr;
oneapi::mkl::sparse::dense_vector_handle_t y_handle = nullptr;
- oneapi::mkl::sparse::init_dense_vector(cpu_selector, &x_handle, sizevec, x);
- oneapi::mkl::sparse::init_dense_vector(cpu_selector, &y_handle, sizevec, y);
+ oneapi::mkl::sparse::init_dense_vector(selector, &x_handle, size, x);
+ oneapi::mkl::sparse::init_dense_vector(selector, &y_handle, size, y);
// Create operation descriptor
oneapi::mkl::sparse::spmv_descr_t descr = nullptr;
- oneapi::mkl::sparse::init_spmv_descr(cpu_selector, &descr);
+ oneapi::mkl::sparse::init_spmv_descr(selector, &descr);
// Allocate external workspace
std::size_t workspace_size = 0;
- oneapi::mkl::sparse::spmv_buffer_size(cpu_selector, transA, &alpha, A_view, A_handle, x_handle,
+ oneapi::mkl::sparse::spmv_buffer_size(selector, transA, &alpha, A_view, A_handle, x_handle,
&beta, y_handle, alg, descr, workspace_size);
- void* workspace = sycl::malloc_device(workspace_size, cpu_queue);
+ void* workspace = sycl::malloc_device(workspace_size, queue);
// Optimize spmv
auto ev_opt =
- oneapi::mkl::sparse::spmv_optimize(cpu_selector, transA, &alpha, A_view, A_handle, x_handle,
+ oneapi::mkl::sparse::spmv_optimize(selector, transA, &alpha, A_view, A_handle, x_handle,
&beta, y_handle, alg, descr, workspace);
// Run spmv
- auto ev_spmv = oneapi::mkl::sparse::spmv(cpu_selector, transA, &alpha, A_view, A_handle,
- x_handle, &beta, y_handle, alg, descr, { ev_opt });
+ auto ev_spmv = oneapi::mkl::sparse::spmv(selector, transA, &alpha, A_view, A_handle, x_handle,
+ &beta, y_handle, alg, descr, { ev_opt });
// Release handles and descriptor
std::vector release_events;
release_events.push_back(
- oneapi::mkl::sparse::release_dense_vector(cpu_selector, x_handle, { ev_spmv }));
+ oneapi::mkl::sparse::release_dense_vector(selector, x_handle, { ev_spmv }));
release_events.push_back(
- oneapi::mkl::sparse::release_dense_vector(cpu_selector, y_handle, { ev_spmv }));
+ oneapi::mkl::sparse::release_dense_vector(selector, y_handle, { ev_spmv }));
release_events.push_back(
- oneapi::mkl::sparse::release_sparse_matrix(cpu_selector, A_handle, { ev_spmv }));
- release_events.push_back(
- oneapi::mkl::sparse::release_spmv_descr(cpu_selector, descr, { ev_spmv }));
+ oneapi::mkl::sparse::release_sparse_matrix(selector, A_handle, { ev_spmv }));
+ release_events.push_back(oneapi::mkl::sparse::release_spmv_descr(selector, descr, { ev_spmv }));
for (auto event : release_events) {
event.wait_and_throw();
}
@@ -188,33 +176,26 @@ int run_sparse_matrix_vector_multiply_example(const sycl::device& cpu_dev) {
// Post Processing
//
- fp* res = y;
- const bool isConj = (transA == oneapi::mkl::transpose::conjtrans);
- for (intType row = 0; row < nrows; row++) {
- z[row] *= beta;
- }
- for (intType row = 0; row < nrows; row++) {
- fp tmp = alpha * x[row];
- for (intType i = ia[row]; i < ia[row + 1]; i++) {
- if constexpr (is_complex()) {
- z[ja[i]] += tmp * (isConj ? std::conj(a[i]) : a[i]);
- }
- else {
- z[ja[i]] += tmp * a[i];
- }
- }
+ // The example assume matrices are not transposed and beta=0 for simplicity.
+ // See the tests for more in-depth verification.
+ fpType* res = y;
+ fpType expected_res[size] = {};
+ for (intType i = 0; i < nnz; ++i) {
+ intType row = ia[i];
+ intType col = ja[i];
+ expected_res[row] += alpha * x[col] * a[i];
}
bool good = true;
- for (intType row = 0; row < nrows; row++) {
- good &= check_result(res[row], z[row], nrows, row);
+ for (intType row = 0; row < size; row++) {
+ good &= check_result(res[row], expected_res[row], size, row);
}
std::cout << "\n\t\t sparse::spmv example " << (good ? "passed" : "failed") << "\n\tFinished"
<< std::endl;
- free_vec(fp_ptr_vec, cpu_queue);
- free_vec(int_ptr_vec, cpu_queue);
+ free_vec(fp_ptr_vec, queue);
+ free_vec(int_ptr_vec, queue);
if (!good)
return 1;
@@ -234,7 +215,7 @@ void print_example_banner() {
std::cout << "# " << std::endl;
std::cout << "# y = alpha * op(A) * x + beta * y" << std::endl;
std::cout << "# " << std::endl;
- std::cout << "# where A is a sparse matrix in CSR format, x and y are "
+ std::cout << "# where A is a sparse matrix in COO format, x and y are "
"dense vectors"
<< std::endl;
std::cout << "# and alpha, beta are floating point type precision scalars." << std::endl;
@@ -244,7 +225,7 @@ void print_example_banner() {
std::cout << "# " << std::endl;
std::cout << "# Using single precision (float) data type" << std::endl;
std::cout << "# " << std::endl;
- std::cout << "# Running on Intel CPU device" << std::endl;
+ std::cout << "# Running on both Intel CPU and Nvidia GPU devices" << std::endl;
std::cout << "# " << std::endl;
std::cout << "########################################################################"
<< std::endl;
@@ -257,17 +238,44 @@ void print_example_banner() {
int main(int /*argc*/, char** /*argv*/) {
print_example_banner();
+ auto exception_handler = [](sycl::exception_list exceptions) {
+ for (std::exception_ptr const& e : exceptions) {
+ try {
+ std::rethrow_exception(e);
+ }
+ catch (sycl::exception const& e) {
+ std::cout << "Caught asynchronous SYCL "
+ "exception during sparse::spmv:\n"
+ << e.what() << std::endl;
+ }
+ }
+ };
+
try {
- // TODO: Add cuSPARSE compile-time dispatcher in this example once it is supported.
- sycl::device cpu_dev(sycl::cpu_selector_v);
+ sycl::queue cpu_queue(sycl::cpu_selector_v, exception_handler);
+ sycl::queue gpu_queue(sycl::gpu_selector_v, exception_handler);
+ unsigned int vendor_id = gpu_queue.get_device().get_info();
+ if (vendor_id != NVIDIA_ID) {
+ std::cerr << "FAILED: NVIDIA GPU device not found" << std::endl;
+ return 1;
+ }
+ oneapi::mkl::backend_selector cpu_selector{ cpu_queue };
+ oneapi::mkl::backend_selector gpu_selector{ gpu_queue };
- std::cout << "Running Sparse BLAS SPMV USM example on CPU device." << std::endl;
- std::cout << "Device name is: " << cpu_dev.get_info()
+ std::cout << "Running Sparse BLAS SPMV USM example on:" << std::endl;
+ std::cout << "\tCPU device: " << cpu_queue.get_device().get_info()
+ << std::endl;
+ std::cout << "\tGPU device: " << gpu_queue.get_device().get_info()
<< std::endl;
std::cout << "Running with single precision real data type:" << std::endl;
- run_sparse_matrix_vector_multiply_example(cpu_dev);
- std::cout << "Sparse BLAS SPMV USM example ran OK." << std::endl;
+ int err = run_sparse_matrix_vector_multiply_example(cpu_selector);
+ if (err)
+ return err;
+ err = run_sparse_matrix_vector_multiply_example(gpu_selector);
+ if (err)
+ return err;
+ std::cout << "Sparse BLAS SPMV USM example ran OK on MKLCPU and CUSPARSE." << std::endl;
}
catch (sycl::exception const& e) {
std::cerr << "Caught synchronous SYCL exception during Sparse SPMV:" << std::endl;
diff --git a/examples/sparse_blas/run_time_dispatching/CMakeLists.txt b/examples/sparse_blas/run_time_dispatching/CMakeLists.txt
index 398f3e0f2..f09daf819 100644
--- a/examples/sparse_blas/run_time_dispatching/CMakeLists.txt
+++ b/examples/sparse_blas/run_time_dispatching/CMakeLists.txt
@@ -33,6 +33,9 @@ endif()
if(ENABLE_MKLGPU_BACKEND)
list(APPEND DEVICE_FILTERS "level_zero:gpu")
endif()
+if(ENABLE_CUSPARSE_BACKEND)
+ list(APPEND DEVICE_FILTERS "cuda:gpu")
+endif()
message(STATUS "ONEAPI_DEVICE_SELECTOR will be set to the following value(s): [${DEVICE_FILTERS}] for run-time dispatching examples")
diff --git a/include/oneapi/mkl/detail/backends.hpp b/include/oneapi/mkl/detail/backends.hpp
index 32b7c2614..216a6feba 100644
--- a/include/oneapi/mkl/detail/backends.hpp
+++ b/include/oneapi/mkl/detail/backends.hpp
@@ -40,20 +40,31 @@ enum class backend {
cufft,
rocfft,
portfft,
+ cusparse,
unsupported
};
typedef std::map backendmap;
-static backendmap backend_map = {
- { backend::mklcpu, "mklcpu" }, { backend::mklgpu, "mklgpu" },
- { backend::cublas, "cublas" }, { backend::cusolver, "cusolver" },
- { backend::curand, "curand" }, { backend::netlib, "netlib" },
- { backend::rocblas, "rocblas" }, { backend::rocrand, "rocrand" },
- { backend::rocsolver, "rocsolver" }, { backend::portblas, "portblas" },
- { backend::cufft, "cufft" }, { backend::rocfft, "rocfft" },
- { backend::portfft, "portfft" }, { backend::unsupported, "unsupported" }
-};
+// clang-format alternate the formatting depending on the parity of the number of backends
+// It is disabled to reduce noise
+// clang-format off
+static backendmap backend_map = { { backend::mklcpu, "mklcpu" },
+ { backend::mklgpu, "mklgpu" },
+ { backend::cublas, "cublas" },
+ { backend::cusolver, "cusolver" },
+ { backend::curand, "curand" },
+ { backend::netlib, "netlib" },
+ { backend::rocblas, "rocblas" },
+ { backend::rocrand, "rocrand" },
+ { backend::rocsolver, "rocsolver" },
+ { backend::portblas, "portblas" },
+ { backend::cufft, "cufft" },
+ { backend::rocfft, "rocfft" },
+ { backend::portfft, "portfft" },
+ { backend::cusparse, "cusparse" },
+ { backend::unsupported, "unsupported" } };
+// clang-format on
} //namespace mkl
} //namespace oneapi
diff --git a/include/oneapi/mkl/detail/backends_table.hpp b/include/oneapi/mkl/detail/backends_table.hpp
index 731781375..9b7c921d6 100644
--- a/include/oneapi/mkl/detail/backends_table.hpp
+++ b/include/oneapi/mkl/detail/backends_table.hpp
@@ -198,6 +198,12 @@ static std::map>> libraries =
{
#ifdef ONEMKL_ENABLE_MKLGPU_BACKEND
LIB_NAME("sparse_blas_mklgpu")
+#endif
+ } },
+ { device::nvidiagpu,
+ {
+#ifdef ONEMKL_ENABLE_CUSPARSE_BACKEND
+ LIB_NAME("sparse_blas_cusparse")
#endif
} } } },
};
diff --git a/include/oneapi/mkl/sparse_blas.hpp b/include/oneapi/mkl/sparse_blas.hpp
index 004b79727..8fb86f244 100644
--- a/include/oneapi/mkl/sparse_blas.hpp
+++ b/include/oneapi/mkl/sparse_blas.hpp
@@ -34,6 +34,9 @@
#ifdef ONEMKL_ENABLE_MKLGPU_BACKEND
#include "sparse_blas/detail/mklgpu/sparse_blas_ct.hpp"
#endif
+#ifdef ONEMKL_ENABLE_CUSPARSE_BACKEND
+#include "sparse_blas/detail/cusparse/sparse_blas_ct.hpp"
+#endif
#include "sparse_blas/detail/sparse_blas_rt.hpp"
diff --git a/include/oneapi/mkl/sparse_blas/detail/cusparse/onemkl_sparse_blas_cusparse.hpp b/include/oneapi/mkl/sparse_blas/detail/cusparse/onemkl_sparse_blas_cusparse.hpp
new file mode 100644
index 000000000..c8e816eeb
--- /dev/null
+++ b/include/oneapi/mkl/sparse_blas/detail/cusparse/onemkl_sparse_blas_cusparse.hpp
@@ -0,0 +1,33 @@
+/***************************************************************************
+* Copyright (C) Codeplay Software Limited
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* For your convenience, a copy of the License has been included in this
+* repository.
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+**************************************************************************/
+
+#ifndef _ONEMKL_SPARSE_BLAS_DETAIL_CUSPARSE_ONEMKL_SPARSE_BLAS_CUSPARSE_HPP_
+#define _ONEMKL_SPARSE_BLAS_DETAIL_CUSPARSE_ONEMKL_SPARSE_BLAS_CUSPARSE_HPP_
+
+#include "oneapi/mkl/detail/export.hpp"
+#include "oneapi/mkl/sparse_blas/detail/helper_types.hpp"
+#include "oneapi/mkl/sparse_blas/types.hpp"
+
+namespace oneapi::mkl::sparse::cusparse {
+
+#include "oneapi/mkl/sparse_blas/detail/onemkl_sparse_blas_backends.hxx"
+
+} // namespace oneapi::mkl::sparse::cusparse
+
+#endif // _ONEMKL_SPARSE_BLAS_DETAIL_CUSPARSE_ONEMKL_SPARSE_BLAS_CUSPARSE_HPP_
diff --git a/include/oneapi/mkl/sparse_blas/detail/cusparse/sparse_blas_ct.hpp b/include/oneapi/mkl/sparse_blas/detail/cusparse/sparse_blas_ct.hpp
new file mode 100644
index 000000000..11abb9a6f
--- /dev/null
+++ b/include/oneapi/mkl/sparse_blas/detail/cusparse/sparse_blas_ct.hpp
@@ -0,0 +1,40 @@
+/***************************************************************************
+* Copyright (C) Codeplay Software Limited
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* For your convenience, a copy of the License has been included in this
+* repository.
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+**************************************************************************/
+
+#ifndef _ONEMKL_SPARSE_BLAS_DETAIL_CUSPARSE_SPARSE_BLAS_CT_HPP_
+#define _ONEMKL_SPARSE_BLAS_DETAIL_CUSPARSE_SPARSE_BLAS_CT_HPP_
+
+#include "oneapi/mkl/detail/backends.hpp"
+#include "oneapi/mkl/detail/backend_selector.hpp"
+
+#include "onemkl_sparse_blas_cusparse.hpp"
+
+namespace oneapi {
+namespace mkl {
+namespace sparse {
+
+#define BACKEND cusparse
+#include "oneapi/mkl/sparse_blas/detail/sparse_blas_ct.hxx"
+#undef BACKEND
+
+} //namespace sparse
+} //namespace mkl
+} //namespace oneapi
+
+#endif // _ONEMKL_SPARSE_BLAS_DETAIL_CUSPARSE_SPARSE_BLAS_CT_HPP_
diff --git a/include/oneapi/mkl/sparse_blas/types.hpp b/include/oneapi/mkl/sparse_blas/types.hpp
index d619be4b3..1a50d6ef4 100644
--- a/include/oneapi/mkl/sparse_blas/types.hpp
+++ b/include/oneapi/mkl/sparse_blas/types.hpp
@@ -36,6 +36,7 @@ namespace sparse {
enum class matrix_property {
symmetric,
sorted,
+ sorted_by_rows,
};
enum class spmm_alg {
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 6ff8d5d11..c363d8a8d 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -59,6 +59,7 @@ function(generate_header_file)
set(ONEMKL_ENABLE_CUFFT_BACKEND ${ENABLE_CUFFT_BACKEND})
set(ONEMKL_ENABLE_ROCFFT_BACKEND ${ENABLE_ROCFFT_BACKEND})
set(ONEMKL_ENABLE_PORTFFT_BACKEND ${ENABLE_PORTFFT_BACKEND})
+ set(ONEMKL_ENABLE_CUSPARSE_BACKEND ${ENABLE_CUSPARSE_BACKEND})
configure_file(config.hpp.in "${CMAKE_CURRENT_BINARY_DIR}/oneapi/mkl/config.hpp.configured")
file(GENERATE
diff --git a/src/config.hpp.in b/src/config.hpp.in
index de44cb16b..5d8b9a136 100644
--- a/src/config.hpp.in
+++ b/src/config.hpp.in
@@ -24,6 +24,7 @@
#cmakedefine ONEMKL_ENABLE_CUFFT_BACKEND
#cmakedefine ONEMKL_ENABLE_CURAND_BACKEND
#cmakedefine ONEMKL_ENABLE_CUSOLVER_BACKEND
+#cmakedefine ONEMKL_ENABLE_CUSPARSE_BACKEND
#cmakedefine ONEMKL_ENABLE_MKLCPU_BACKEND
#cmakedefine ONEMKL_ENABLE_MKLGPU_BACKEND
#cmakedefine ONEMKL_ENABLE_NETLIB_BACKEND
diff --git a/src/sparse_blas/backends/CMakeLists.txt b/src/sparse_blas/backends/CMakeLists.txt
index 294040808..baae9445d 100644
--- a/src/sparse_blas/backends/CMakeLists.txt
+++ b/src/sparse_blas/backends/CMakeLists.txt
@@ -27,3 +27,7 @@ endif()
if(ENABLE_MKLGPU_BACKEND)
add_subdirectory(mklgpu)
endif()
+
+if(ENABLE_CUSPARSE_BACKEND)
+ add_subdirectory(cusparse)
+endif()
diff --git a/src/sparse_blas/backends/cusparse/CMakeLists.txt b/src/sparse_blas/backends/cusparse/CMakeLists.txt
new file mode 100644
index 000000000..60bbaf35f
--- /dev/null
+++ b/src/sparse_blas/backends/cusparse/CMakeLists.txt
@@ -0,0 +1,85 @@
+#===============================================================================
+# Copyright 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+#
+# SPDX-License-Identifier: Apache-2.0
+#===============================================================================
+
+set(LIB_NAME onemkl_sparse_blas_cusparse)
+set(LIB_OBJ ${LIB_NAME}_obj)
+
+include(WarningsUtils)
+
+add_library(${LIB_NAME})
+add_library(${LIB_OBJ} OBJECT
+ cusparse_handles.cpp
+ cusparse_scope_handle.cpp
+ operations/cusparse_spmm.cpp
+ operations/cusparse_spmv.cpp
+ operations/cusparse_spsv.cpp
+ $<$: cusparse_wrappers.cpp>
+)
+add_dependencies(onemkl_backend_libs_sparse_blas ${LIB_NAME})
+
+target_include_directories(${LIB_OBJ}
+ PRIVATE ${PROJECT_SOURCE_DIR}/include
+ ${PROJECT_SOURCE_DIR}/src
+ ${CMAKE_BINARY_DIR}/bin
+ ${ONEMKL_GENERATED_INCLUDE_PATH}
+)
+
+target_compile_options(${LIB_OBJ} PRIVATE ${ONEMKL_BUILD_COPT})
+
+if (${CMAKE_VERSION} VERSION_LESS "3.17.0")
+ find_package(CUDA 12.2 REQUIRED)
+ target_include_directories(${LIB_OBJ} PRIVATE ${CUDA_INCLUDE_DIRS})
+ target_link_libraries(${LIB_OBJ} PUBLIC cuda rt ${CUDA_cusparse_LIBRARY})
+else()
+ find_package(CUDAToolkit 12.2 REQUIRED)
+ target_link_libraries(${LIB_OBJ} PRIVATE CUDA::cusparse CUDA::cudart CUDA::cuda_driver)
+endif()
+
+target_link_libraries(${LIB_OBJ}
+ PUBLIC ONEMKL::SYCL::SYCL
+ PRIVATE onemkl_warnings
+)
+
+set_target_properties(${LIB_OBJ} PROPERTIES
+ POSITION_INDEPENDENT_CODE ON
+)
+target_link_libraries(${LIB_NAME} PUBLIC ${LIB_OBJ})
+
+#Set oneMKL libraries as not transitive for dynamic
+if(BUILD_SHARED_LIBS)
+ set_target_properties(${LIB_NAME} PROPERTIES
+ INTERFACE_LINK_LIBRARIES ONEMKL::SYCL::SYCL
+ )
+endif()
+
+# Add major version to the library
+set_target_properties(${LIB_NAME} PROPERTIES
+ SOVERSION ${PROJECT_VERSION_MAJOR}
+)
+
+# Add dependencies rpath to the library
+list(APPEND CMAKE_BUILD_RPATH $)
+
+# Add the library to install package
+install(TARGETS ${LIB_OBJ} EXPORT oneMKLTargets)
+install(TARGETS ${LIB_NAME} EXPORT oneMKLTargets
+ RUNTIME DESTINATION bin
+ ARCHIVE DESTINATION lib
+ LIBRARY DESTINATION lib
+)
diff --git a/src/sparse_blas/backends/cusparse/cusparse_error.hpp b/src/sparse_blas/backends/cusparse/cusparse_error.hpp
new file mode 100644
index 000000000..738888576
--- /dev/null
+++ b/src/sparse_blas/backends/cusparse/cusparse_error.hpp
@@ -0,0 +1,103 @@
+/***************************************************************************
+* Copyright (C) Codeplay Software Limited
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* For your convenience, a copy of the License has been included in this
+* repository.
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+**************************************************************************/
+
+#ifndef _ONEMKL_SPARSE_BLAS_BACKENDS_CUSPARSE_ERROR_HPP_
+#define _ONEMKL_SPARSE_BLAS_BACKENDS_CUSPARSE_ERROR_HPP_
+
+#include
+
+#include
+#include
+
+#include "oneapi/mkl/exceptions.hpp"
+
+namespace oneapi::mkl::sparse::cusparse::detail {
+
+inline std::string cuda_result_to_str(CUresult result) {
+ switch (result) {
+#define ONEMKL_CUSPARSE_CASE(STATUS) \
+ case STATUS: return #STATUS
+ ONEMKL_CUSPARSE_CASE(CUDA_SUCCESS);
+ ONEMKL_CUSPARSE_CASE(CUDA_ERROR_NOT_PERMITTED);
+ ONEMKL_CUSPARSE_CASE(CUDA_ERROR_INVALID_CONTEXT);
+ ONEMKL_CUSPARSE_CASE(CUDA_ERROR_INVALID_DEVICE);
+ ONEMKL_CUSPARSE_CASE(CUDA_ERROR_INVALID_VALUE);
+ ONEMKL_CUSPARSE_CASE(CUDA_ERROR_OUT_OF_MEMORY);
+ ONEMKL_CUSPARSE_CASE(CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES);
+ default: return "";
+ }
+}
+
+#define CUDA_ERROR_FUNC(func, ...) \
+ do { \
+ auto res = func(__VA_ARGS__); \
+ if (res != CUDA_SUCCESS) { \
+ throw oneapi::mkl::exception("sparse_blas", #func, \
+ "cuda error: " + detail::cuda_result_to_str(res)); \
+ } \
+ } while (0)
+
+inline std::string cusparse_status_to_str(cusparseStatus_t status) {
+ switch (status) {
+#define ONEMKL_CUSPARSE_CASE(STATUS) \
+ case STATUS: return #STATUS
+ ONEMKL_CUSPARSE_CASE(CUSPARSE_STATUS_SUCCESS);
+ ONEMKL_CUSPARSE_CASE(CUSPARSE_STATUS_NOT_INITIALIZED);
+ ONEMKL_CUSPARSE_CASE(CUSPARSE_STATUS_ALLOC_FAILED);
+ ONEMKL_CUSPARSE_CASE(CUSPARSE_STATUS_INVALID_VALUE);
+ ONEMKL_CUSPARSE_CASE(CUSPARSE_STATUS_ARCH_MISMATCH);
+ ONEMKL_CUSPARSE_CASE(CUSPARSE_STATUS_EXECUTION_FAILED);
+ ONEMKL_CUSPARSE_CASE(CUSPARSE_STATUS_INTERNAL_ERROR);
+ ONEMKL_CUSPARSE_CASE(CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED);
+ ONEMKL_CUSPARSE_CASE(CUSPARSE_STATUS_NOT_SUPPORTED);
+ ONEMKL_CUSPARSE_CASE(CUSPARSE_STATUS_INSUFFICIENT_RESOURCES);
+#undef ONEMKL_CUSPARSE_CASE
+ default: return "";
+ }
+}
+
+inline void check_status(cusparseStatus_t status, const std::string& function,
+ std::string error_str = "") {
+ if (status != CUSPARSE_STATUS_SUCCESS) {
+ if (!error_str.empty()) {
+ error_str += "; ";
+ }
+ error_str += "cuSPARSE status: " + cusparse_status_to_str(status);
+ switch (status) {
+ case CUSPARSE_STATUS_NOT_SUPPORTED:
+ throw oneapi::mkl::unimplemented("sparse_blas", function, error_str);
+ case CUSPARSE_STATUS_NOT_INITIALIZED:
+ throw oneapi::mkl::uninitialized("sparse_blas", function, error_str);
+ case CUSPARSE_STATUS_INVALID_VALUE:
+ case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
+ throw oneapi::mkl::invalid_argument("sparse_blas", function, error_str);
+ default: throw oneapi::mkl::exception("sparse_blas", function, error_str);
+ }
+ }
+}
+
+#define CUSPARSE_ERR_FUNC(func, ...) \
+ do { \
+ auto status = func(__VA_ARGS__); \
+ detail::check_status(status, #func); \
+ } while (0)
+
+} // namespace oneapi::mkl::sparse::cusparse::detail
+
+#endif // _ONEMKL_SPARSE_BLAS_BACKENDS_CUSPARSE_ERROR_HPP_
diff --git a/src/sparse_blas/backends/cusparse/cusparse_global_handle.hpp b/src/sparse_blas/backends/cusparse/cusparse_global_handle.hpp
new file mode 100644
index 000000000..179b007f5
--- /dev/null
+++ b/src/sparse_blas/backends/cusparse/cusparse_global_handle.hpp
@@ -0,0 +1,63 @@
+/***************************************************************************
+* Copyright (C) Codeplay Software Limited
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* For your convenience, a copy of the License has been included in this
+* repository.
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+**************************************************************************/
+
+#ifndef _ONEMKL_SPARSE_BLAS_BACKENDS_CUSPARSE_GLOBAL_HANDLE_HPP_
+#define _ONEMKL_SPARSE_BLAS_BACKENDS_CUSPARSE_GLOBAL_HANDLE_HPP_
+
+/**
+ * @file Similar to blas_handle.hpp
+ * Provides a map from a ur_context_handle_t (or equivalent) to a cusparseHandle_t.
+ * @see cusparse_scope_handle.hpp
+*/
+
+#include
+#include
+
+namespace oneapi::mkl::sparse::cusparse::detail {
+
+template
+struct cusparse_global_handle {
+ using handle_container_t = std::unordered_map*>;
+ handle_container_t cusparse_global_handle_mapper_{};
+
+ ~cusparse_global_handle() noexcept(false) {
+ for (auto& handle_pair : cusparse_global_handle_mapper_) {
+ if (handle_pair.second != nullptr) {
+ auto handle = handle_pair.second->exchange(nullptr);
+ if (handle != nullptr) {
+ CUSPARSE_ERR_FUNC(cusparseDestroy, handle);
+ handle = nullptr;
+ }
+ else {
+ // if the handle is nullptr it means the handle was already
+ // destroyed by the ContextCallback and we're free to delete the
+ // atomic object.
+ delete handle_pair.second;
+ }
+
+ handle_pair.second = nullptr;
+ }
+ }
+ cusparse_global_handle_mapper_.clear();
+ }
+};
+
+} // namespace oneapi::mkl::sparse::cusparse::detail
+
+#endif // _ONEMKL_SPARSE_BLAS_BACKENDS_CUSPARSE_GLOBAL_HANDLE_HPP_
diff --git a/src/sparse_blas/backends/cusparse/cusparse_handles.cpp b/src/sparse_blas/backends/cusparse/cusparse_handles.cpp
new file mode 100644
index 000000000..ff3d8fcae
--- /dev/null
+++ b/src/sparse_blas/backends/cusparse/cusparse_handles.cpp
@@ -0,0 +1,485 @@
+/***************************************************************************
+* Copyright (C) Codeplay Software Limited
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* For your convenience, a copy of the License has been included in this
+* repository.
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+**************************************************************************/
+
+#include "oneapi/mkl/sparse_blas/detail/cusparse/onemkl_sparse_blas_cusparse.hpp"
+
+#include "cusparse_error.hpp"
+#include "cusparse_helper.hpp"
+#include "cusparse_handles.hpp"
+#include "cusparse_task.hpp"
+#include "sparse_blas/macros.hpp"
+
+namespace oneapi::mkl::sparse::cusparse {
+
+/**
+ * In this file CusparseScopedContextHandler are used to ensure that a cusparseHandle_t is created before any other cuSPARSE call, as required by the specification.
+*/
+
+// Dense vector
+template
+void init_dense_vector(sycl::queue& queue, dense_vector_handle_t* p_dvhandle, std::int64_t size,
+ sycl::buffer val) {
+ auto event = queue.submit([&](sycl::handler& cgh) {
+ auto acc = val.template get_access(cgh);
+ detail::submit_host_task(cgh, queue, [=](sycl::interop_handle ih) {
+ // Ensure that a cusparse handle is created before any other cuSPARSE function is called.
+ detail::CusparseScopedContextHandler(queue, ih).get_handle(queue);
+ auto cuda_value_type = detail::CudaEnumType::value;
+ cusparseDnVecDescr_t cu_dvhandle;
+ CUSPARSE_ERR_FUNC(cusparseCreateDnVec, &cu_dvhandle, size, detail::get_mem(ih, acc),
+ cuda_value_type);
+ *p_dvhandle = new dense_vector_handle(cu_dvhandle, val, size);
+ });
+ });
+ event.wait_and_throw();
+}
+
+template
+void init_dense_vector(sycl::queue& queue, dense_vector_handle_t* p_dvhandle, std::int64_t size,
+ fpType* val) {
+ auto event = queue.submit([&](sycl::handler& cgh) {
+ detail::submit_host_task(cgh, queue, [=](sycl::interop_handle ih) {
+ // Ensure that a cusparse handle is created before any other cuSPARSE function is called.
+ detail::CusparseScopedContextHandler(queue, ih).get_handle(queue);
+ auto cuda_value_type = detail::CudaEnumType::value;
+ cusparseDnVecDescr_t cu_dvhandle;
+ CUSPARSE_ERR_FUNC(cusparseCreateDnVec, &cu_dvhandle, size, val, cuda_value_type);
+ *p_dvhandle = new dense_vector_handle(cu_dvhandle, val, size);
+ });
+ });
+ event.wait_and_throw();
+}
+
+template
+void set_dense_vector_data(sycl::queue& queue, dense_vector_handle_t dvhandle, std::int64_t size,
+ sycl::buffer val) {
+ detail::check_can_reset_value_handle(__func__, dvhandle, true);
+ auto event = queue.submit([&](sycl::handler& cgh) {
+ auto acc = val.template get_access(cgh);
+ detail::submit_host_task(cgh, queue, [=](sycl::interop_handle ih) {
+ if (dvhandle->size != size) {
+ CUSPARSE_ERR_FUNC(cusparseDestroyDnVec, dvhandle->backend_handle);
+ auto cuda_value_type = detail::CudaEnumType::value;
+ CUSPARSE_ERR_FUNC(cusparseCreateDnVec, &dvhandle->backend_handle, size,
+ detail::get_mem(ih, acc), cuda_value_type);
+ dvhandle->size = size;
+ }
+ else {
+ CUSPARSE_ERR_FUNC(cusparseDnVecSetValues, dvhandle->backend_handle,
+ detail::get_mem(ih, acc));
+ }
+ dvhandle->set_buffer(val);
+ });
+ });
+ event.wait_and_throw();
+}
+
+template
+void set_dense_vector_data(sycl::queue&, dense_vector_handle_t dvhandle, std::int64_t size,
+ fpType* val) {
+ detail::check_can_reset_value_handle(__func__, dvhandle, false);
+ if (dvhandle->size != size) {
+ CUSPARSE_ERR_FUNC(cusparseDestroyDnVec, dvhandle->backend_handle);
+ auto cuda_value_type = detail::CudaEnumType::value;
+ CUSPARSE_ERR_FUNC(cusparseCreateDnVec, &dvhandle->backend_handle, size, val,
+ cuda_value_type);
+ dvhandle->size = size;
+ }
+ else {
+ CUSPARSE_ERR_FUNC(cusparseDnVecSetValues, dvhandle->backend_handle, val);
+ }
+ dvhandle->set_usm_ptr(val);
+}
+
+FOR_EACH_FP_TYPE(INSTANTIATE_DENSE_VECTOR_FUNCS);
+
+sycl::event release_dense_vector(sycl::queue& queue, dense_vector_handle_t dvhandle,
+ const std::vector& dependencies) {
+ // Use dispatch_submit_impl_fp to ensure the backend's handle is kept alive as long as the buffer is used
+ auto functor = [=](sycl::interop_handle) {
+ CUSPARSE_ERR_FUNC(cusparseDestroyDnVec, dvhandle->backend_handle);
+ delete dvhandle;
+ };
+ return detail::dispatch_submit_impl_fp(__func__, queue, dependencies, functor, dvhandle);
+}
+
+// Dense matrix
+template
+void init_dense_matrix(sycl::queue& queue, dense_matrix_handle_t* p_dmhandle, std::int64_t num_rows,
+ std::int64_t num_cols, std::int64_t ld, layout dense_layout,
+ sycl::buffer val) {
+ auto event = queue.submit([&](sycl::handler& cgh) {
+ auto acc = val.template get_access(cgh);
+ detail::submit_host_task(cgh, queue, [=](sycl::interop_handle ih) {
+ // Ensure that a cusparse handle is created before any other cuSPARSE function is called.
+ detail::CusparseScopedContextHandler(queue, ih).get_handle(queue);
+ auto cuda_value_type = detail::CudaEnumType::value;
+ auto cuda_order = detail::get_cuda_order(dense_layout);
+ cusparseDnMatDescr_t cu_dmhandle;
+ CUSPARSE_ERR_FUNC(cusparseCreateDnMat, &cu_dmhandle, num_rows, num_cols, ld,
+ detail::get_mem(ih, acc), cuda_value_type, cuda_order);
+ *p_dmhandle =
+ new dense_matrix_handle(cu_dmhandle, val, num_rows, num_cols, ld, dense_layout);
+ });
+ });
+ event.wait_and_throw();
+}
+
+template
+void init_dense_matrix(sycl::queue& queue, dense_matrix_handle_t* p_dmhandle, std::int64_t num_rows,
+ std::int64_t num_cols, std::int64_t ld, layout dense_layout, fpType* val) {
+ auto event = queue.submit([&](sycl::handler& cgh) {
+ detail::submit_host_task(cgh, queue, [=](sycl::interop_handle ih) {
+ // Ensure that a cusparse handle is created before any other cuSPARSE function is called.
+ detail::CusparseScopedContextHandler(queue, ih).get_handle(queue);
+ auto cuda_value_type = detail::CudaEnumType::value;
+ auto cuda_order = detail::get_cuda_order(dense_layout);
+ cusparseDnMatDescr_t cu_dmhandle;
+ CUSPARSE_ERR_FUNC(cusparseCreateDnMat, &cu_dmhandle, num_rows, num_cols, ld, val,
+ cuda_value_type, cuda_order);
+ *p_dmhandle =
+ new dense_matrix_handle(cu_dmhandle, val, num_rows, num_cols, ld, dense_layout);
+ });
+ });
+ event.wait_and_throw();
+}
+
+template
+void set_dense_matrix_data(sycl::queue& queue, dense_matrix_handle_t dmhandle,
+ std::int64_t num_rows, std::int64_t num_cols, std::int64_t ld,
+ oneapi::mkl::layout dense_layout, sycl::buffer val) {
+ detail::check_can_reset_value_handle(__func__, dmhandle, true);
+ auto event = queue.submit([&](sycl::handler& cgh) {
+ auto acc = val.template get_access(cgh);
+ detail::submit_host_task(cgh, queue, [=](sycl::interop_handle ih) {
+ if (dmhandle->num_rows != num_rows || dmhandle->num_cols != num_cols ||
+ dmhandle->ld != ld || dmhandle->dense_layout != dense_layout) {
+ CUSPARSE_ERR_FUNC(cusparseDestroyDnMat, dmhandle->backend_handle);
+ auto cuda_value_type = detail::CudaEnumType::value;
+ auto cuda_order = detail::get_cuda_order(dense_layout);
+ CUSPARSE_ERR_FUNC(cusparseCreateDnMat, &dmhandle->backend_handle, num_rows,
+ num_cols, ld, detail::get_mem(ih, acc), cuda_value_type,
+ cuda_order);
+ dmhandle->num_rows = num_rows;
+ dmhandle->num_cols = num_cols;
+ dmhandle->ld = ld;
+ dmhandle->dense_layout = dense_layout;
+ }
+ else {
+ CUSPARSE_ERR_FUNC(cusparseDnMatSetValues, dmhandle->backend_handle,
+ detail::get_mem(ih, acc));
+ }
+ dmhandle->set_buffer(val);
+ });
+ });
+ event.wait_and_throw();
+}
+
+template
+void set_dense_matrix_data(sycl::queue&, dense_matrix_handle_t dmhandle, std::int64_t num_rows,
+ std::int64_t num_cols, std::int64_t ld, oneapi::mkl::layout dense_layout,
+ fpType* val) {
+ detail::check_can_reset_value_handle(__func__, dmhandle, false);
+ if (dmhandle->num_rows != num_rows || dmhandle->num_cols != num_cols || dmhandle->ld != ld ||
+ dmhandle->dense_layout != dense_layout) {
+ CUSPARSE_ERR_FUNC(cusparseDestroyDnMat, dmhandle->backend_handle);
+ auto cuda_value_type = detail::CudaEnumType::value;
+ auto cuda_order = detail::get_cuda_order(dense_layout);
+ CUSPARSE_ERR_FUNC(cusparseCreateDnMat, &dmhandle->backend_handle, num_rows, num_cols, ld,
+ val, cuda_value_type, cuda_order);
+ dmhandle->num_rows = num_rows;
+ dmhandle->num_cols = num_cols;
+ dmhandle->ld = ld;
+ dmhandle->dense_layout = dense_layout;
+ }
+ else {
+ CUSPARSE_ERR_FUNC(cusparseDnMatSetValues, dmhandle->backend_handle, val);
+ }
+ dmhandle->set_usm_ptr(val);
+}
+
+FOR_EACH_FP_TYPE(INSTANTIATE_DENSE_MATRIX_FUNCS);
+
+sycl::event release_dense_matrix(sycl::queue& queue, dense_matrix_handle_t dmhandle,
+ const std::vector& dependencies) {
+ // Use dispatch_submit_impl_fp to ensure the backend's handle is kept alive as long as the buffer is used
+ auto functor = [=](sycl::interop_handle) {
+ CUSPARSE_ERR_FUNC(cusparseDestroyDnMat, dmhandle->backend_handle);
+ delete dmhandle;
+ };
+ return detail::dispatch_submit_impl_fp(__func__, queue, dependencies, functor, dmhandle);
+}
+
+// COO matrix
+template
+void init_coo_matrix(sycl::queue& queue, matrix_handle_t* p_smhandle, std::int64_t num_rows,
+ std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index,
+ sycl::buffer row_ind, sycl::buffer col_ind,
+ sycl::buffer val) {
+ auto event = queue.submit([&](sycl::handler& cgh) {
+ auto row_acc = row_ind.template get_access(cgh);
+ auto col_acc = col_ind.template get_access(cgh);
+ auto val_acc = val.template get_access(cgh);
+ detail::submit_host_task(cgh, queue, [=](sycl::interop_handle ih) {
+ // Ensure that a cusparse handle is created before any other cuSPARSE function is called.
+ detail::CusparseScopedContextHandler(queue, ih).get_handle(queue);
+ auto cuda_index_type = detail::CudaIndexEnumType::value;
+ auto cuda_index_base = detail::get_cuda_index_base(index);
+ auto cuda_value_type = detail::CudaEnumType::value;
+ cusparseSpMatDescr_t cu_smhandle;
+ CUSPARSE_ERR_FUNC(cusparseCreateCoo, &cu_smhandle, num_rows, num_cols, nnz,
+ detail::get_mem(ih, row_acc), detail::get_mem(ih, col_acc),
+ detail::get_mem(ih, val_acc), cuda_index_type, cuda_index_base,
+ cuda_value_type);
+ *p_smhandle =
+ new matrix_handle(cu_smhandle, row_ind, col_ind, val, detail::sparse_format::COO,
+ num_rows, num_cols, nnz, index);
+ });
+ });
+ event.wait_and_throw();
+}
+
+template
+void init_coo_matrix(sycl::queue& queue, matrix_handle_t* p_smhandle, std::int64_t num_rows,
+ std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index,
+ intType* row_ind, intType* col_ind, fpType* val) {
+ auto event = queue.submit([&](sycl::handler& cgh) {
+ detail::submit_host_task(cgh, queue, [=](sycl::interop_handle ih) {
+ // Ensure that a cusparse handle is created before any other cuSPARSE function is called.
+ detail::CusparseScopedContextHandler(queue, ih).get_handle(queue);
+ auto cuda_index_type = detail::CudaIndexEnumType::value;
+ auto cuda_index_base = detail::get_cuda_index_base(index);
+ auto cuda_value_type = detail::CudaEnumType::value;
+ cusparseSpMatDescr_t cu_smhandle;
+ CUSPARSE_ERR_FUNC(cusparseCreateCoo, &cu_smhandle, num_rows, num_cols, nnz, row_ind,
+ col_ind, val, cuda_index_type, cuda_index_base, cuda_value_type);
+ *p_smhandle =
+ new matrix_handle(cu_smhandle, row_ind, col_ind, val, detail::sparse_format::COO,
+ num_rows, num_cols, nnz, index);
+ });
+ });
+ event.wait_and_throw();
+}
+
+template
+void set_coo_matrix_data(sycl::queue& queue, matrix_handle_t smhandle, std::int64_t num_rows,
+ std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index,
+ sycl::buffer row_ind, sycl::buffer col_ind,
+ sycl::buffer val) {
+ detail::check_can_reset_sparse_handle(__func__, smhandle, true);
+ auto event = queue.submit([&](sycl::handler& cgh) {
+ auto row_acc = row_ind.template get_access(cgh);
+ auto col_acc = col_ind.template get_access(cgh);
+ auto val_acc = val.template get_access(cgh);
+ detail::submit_host_task(cgh, queue, [=](sycl::interop_handle ih) {
+ if (smhandle->num_rows != num_rows || smhandle->num_cols != num_cols ||
+ smhandle->nnz != nnz || smhandle->index != index) {
+ CUSPARSE_ERR_FUNC(cusparseDestroySpMat, smhandle->backend_handle);
+ auto cuda_index_type = detail::CudaIndexEnumType::value;
+ auto cuda_index_base = detail::get_cuda_index_base(index);
+ auto cuda_value_type = detail::CudaEnumType::value;
+ CUSPARSE_ERR_FUNC(cusparseCreateCoo, &smhandle->backend_handle, num_rows, num_cols,
+ nnz, detail::get_mem(ih, row_acc), detail::get_mem(ih, col_acc),
+ detail::get_mem(ih, val_acc), cuda_index_type, cuda_index_base,
+ cuda_value_type);
+ smhandle->num_rows = num_rows;
+ smhandle->num_cols = num_cols;
+ smhandle->nnz = nnz;
+ smhandle->index = index;
+ }
+ else {
+ CUSPARSE_ERR_FUNC(cusparseCooSetPointers, smhandle->backend_handle,
+ detail::get_mem(ih, row_acc), detail::get_mem(ih, col_acc),
+ detail::get_mem(ih, val_acc));
+ }
+ smhandle->row_container.set_buffer(row_ind);
+ smhandle->col_container.set_buffer(col_ind);
+ smhandle->value_container.set_buffer(val);
+ });
+ });
+ event.wait_and_throw();
+}
+
+template
+void set_coo_matrix_data(sycl::queue&, matrix_handle_t smhandle, std::int64_t num_rows,
+ std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index,
+ intType* row_ind, intType* col_ind, fpType* val) {
+ detail::check_can_reset_sparse_handle(__func__, smhandle, false);
+ if (smhandle->num_rows != num_rows || smhandle->num_cols != num_cols || smhandle->nnz != nnz ||
+ smhandle->index != index) {
+ CUSPARSE_ERR_FUNC(cusparseDestroySpMat, smhandle->backend_handle);
+ auto cuda_index_type = detail::CudaIndexEnumType::value;
+ auto cuda_index_base = detail::get_cuda_index_base(index);
+ auto cuda_value_type = detail::CudaEnumType::value;
+ CUSPARSE_ERR_FUNC(cusparseCreateCoo, &smhandle->backend_handle, num_rows, num_cols, nnz,
+ row_ind, col_ind, val, cuda_index_type, cuda_index_base, cuda_value_type);
+ smhandle->num_rows = num_rows;
+ smhandle->num_cols = num_cols;
+ smhandle->nnz = nnz;
+ smhandle->index = index;
+ }
+ else {
+ CUSPARSE_ERR_FUNC(cusparseCooSetPointers, smhandle->backend_handle, row_ind, col_ind, val);
+ }
+ smhandle->row_container.set_usm_ptr(row_ind);
+ smhandle->col_container.set_usm_ptr(col_ind);
+ smhandle->value_container.set_usm_ptr(val);
+}
+
+FOR_EACH_FP_AND_INT_TYPE(INSTANTIATE_COO_MATRIX_FUNCS);
+
+// CSR matrix
+template
+void init_csr_matrix(sycl::queue& queue, matrix_handle_t* p_smhandle, std::int64_t num_rows,
+ std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index,
+ sycl::buffer row_ptr, sycl::buffer col_ind,
+ sycl::buffer val) {
+ auto event = queue.submit([&](sycl::handler& cgh) {
+ auto row_acc = row_ptr.template get_access(cgh);
+ auto col_acc = col_ind.template get_access(cgh);
+ auto val_acc = val.template get_access(cgh);
+ detail::submit_host_task(cgh, queue, [=](sycl::interop_handle ih) {
+ // Ensure that a cusparse handle is created before any other cuSPARSE function is called.
+ detail::CusparseScopedContextHandler(queue, ih).get_handle(queue);
+ auto cuda_index_type = detail::CudaIndexEnumType::value;
+ auto cuda_index_base = detail::get_cuda_index_base(index);
+ auto cuda_value_type = detail::CudaEnumType::value;
+ cusparseSpMatDescr_t cu_smhandle;
+ CUSPARSE_ERR_FUNC(cusparseCreateCsr, &cu_smhandle, num_rows, num_cols, nnz,
+ detail::get_mem(ih, row_acc), detail::get_mem(ih, col_acc),
+ detail::get_mem(ih, val_acc), cuda_index_type, cuda_index_type,
+ cuda_index_base, cuda_value_type);
+ *p_smhandle =
+ new matrix_handle(cu_smhandle, row_ptr, col_ind, val, detail::sparse_format::CSR,
+ num_rows, num_cols, nnz, index);
+ });
+ });
+ event.wait_and_throw();
+}
+
+template
+void init_csr_matrix(sycl::queue& queue, matrix_handle_t* p_smhandle, std::int64_t num_rows,
+ std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index,
+ intType* row_ptr, intType* col_ind, fpType* val) {
+ auto event = queue.submit([&](sycl::handler& cgh) {
+ detail::submit_host_task(cgh, queue, [=](sycl::interop_handle ih) {
+ // Ensure that a cusparse handle is created before any other cuSPARSE function is called.
+ detail::CusparseScopedContextHandler(queue, ih).get_handle(queue);
+ auto cuda_index_type = detail::CudaIndexEnumType::value;
+ auto cuda_index_base = detail::get_cuda_index_base(index);
+ auto cuda_value_type = detail::CudaEnumType::value;
+ cusparseSpMatDescr_t cu_smhandle;
+ CUSPARSE_ERR_FUNC(cusparseCreateCsr, &cu_smhandle, num_rows, num_cols, nnz, row_ptr,
+ col_ind, val, cuda_index_type, cuda_index_type, cuda_index_base,
+ cuda_value_type);
+ *p_smhandle =
+ new matrix_handle(cu_smhandle, row_ptr, col_ind, val, detail::sparse_format::CSR,
+ num_rows, num_cols, nnz, index);
+ });
+ });
+ event.wait_and_throw();
+}
+
+template
+void set_csr_matrix_data(sycl::queue& queue, matrix_handle_t smhandle, std::int64_t num_rows,
+ std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index,
+ sycl::buffer row_ptr, sycl::buffer col_ind,
+ sycl::buffer val) {
+ detail::check_can_reset_sparse_handle(__func__, smhandle, true);
+ auto event = queue.submit([&](sycl::handler& cgh) {
+ auto row_acc = row_ptr.template get_access(cgh);
+ auto col_acc = col_ind.template get_access(cgh);
+ auto val_acc = val.template get_access(cgh);
+ detail::submit_host_task(cgh, queue, [=](sycl::interop_handle ih) {
+ if (smhandle->num_rows != num_rows || smhandle->num_cols != num_cols ||
+ smhandle->nnz != nnz || smhandle->index != index) {
+ CUSPARSE_ERR_FUNC(cusparseDestroySpMat, smhandle->backend_handle);
+ auto cuda_index_type = detail::CudaIndexEnumType::value;
+ auto cuda_index_base = detail::get_cuda_index_base(index);
+ auto cuda_value_type = detail::CudaEnumType::value;
+ CUSPARSE_ERR_FUNC(cusparseCreateCsr, &smhandle->backend_handle, num_rows, num_cols,
+ nnz, detail::get_mem(ih, row_acc), detail::get_mem(ih, col_acc),
+ detail::get_mem(ih, val_acc), cuda_index_type, cuda_index_type,
+ cuda_index_base, cuda_value_type);
+ smhandle->num_rows = num_rows;
+ smhandle->num_cols = num_cols;
+ smhandle->nnz = nnz;
+ smhandle->index = index;
+ }
+ else {
+ CUSPARSE_ERR_FUNC(cusparseCsrSetPointers, smhandle->backend_handle,
+ detail::get_mem(ih, row_acc), detail::get_mem(ih, col_acc),
+ detail::get_mem(ih, val_acc));
+ }
+ smhandle->row_container.set_buffer(row_ptr);
+ smhandle->col_container.set_buffer(col_ind);
+ smhandle->value_container.set_buffer(val);
+ });
+ });
+ event.wait_and_throw();
+}
+
+template
+void set_csr_matrix_data(sycl::queue&, matrix_handle_t smhandle, std::int64_t num_rows,
+ std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index,
+ intType* row_ptr, intType* col_ind, fpType* val) {
+ detail::check_can_reset_sparse_handle(__func__, smhandle, false);
+ if (smhandle->num_rows != num_rows || smhandle->num_cols != num_cols || smhandle->nnz != nnz ||
+ smhandle->index != index) {
+ CUSPARSE_ERR_FUNC(cusparseDestroySpMat, smhandle->backend_handle);
+ auto cuda_index_type = detail::CudaIndexEnumType::value;
+ auto cuda_index_base = detail::get_cuda_index_base(index);
+ auto cuda_value_type = detail::CudaEnumType::value;
+ CUSPARSE_ERR_FUNC(cusparseCreateCsr, &smhandle->backend_handle, num_rows, num_cols, nnz,
+ row_ptr, col_ind, val, cuda_index_type, cuda_index_type, cuda_index_base,
+ cuda_value_type);
+ smhandle->num_rows = num_rows;
+ smhandle->num_cols = num_cols;
+ smhandle->nnz = nnz;
+ smhandle->index = index;
+ }
+ else {
+ CUSPARSE_ERR_FUNC(cusparseCsrSetPointers, smhandle->backend_handle, row_ptr, col_ind, val);
+ }
+ smhandle->row_container.set_usm_ptr(row_ptr);
+ smhandle->col_container.set_usm_ptr(col_ind);
+ smhandle->value_container.set_usm_ptr(val);
+}
+
+FOR_EACH_FP_AND_INT_TYPE(INSTANTIATE_CSR_MATRIX_FUNCS);
+
+sycl::event release_sparse_matrix(sycl::queue& queue, matrix_handle_t smhandle,
+ const std::vector& dependencies) {
+ // Use dispatch_submit to ensure the backend's handle is kept alive as long as the buffers are used
+ auto functor = [=](sycl::interop_handle) {
+ CUSPARSE_ERR_FUNC(cusparseDestroySpMat, smhandle->backend_handle);
+ delete smhandle;
+ };
+ return detail::dispatch_submit(__func__, queue, dependencies, functor, smhandle);
+}
+
+// Matrix property
+bool set_matrix_property(sycl::queue&, matrix_handle_t smhandle, matrix_property property) {
+ // No equivalent in cuSPARSE
+ // Store the matrix property internally for future usages
+ smhandle->set_matrix_property(property);
+ return false;
+}
+
+} // namespace oneapi::mkl::sparse::cusparse
diff --git a/src/sparse_blas/backends/cusparse/cusparse_handles.hpp b/src/sparse_blas/backends/cusparse/cusparse_handles.hpp
new file mode 100644
index 000000000..5e5bdc732
--- /dev/null
+++ b/src/sparse_blas/backends/cusparse/cusparse_handles.hpp
@@ -0,0 +1,95 @@
+/***************************************************************************
+* Copyright (C) Codeplay Software Limited
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* For your convenience, a copy of the License has been included in this
+* repository.
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+**************************************************************************/
+
+#ifndef _ONEMKL_SRC_SPARSE_BLAS_BACKENDS_CUSPARSE_HANDLES_HPP_
+#define _ONEMKL_SRC_SPARSE_BLAS_BACKENDS_CUSPARSE_HANDLES_HPP_
+
+#include
+
+#include "sparse_blas/generic_container.hpp"
+
+namespace oneapi::mkl::sparse {
+
+// Complete the definition of incomplete types dense_vector_handle, dense_matrix_handle and matrix_handle.
+
+struct dense_vector_handle : public detail::generic_dense_vector_handle {
+ template
+ dense_vector_handle(cusparseDnVecDescr_t cu_descr, T* value_ptr, std::int64_t size)
+ : detail::generic_dense_vector_handle(cu_descr, value_ptr, size) {
+ }
+
+ template
+ dense_vector_handle(cusparseDnVecDescr_t cu_descr, const sycl::buffer value_buffer,
+ std::int64_t size)
+ : detail::generic_dense_vector_handle(cu_descr, value_buffer,
+ size) {}
+};
+
+struct dense_matrix_handle : public detail::generic_dense_matrix_handle {
+ template
+ dense_matrix_handle(cusparseDnMatDescr_t cu_descr, T* value_ptr, std::int64_t num_rows,
+ std::int64_t num_cols, std::int64_t ld, layout dense_layout)
+ : detail::generic_dense_matrix_handle(
+ cu_descr, value_ptr, num_rows, num_cols, ld, dense_layout) {}
+
+ template
+ dense_matrix_handle(cusparseDnMatDescr_t cu_descr, const sycl::buffer value_buffer,
+ std::int64_t num_rows, std::int64_t num_cols, std::int64_t ld,
+ layout dense_layout)
+ : detail::generic_dense_matrix_handle(
+ cu_descr, value_buffer, num_rows, num_cols, ld, dense_layout) {}
+};
+
+struct matrix_handle : public detail::generic_sparse_handle {
+ template
+ matrix_handle(cusparseSpMatDescr_t cu_descr, intType* row_ptr, intType* col_ptr,
+ fpType* value_ptr, detail::sparse_format format, std::int64_t num_rows,
+ std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index)
+ : detail::generic_sparse_handle(
+ cu_descr, row_ptr, col_ptr, value_ptr, format, num_rows, num_cols, nnz, index) {}
+
+ template
+ matrix_handle(cusparseSpMatDescr_t cu_descr, const sycl::buffer row_buffer,
+ const sycl::buffer col_buffer,
+ const sycl::buffer value_buffer, detail::sparse_format format,
+ std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz,
+ oneapi::mkl::index_base index)
+ : detail::generic_sparse_handle(cu_descr, row_buffer, col_buffer,
+ value_buffer, format, num_rows,
+ num_cols, nnz, index) {}
+};
+
+namespace detail {
+
+inline void check_valid_matrix_properties(const std::string& function_name,
+ matrix_handle_t sm_handle) {
+ if (sm_handle->format == sparse_format::COO &&
+ !(sm_handle->has_matrix_property(matrix_property::sorted_by_rows) ||
+ sm_handle->has_matrix_property(matrix_property::sorted))) {
+ throw mkl::unimplemented(
+ "sparse_blas", function_name,
+ "The backend does not support unsorted COO format. Use `set_matrix_property` to set the property `matrix_property::sorted_by_rows` or `matrix_property::sorted`");
+ }
+}
+
+} // namespace detail
+
+} // namespace oneapi::mkl::sparse
+
+#endif // _ONEMKL_SRC_SPARSE_BLAS_BACKENDS_CUSPARSE_HANDLES_HPP_
diff --git a/src/sparse_blas/backends/cusparse/cusparse_helper.hpp b/src/sparse_blas/backends/cusparse/cusparse_helper.hpp
new file mode 100644
index 000000000..3feb4bcad
--- /dev/null
+++ b/src/sparse_blas/backends/cusparse/cusparse_helper.hpp
@@ -0,0 +1,166 @@
+/***************************************************************************
+* Copyright (C) Codeplay Software Limited
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* For your convenience, a copy of the License has been included in this
+* repository.
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+**************************************************************************/
+#ifndef _ONEMKL_SPARSE_BLAS_BACKENDS_CUSPARSE_HELPER_HPP_
+#define _ONEMKL_SPARSE_BLAS_BACKENDS_CUSPARSE_HELPER_HPP_
+
+#include
+#include
+#include
+#include
+
+#include
+
+#include "oneapi/mkl/sparse_blas/types.hpp"
+#include "sparse_blas/enum_data_types.hpp"
+#include "sparse_blas/sycl_helper.hpp"
+#include "cusparse_error.hpp"
+
+namespace oneapi::mkl::sparse::cusparse::detail {
+
+using namespace oneapi::mkl::sparse::detail;
+
+template
+struct CudaEnumType;
+template <>
+struct CudaEnumType {
+ static constexpr cudaDataType_t value = CUDA_R_32F;
+};
+template <>
+struct CudaEnumType {
+ static constexpr cudaDataType_t value = CUDA_R_64F;
+};
+template <>
+struct CudaEnumType> {
+ static constexpr cudaDataType_t value = CUDA_C_32F;
+};
+template <>
+struct CudaEnumType> {
+ static constexpr cudaDataType_t value = CUDA_C_64F;
+};
+
+template
+struct CudaIndexEnumType;
+template <>
+struct CudaIndexEnumType {
+ static constexpr cusparseIndexType_t value = CUSPARSE_INDEX_32I;
+};
+template <>
+struct CudaIndexEnumType {
+ static constexpr cusparseIndexType_t value = CUSPARSE_INDEX_64I;
+};
+
+template
+inline std::string cast_enum_to_str(E e) {
+ return std::to_string(static_cast(e));
+}
+
+inline cudaDataType_t get_cuda_value_type(data_type onemkl_data_type) {
+ switch (onemkl_data_type) {
+ case data_type::real_fp32: return CUDA_R_32F;
+ case data_type::real_fp64: return CUDA_R_64F;
+ case data_type::complex_fp32: return CUDA_C_32F;
+ case data_type::complex_fp64: return CUDA_C_64F;
+ default:
+ throw oneapi::mkl::invalid_argument(
+ "sparse_blas", "get_cuda_value_type",
+ "Invalid data type: " + cast_enum_to_str(onemkl_data_type));
+ }
+}
+
+inline cusparseOrder_t get_cuda_order(layout l) {
+ switch (l) {
+ case layout::row_major: return CUSPARSE_ORDER_ROW;
+ case layout::col_major: return CUSPARSE_ORDER_COL;
+ default:
+ throw oneapi::mkl::invalid_argument("sparse_blas", "get_cuda_order",
+ "Unknown layout: " + cast_enum_to_str(l));
+ }
+}
+
+inline cusparseIndexBase_t get_cuda_index_base(index_base index) {
+ switch (index) {
+ case index_base::zero: return CUSPARSE_INDEX_BASE_ZERO;
+ case index_base::one: return CUSPARSE_INDEX_BASE_ONE;
+ default:
+ throw oneapi::mkl::invalid_argument("sparse_blas", "get_cuda_index_base",
+ "Unknown index_base: " + cast_enum_to_str(index));
+ }
+}
+
+/// Return the CUDA transpose operation from a oneMKL type.
+/// Do not conjugate for real types to avoid an invalid argument.
+inline cusparseOperation_t get_cuda_operation(data_type type, transpose op) {
+ switch (op) {
+ case transpose::nontrans: return CUSPARSE_OPERATION_NON_TRANSPOSE;
+ case transpose::trans: return CUSPARSE_OPERATION_TRANSPOSE;
+ case transpose::conjtrans:
+ return (type == data_type::complex_fp32 || type == data_type::complex_fp64)
+ ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE
+ : CUSPARSE_OPERATION_TRANSPOSE;
+ default:
+ throw oneapi::mkl::invalid_argument(
+ "sparse_blas", "get_cuda_operation",
+ "Unknown transpose operation: " + cast_enum_to_str(op));
+ }
+}
+
+inline auto get_cuda_uplo(uplo uplo_val) {
+ switch (uplo_val) {
+ case uplo::upper: return CUSPARSE_FILL_MODE_UPPER;
+ case uplo::lower: return CUSPARSE_FILL_MODE_LOWER;
+ default:
+ throw oneapi::mkl::invalid_argument("sparse_blas", "get_cuda_uplo",
+ "Unknown uplo: " + cast_enum_to_str(uplo_val));
+ }
+}
+
+inline auto get_cuda_diag(diag diag_val) {
+ switch (diag_val) {
+ case diag::nonunit: return CUSPARSE_DIAG_TYPE_NON_UNIT;
+ case diag::unit: return CUSPARSE_DIAG_TYPE_UNIT;
+ default:
+ throw oneapi::mkl::invalid_argument("sparse_blas", "get_cuda_diag",
+ "Unknown diag: " + cast_enum_to_str(diag_val));
+ }
+}
+
+inline void set_matrix_attributes(const std::string& func_name, cusparseSpMatDescr_t cu_a,
+ oneapi::mkl::sparse::matrix_view A_view) {
+ auto cu_fill_mode = get_cuda_uplo(A_view.uplo_view);
+ auto status = cusparseSpMatSetAttribute(cu_a, CUSPARSE_SPMAT_FILL_MODE, &cu_fill_mode,
+ sizeof(cu_fill_mode));
+ check_status(status, func_name + "/set_uplo");
+
+ auto cu_diag_type = get_cuda_diag(A_view.diag_view);
+ status = cusparseSpMatSetAttribute(cu_a, CUSPARSE_SPMAT_DIAG_TYPE, &cu_diag_type,
+ sizeof(cu_diag_type));
+ check_status(status, func_name + "/set_diag");
+}
+
+/**
+ * cuSPARSE requires to set the pointer mode for scalars parameters (typically alpha and beta).
+ */
+inline void set_pointer_mode(cusparseHandle_t cu_handle, bool is_ptr_host_accessible) {
+ cusparseSetPointerMode(cu_handle, is_ptr_host_accessible ? CUSPARSE_POINTER_MODE_HOST
+ : CUSPARSE_POINTER_MODE_DEVICE);
+}
+
+} // namespace oneapi::mkl::sparse::cusparse::detail
+
+#endif //_ONEMKL_SPARSE_BLAS_BACKENDS_CUSPARSE_HELPER_HPP_
diff --git a/src/sparse_blas/backends/cusparse/cusparse_scope_handle.cpp b/src/sparse_blas/backends/cusparse/cusparse_scope_handle.cpp
new file mode 100644
index 000000000..4d92daf35
--- /dev/null
+++ b/src/sparse_blas/backends/cusparse/cusparse_scope_handle.cpp
@@ -0,0 +1,147 @@
+/***************************************************************************
+* Copyright (C) Codeplay Software Limited
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* For your convenience, a copy of the License has been included in this
+* repository.
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+**************************************************************************/
+
+/**
+ * @file Similar to cublas_scope_handle.cpp
+*/
+
+#include "cusparse_scope_handle.hpp"
+
+namespace oneapi::mkl::sparse::cusparse::detail {
+
+/**
+ * Inserts a new element in the map if its key is unique. This new element
+ * is constructed in place using args as the arguments for the construction
+ * of a value_type (which is an object of a pair type). The insertion only
+ * takes place if no other element in the container has a key equivalent to
+ * the one being emplaced (keys in a map container are unique).
+ */
+#ifdef ONEAPI_ONEMKL_PI_INTERFACE_REMOVED
+thread_local cusparse_global_handle
+ CusparseScopedContextHandler::handle_helper = cusparse_global_handle{};
+#else
+thread_local cusparse_global_handle CusparseScopedContextHandler::handle_helper =
+ cusparse_global_handle{};
+#endif
+
+CusparseScopedContextHandler::CusparseScopedContextHandler(sycl::queue queue,
+ sycl::interop_handle& ih)
+ : ih(ih),
+ needToRecover_(false) {
+ placedContext_ = new sycl::context(queue.get_context());
+ auto cudaDevice = ih.get_native_device();
+ CUcontext desired;
+ CUDA_ERROR_FUNC(cuCtxGetCurrent, &original_);
+ CUDA_ERROR_FUNC(cuDevicePrimaryCtxRetain, &desired, cudaDevice);
+ if (original_ != desired) {
+ // Sets the desired context as the active one for the thread
+ CUDA_ERROR_FUNC(cuCtxSetCurrent, desired);
+ // No context is installed and the suggested context is primary
+ // This is the most common case. We can activate the context in the
+ // thread and leave it there until all the PI context referring to the
+ // same underlying CUDA primary context are destroyed. This emulates
+ // the behaviour of the CUDA runtime api, and avoids costly context
+ // switches. No action is required on this side of the if.
+ needToRecover_ = !(original_ == nullptr);
+ }
+}
+
+CusparseScopedContextHandler::~CusparseScopedContextHandler() noexcept(false) {
+ if (needToRecover_) {
+ CUDA_ERROR_FUNC(cuCtxSetCurrent, original_);
+ }
+ delete placedContext_;
+}
+
+void ContextCallback(void* userData) {
+ auto* ptr = static_cast*>(userData);
+ if (!ptr) {
+ return;
+ }
+ auto handle = ptr->exchange(nullptr);
+ if (handle != nullptr) {
+ CUSPARSE_ERR_FUNC(cusparseDestroy, handle);
+ handle = nullptr;
+ }
+ else {
+ // if the handle is nullptr it means the handle was already destroyed by
+ // the cusparse_global_handle destructor and we're free to delete the atomic
+ // object.
+ delete ptr;
+ }
+}
+
+std::pair CusparseScopedContextHandler::get_handle_and_stream(
+ const sycl::queue& queue) {
+ auto cudaDevice = ih.get_native_device();
+ CUcontext desired;
+ CUDA_ERROR_FUNC(cuDevicePrimaryCtxRetain, &desired, cudaDevice);
+#ifdef ONEAPI_ONEMKL_PI_INTERFACE_REMOVED
+ auto piPlacedContext_ = reinterpret_cast(desired);
+#else
+ auto piPlacedContext_ = reinterpret_cast(desired);
+#endif
+ CUstream streamId = get_stream(queue);
+ auto it = handle_helper.cusparse_global_handle_mapper_.find(piPlacedContext_);
+ if (it != handle_helper.cusparse_global_handle_mapper_.end()) {
+ if (it->second == nullptr) {
+ handle_helper.cusparse_global_handle_mapper_.erase(it);
+ }
+ else {
+ auto handle = it->second->load();
+ if (handle != nullptr) {
+ cudaStream_t currentStreamId;
+ CUSPARSE_ERR_FUNC(cusparseGetStream, handle, ¤tStreamId);
+ if (currentStreamId != streamId) {
+ CUSPARSE_ERR_FUNC(cusparseSetStream, handle, streamId);
+ }
+ return { handle, streamId };
+ }
+ else {
+ handle_helper.cusparse_global_handle_mapper_.erase(it);
+ }
+ }
+ }
+
+ cusparseHandle_t handle;
+ CUSPARSE_ERR_FUNC(cusparseCreate, &handle);
+ CUSPARSE_ERR_FUNC(cusparseSetStream, handle, streamId);
+
+ auto insert_iter = handle_helper.cusparse_global_handle_mapper_.insert(
+ std::make_pair(piPlacedContext_, new std::atomic(handle)));
+
+ sycl::detail::pi::contextSetExtendedDeleter(*placedContext_, ContextCallback,
+ insert_iter.first->second);
+
+ return { handle, streamId };
+}
+
+cusparseHandle_t CusparseScopedContextHandler::get_handle(const sycl::queue& queue) {
+ return get_handle_and_stream(queue).first;
+}
+
+CUstream CusparseScopedContextHandler::get_stream(const sycl::queue& queue) {
+ return sycl::get_native(queue);
+}
+
+sycl::context CusparseScopedContextHandler::get_context(const sycl::queue& queue) {
+ return queue.get_context();
+}
+
+} // namespace oneapi::mkl::sparse::cusparse::detail
diff --git a/src/sparse_blas/backends/cusparse/cusparse_scope_handle.hpp b/src/sparse_blas/backends/cusparse/cusparse_scope_handle.hpp
new file mode 100644
index 000000000..7b8313ee6
--- /dev/null
+++ b/src/sparse_blas/backends/cusparse/cusparse_scope_handle.hpp
@@ -0,0 +1,88 @@
+/***************************************************************************
+* Copyright (C) Codeplay Software Limited
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* For your convenience, a copy of the License has been included in this
+* repository.
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+**************************************************************************/
+#ifndef _ONEMKL_SPARSE_BLAS_BACKENDS_CUSPARSE_SCOPE_HANDLE_HPP_
+#define _ONEMKL_SPARSE_BLAS_BACKENDS_CUSPARSE_SCOPE_HANDLE_HPP_
+
+/**
+ * @file Similar to cublas_scope_handle.hpp
+*/
+
+#if __has_include()
+#include
+#else
+#include
+#endif
+
+// After Plugin Interface removal in DPC++ ur.hpp is the new include
+#if __has_include() && !defined(ONEAPI_ONEMKL_PI_INTERFACE_REMOVED)
+#define ONEAPI_ONEMKL_PI_INTERFACE_REMOVED
+#endif
+
+#include
+
+#include "cusparse_error.hpp"
+#include "cusparse_global_handle.hpp"
+#include "cusparse_helper.hpp"
+
+namespace oneapi::mkl::sparse::cusparse::detail {
+
+class CusparseScopedContextHandler {
+ CUcontext original_;
+ sycl::context* placedContext_;
+ sycl::interop_handle& ih;
+ bool needToRecover_;
+
+#ifdef ONEAPI_ONEMKL_PI_INTERFACE_REMOVED
+ static thread_local cusparse_global_handle handle_helper;
+#else
+ static thread_local cusparse_global_handle handle_helper;
+#endif
+
+ CUstream get_stream(const sycl::queue& queue);
+ sycl::context get_context(const sycl::queue& queue);
+
+public:
+ CusparseScopedContextHandler(sycl::queue queue, sycl::interop_handle& ih);
+
+ ~CusparseScopedContextHandler() noexcept(false);
+
+ /**
+ * @brief get_handle: creates the handle by implicitly impose the advice
+ * given by nvidia for creating a cusparse_global_handle. (e.g. one cuStream per device
+ * per thread).
+ * @param queue sycl queue.
+ * @return a pair of: cusparseHandle_t a handle to construct cusparse routines; and a CUDA stream
+ */
+ std::pair get_handle_and_stream(const sycl::queue& queue);
+
+ /// See get_handle_and_stream
+ cusparseHandle_t get_handle(const sycl::queue& queue);
+};
+
+// Get the native pointer from an accessor. This is a different pointer than
+// what can be retrieved with get_multi_ptr.
+template
+inline void* get_mem(sycl::interop_handle ih, AccT acc) {
+ auto cudaPtr = ih.get_native_mem(acc);
+ return reinterpret_cast(cudaPtr);
+}
+
+} // namespace oneapi::mkl::sparse::cusparse::detail
+
+#endif //_ONEMKL_SPARSE_BLAS_BACKENDS_CUSPARSE_SCOPE_HANDLE_HPP_
diff --git a/src/sparse_blas/backends/cusparse/cusparse_task.hpp b/src/sparse_blas/backends/cusparse/cusparse_task.hpp
new file mode 100644
index 000000000..0d86d642d
--- /dev/null
+++ b/src/sparse_blas/backends/cusparse/cusparse_task.hpp
@@ -0,0 +1,431 @@
+/***************************************************************************
+* Copyright (C) Codeplay Software Limited
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* For your convenience, a copy of the License has been included in this
+* repository.
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+**************************************************************************/
+
+#ifndef _ONEMKL_SPARSE_BLAS_BACKENDS_CUSPARSE_TASKS_HPP_
+#define _ONEMKL_SPARSE_BLAS_BACKENDS_CUSPARSE_TASKS_HPP_
+
+#include "cusparse_handles.hpp"
+#include "cusparse_scope_handle.hpp"
+
+/// This file provide a helper function to submit host_task using buffers or USM seamlessly
+
+namespace oneapi::mkl::sparse::cusparse::detail {
+
+template
+auto get_value_accessor(sycl::handler& cgh, Container container) {
+ auto buffer_ptr =
+ reinterpret_cast*>(container->value_container.buffer_ptr.get());
+ return buffer_ptr->template get_access(cgh);
+}
+
+template
+auto get_fp_accessors(sycl::handler& cgh, Ts... containers) {
+ return std::array, sizeof...(containers)>{ get_value_accessor(
+ cgh, containers)... };
+}
+
+template
+auto get_row_accessor(sycl::handler& cgh, matrix_handle_t smhandle) {
+ auto buffer_ptr =
+ reinterpret_cast*>(smhandle->row_container.buffer_ptr.get());
+ return buffer_ptr->template get_access(cgh);
+}
+
+template
+auto get_col_accessor(sycl::handler& cgh, matrix_handle_t smhandle) {
+ auto buffer_ptr =
+ reinterpret_cast*>(smhandle->col_container.buffer_ptr.get());
+ return buffer_ptr->template get_access(cgh);
+}
+
+template
+auto get_int_accessors(sycl::handler& cgh, matrix_handle_t smhandle) {
+ return std::array, 2>{ get_row_accessor(cgh, smhandle),
+ get_col_accessor(cgh, smhandle) };
+}
+
+template
+void submit_host_task(sycl::handler& cgh, sycl::queue& queue, Functor functor,
+ CaptureOnlyAcc... capture_only_accessors) {
+ // Only capture the accessors to ensure the dependencies are properly
+ // handled. The accessors's pointer have already been set to the native
+ // container types in previous functions. This assumes the underlying
+ // pointer of the buffer does not change. This is not guaranteed by the SYCL
+ // specification but should be true for all the implementations. This
+ // assumption avoids the overhead of resetting the pointer of all data
+ // handles for each enqueued command.
+ cgh.host_task([functor, queue, capture_only_accessors...](sycl::interop_handle ih) {
+ auto unused = std::make_tuple(capture_only_accessors...);
+ (void)unused;
+ functor(ih);
+ });
+}
+
+template
+void submit_host_task_with_acc(sycl::handler& cgh, sycl::queue& queue, Functor functor,
+ sycl::accessor workspace_acc,
+ CaptureOnlyAcc... capture_only_accessors) {
+ // Only capture the accessors to ensure the dependencies are properly
+ // handled. The accessors's pointer have already been set to the native
+ // container types in previous functions. This assumes the underlying
+ // pointer of the buffer does not change. This is not guaranteed by the SYCL
+ // specification but should be true for all the implementations. This
+ // assumption avoids the overhead of resetting the pointer of all data
+ // handles for each enqueued command.
+ cgh.host_task(
+ [functor, queue, workspace_acc, capture_only_accessors...](sycl::interop_handle ih) {
+ auto unused = std::make_tuple(capture_only_accessors...);
+ (void)unused;
+ functor(ih, workspace_acc);
+ });
+}
+
+template
+void submit_native_command_ext(sycl::handler& cgh, sycl::queue& queue, Functor functor,
+ const std::vector& dependencies,
+ CaptureOnlyAcc... capture_only_accessors) {
+ // Only capture the accessors to ensure the dependencies are properly
+ // handled. The accessors's pointer have already been set to the native
+ // container types in previous functions. This assumes the underlying
+ // pointer of the buffer does not change. This is not guaranteed by the SYCL
+ // specification but should be true for all the implementations. This
+ // assumption avoids the overhead of resetting the pointer of all data
+ // handles for each enqueued command.
+#ifdef SYCL_EXT_ONEAPI_ENQUEUE_NATIVE_COMMAND
+ cgh.ext_codeplay_enqueue_native_command(
+ [functor, queue, dependencies, capture_only_accessors...](sycl::interop_handle ih) {
+ auto unused = std::make_tuple(capture_only_accessors...);
+ (void)unused;
+ // The functor using ext_codeplay_enqueue_native_command need to
+ // explicitly wait on the events for the SPARSE domain. The
+ // extension ext_codeplay_enqueue_native_command is used to launch
+ // the compute operation which depends on the previous optimize
+ // step. In cuSPARSE the optimize step is synchronous but it is
+ // asynchronous in oneMKL Interface. The optimize step may not use
+ // the CUDA stream which would make it impossible for
+ // ext_codeplay_enqueue_native_command to automatically ensure it
+ // has completed before the compute function starts. These waits are
+ // used to ensure the optimize step has completed before starting
+ // the computation.
+ for (auto event : dependencies) {
+ event.wait();
+ }
+ functor(ih);
+ });
+#else
+ (void)dependencies;
+ submit_host_task(cgh, queue, functor, capture_only_accessors...);
+#endif
+}
+
+template
+void submit_native_command_ext_with_acc(sycl::handler& cgh, sycl::queue& queue, Functor functor,
+ const std::vector& dependencies,
+ sycl::accessor workspace_acc,
+ CaptureOnlyAcc... capture_only_accessors) {
+ // Only capture the accessors to ensure the dependencies are properly
+ // handled. The accessors's pointer have already been set to the native
+ // container types in previous functions. This assumes the underlying
+ // pointer of the buffer does not change. This is not guaranteed by the SYCL
+ // specification but should be true for all the implementations. This
+ // assumption avoids the overhead of resetting the pointer of all data
+ // handles for each enqueued command.
+#ifdef SYCL_EXT_ONEAPI_ENQUEUE_NATIVE_COMMAND
+ cgh.ext_codeplay_enqueue_native_command([functor, queue, dependencies, workspace_acc,
+ capture_only_accessors...](sycl::interop_handle ih) {
+ auto unused = std::make_tuple(capture_only_accessors...);
+ (void)unused;
+ // The functor using ext_codeplay_enqueue_native_command need to
+ // explicitly wait on the events for the SPARSE domain. The
+ // extension ext_codeplay_enqueue_native_command is used to launch
+ // the compute operation which depends on the previous optimize
+ // step. In cuSPARSE the optimize step is synchronous but it is
+ // asynchronous in oneMKL Interface. The optimize step may not use
+ // the CUDA stream which would make it impossible for
+ // ext_codeplay_enqueue_native_command to automatically ensure it
+ // has completed before the compute function starts. These waits are
+ // used to ensure the optimize step has completed before starting
+ // the computation.
+ for (auto event : dependencies) {
+ event.wait();
+ }
+ functor(ih, workspace_acc);
+ });
+#else
+ (void)dependencies;
+ submit_host_task_with_acc(cgh, queue, functor, workspace_acc, capture_only_accessors...);
+#endif
+}
+
+/// Helper submit functions to capture all accessors from the generic containers
+/// \p other_containers and ensure the dependencies of buffers are respected.
+/// The accessors are not directly used as the underlying data pointer has
+/// already been captured in previous functions.
+/// \p workspace_buffer is an optional buffer. Its accessor will be given to the
+/// functor as a last argument if \p UseWorkspace is true.
+/// \p UseWorkspace must be true to use the given \p workspace_buffer.
+/// \p UseEnqueueNativeCommandExt controls whether host_task are used or the
+/// extension ext_codeplay_enqueue_native_command is used to launch tasks. The
+/// extension should only be used for asynchronous functions using native
+/// backend's functions. The extension can only be used for in-order queues as
+/// the same cuStream needs to be used for the 3 steps to run an operation:
+/// querying the buffer size, optimizing and running the computation. This means
+/// a different cuStream can be used inside the native_command than the native
+/// cuStream used by the extension.
+template
+sycl::event dispatch_submit_impl_fp_int(const std::string& function_name, sycl::queue queue,
+ const std::vector& dependencies,
+ Functor functor, matrix_handle_t sm_handle,
+ sycl::buffer workspace_buffer,
+ Ts... other_containers) {
+ bool is_in_order_queue = queue.is_in_order();
+ if (sm_handle->all_use_buffer()) {
+ data_type value_type = sm_handle->get_value_type();
+ data_type int_type = sm_handle->get_int_type();
+
+#define ONEMKL_CUSPARSE_SUBMIT(FP_TYPE, INT_TYPE) \
+ return queue.submit([&](sycl::handler& cgh) { \
+ cgh.depends_on(dependencies); \
+ auto fp_accs = get_fp_accessors(cgh, sm_handle, other_containers...); \
+ auto int_accs = get_int_accessors(cgh, sm_handle); \
+ auto workspace_acc = workspace_buffer.get_access(cgh); \
+ if constexpr (UseWorkspace) { \
+ if constexpr (UseEnqueueNativeCommandExt) { \
+ if (is_in_order_queue) { \
+ submit_native_command_ext_with_acc(cgh, queue, functor, dependencies, \
+ workspace_acc, fp_accs, int_accs); \
+ } \
+ else { \
+ submit_host_task_with_acc(cgh, queue, functor, workspace_acc, fp_accs, \
+ int_accs); \
+ } \
+ } \
+ else { \
+ submit_host_task_with_acc(cgh, queue, functor, workspace_acc, fp_accs, int_accs); \
+ } \
+ } \
+ else { \
+ (void)workspace_buffer; \
+ if constexpr (UseEnqueueNativeCommandExt) { \
+ if (is_in_order_queue) { \
+ submit_native_command_ext(cgh, queue, functor, dependencies, fp_accs, \
+ int_accs); \
+ } \
+ else { \
+ submit_host_task(cgh, queue, functor, fp_accs, int_accs); \
+ } \
+ } \
+ else { \
+ submit_host_task(cgh, queue, functor, fp_accs, int_accs); \
+ } \
+ } \
+ })
+#define ONEMKL_CUSPARSE_SUBMIT_INT(FP_TYPE) \
+ if (int_type == data_type::int32) { \
+ ONEMKL_CUSPARSE_SUBMIT(FP_TYPE, std::int32_t); \
+ } \
+ else if (int_type == data_type::int64) { \
+ ONEMKL_CUSPARSE_SUBMIT(FP_TYPE, std::int64_t); \
+ }
+
+ if (value_type == data_type::real_fp32) {
+ ONEMKL_CUSPARSE_SUBMIT_INT(float)
+ }
+ else if (value_type == data_type::real_fp64) {
+ ONEMKL_CUSPARSE_SUBMIT_INT(double)
+ }
+ else if (value_type == data_type::complex_fp32) {
+ ONEMKL_CUSPARSE_SUBMIT_INT(std::complex)
+ }
+ else if (value_type == data_type::complex_fp64) {
+ ONEMKL_CUSPARSE_SUBMIT_INT(std::complex)
+ }
+
+#undef ONEMKL_CUSPARSE_SUBMIT_INT
+#undef ONEMKL_CUSPARSE_SUBMIT
+
+ throw oneapi::mkl::exception("sparse_blas", function_name,
+ "Could not dispatch buffer kernel to a supported type");
+ }
+ else {
+ // USM submit does not need to capture accessors
+ if constexpr (!UseWorkspace) {
+ return queue.submit([&](sycl::handler& cgh) {
+ cgh.depends_on(dependencies);
+ if constexpr (UseEnqueueNativeCommandExt) {
+ if (is_in_order_queue) {
+ submit_native_command_ext(cgh, queue, functor, dependencies);
+ }
+ else {
+ submit_host_task(cgh, queue, functor);
+ }
+ }
+ else {
+ submit_host_task(cgh, queue, functor);
+ }
+ });
+ }
+ else {
+ throw oneapi::mkl::exception("sparse_blas", function_name,
+ "Internal error: Cannot use accessor workspace with USM");
+ }
+ }
+}
+
+/// Similar to dispatch_submit_impl_fp_int but only dispatches the host_task based on the floating point value type.
+template
+sycl::event dispatch_submit_impl_fp(const std::string& function_name, sycl::queue queue,
+ const std::vector& dependencies, Functor functor,
+ ContainerT container_handle) {
+ if (container_handle->all_use_buffer()) {
+ data_type value_type = container_handle->get_value_type();
+
+#define ONEMKL_CUSPARSE_SUBMIT(FP_TYPE) \
+ return queue.submit([&](sycl::handler& cgh) { \
+ cgh.depends_on(dependencies); \
+ auto fp_accs = get_fp_accessors(cgh, container_handle); \
+ submit_host_task(cgh, queue, functor, fp_accs); \
+ })
+
+ if (value_type == data_type::real_fp32) {
+ ONEMKL_CUSPARSE_SUBMIT(float);
+ }
+ else if (value_type == data_type::real_fp64) {
+ ONEMKL_CUSPARSE_SUBMIT(double);
+ }
+ else if (value_type == data_type::complex_fp32) {
+ ONEMKL_CUSPARSE_SUBMIT(std::complex);
+ }
+ else if (value_type == data_type::complex_fp64) {
+ ONEMKL_CUSPARSE_SUBMIT(std::complex);
+ }
+
+#undef ONEMKL_CUSPARSE_SUBMIT
+
+ throw oneapi::mkl::exception("sparse_blas", function_name,
+ "Could not dispatch buffer kernel to a supported type");
+ }
+ else {
+ return queue.submit([&](sycl::handler& cgh) {
+ cgh.depends_on(dependencies);
+ submit_host_task(cgh, queue, functor);
+ });
+ }
+}
+
+/// Helper function for dispatch_submit_impl_fp_int
+template
+sycl::event dispatch_submit(const std::string& function_name, sycl::queue queue, Functor functor,
+ matrix_handle_t sm_handle, sycl::buffer workspace_buffer,
+ Ts... other_containers) {
+ constexpr bool UseWorkspace = true;
+ constexpr bool UseEnqueueNativeCommandExt = false;
+ return dispatch_submit_impl_fp_int(
+ function_name, queue, {}, functor, sm_handle, workspace_buffer, other_containers...);
+}
+
+/// Helper function for dispatch_submit_impl_fp_int
+template
+sycl::event dispatch_submit(const std::string& function_name, sycl::queue queue,
+ const std::vector& dependencies, Functor functor,
+ matrix_handle_t sm_handle, Ts... other_containers) {
+ constexpr bool UseWorkspace = false;
+ constexpr bool UseEnqueueNativeCommandExt = false;
+ sycl::buffer no_workspace(sycl::range<1>(0));
+ return dispatch_submit_impl_fp_int(
+ function_name, queue, dependencies, functor, sm_handle, no_workspace, other_containers...);
+}
+
+/// Helper function for dispatch_submit_impl_fp_int
+template
+sycl::event dispatch_submit(const std::string& function_name, sycl::queue queue, Functor functor,
+ matrix_handle_t sm_handle, Ts... other_containers) {
+ constexpr bool UseWorkspace = false;
+ constexpr bool UseEnqueueNativeCommandExt = false;
+ sycl::buffer no_workspace(sycl::range<1>(0));
+ return dispatch_submit_impl_fp_int(
+ function_name, queue, {}, functor, sm_handle, no_workspace, other_containers...);
+}
+
+/// Helper function for dispatch_submit_impl_fp_int
+template
+sycl::event dispatch_submit_native_ext(const std::string& function_name, sycl::queue queue,
+ Functor functor, matrix_handle_t sm_handle,
+ sycl::buffer workspace_buffer,
+ Ts... other_containers) {
+ constexpr bool UseWorkspace = true;
+#ifdef SYCL_EXT_ONEAPI_ENQUEUE_NATIVE_COMMAND
+ constexpr bool UseEnqueueNativeCommandExt = true;
+#else
+ constexpr bool UseEnqueueNativeCommandExt = false;
+#endif
+ return dispatch_submit_impl_fp_int(
+ function_name, queue, {}, functor, sm_handle, workspace_buffer, other_containers...);
+}
+
+/// Helper function for dispatch_submit_impl_fp_int
+template
+sycl::event dispatch_submit_native_ext(const std::string& function_name, sycl::queue queue,
+ const std::vector& dependencies,
+ Functor functor, matrix_handle_t sm_handle,
+ Ts... other_containers) {
+ constexpr bool UseWorkspace = false;
+#ifdef SYCL_EXT_ONEAPI_ENQUEUE_NATIVE_COMMAND
+ constexpr bool UseEnqueueNativeCommandExt = true;
+#else
+ constexpr bool UseEnqueueNativeCommandExt = false;
+#endif
+ sycl::buffer no_workspace(sycl::range<1>(0));
+ return dispatch_submit_impl_fp_int(
+ function_name, queue, dependencies, functor, sm_handle, no_workspace, other_containers...);
+}
+
+/// Helper function for dispatch_submit_impl_fp_int
+template
+sycl::event dispatch_submit_native_ext(const std::string& function_name, sycl::queue queue,
+ Functor functor, matrix_handle_t sm_handle,
+ Ts... other_containers) {
+ constexpr bool UseWorkspace = false;
+#ifdef SYCL_EXT_ONEAPI_ENQUEUE_NATIVE_COMMAND
+ constexpr bool UseEnqueueNativeCommandExt = true;
+#else
+ constexpr bool UseEnqueueNativeCommandExt = false;
+#endif
+ sycl::buffer no_workspace(sycl::range<1>(0));
+ return dispatch_submit_impl_fp_int(
+ function_name, queue, {}, functor, sm_handle, no_workspace, other_containers...);
+}
+
+// Helper function for functors submitted to host_task or native_command.
+// When the extension is disabled, host_task are used and the synchronization is needed to ensure the sycl::event corresponds to the end of the whole functor.
+// When the extension is enabled, host_task are still used for out-of-order queues, see description of dispatch_submit_impl_fp_int.
+inline void synchronize_if_needed(bool is_in_order_queue, CUstream cu_stream) {
+#ifndef SYCL_EXT_ONEAPI_ENQUEUE_NATIVE_COMMAND
+ (void)is_in_order_queue;
+ CUDA_ERROR_FUNC(cuStreamSynchronize, cu_stream);
+#else
+ if (!is_in_order_queue) {
+ CUDA_ERROR_FUNC(cuStreamSynchronize, cu_stream);
+ }
+#endif
+}
+
+} // namespace oneapi::mkl::sparse::cusparse::detail
+
+#endif // _ONEMKL_SPARSE_BLAS_BACKENDS_CUSPARSE_TASKS_HPP_
diff --git a/src/sparse_blas/backends/cusparse/cusparse_wrappers.cpp b/src/sparse_blas/backends/cusparse/cusparse_wrappers.cpp
new file mode 100644
index 000000000..278aec296
--- /dev/null
+++ b/src/sparse_blas/backends/cusparse/cusparse_wrappers.cpp
@@ -0,0 +1,32 @@
+/***************************************************************************
+* Copyright (C) Codeplay Software Limited
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* For your convenience, a copy of the License has been included in this
+* repository.
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+**************************************************************************/
+
+#include "oneapi/mkl/sparse_blas/types.hpp"
+
+#include "oneapi/mkl/sparse_blas/detail/cusparse/onemkl_sparse_blas_cusparse.hpp"
+
+#include "sparse_blas/function_table.hpp"
+
+#define WRAPPER_VERSION 1
+#define BACKEND cusparse
+
+extern "C" sparse_blas_function_table_t mkl_sparse_blas_table = {
+ WRAPPER_VERSION,
+#include "sparse_blas/backends/backend_wrappers.cxx"
+};
diff --git a/src/sparse_blas/backends/cusparse/operations/cusparse_spmm.cpp b/src/sparse_blas/backends/cusparse/operations/cusparse_spmm.cpp
new file mode 100644
index 000000000..5fd24d3f4
--- /dev/null
+++ b/src/sparse_blas/backends/cusparse/operations/cusparse_spmm.cpp
@@ -0,0 +1,336 @@
+/***************************************************************************
+* Copyright (C) Codeplay Software Limited
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* For your convenience, a copy of the License has been included in this
+* repository.
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+**************************************************************************/
+
+#include "oneapi/mkl/sparse_blas/detail/cusparse/onemkl_sparse_blas_cusparse.hpp"
+
+#include "sparse_blas/backends/cusparse/cusparse_error.hpp"
+#include "sparse_blas/backends/cusparse/cusparse_helper.hpp"
+#include "sparse_blas/backends/cusparse/cusparse_task.hpp"
+#include "sparse_blas/backends/cusparse/cusparse_handles.hpp"
+#include "sparse_blas/common_op_verification.hpp"
+#include "sparse_blas/macros.hpp"
+#include "sparse_blas/matrix_view_comparison.hpp"
+#include "sparse_blas/sycl_helper.hpp"
+
+namespace oneapi::mkl::sparse {
+
+// Complete the definition of the incomplete type
+struct spmm_descr {
+ // Cache the CUstream and global handle to avoid relying on CusparseScopedContextHandler to retrieve them.
+ // cuSPARSE seem to implicitly require to use the same CUstream for a whole operation (buffer_size, optimization and computation steps).
+ // This is needed as the default SYCL queue is out-of-order which can have a different CUstream for each host_task or native_command.
+ CUstream cu_stream;
+ cusparseHandle_t cu_handle;
+
+ detail::generic_container workspace;
+ std::size_t temp_buffer_size = 0;
+ bool buffer_size_called = false;
+ bool optimized_called = false;
+ oneapi::mkl::transpose last_optimized_opA;
+ oneapi::mkl::transpose last_optimized_opB;
+ matrix_view last_optimized_A_view;
+ matrix_handle_t last_optimized_A_handle;
+ dense_matrix_handle_t last_optimized_B_handle;
+ dense_matrix_handle_t last_optimized_C_handle;
+ spmm_alg last_optimized_alg;
+};
+
+} // namespace oneapi::mkl::sparse
+
+namespace oneapi::mkl::sparse::cusparse {
+
+namespace detail {
+
+inline auto get_cuda_spmm_alg(spmm_alg alg) {
+ switch (alg) {
+ case spmm_alg::coo_alg1: return CUSPARSE_SPMM_COO_ALG1;
+ case spmm_alg::coo_alg2: return CUSPARSE_SPMM_COO_ALG2;
+ case spmm_alg::coo_alg3: return CUSPARSE_SPMM_COO_ALG3;
+ case spmm_alg::coo_alg4: return CUSPARSE_SPMM_COO_ALG4;
+ case spmm_alg::csr_alg1: return CUSPARSE_SPMM_CSR_ALG1;
+ case spmm_alg::csr_alg2: return CUSPARSE_SPMM_CSR_ALG2;
+ case spmm_alg::csr_alg3: return CUSPARSE_SPMM_CSR_ALG3;
+ default: return CUSPARSE_SPMM_ALG_DEFAULT;
+ }
+}
+
+void check_valid_spmm(const std::string& function_name, oneapi::mkl::transpose opA,
+ oneapi::mkl::transpose opB, matrix_view A_view, matrix_handle_t A_handle,
+ dense_matrix_handle_t B_handle, dense_matrix_handle_t C_handle,
+ bool is_alpha_host_accessible, bool is_beta_host_accessible, spmm_alg alg) {
+ check_valid_spmm_common(function_name, A_view, A_handle, B_handle, C_handle,
+ is_alpha_host_accessible, is_beta_host_accessible);
+ check_valid_matrix_properties(function_name, A_handle);
+ if (alg == spmm_alg::csr_alg3 && opA != oneapi::mkl::transpose::nontrans) {
+ throw mkl::unimplemented(
+ "sparse_blas", function_name,
+ "The backend does not support spmm with the algorithm `spmm_alg::csr_alg3` if `opA` is not `transpose::nontrans`.");
+ }
+ if (alg == spmm_alg::csr_alg3 && opB == oneapi::mkl::transpose::conjtrans) {
+ throw mkl::unimplemented(
+ "sparse_blas", function_name,
+ "The backend does not support spmm with the algorithm `spmm_alg::csr_alg3` if `opB` is `transpose::conjtrans`.");
+ }
+ if (alg == spmm_alg::csr_alg3 && opB == oneapi::mkl::transpose::trans &&
+ A_handle->get_value_type() == data_type::real_fp64) {
+ // TODO: Remove once the issue is fixed: https://forums.developer.nvidia.com/t/cusparse-spmm-sample-failing-with-misaligned-address/311022
+ throw mkl::unimplemented(
+ "sparse_blas", function_name,
+ "The backend does not support spmm with the algorithm `spmm_alg::csr_alg3` if `opB` is `transpose::trans` and the real fp64 precision is used.");
+ }
+}
+
+inline void common_spmm_optimize(oneapi::mkl::transpose opA, oneapi::mkl::transpose opB,
+ bool is_alpha_host_accessible, matrix_view A_view,
+ matrix_handle_t A_handle, dense_matrix_handle_t B_handle,
+ bool is_beta_host_accessible, dense_matrix_handle_t C_handle,
+ spmm_alg alg, spmm_descr_t spmm_descr) {
+ check_valid_spmm("spmm_optimize", opA, opB, A_view, A_handle, B_handle, C_handle,
+ is_alpha_host_accessible, is_beta_host_accessible, alg);
+ if (!spmm_descr->buffer_size_called) {
+ throw mkl::uninitialized("sparse_blas", "spmm_optimize",
+ "spmm_buffer_size must be called before spmm_optimize.");
+ }
+ spmm_descr->optimized_called = true;
+ spmm_descr->last_optimized_opA = opA;
+ spmm_descr->last_optimized_opB = opB;
+ spmm_descr->last_optimized_A_view = A_view;
+ spmm_descr->last_optimized_A_handle = A_handle;
+ spmm_descr->last_optimized_B_handle = B_handle;
+ spmm_descr->last_optimized_C_handle = C_handle;
+ spmm_descr->last_optimized_alg = alg;
+}
+
+void spmm_optimize_impl(cusparseHandle_t cu_handle, oneapi::mkl::transpose opA,
+ oneapi::mkl::transpose opB, const void* alpha, matrix_handle_t A_handle,
+ dense_matrix_handle_t B_handle, const void* beta,
+ dense_matrix_handle_t C_handle, spmm_alg alg, void* workspace_ptr,
+ bool is_alpha_host_accessible) {
+ auto cu_a = A_handle->backend_handle;
+ auto cu_b = B_handle->backend_handle;
+ auto cu_c = C_handle->backend_handle;
+ auto type = A_handle->value_container.data_type;
+ auto cu_op_a = get_cuda_operation(type, opA);
+ auto cu_op_b = get_cuda_operation(type, opB);
+ auto cu_type = get_cuda_value_type(type);
+ auto cu_alg = get_cuda_spmm_alg(alg);
+ set_pointer_mode(cu_handle, is_alpha_host_accessible);
+ auto status = cusparseSpMM_preprocess(cu_handle, cu_op_a, cu_op_b, alpha, cu_a, cu_b, beta,
+ cu_c, cu_type, cu_alg, workspace_ptr);
+ check_status(status, "spmm_optimize");
+}
+
+} // namespace detail
+
+void init_spmm_descr(sycl::queue& /*queue*/, spmm_descr_t* p_spmm_descr) {
+ *p_spmm_descr = new spmm_descr();
+}
+
+sycl::event release_spmm_descr(sycl::queue& queue, spmm_descr_t spmm_descr,
+ const std::vector& dependencies) {
+ if (!spmm_descr) {
+ return detail::collapse_dependencies(queue, dependencies);
+ }
+
+ auto release_functor = [=]() {
+ spmm_descr->cu_handle = nullptr;
+ spmm_descr->last_optimized_A_handle = nullptr;
+ spmm_descr->last_optimized_B_handle = nullptr;
+ spmm_descr->last_optimized_C_handle = nullptr;
+ delete spmm_descr;
+ };
+
+ // Use dispatch_submit to ensure the descriptor is kept alive as long as the buffers are used
+ // dispatch_submit can only be used if the descriptor's handles are valid
+ if (spmm_descr->last_optimized_A_handle &&
+ spmm_descr->last_optimized_A_handle->all_use_buffer() &&
+ spmm_descr->last_optimized_B_handle && spmm_descr->last_optimized_C_handle &&
+ spmm_descr->workspace.use_buffer()) {
+ auto dispatch_functor = [=](sycl::interop_handle, sycl::accessor) {
+ release_functor();
+ };
+ return detail::dispatch_submit(
+ __func__, queue, dispatch_functor, spmm_descr->last_optimized_A_handle,
+ spmm_descr->workspace.get_buffer(), spmm_descr->last_optimized_B_handle,
+ spmm_descr->last_optimized_C_handle);
+ }
+
+ // Release used if USM is used or if the descriptor has been released before spmm_optimize has succeeded
+ sycl::event event = queue.submit([&](sycl::handler& cgh) {
+ cgh.depends_on(dependencies);
+ cgh.host_task(release_functor);
+ });
+ return event;
+}
+
+void spmm_buffer_size(sycl::queue& queue, oneapi::mkl::transpose opA, oneapi::mkl::transpose opB,
+ const void* alpha, matrix_view A_view, matrix_handle_t A_handle,
+ dense_matrix_handle_t B_handle, const void* beta,
+ dense_matrix_handle_t C_handle, spmm_alg alg, spmm_descr_t spmm_descr,
+ std::size_t& temp_buffer_size) {
+ bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha);
+ bool is_beta_host_accessible = detail::is_ptr_accessible_on_host(queue, beta);
+ detail::check_valid_spmm(__func__, opA, opB, A_view, A_handle, B_handle, C_handle,
+ is_alpha_host_accessible, is_beta_host_accessible, alg);
+ auto functor = [=, &temp_buffer_size](sycl::interop_handle ih) {
+ detail::CusparseScopedContextHandler sc(queue, ih);
+ auto [cu_handle, cu_stream] = sc.get_handle_and_stream(queue);
+ spmm_descr->cu_handle = cu_handle;
+ spmm_descr->cu_stream = cu_stream;
+ auto cu_a = A_handle->backend_handle;
+ auto cu_b = B_handle->backend_handle;
+ auto cu_c = C_handle->backend_handle;
+ auto type = A_handle->value_container.data_type;
+ auto cu_op_a = detail::get_cuda_operation(type, opA);
+ auto cu_op_b = detail::get_cuda_operation(type, opB);
+ auto cu_type = detail::get_cuda_value_type(type);
+ auto cu_alg = detail::get_cuda_spmm_alg(alg);
+ detail::set_pointer_mode(cu_handle, is_alpha_host_accessible);
+ auto status = cusparseSpMM_bufferSize(cu_handle, cu_op_a, cu_op_b, alpha, cu_a, cu_b, beta,
+ cu_c, cu_type, cu_alg, &temp_buffer_size);
+ detail::check_status(status, __func__);
+ };
+ auto event = detail::dispatch_submit(__func__, queue, functor, A_handle, B_handle, C_handle);
+ event.wait_and_throw();
+ spmm_descr->temp_buffer_size = temp_buffer_size;
+ spmm_descr->buffer_size_called = true;
+}
+
+void spmm_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, oneapi::mkl::transpose opB,
+ const void* alpha, matrix_view A_view, matrix_handle_t A_handle,
+ dense_matrix_handle_t B_handle, const void* beta, dense_matrix_handle_t C_handle,
+ spmm_alg alg, spmm_descr_t spmm_descr, sycl::buffer workspace) {
+ bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha);
+ bool is_beta_host_accessible = detail::is_ptr_accessible_on_host(queue, beta);
+ if (!A_handle->all_use_buffer()) {
+ detail::throw_incompatible_container(__func__);
+ }
+ detail::common_spmm_optimize(opA, opB, is_alpha_host_accessible, A_view, A_handle, B_handle,
+ is_beta_host_accessible, C_handle, alg, spmm_descr);
+ // Copy the buffer to extend its lifetime until the descriptor is free'd.
+ spmm_descr->workspace.set_buffer_untyped(workspace);
+ if (alg == spmm_alg::no_optimize_alg || workspace.size() == 0) {
+ // cusparseSpMM_preprocess cannot be called if the workspace is empty
+ return;
+ }
+ auto functor = [=](sycl::interop_handle ih, sycl::accessor workspace_acc) {
+ auto cu_handle = spmm_descr->cu_handle;
+ auto workspace_ptr = detail::get_mem(ih, workspace_acc);
+ detail::spmm_optimize_impl(cu_handle, opA, opB, alpha, A_handle, B_handle, beta, C_handle,
+ alg, workspace_ptr, is_alpha_host_accessible);
+ };
+
+ detail::dispatch_submit(__func__, queue, functor, A_handle, workspace, B_handle, C_handle);
+}
+
+sycl::event spmm_optimize(sycl::queue& queue, oneapi::mkl::transpose opA,
+ oneapi::mkl::transpose opB, const void* alpha, matrix_view A_view,
+ matrix_handle_t A_handle, dense_matrix_handle_t B_handle,
+ const void* beta, dense_matrix_handle_t C_handle, spmm_alg alg,
+ spmm_descr_t spmm_descr, void* workspace,
+ const std::vector& dependencies) {
+ bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha);
+ bool is_beta_host_accessible = detail::is_ptr_accessible_on_host(queue, beta);
+ if (A_handle->all_use_buffer()) {
+ detail::throw_incompatible_container(__func__);
+ }
+ detail::common_spmm_optimize(opA, opB, is_alpha_host_accessible, A_view, A_handle, B_handle,
+ is_beta_host_accessible, C_handle, alg, spmm_descr);
+ spmm_descr->workspace.usm_ptr = workspace;
+ if (alg == spmm_alg::no_optimize_alg || workspace == nullptr) {
+ // cusparseSpMM_preprocess cannot be called if the workspace is empty
+ return detail::collapse_dependencies(queue, dependencies);
+ }
+ auto functor = [=](sycl::interop_handle) {
+ auto cu_handle = spmm_descr->cu_handle;
+ detail::spmm_optimize_impl(cu_handle, opA, opB, alpha, A_handle, B_handle, beta, C_handle,
+ alg, workspace, is_alpha_host_accessible);
+ };
+
+ return detail::dispatch_submit(__func__, queue, dependencies, functor, A_handle, B_handle,
+ C_handle);
+}
+
+sycl::event spmm(sycl::queue& queue, oneapi::mkl::transpose opA, oneapi::mkl::transpose opB,
+ const void* alpha, matrix_view A_view, matrix_handle_t A_handle,
+ dense_matrix_handle_t B_handle, const void* beta, dense_matrix_handle_t C_handle,
+ spmm_alg alg, spmm_descr_t spmm_descr,
+ const std::vector& dependencies) {
+ bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha);
+ bool is_beta_host_accessible = detail::is_ptr_accessible_on_host(queue, beta);
+ detail::check_valid_spmm(__func__, opA, opB, A_view, A_handle, B_handle, C_handle,
+ is_alpha_host_accessible, is_beta_host_accessible, alg);
+ if (A_handle->all_use_buffer() != spmm_descr->workspace.use_buffer()) {
+ detail::throw_incompatible_container(__func__);
+ }
+
+ if (!spmm_descr->optimized_called) {
+ throw mkl::uninitialized("sparse_blas", __func__,
+ "spmm_optimize must be called before spmm.");
+ }
+ CHECK_DESCR_MATCH(spmm_descr, opA, "spmm_optimize");
+ CHECK_DESCR_MATCH(spmm_descr, opB, "spmm_optimize");
+ CHECK_DESCR_MATCH(spmm_descr, A_view, "spmm_optimize");
+ CHECK_DESCR_MATCH(spmm_descr, A_handle, "spmm_optimize");
+ CHECK_DESCR_MATCH(spmm_descr, B_handle, "spmm_optimize");
+ CHECK_DESCR_MATCH(spmm_descr, C_handle, "spmm_optimize");
+ CHECK_DESCR_MATCH(spmm_descr, alg, "spmm_optimize");
+
+ bool is_in_order_queue = queue.is_in_order();
+ auto compute_functor = [=](void* workspace_ptr) {
+ auto cu_handle = spmm_descr->cu_handle;
+ auto cu_a = A_handle->backend_handle;
+ auto cu_b = B_handle->backend_handle;
+ auto cu_c = C_handle->backend_handle;
+ auto type = A_handle->value_container.data_type;
+ auto cu_op_a = detail::get_cuda_operation(type, opA);
+ auto cu_op_b = detail::get_cuda_operation(type, opB);
+ auto cu_type = detail::get_cuda_value_type(type);
+ auto cu_alg = detail::get_cuda_spmm_alg(alg);
+ detail::set_pointer_mode(cu_handle, is_alpha_host_accessible);
+ auto status = cusparseSpMM(cu_handle, cu_op_a, cu_op_b, alpha, cu_a, cu_b, beta, cu_c,
+ cu_type, cu_alg, workspace_ptr);
+ detail::check_status(status, __func__);
+ detail::synchronize_if_needed(is_in_order_queue, spmm_descr->cu_stream);
+ };
+ if (A_handle->all_use_buffer() && spmm_descr->temp_buffer_size > 0) {
+ // The accessor can only be created if the buffer size is greater than 0
+ auto functor_buffer = [=](sycl::interop_handle ih,
+ sycl::accessor workspace_acc) {
+ auto workspace_ptr = detail::get_mem(ih, workspace_acc);
+ compute_functor(workspace_ptr);
+ };
+ return detail::dispatch_submit_native_ext(__func__, queue, functor_buffer, A_handle,
+ spmm_descr->workspace.get_buffer(),
+ B_handle, C_handle);
+ }
+ else {
+ // The same dispatch_submit can be used for USM or buffers if no
+ // workspace accessor is needed, workspace_ptr will be a nullptr in the
+ // latter case.
+ auto workspace_ptr = spmm_descr->workspace.usm_ptr;
+ auto functor_usm = [=](sycl::interop_handle) {
+ compute_functor(workspace_ptr);
+ };
+ return detail::dispatch_submit_native_ext(__func__, queue, dependencies, functor_usm,
+ A_handle, B_handle, C_handle);
+ }
+}
+
+} // namespace oneapi::mkl::sparse::cusparse
diff --git a/src/sparse_blas/backends/cusparse/operations/cusparse_spmv.cpp b/src/sparse_blas/backends/cusparse/operations/cusparse_spmv.cpp
new file mode 100644
index 000000000..03b848916
--- /dev/null
+++ b/src/sparse_blas/backends/cusparse/operations/cusparse_spmv.cpp
@@ -0,0 +1,335 @@
+/***************************************************************************
+* Copyright (C) Codeplay Software Limited
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* For your convenience, a copy of the License has been included in this
+* repository.
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+**************************************************************************/
+
+#include "oneapi/mkl/sparse_blas/detail/cusparse/onemkl_sparse_blas_cusparse.hpp"
+
+#include "sparse_blas/backends/cusparse/cusparse_error.hpp"
+#include "sparse_blas/backends/cusparse/cusparse_helper.hpp"
+#include "sparse_blas/backends/cusparse/cusparse_task.hpp"
+#include "sparse_blas/backends/cusparse/cusparse_handles.hpp"
+#include "sparse_blas/common_op_verification.hpp"
+#include "sparse_blas/macros.hpp"
+#include "sparse_blas/matrix_view_comparison.hpp"
+#include "sparse_blas/sycl_helper.hpp"
+
+namespace oneapi::mkl::sparse {
+
+// Complete the definition of the incomplete type
+struct spmv_descr {
+ // Cache the CUstream and global handle to avoid relying on CusparseScopedContextHandler to retrieve them.
+ // cuSPARSE seem to implicitly require to use the same CUstream for a whole operation (buffer_size, optimization and computation steps).
+ // This is needed as the default SYCL queue is out-of-order which can have a different CUstream for each host_task or native_command.
+ CUstream cu_stream;
+ cusparseHandle_t cu_handle;
+
+ detail::generic_container workspace;
+ std::size_t temp_buffer_size = 0;
+ bool buffer_size_called = false;
+ bool optimized_called = false;
+ oneapi::mkl::transpose last_optimized_opA;
+ matrix_view last_optimized_A_view;
+ matrix_handle_t last_optimized_A_handle;
+ dense_vector_handle_t last_optimized_x_handle;
+ dense_vector_handle_t last_optimized_y_handle;
+ spmv_alg last_optimized_alg;
+};
+
+} // namespace oneapi::mkl::sparse
+
+namespace oneapi::mkl::sparse::cusparse {
+
+namespace detail {
+
+inline auto get_cuda_spmv_alg(spmv_alg alg) {
+ switch (alg) {
+ case spmv_alg::coo_alg1: return CUSPARSE_SPMV_COO_ALG1;
+ case spmv_alg::coo_alg2: return CUSPARSE_SPMV_COO_ALG2;
+ case spmv_alg::csr_alg1: return CUSPARSE_SPMV_CSR_ALG1;
+ case spmv_alg::csr_alg2: return CUSPARSE_SPMV_CSR_ALG2;
+ default: return CUSPARSE_SPMV_ALG_DEFAULT;
+ }
+}
+
+void check_valid_spmv(const std::string& function_name, oneapi::mkl::transpose opA,
+ matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle,
+ dense_vector_handle_t y_handle, bool is_alpha_host_accessible,
+ bool is_beta_host_accessible) {
+ check_valid_spmv_common(function_name, opA, A_view, A_handle, x_handle, y_handle,
+ is_alpha_host_accessible, is_beta_host_accessible);
+ check_valid_matrix_properties(function_name, A_handle);
+ if (A_view.type_view != matrix_descr::general) {
+ throw mkl::unimplemented(
+ "sparse_blas", function_name,
+ "The backend does not support spmv with a `type_view` other than `matrix_descr::general`.");
+ }
+}
+
+inline void common_spmv_optimize(oneapi::mkl::transpose opA, bool is_alpha_host_accessible,
+ matrix_view A_view, matrix_handle_t A_handle,
+ dense_vector_handle_t x_handle, bool is_beta_host_accessible,
+ dense_vector_handle_t y_handle, spmv_alg alg,
+ spmv_descr_t spmv_descr) {
+ check_valid_spmv("spmv_optimize", opA, A_view, A_handle, x_handle, y_handle,
+ is_alpha_host_accessible, is_beta_host_accessible);
+ if (!spmv_descr->buffer_size_called) {
+ throw mkl::uninitialized("sparse_blas", "spmv_optimize",
+ "spmv_buffer_size must be called before spmv_optimize.");
+ }
+ spmv_descr->optimized_called = true;
+ spmv_descr->last_optimized_opA = opA;
+ spmv_descr->last_optimized_A_view = A_view;
+ spmv_descr->last_optimized_A_handle = A_handle;
+ spmv_descr->last_optimized_x_handle = x_handle;
+ spmv_descr->last_optimized_y_handle = y_handle;
+ spmv_descr->last_optimized_alg = alg;
+}
+
+#if CUSPARSE_VERSION >= 12300
+// cusparseSpMV_preprocess was added in cuSPARSE 12.3.0.142 (CUDA 12.4)
+void spmv_optimize_impl(cusparseHandle_t cu_handle, oneapi::mkl::transpose opA, const void* alpha,
+ matrix_handle_t A_handle, dense_vector_handle_t x_handle, const void* beta,
+ dense_vector_handle_t y_handle, spmv_alg alg, void* workspace_ptr,
+ bool is_alpha_host_accessible) {
+ auto cu_a = A_handle->backend_handle;
+ auto cu_x = x_handle->backend_handle;
+ auto cu_y = y_handle->backend_handle;
+ auto type = A_handle->value_container.data_type;
+ auto cu_op = get_cuda_operation(type, opA);
+ auto cu_type = get_cuda_value_type(type);
+ auto cu_alg = get_cuda_spmv_alg(alg);
+ set_pointer_mode(cu_handle, is_alpha_host_accessible);
+ auto status = cusparseSpMV_preprocess(cu_handle, cu_op, alpha, cu_a, cu_x, beta, cu_y, cu_type,
+ cu_alg, workspace_ptr);
+ check_status(status, "spmv_optimize");
+}
+#endif
+
+} // namespace detail
+
+void init_spmv_descr(sycl::queue& /*queue*/, spmv_descr_t* p_spmv_descr) {
+ *p_spmv_descr = new spmv_descr();
+}
+
+sycl::event release_spmv_descr(sycl::queue& queue, spmv_descr_t spmv_descr,
+ const std::vector& dependencies) {
+ if (!spmv_descr) {
+ return detail::collapse_dependencies(queue, dependencies);
+ }
+
+ auto release_functor = [=]() {
+ spmv_descr->cu_handle = nullptr;
+ spmv_descr->last_optimized_A_handle = nullptr;
+ spmv_descr->last_optimized_x_handle = nullptr;
+ spmv_descr->last_optimized_y_handle = nullptr;
+ delete spmv_descr;
+ };
+
+ // Use dispatch_submit to ensure the descriptor is kept alive as long as the buffers are used
+ // dispatch_submit can only be used if the descriptor's handles are valid
+ if (spmv_descr->last_optimized_A_handle &&
+ spmv_descr->last_optimized_A_handle->all_use_buffer() &&
+ spmv_descr->last_optimized_x_handle && spmv_descr->last_optimized_y_handle &&
+ spmv_descr->workspace.use_buffer()) {
+ auto dispatch_functor = [=](sycl::interop_handle, sycl::accessor) {
+ release_functor();
+ };
+ return detail::dispatch_submit(
+ __func__, queue, dispatch_functor, spmv_descr->last_optimized_A_handle,
+ spmv_descr->workspace.get_buffer(), spmv_descr->last_optimized_x_handle,
+ spmv_descr->last_optimized_y_handle);
+ }
+
+ // Release used if USM is used or if the descriptor has been released before spmv_optimize has succeeded
+ sycl::event event = queue.submit([&](sycl::handler& cgh) {
+ cgh.depends_on(dependencies);
+ cgh.host_task(release_functor);
+ });
+ return event;
+}
+
+void spmv_buffer_size(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha,
+ matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle,
+ const void* beta, dense_vector_handle_t y_handle, spmv_alg alg,
+ spmv_descr_t spmv_descr, std::size_t& temp_buffer_size) {
+ bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha);
+ bool is_beta_host_accessible = detail::is_ptr_accessible_on_host(queue, beta);
+ detail::check_valid_spmv(__func__, opA, A_view, A_handle, x_handle, y_handle,
+ is_alpha_host_accessible, is_beta_host_accessible);
+
+ auto functor = [=, &temp_buffer_size](sycl::interop_handle ih) {
+ detail::CusparseScopedContextHandler sc(queue, ih);
+ auto [cu_handle, cu_stream] = sc.get_handle_and_stream(queue);
+ spmv_descr->cu_handle = cu_handle;
+ spmv_descr->cu_stream = cu_stream;
+ auto cu_a = A_handle->backend_handle;
+ auto cu_x = x_handle->backend_handle;
+ auto cu_y = y_handle->backend_handle;
+ auto type = A_handle->value_container.data_type;
+ auto cu_op = detail::get_cuda_operation(type, opA);
+ auto cu_type = detail::get_cuda_value_type(type);
+ auto cu_alg = detail::get_cuda_spmv_alg(alg);
+ detail::set_pointer_mode(cu_handle, is_alpha_host_accessible);
+ auto status = cusparseSpMV_bufferSize(cu_handle, cu_op, alpha, cu_a, cu_x, beta, cu_y,
+ cu_type, cu_alg, &temp_buffer_size);
+ detail::check_status(status, __func__);
+ };
+ auto event = detail::dispatch_submit(__func__, queue, functor, A_handle, x_handle, y_handle);
+ event.wait_and_throw();
+ spmv_descr->temp_buffer_size = temp_buffer_size;
+ spmv_descr->buffer_size_called = true;
+}
+
+void spmv_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha,
+ matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle,
+ const void* beta, dense_vector_handle_t y_handle, spmv_alg alg,
+ spmv_descr_t spmv_descr, sycl::buffer workspace) {
+ bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha);
+ bool is_beta_host_accessible = detail::is_ptr_accessible_on_host(queue, beta);
+ if (!A_handle->all_use_buffer()) {
+ detail::throw_incompatible_container(__func__);
+ }
+ detail::common_spmv_optimize(opA, is_alpha_host_accessible, A_view, A_handle, x_handle,
+ is_beta_host_accessible, y_handle, alg, spmv_descr);
+ // Copy the buffer to extend its lifetime until the descriptor is free'd.
+ spmv_descr->workspace.set_buffer_untyped(workspace);
+ if (alg == spmv_alg::no_optimize_alg) {
+ return;
+ }
+
+#if CUSPARSE_VERSION < 12300
+ // cusparseSpMV_preprocess was added in cuSPARSE 12.3.0.142 (CUDA 12.4)
+ return;
+#else
+ if (spmv_descr->temp_buffer_size > 0) {
+ auto functor = [=](sycl::interop_handle ih, sycl::accessor workspace_acc) {
+ auto cu_handle = spmv_descr->cu_handle;
+ auto workspace_ptr = detail::get_mem(ih, workspace_acc);
+ detail::spmv_optimize_impl(cu_handle, opA, alpha, A_handle, x_handle, beta, y_handle,
+ alg, workspace_ptr, is_alpha_host_accessible);
+ };
+
+ // The accessor can only be created if the buffer size is greater than 0
+ detail::dispatch_submit(__func__, queue, functor, A_handle, workspace, x_handle, y_handle);
+ }
+ else {
+ auto functor = [=](sycl::interop_handle) {
+ auto cu_handle = spmv_descr->cu_handle;
+ detail::spmv_optimize_impl(cu_handle, opA, alpha, A_handle, x_handle, beta, y_handle,
+ alg, nullptr, is_alpha_host_accessible);
+ };
+ detail::dispatch_submit(__func__, queue, functor, A_handle, x_handle, y_handle);
+ }
+#endif
+}
+
+sycl::event spmv_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha,
+ matrix_view A_view, matrix_handle_t A_handle,
+ dense_vector_handle_t x_handle, const void* beta,
+ dense_vector_handle_t y_handle, spmv_alg alg, spmv_descr_t spmv_descr,
+ void* workspace, const std::vector& dependencies) {
+ bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha);
+ bool is_beta_host_accessible = detail::is_ptr_accessible_on_host(queue, beta);
+ if (A_handle->all_use_buffer()) {
+ detail::throw_incompatible_container(__func__);
+ }
+ detail::common_spmv_optimize(opA, is_alpha_host_accessible, A_view, A_handle, x_handle,
+ is_beta_host_accessible, y_handle, alg, spmv_descr);
+ spmv_descr->workspace.usm_ptr = workspace;
+ if (alg == spmv_alg::no_optimize_alg) {
+ return detail::collapse_dependencies(queue, dependencies);
+ }
+
+#if CUSPARSE_VERSION < 12300
+ // cusparseSpMV_preprocess was added in cuSPARSE 12.3.0.142 (CUDA 12.4)
+ return detail::collapse_dependencies(queue, dependencies);
+#else
+ auto functor = [=](sycl::interop_handle) {
+ auto cu_handle = spmv_descr->cu_handle;
+ detail::spmv_optimize_impl(cu_handle, opA, alpha, A_handle, x_handle, beta, y_handle, alg,
+ workspace, is_alpha_host_accessible);
+ };
+ return detail::dispatch_submit(__func__, queue, dependencies, functor, A_handle, x_handle,
+ y_handle);
+#endif
+}
+
+sycl::event spmv(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha,
+ matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle,
+ const void* beta, dense_vector_handle_t y_handle, spmv_alg alg,
+ spmv_descr_t spmv_descr, const std::vector& dependencies) {
+ bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha);
+ bool is_beta_host_accessible = detail::is_ptr_accessible_on_host(queue, beta);
+ detail::check_valid_spmv(__func__, opA, A_view, A_handle, x_handle, y_handle,
+ is_alpha_host_accessible, is_beta_host_accessible);
+ if (A_handle->all_use_buffer() != spmv_descr->workspace.use_buffer()) {
+ detail::throw_incompatible_container(__func__);
+ }
+
+ if (!spmv_descr->optimized_called) {
+ throw mkl::uninitialized("sparse_blas", __func__,
+ "spmv_optimize must be called before spmv.");
+ }
+ CHECK_DESCR_MATCH(spmv_descr, opA, "spmv_optimize");
+ CHECK_DESCR_MATCH(spmv_descr, A_view, "spmv_optimize");
+ CHECK_DESCR_MATCH(spmv_descr, A_handle, "spmv_optimize");
+ CHECK_DESCR_MATCH(spmv_descr, x_handle, "spmv_optimize");
+ CHECK_DESCR_MATCH(spmv_descr, y_handle, "spmv_optimize");
+ CHECK_DESCR_MATCH(spmv_descr, alg, "spmv_optimize");
+
+ bool is_in_order_queue = queue.is_in_order();
+ auto compute_functor = [=](void* workspace_ptr) {
+ auto cu_handle = spmv_descr->cu_handle;
+ auto cu_a = A_handle->backend_handle;
+ auto cu_x = x_handle->backend_handle;
+ auto cu_y = y_handle->backend_handle;
+ auto type = A_handle->value_container.data_type;
+ auto cu_op = detail::get_cuda_operation(type, opA);
+ auto cu_type = detail::get_cuda_value_type(type);
+ auto cu_alg = detail::get_cuda_spmv_alg(alg);
+ detail::set_pointer_mode(cu_handle, is_alpha_host_accessible);
+ auto status = cusparseSpMV(cu_handle, cu_op, alpha, cu_a, cu_x, beta, cu_y, cu_type, cu_alg,
+ workspace_ptr);
+ detail::check_status(status, __func__);
+ detail::synchronize_if_needed(is_in_order_queue, spmv_descr->cu_stream);
+ };
+ if (A_handle->all_use_buffer() && spmv_descr->temp_buffer_size > 0) {
+ // The accessor can only be created if the buffer size is greater than 0
+ auto functor_buffer = [=](sycl::interop_handle ih,
+ sycl::accessor workspace_acc) {
+ auto workspace_ptr = detail::get_mem(ih, workspace_acc);
+ compute_functor(workspace_ptr);
+ };
+ return detail::dispatch_submit_native_ext(__func__, queue, functor_buffer, A_handle,
+ spmv_descr->workspace.get_buffer