From bf14408627953c7dfe7f4d48fab8bebdfaf7f886 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 26 Sep 2023 16:51:44 -0600 Subject: [PATCH] Add sptrsv execution space overloads --- docs/developer/apidocs/sparse.rst | 13 + .../KokkosSparse_sptrsv_cuSPARSE_impl.hpp | 179 +++++----- .../impl/KokkosSparse_sptrsv_solve_impl.hpp | 326 +++++++++++------- .../impl/KokkosSparse_sptrsv_solve_spec.hpp | 38 +- .../KokkosSparse_sptrsv_symbolic_impl.hpp | 57 +-- .../KokkosSparse_sptrsv_symbolic_spec.hpp | 22 +- sparse/src/KokkosSparse_sptrsv.hpp | 253 ++++++++++++-- 7 files changed, 599 insertions(+), 289 deletions(-) diff --git a/docs/developer/apidocs/sparse.rst b/docs/developer/apidocs/sparse.rst index 415f72eec8..3a55e50c8b 100644 --- a/docs/developer/apidocs/sparse.rst +++ b/docs/developer/apidocs/sparse.rst @@ -94,3 +94,16 @@ par_ilut gmres ----- .. doxygenfunction:: gmres(KernelHandle* handle, AMatrix& A, BType& B, XType& X, Preconditioner* precond) + +sptrsv +------ +.. doxygenfunction:: sptrsv_symbolic(const ExecutionSpace &space, KernelHandle *handle, lno_row_view_t_ rowmap, lno_nnz_view_t_ entries) +.. doxygenfunction:: sptrsv_symbolic(KernelHandle *handle, lno_row_view_t_ rowmap, lno_nnz_view_t_ entries) +.. doxygenfunction:: sptrsv_symbolic(ExecutionSpace &space, KernelHandle *handle, lno_row_view_t_ rowmap, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values) +.. doxygenfunction:: sptrsv_symbolic(KernelHandle *handle, lno_row_view_t_ rowmap, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values) +.. doxygenfunction:: sptrsv_solve(ExecutionSpace &space, KernelHandle *handle, lno_row_view_t_ rowmap, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, BType b, XType x) +.. doxygenfunction:: sptrsv_solve(KernelHandle *handle, lno_row_view_t_ rowmap, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, BType b, XType x) +.. doxygenfunction:: sptrsv_solve(ExecutionSpace &space, KernelHandle *handle, XType x, XType b) +.. doxygenfunction:: sptrsv_solve(KernelHandle *handle, XType x, XType b) +.. doxygenfunction:: sptrsv_solve(ExecutionSpace &space, KernelHandle *handleL, KernelHandle *handleU, XType x, XType b) +.. doxygenfunction:: sptrsv_solve(KernelHandle *handleL, KernelHandle *handleU, XType x, XType b) diff --git a/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp index 7605f03fa2..0b9afa7796 100644 --- a/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp @@ -22,10 +22,11 @@ namespace KokkosSparse { namespace Impl { -template -void sptrsvcuSPARSE_symbolic(KernelHandle *sptrsv_handle, +void sptrsvcuSPARSE_symbolic(ExecutionSpace &space, KernelHandle *sptrsv_handle, typename KernelHandle::nnz_lno_t nrows, ain_row_index_view_type row_map, ain_nonzero_index_view_type entries, @@ -61,6 +62,9 @@ void sptrsvcuSPARSE_symbolic(KernelHandle *sptrsv_handle, typename KernelHandle::SPTRSVcuSparseHandleType *h = sptrsv_handle->get_cuSparseHandle(); + KOKKOS_CUSPARSE_SAFE_CALL( + cusparseSetStream(h->handle, space.cuda_stream())); + int64_t nnz = static_cast(entries.extent(0)); size_t pBufferSize; void *rm; @@ -98,13 +102,13 @@ void sptrsvcuSPARSE_symbolic(KernelHandle *sptrsv_handle, CUSPARSE_INDEX_BASE_ZERO, cudaValueType)); // Create dummy dense vector B (RHS) - nnz_scalar_view_t b_dummy("b_dummy", nrows); + nnz_scalar_view_t b_dummy(Kokkos::view_alloc(space, "b_dummy"), nrows); KOKKOS_CUSPARSE_SAFE_CALL( cusparseCreateDnVec(&(h->vecBDescr_dummy), static_cast(nrows), b_dummy.data(), cudaValueType)); // Create dummy dense vector X (LHS) - nnz_scalar_view_t x_dummy("x_dummy", nrows); + nnz_scalar_view_t x_dummy(Kokkos::view_alloc(space, "x_dummy"), nrows); KOKKOS_CUSPARSE_SAFE_CALL( cusparseCreateDnVec(&(h->vecXDescr_dummy), static_cast(nrows), x_dummy.data(), cudaValueType)); @@ -163,9 +167,12 @@ void sptrsvcuSPARSE_symbolic(KernelHandle *sptrsv_handle, bool is_lower = sptrsv_handle->is_lower_tri(); sptrsv_handle->create_cuSPARSE_Handle(trans, is_lower); - typename KernelHandle::SPTRSVcuSparseHandleType* h = + typename KernelHandle::SPTRSVcuSparseHandleType *h = sptrsv_handle->get_cuSparseHandle(); + KOKKOS_CUSPARSE_SAFE_CALL( + cusparseSetStream(h->handle, space.cuda_stream())); + cusparseStatus_t status; status = cusparseCreateCsrsv2Info(&(h->info)); if (CUSPARSE_STATUS_SUCCESS != status) @@ -178,85 +185,86 @@ void sptrsvcuSPARSE_symbolic(KernelHandle *sptrsv_handle, if (!std::is_same::value) sptrsv_handle->allocate_tmp_int_rowmap(row_map.extent(0)); - const int* rm = !std::is_same::value + const int *rm = !std::is_same::value ? sptrsv_handle->get_int_rowmap_ptr_copy(row_map) - : (const int*)row_map.data(); - const int* ent = (const int*)entries.data(); - const scalar_type* vals = values.data(); + : (const int *)row_map.data(); + const int *ent = (const int *)entries.data(); + const scalar_type *vals = values.data(); if (std::is_same::value) { cusparseDcsrsv2_bufferSize(h->handle, h->transpose, nrows, nnz, h->descr, - (double*)vals, (int*)rm, (int*)ent, h->info, + (double *)vals, (int *)rm, (int *)ent, h->info, &pBufferSize); // pBuffer returned by cudaMalloc is automatically aligned to 128 bytes. cudaError_t my_error; - my_error = cudaMalloc((void**)&(h->pBuffer), pBufferSize); + my_error = cudaMalloc((void **)&(h->pBuffer), pBufferSize); if (cudaSuccess != my_error) std::cout << "cudmalloc pBuffer error_t error name " << cudaGetErrorString(my_error) << std::endl; status = cusparseDcsrsv2_analysis( - h->handle, h->transpose, nrows, nnz, h->descr, (double*)vals, - (int*)rm, (int*)ent, h->info, h->policy, h->pBuffer); + h->handle, h->transpose, nrows, nnz, h->descr, (double *)vals, + (int *)rm, (int *)ent, h->info, h->policy, h->pBuffer); if (CUSPARSE_STATUS_SUCCESS != status) std::cout << "analysis status error name " << (status) << std::endl; } else if (std::is_same::value) { cusparseScsrsv2_bufferSize(h->handle, h->transpose, nrows, nnz, h->descr, - (float*)vals, (int*)rm, (int*)ent, h->info, + (float *)vals, (int *)rm, (int *)ent, h->info, &pBufferSize); // pBuffer returned by cudaMalloc is automatically aligned to 128 bytes. cudaError_t my_error; - my_error = cudaMalloc((void**)&(h->pBuffer), pBufferSize); + my_error = cudaMalloc((void **)&(h->pBuffer), pBufferSize); if (cudaSuccess != my_error) std::cout << "cudmalloc pBuffer error_t error name " << cudaGetErrorString(my_error) << std::endl; status = cusparseScsrsv2_analysis( - h->handle, h->transpose, nrows, nnz, h->descr, (float*)vals, (int*)rm, - (int*)ent, h->info, h->policy, h->pBuffer); + h->handle, h->transpose, nrows, nnz, h->descr, (float *)vals, + (int *)rm, (int *)ent, h->info, h->policy, h->pBuffer); if (CUSPARSE_STATUS_SUCCESS != status) std::cout << "analysis status error name " << (status) << std::endl; } else if (std::is_same >::value) { cusparseZcsrsv2_bufferSize(h->handle, h->transpose, nrows, nnz, h->descr, - (cuDoubleComplex*)vals, (int*)rm, (int*)ent, + (cuDoubleComplex *)vals, (int *)rm, (int *)ent, h->info, &pBufferSize); // pBuffer returned by cudaMalloc is automatically aligned to 128 bytes. cudaError_t my_error; - my_error = cudaMalloc((void**)&(h->pBuffer), pBufferSize); + my_error = cudaMalloc((void **)&(h->pBuffer), pBufferSize); if (cudaSuccess != my_error) std::cout << "cudmalloc pBuffer error_t error name " << cudaGetErrorString(my_error) << std::endl; - status = cusparseZcsrsv2_analysis( - h->handle, h->transpose, nrows, nnz, h->descr, (cuDoubleComplex*)vals, - (int*)rm, (int*)ent, h->info, h->policy, h->pBuffer); + status = + cusparseZcsrsv2_analysis(h->handle, h->transpose, nrows, nnz, + h->descr, (cuDoubleComplex *)vals, (int *)rm, + (int *)ent, h->info, h->policy, h->pBuffer); if (CUSPARSE_STATUS_SUCCESS != status) std::cout << "analysis status error name " << (status) << std::endl; } else if (std::is_same >::value) { cusparseCcsrsv2_bufferSize(h->handle, h->transpose, nrows, nnz, h->descr, - (cuComplex*)vals, (int*)rm, (int*)ent, h->info, - &pBufferSize); + (cuComplex *)vals, (int *)rm, (int *)ent, + h->info, &pBufferSize); // pBuffer returned by cudaMalloc is automatically aligned to 128 bytes. cudaError_t my_error; - my_error = cudaMalloc((void**)&(h->pBuffer), pBufferSize); + my_error = cudaMalloc((void **)&(h->pBuffer), pBufferSize); if (cudaSuccess != my_error) std::cout << "cudmalloc pBuffer error_t error name " << cudaGetErrorString(my_error) << std::endl; status = cusparseCcsrsv2_analysis( - h->handle, h->transpose, nrows, nnz, h->descr, (cuComplex*)vals, - (int*)rm, (int*)ent, h->info, h->policy, h->pBuffer); + h->handle, h->transpose, nrows, nnz, h->descr, (cuComplex *)vals, + (int *)rm, (int *)ent, h->info, h->policy, h->pBuffer); if (CUSPARSE_STATUS_SUCCESS != status) std::cout << "analysis status error name " << (status) << std::endl; @@ -281,10 +289,11 @@ void sptrsvcuSPARSE_symbolic(KernelHandle *sptrsv_handle, } template < - typename KernelHandle, typename ain_row_index_view_type, - typename ain_nonzero_index_view_type, typename ain_values_scalar_view_type, - typename b_values_scalar_view_type, typename x_values_scalar_view_type> -void sptrsvcuSPARSE_solve(KernelHandle *sptrsv_handle, + typename ExecutionSpace, typename KernelHandle, + typename ain_row_index_view_type, typename ain_nonzero_index_view_type, + typename ain_values_scalar_view_type, typename b_values_scalar_view_type, + typename x_values_scalar_view_type> +void sptrsvcuSPARSE_solve(ExecutionSpace &space, KernelHandle *sptrsv_handle, typename KernelHandle::nnz_lno_t nrows, ain_row_index_view_type row_map, ain_nonzero_index_view_type entries, @@ -323,6 +332,9 @@ void sptrsvcuSPARSE_solve(KernelHandle *sptrsv_handle, typename KernelHandle::SPTRSVcuSparseHandleType *h = sptrsv_handle->get_cuSparseHandle(); + KOKKOS_CUSPARSE_SAFE_CALL( + cusparseSetStream(h->handle, space.cuda_stream())); + const scalar_type alpha = scalar_type(1.0); cudaDataType cudaValueType = cuda_data_type_from(); @@ -354,18 +366,21 @@ void sptrsvcuSPARSE_solve(KernelHandle *sptrsv_handle, if (std::is_same::value) { cusparseStatus_t status; - typename KernelHandle::SPTRSVcuSparseHandleType* h = + typename KernelHandle::SPTRSVcuSparseHandleType *h = sptrsv_handle->get_cuSparseHandle(); + KOKKOS_CUSPARSE_SAFE_CALL( + cusparseSetStream(h->handle, space.cuda_stream())); + int nnz = entries.extent_int(0); - const int* rm = !std::is_same::value + const int *rm = !std::is_same::value ? sptrsv_handle->get_int_rowmap_ptr() - : (const int*)row_map.data(); - const int* ent = (const int*)entries.data(); - const scalar_type* vals = values.data(); - const scalar_type* bv = rhs.data(); - scalar_type* xv = lhs.data(); + : (const int *)row_map.data(); + const int *ent = (const int *)entries.data(); + const scalar_type *vals = values.data(); + const scalar_type *bv = rhs.data(); + scalar_type *xv = lhs.data(); if (std::is_same::value) { if (h->pBuffer == nullptr) { @@ -373,10 +388,10 @@ void sptrsvcuSPARSE_solve(KernelHandle *sptrsv_handle, } const double alpha = double(1); - status = cusparseDcsrsv2_solve(h->handle, h->transpose, nrows, nnz, - &alpha, h->descr, (double*)vals, (int*)rm, - (int*)ent, h->info, (double*)bv, - (double*)xv, h->policy, h->pBuffer); + status = cusparseDcsrsv2_solve( + h->handle, h->transpose, nrows, nnz, &alpha, h->descr, (double *)vals, + (int *)rm, (int *)ent, h->info, (double *)bv, (double *)xv, h->policy, + h->pBuffer); if (CUSPARSE_STATUS_SUCCESS != status) std::cout << "solve status error name " << (status) << std::endl; @@ -387,9 +402,9 @@ void sptrsvcuSPARSE_solve(KernelHandle *sptrsv_handle, const float alpha = float(1); status = cusparseScsrsv2_solve(h->handle, h->transpose, nrows, nnz, - &alpha, h->descr, (float*)vals, (int*)rm, - (int*)ent, h->info, (float*)bv, (float*)xv, - h->policy, h->pBuffer); + &alpha, h->descr, (float *)vals, (int *)rm, + (int *)ent, h->info, (float *)bv, + (float *)xv, h->policy, h->pBuffer); if (CUSPARSE_STATUS_SUCCESS != status) std::cout << "solve status error name " << (status) << std::endl; @@ -399,8 +414,8 @@ void sptrsvcuSPARSE_solve(KernelHandle *sptrsv_handle, cualpha.y = 0.0; status = cusparseZcsrsv2_solve( h->handle, h->transpose, nrows, nnz, &cualpha, h->descr, - (cuDoubleComplex*)vals, (int*)rm, (int*)ent, h->info, - (cuDoubleComplex*)bv, (cuDoubleComplex*)xv, h->policy, h->pBuffer); + (cuDoubleComplex *)vals, (int *)rm, (int *)ent, h->info, + (cuDoubleComplex *)bv, (cuDoubleComplex *)xv, h->policy, h->pBuffer); if (CUSPARSE_STATUS_SUCCESS != status) std::cout << "solve status error name " << (status) << std::endl; @@ -410,8 +425,8 @@ void sptrsvcuSPARSE_solve(KernelHandle *sptrsv_handle, cualpha.y = 0.0; status = cusparseCcsrsv2_solve( h->handle, h->transpose, nrows, nnz, &cualpha, h->descr, - (cuComplex*)vals, (int*)rm, (int*)ent, h->info, (cuComplex*)bv, - (cuComplex*)xv, h->policy, h->pBuffer); + (cuComplex *)vals, (int *)rm, (int *)ent, h->info, (cuComplex *)bv, + (cuComplex *)xv, h->policy, h->pBuffer); if (CUSPARSE_STATUS_SUCCESS != status) std::cout << "solve status error name " << (status) << std::endl; @@ -539,13 +554,13 @@ void sptrsvcuSPARSE_solve_streams( "CUSPARSE requires local ordinals to be integer.\n"); } else { const scalar_type alpha = scalar_type(1.0); - std::vector sptrsv_handle_v(nstreams); - std::vector h_v(nstreams); - std::vector rm_v(nstreams); - std::vector ent_v(nstreams); - std::vector vals_v(nstreams); - std::vector bv_v(nstreams); - std::vector xv_v(nstreams); + std::vector sptrsv_handle_v(nstreams); + std::vector h_v(nstreams); + std::vector rm_v(nstreams); + std::vector ent_v(nstreams); + std::vector vals_v(nstreams); + std::vector bv_v(nstreams); + std::vector xv_v(nstreams); for (int i = 0; i < nstreams; i++) { sptrsv_handle_v[i] = handle_v[i].get_sptrsv_handle(); @@ -560,8 +575,8 @@ void sptrsvcuSPARSE_solve_streams( } rm_v[i] = !std::is_same::value ? sptrsv_handle_v[i]->get_int_rowmap_ptr() - : reinterpret_cast(row_map_v[i].data()); - ent_v[i] = reinterpret_cast(entries_v[i].data()); + : reinterpret_cast(row_map_v[i].data()); + ent_v[i] = reinterpret_cast(entries_v[i].data()); vals_v[i] = values_v[i].data(); bv_v[i] = rhs_v[i].data(); xv_v[i] = lhs_v[i].data(); @@ -573,42 +588,42 @@ void sptrsvcuSPARSE_solve_streams( if (std::is_same::value) { KOKKOS_CUSPARSE_SAFE_CALL(cusparseDcsrsv2_solve( h_v[i]->handle, h_v[i]->transpose, nrows, nnz, - reinterpret_cast(&alpha), h_v[i]->descr, - reinterpret_cast(vals_v[i]), - reinterpret_cast(rm_v[i]), - reinterpret_cast(ent_v[i]), h_v[i]->info, - reinterpret_cast(bv_v[i]), - reinterpret_cast(xv_v[i]), h_v[i]->policy, + reinterpret_cast(&alpha), h_v[i]->descr, + reinterpret_cast(vals_v[i]), + reinterpret_cast(rm_v[i]), + reinterpret_cast(ent_v[i]), h_v[i]->info, + reinterpret_cast(bv_v[i]), + reinterpret_cast(xv_v[i]), h_v[i]->policy, h_v[i]->pBuffer)); } else if (std::is_same::value) { KOKKOS_CUSPARSE_SAFE_CALL(cusparseScsrsv2_solve( h_v[i]->handle, h_v[i]->transpose, nrows, nnz, - reinterpret_cast(&alpha), h_v[i]->descr, - reinterpret_cast(vals_v[i]), - reinterpret_cast(rm_v[i]), - reinterpret_cast(ent_v[i]), h_v[i]->info, - reinterpret_cast(bv_v[i]), - reinterpret_cast(xv_v[i]), h_v[i]->policy, + reinterpret_cast(&alpha), h_v[i]->descr, + reinterpret_cast(vals_v[i]), + reinterpret_cast(rm_v[i]), + reinterpret_cast(ent_v[i]), h_v[i]->info, + reinterpret_cast(bv_v[i]), + reinterpret_cast(xv_v[i]), h_v[i]->policy, h_v[i]->pBuffer)); } else if (std::is_same >::value) { KOKKOS_CUSPARSE_SAFE_CALL(cusparseZcsrsv2_solve( h_v[i]->handle, h_v[i]->transpose, nrows, nnz, - reinterpret_cast(&alpha), h_v[i]->descr, - reinterpret_cast(vals_v[i]), - reinterpret_cast(rm_v[i]), - reinterpret_cast(ent_v[i]), h_v[i]->info, - reinterpret_cast(bv_v[i]), - reinterpret_cast(xv_v[i]), h_v[i]->policy, + reinterpret_cast(&alpha), h_v[i]->descr, + reinterpret_cast(vals_v[i]), + reinterpret_cast(rm_v[i]), + reinterpret_cast(ent_v[i]), h_v[i]->info, + reinterpret_cast(bv_v[i]), + reinterpret_cast(xv_v[i]), h_v[i]->policy, h_v[i]->pBuffer)); } else if (std::is_same >::value) { KOKKOS_CUSPARSE_SAFE_CALL(cusparseCcsrsv2_solve( h_v[i]->handle, h_v[i]->transpose, nrows, nnz, - reinterpret_cast(&alpha), h_v[i]->descr, - reinterpret_cast(vals_v[i]), - reinterpret_cast(rm_v[i]), - reinterpret_cast(ent_v[i]), h_v[i]->info, - reinterpret_cast(bv_v[i]), - reinterpret_cast(xv_v[i]), h_v[i]->policy, + reinterpret_cast(&alpha), h_v[i]->descr, + reinterpret_cast(vals_v[i]), + reinterpret_cast(rm_v[i]), + reinterpret_cast(ent_v[i]), h_v[i]->info, + reinterpret_cast(bv_v[i]), + reinterpret_cast(xv_v[i]), h_v[i]->policy, h_v[i]->pBuffer)); } else { throw std::runtime_error("CUSPARSE wrapper error: unsupported type.\n"); diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp index b7e7fa1650..970c15a5af 100644 --- a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp @@ -2891,16 +2891,15 @@ void upper_tri_solve_cg(TriSolveHandle &thandle, const RowMapType row_map, #endif -template -void lower_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, - const EntriesType entries, const ValuesType values, - const RHSType &rhs, LHSType &lhs) { +template +void lower_tri_solve(ExecutionSpace &space, TriSolveHandle &thandle, + const RowMapType row_map, const EntriesType entries, + const ValuesType values, const RHSType &rhs, + LHSType &lhs) { #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSPSTRSV_SOLVE_IMPL_PROFILE) cudaProfilerStop(); #endif - - typedef typename TriSolveHandle::execution_space execution_space; typedef typename TriSolveHandle::size_type size_type; typedef typename TriSolveHandle::nnz_lno_view_t NGBLType; @@ -2972,8 +2971,10 @@ void lower_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_RP) { Kokkos::parallel_for( "parfor_fixed_lvl", - Kokkos::RangePolicy(node_count, - node_count + lvl_nodes), + Kokkos::Experimental::require( + Kokkos::RangePolicy(space, node_count, + node_count + lvl_nodes), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), LowerTriLvlSchedRPSolverFunctor( @@ -2981,8 +2982,8 @@ void lower_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, } else if (thandle.get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm:: SEQLVLSCHD_TP1) { - typedef Kokkos::TeamPolicy policy_type; - int team_size = thandle.get_team_size(); + using team_policy_t = Kokkos::TeamPolicy; + int team_size = thandle.get_team_size(); #ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED TriLvlSchedTP1SolverFunctor; + using team_policy_type = Kokkos::TeamPolicy; using supernode_view_type = Kokkos::View; @@ -3070,9 +3079,12 @@ void lower_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, SparseTriSupernodalSpMVFunctor sptrsv_init_functor(-2, node_count, nodes_grouped_by_level, supercols, work_offset_data, lhs, work); - Kokkos::parallel_for("parfor_tri_supernode_spmv", - team_policy_type(lvl_nodes, Kokkos::AUTO), - sptrsv_init_functor); + Kokkos::parallel_for( + "parfor_tri_supernode_spmv", + Kokkos::Experimental::require( + team_policy_type(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + sptrsv_init_functor); } for (size_type league_rank = 0; league_rank < lvl_nodes; @@ -3109,7 +3121,7 @@ void lower_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, auto Ljj = Kokkos::subview( viewL, range_type(0, nsrow), Kokkos::ALL()); // s-th supernocal column of L - KokkosBlas::gemv("N", one, Ljj, Xj, zero, Y); + KokkosBlas::gemv(space, "N", one, Ljj, Xj, zero, Y); } else { auto Xj = Kokkos::subview( lhs, @@ -3122,7 +3134,7 @@ void lower_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, if (invert_diagonal) { auto Y = Kokkos::subview( work, range_type(workoffset, workoffset + nscol)); - KokkosBlas::gemv("N", one, Ljj, Y, zero, Xj); + KokkosBlas::gemv(space, "N", one, Ljj, Y, zero, Xj); } else { char unit_diag = (unit_diagonal ? 'U' : 'N'); // NOTE: we currently supports only default_layout = @@ -3130,7 +3142,9 @@ void lower_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, Kokkos::View Xjj(Xj.data(), nscol, 1); - KokkosBlas::trsm("L", "L", "N", &unit_diag, one, Ljj, Xjj); + KokkosBlas::trsm(space, "L", "L", "N", &unit_diag, one, Ljj, + Xjj); + // TODO: space.fence(); Kokkos::fence(); } // update off-diagonal blocks @@ -3146,7 +3160,7 @@ void lower_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, viewL, range_type(nscol, nsrow), Kokkos::ALL()); // off-diagonal blocks of s-th supernodal // column of L - KokkosBlas::gemv("N", one, Lij, Xj, zero, Z); + KokkosBlas::gemv(space, "N", one, Lij, Xj, zero, Z); } } } @@ -3156,9 +3170,12 @@ void lower_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, SparseTriSupernodalSpMVFunctor sptrsv_init_functor(-1, node_count, nodes_grouped_by_level, supercols, work_offset_data, lhs, work); - Kokkos::parallel_for("parfor_tri_supernode_spmv", - team_policy_type(lvl_nodes, Kokkos::AUTO), - sptrsv_init_functor); + Kokkos::parallel_for( + "parfor_tri_supernode_spmv", + Kokkos::Experimental::require( + team_policy_type(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + sptrsv_init_functor); } } @@ -3169,9 +3186,12 @@ void lower_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, supercols, row_map, entries, values, lvl, kernel_type, diag_kernel_type, lhs, work, work_offset, nodes_grouped_by_level, node_count); - Kokkos::parallel_for("parfor_lsolve_supernode", - team_policy_type(lvl_nodes, Kokkos::AUTO), - sptrsv_functor); + Kokkos::parallel_for( + "parfor_lsolve_supernode", + Kokkos::Experimental::require( + team_policy_type(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + sptrsv_functor); #ifdef profile_supernodal_etree Kokkos::fence(); @@ -3191,7 +3211,7 @@ void lower_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, #endif // initialize input & output vectors - using team_policy_type = Kokkos::TeamPolicy; + using team_policy_type = Kokkos::TeamPolicy; // update with spmv (one or two SpMV) bool transpose_spmv = @@ -3201,36 +3221,45 @@ void lower_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, if (!invert_offdiagonal) { // solve with diagonals auto digmat = thandle.get_diagblock(lvl); - KokkosSparse::spmv(tran, one, digmat, lhs, one, work); + KokkosSparse::spmv(space, tran, one, digmat, lhs, one, work); // copy from work to lhs corresponding to diagonal blocks SparseTriSupernodalSpMVFunctor sptrsv_init_functor(-1, node_count, nodes_grouped_by_level, supercols, supercols, lhs, work); - Kokkos::parallel_for("parfor_lsolve_supernode", - team_policy_type(lvl_nodes, Kokkos::AUTO), - sptrsv_init_functor); + Kokkos::parallel_for( + "parfor_lsolve_supernode", + Kokkos::Experimental::require( + team_policy_type(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + sptrsv_init_functor); } else { // copy lhs corresponding to diagonal blocks to work and zero out in // lhs SparseTriSupernodalSpMVFunctor sptrsv_init_functor(1, node_count, nodes_grouped_by_level, supercols, supercols, lhs, work); - Kokkos::parallel_for("parfor_lsolve_supernode", - team_policy_type(lvl_nodes, Kokkos::AUTO), - sptrsv_init_functor); + Kokkos::parallel_for( + "parfor_lsolve_supernode", + Kokkos::Experimental::require( + team_policy_type(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + sptrsv_init_functor); } // update off-diagonals (potentiall combined with solve with // diagonals) auto submat = thandle.get_submatrix(lvl); - KokkosSparse::spmv(tran, one, submat, work, one, lhs); + KokkosSparse::spmv(space, tran, one, submat, work, one, lhs); // reinitialize workspace SparseTriSupernodalSpMVFunctor sptrsv_finalize_functor(0, node_count, nodes_grouped_by_level, supercols, supercols, lhs, work); - Kokkos::parallel_for("parfor_lsolve_supernode", - team_policy_type(lvl_nodes, Kokkos::AUTO), - sptrsv_finalize_functor); + Kokkos::parallel_for( + "parfor_lsolve_supernode", + Kokkos::Experimental::require( + team_policy_type(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + sptrsv_finalize_functor); #ifdef profile_supernodal_etree Kokkos::fence(); @@ -3263,16 +3292,16 @@ void lower_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, } // end lower_tri_solve -template -void upper_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, - const EntriesType entries, const ValuesType values, - const RHSType &rhs, LHSType &lhs) { +template +void upper_tri_solve(ExecutionSpace &space, TriSolveHandle &thandle, + const RowMapType row_map, const EntriesType entries, + const ValuesType values, const RHSType &rhs, + LHSType &lhs) { #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSPSTRSV_SOLVE_IMPL_PROFILE) cudaProfilerStop(); #endif - typedef typename TriSolveHandle::execution_space execution_space; - + using memory_space = typename ExecutionSpace::memory_space; typedef typename TriSolveHandle::size_type size_type; typedef typename TriSolveHandle::nnz_lno_view_t NGBLType; @@ -3289,7 +3318,6 @@ void upper_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, #if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV) using namespace KokkosSparse::Experimental; - using memory_space = typename TriSolveHandle::memory_space; using integer_view_t = typename TriSolveHandle::integer_view_t; using integer_view_host_t = typename TriSolveHandle::integer_view_host_t; using scalar_t = typename ValuesType::non_const_value_type; @@ -3349,14 +3377,16 @@ void upper_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_RP) { Kokkos::parallel_for( "parfor_fixed_lvl", - Kokkos::RangePolicy(node_count, - node_count + lvl_nodes), + Kokkos::Experimental::require( + Kokkos::RangePolicy(space, node_count, + node_count + lvl_nodes), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), UpperTriLvlSchedRPSolverFunctor( row_map, entries, values, lhs, rhs, nodes_grouped_by_level)); } else if (thandle.get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1) { - typedef Kokkos::TeamPolicy policy_type; + using team_policy_t = Kokkos::TeamPolicy; int team_size = thandle.get_team_size(); @@ -3372,11 +3402,19 @@ void upper_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, node_count); #endif if (team_size == -1) - Kokkos::parallel_for("parfor_u_team", - policy_type(lvl_nodes, Kokkos::AUTO), tstf); + Kokkos::parallel_for( + "parfor_u_team", + Kokkos::Experimental::require( + team_policy_t(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + tstf); else - Kokkos::parallel_for("parfor_u_team", - policy_type(lvl_nodes, team_size), tstf); + Kokkos::parallel_for( + "parfor_u_team", + Kokkos::Experimental::require( + team_policy_t(space, lvl_nodes, team_size), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + tstf); } // TP2 algorithm has issues with some offset-ordinal combo to be addressed /* @@ -3428,7 +3466,7 @@ tstf); } // end elseif timer.reset(); #endif - using team_policy_type = Kokkos::TeamPolicy; + using team_policy_type = Kokkos::TeamPolicy; if (thandle.is_column_major()) { // U stored in CSC if (diag_kernel_type_host(lvl) == 3) { // using device-level kernels (functor is called to gather the input @@ -3441,9 +3479,12 @@ tstf); } // end elseif SparseTriSupernodalSpMVFunctor sptrsv_init_functor(-2, node_count, nodes_grouped_by_level, supercols, work_offset_data, lhs, work); - Kokkos::parallel_for("parfor_tri_supernode_spmv", - team_policy_type(lvl_nodes, Kokkos::AUTO), - sptrsv_init_functor); + Kokkos::parallel_for( + "parfor_tri_supernode_spmv", + Kokkos::Experimental::require( + team_policy_type(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + sptrsv_init_functor); } for (size_type league_rank = 0; league_rank < lvl_nodes; league_rank++) { @@ -3484,7 +3525,7 @@ tstf); } // end elseif workoffset, workoffset + nsrow)); // needed with gemv for update&scatter - KokkosBlas::gemv("N", one, Uij, Xj, zero, Z); + KokkosBlas::gemv(space, "N", one, Uij, Xj, zero, Z); } else { // extract part of the solution, corresponding to the diagonal // block @@ -3501,14 +3542,14 @@ tstf); } // end elseif workoffset, workoffset + nscol)); // needed for gemv instead of trmv/trsv - KokkosBlas::gemv("N", one, Ujj, Y, zero, Xj); + KokkosBlas::gemv(space, "N", one, Ujj, Y, zero, Xj); } else { // NOTE: we currently supports only default_layout = // LayoutLeft Kokkos::View Xjj(Xj.data(), nscol, 1); - KokkosBlas::trsm("L", "U", "N", "N", one, Ujj, Xjj); + KokkosBlas::trsm(space, "L", "U", "N", "N", one, Ujj, Xjj); } // update off-diagonal blocks if (nsrow2 > 0) { @@ -3522,7 +3563,7 @@ tstf); } // end elseif workoffset + nscol, workoffset + nscol + nsrow2)); // needed with gemv for update&scatter - KokkosBlas::gemv("N", one, Uij, Xj, zero, Z); + KokkosBlas::gemv(space, "N", one, Uij, Xj, zero, Z); } } } @@ -3532,9 +3573,12 @@ tstf); } // end elseif SparseTriSupernodalSpMVFunctor sptrsv_init_functor(-1, node_count, nodes_grouped_by_level, supercols, work_offset_data, lhs, work); - Kokkos::parallel_for("parfor_tri_supernode_spmv", - team_policy_type(lvl_nodes, Kokkos::AUTO), - sptrsv_init_functor); + Kokkos::parallel_for( + "parfor_tri_supernode_spmv", + Kokkos::Experimental::require( + team_policy_type(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + sptrsv_init_functor); } } @@ -3546,10 +3590,13 @@ tstf); } // end elseif diag_kernel_type, lhs, work, work_offset, nodes_grouped_by_level, node_count); - using policy_type = Kokkos::TeamPolicy; - Kokkos::parallel_for("parfor_usolve_tran_supernode", - policy_type(lvl_nodes, Kokkos::AUTO), - sptrsv_functor); + using team_policy_t = Kokkos::TeamPolicy; + Kokkos::parallel_for( + "parfor_usolve_tran_supernode", + Kokkos::Experimental::require( + team_policy_t(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + sptrsv_functor); } else { // U stored in CSR // launching sparse-triangular solve functor UpperTriSupernodalFunctor; - Kokkos::parallel_for("parfor_usolve_supernode", - policy_type(lvl_nodes, Kokkos::AUTO), - sptrsv_functor); + using team_policy_t = Kokkos::TeamPolicy; + Kokkos::parallel_for( + "parfor_usolve_supernode", + Kokkos::Experimental::require( + team_policy_t(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + sptrsv_functor); if (diag_kernel_type_host(lvl) == 3) { // using device-level kernels (functor is called to gather the input @@ -3618,7 +3668,7 @@ tstf); } // end elseif workoffset + nscol, workoffset + nscol + nsrow2)); // needed with gemv for update&scatter - KokkosBlas::gemv("T", -one, Uij, Z, one, Xj); + KokkosBlas::gemv(space, "T", -one, Uij, Z, one, Xj); } // "triangular-solve" to compute Xj @@ -3626,13 +3676,13 @@ tstf); } // end elseif auto Ujj = Kokkos::subview(viewU, range_type(0, nscol), Kokkos::ALL()); if (invert_diagonal) { - KokkosBlas::gemv("T", one, Ujj, Xj, zero, Y); + KokkosBlas::gemv(space, "T", one, Ujj, Xj, zero, Y); } else { // NOTE: we currently supports only default_layout = LayoutLeft Kokkos::View Xjj(Xj.data(), nscol, 1); - KokkosBlas::trsm("L", "L", "T", "N", one, Ujj, Xjj); + KokkosBlas::trsm(space, "L", "L", "T", "N", one, Ujj, Xjj); } } if (invert_diagonal) { @@ -3641,9 +3691,12 @@ tstf); } // end elseif SparseTriSupernodalSpMVFunctor sptrsv_init_functor(-1, node_count, nodes_grouped_by_level, supercols, work_offset_data, lhs, work); - Kokkos::parallel_for("parfor_tri_supernode_spmv", - team_policy_type(lvl_nodes, Kokkos::AUTO), - sptrsv_init_functor); + Kokkos::parallel_for( + "parfor_tri_supernode_spmv", + Kokkos::Experimental::require( + team_policy_type(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + sptrsv_init_functor); } } } @@ -3664,7 +3717,7 @@ tstf); } // end elseif #endif // initialize input & output vectors - using team_policy_type = Kokkos::TeamPolicy; + using team_policy_type = Kokkos::TeamPolicy; // update with one, or two, spmv bool transpose_spmv = @@ -3675,28 +3728,34 @@ tstf); } // end elseif if (!invert_offdiagonal) { // solve with diagonals auto digmat = thandle.get_diagblock(lvl); - KokkosSparse::spmv(tran, one, digmat, lhs, one, work); + KokkosSparse::spmv(space, tran, one, digmat, lhs, one, work); // copy from work to lhs corresponding to diagonal blocks SparseTriSupernodalSpMVFunctor sptrsv_init_functor(-1, node_count, nodes_grouped_by_level, supercols, supercols, lhs, work); - Kokkos::parallel_for("parfor_lsolve_supernode", - team_policy_type(lvl_nodes, Kokkos::AUTO), - sptrsv_init_functor); + Kokkos::parallel_for( + "parfor_lsolve_supernode", + Kokkos::Experimental::require( + team_policy_type(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + sptrsv_init_functor); } else { // zero out lhs corresponding to diagonal blocks in lhs, and copy to // work SparseTriSupernodalSpMVFunctor sptrsv_init_functor(1, node_count, nodes_grouped_by_level, supercols, supercols, lhs, work); - Kokkos::parallel_for("parfor_lsolve_supernode", - team_policy_type(lvl_nodes, Kokkos::AUTO), - sptrsv_init_functor); + Kokkos::parallel_for( + "parfor_lsolve_supernode", + Kokkos::Experimental::require( + team_policy_type(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + sptrsv_init_functor); } // update with off-diagonals (potentiall combined with diagonal // solves) auto submat = thandle.get_submatrix(lvl); - KokkosSparse::spmv(tran, one, submat, work, one, lhs); + KokkosSparse::spmv(space, tran, one, submat, work, one, lhs); } else { if (!invert_offdiagonal) { // zero out lhs corresponding to diagonal blocks in lhs, and copy to @@ -3704,17 +3763,20 @@ tstf); } // end elseif SparseTriSupernodalSpMVFunctor sptrsv_init_functor(1, node_count, nodes_grouped_by_level, supercols, supercols, lhs, work); - Kokkos::parallel_for("parfor_lsolve_supernode", - team_policy_type(lvl_nodes, Kokkos::AUTO), - sptrsv_init_functor); + Kokkos::parallel_for( + "parfor_lsolve_supernode", + Kokkos::Experimental::require( + team_policy_type(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + sptrsv_init_functor); // update with off-diagonals auto submat = thandle.get_submatrix(lvl); - KokkosSparse::spmv(tran, one, submat, lhs, one, work); + KokkosSparse::spmv(space, tran, one, submat, lhs, one, work); // solve with diagonals auto digmat = thandle.get_diagblock(lvl); - KokkosSparse::spmv(tran, one, digmat, work, one, lhs); + KokkosSparse::spmv(space, tran, one, digmat, work, one, lhs); } else { std::cout << " ** invert_offdiag with U in CSR not supported **" << std::endl; @@ -3724,9 +3786,12 @@ tstf); } // end elseif SparseTriSupernodalSpMVFunctor sptrsv_finalize_functor(0, node_count, nodes_grouped_by_level, supercols, supercols, lhs, work); - Kokkos::parallel_for("parfor_lsolve_supernode", - team_policy_type(lvl_nodes, Kokkos::AUTO), - sptrsv_finalize_functor); + Kokkos::parallel_for( + "parfor_lsolve_supernode", + Kokkos::Experimental::require( + team_policy_type(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + sptrsv_finalize_functor); #ifdef profile_supernodal_etree Kokkos::fence(); @@ -3749,23 +3814,22 @@ tstf); } // end elseif double sptrsv_time_seconds = sptrsv_timer.seconds(); std::cout << " + SpTrsv(uppper) time: " << sptrsv_time_seconds << std::endl << std::endl; - std::cout << " + Execution space : " << execution_space::name() + std::cout << " + Execution space : " << ExecutionSpace::name() << std::endl; std::cout << " + Memory space : " << memory_space::name() << std::endl; #endif } // end upper_tri_solve -template -void tri_solve_chain(TriSolveHandle &thandle, const RowMapType row_map, - const EntriesType entries, const ValuesType values, - const RHSType &rhs, LHSType &lhs, +template +void tri_solve_chain(ExecutionSpace &space, TriSolveHandle &thandle, + const RowMapType row_map, const EntriesType entries, + const ValuesType values, const RHSType &rhs, LHSType &lhs, const bool /*is_lowertri_*/) { #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSPSTRSV_SOLVE_IMPL_PROFILE) cudaProfilerStop(); #endif - typedef typename TriSolveHandle::execution_space execution_space; typedef typename TriSolveHandle::size_type size_type; typedef typename TriSolveHandle::nnz_lno_view_t NGBLType; @@ -3786,9 +3850,9 @@ void tri_solve_chain(TriSolveHandle &thandle, const RowMapType row_map, size_type node_count = 0; // REFACTORED to cleanup; next, need debug and timer routines - using policy_type = Kokkos::TeamPolicy; + using policy_type = Kokkos::TeamPolicy; using large_cutoff_policy_type = - Kokkos::TeamPolicy; + Kokkos::TeamPolicy; /* using TP1Functor = TriLvlSchedTP1SolverFunctor; using LTP1Functor = @@ -3849,14 +3913,17 @@ void tri_solve_chain(TriSolveHandle &thandle, const RowMapType row_map, #endif if (team_size == -1) { team_size = - policy_type(1, 1, vector_size) + policy_type(space, 1, 1, vector_size) .team_size_recommended(tstf, Kokkos::ParallelForTag()); } size_type lvl_nodes = hnodes_per_level(schain); // lvl == echain???? - Kokkos::parallel_for("parfor_l_team_chain1", - policy_type(lvl_nodes, team_size, vector_size), - tstf); + Kokkos::parallel_for( + "parfor_l_team_chain1", + Kokkos::Experimental::require( + policy_type(space, lvl_nodes, team_size, vector_size), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + tstf); node_count += lvl_nodes; } else { @@ -3868,7 +3935,7 @@ void tri_solve_chain(TriSolveHandle &thandle, const RowMapType row_map, if (team_size_singleblock <= 0) { team_size_singleblock = - policy_type(1, 1, vector_size) + policy_type(space, 1, 1, vector_size) .team_size_recommended( SingleBlockFunctor(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, @@ -3891,7 +3958,10 @@ void tri_solve_chain(TriSolveHandle &thandle, const RowMapType row_map, #endif Kokkos::parallel_for( "parfor_l_team_chainmulti", - policy_type(1, team_size_singleblock, vector_size), tstf); + Kokkos::Experimental::require( + policy_type(space, 1, team_size_singleblock, vector_size), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + tstf); } else { // team_size_singleblock < cutoff => kernel must allow for a // block-stride internally @@ -3909,11 +3979,15 @@ void tri_solve_chain(TriSolveHandle &thandle, const RowMapType row_map, #endif Kokkos::parallel_for( "parfor_l_team_chainmulti_cutoff", - large_cutoff_policy_type(1, team_size_singleblock, vector_size), + Kokkos::Experimental::require( + large_cutoff_policy_type(1, team_size_singleblock, + vector_size), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), tstf); } node_count += lvl_nodes; } + // TODO: space.fence() Kokkos::fence(); // TODO - is this necessary? that is, can the // parallel_for launch before the s/echain values have // been updated? @@ -3939,16 +4013,19 @@ void tri_solve_chain(TriSolveHandle &thandle, const RowMapType row_map, #endif if (team_size == -1) { team_size = - policy_type(1, 1, vector_size) + policy_type(space, 1, 1, vector_size) .team_size_recommended(tstf, Kokkos::ParallelForTag()); } // TODO To use cudagraph here, need to know how many non-unit chains // there are, create a graph for each and launch accordingly size_type lvl_nodes = hnodes_per_level(schain); // lvl == echain???? - Kokkos::parallel_for("parfor_u_team_chain1", - policy_type(lvl_nodes, team_size, vector_size), - tstf); + Kokkos::parallel_for( + "parfor_u_team_chain1", + Kokkos::Experimental::require( + policy_type(space, lvl_nodes, team_size, vector_size), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + tstf); node_count += lvl_nodes; } else { @@ -3964,7 +4041,7 @@ void tri_solve_chain(TriSolveHandle &thandle, const RowMapType row_map, // values, lhs, rhs, nodes_grouped_by_level, is_lowertri, node_count), // Kokkos::ParallelForTag()); team_size_singleblock = - policy_type(1, 1, vector_size) + policy_type(space, 1, 1, vector_size) .team_size_recommended( SingleBlockFunctor(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, @@ -3987,7 +4064,10 @@ void tri_solve_chain(TriSolveHandle &thandle, const RowMapType row_map, #endif Kokkos::parallel_for( "parfor_u_team_chainmulti", - policy_type(1, team_size_singleblock, vector_size), tstf); + Kokkos::Experimental::require( + policy_type(space, 1, team_size_singleblock, vector_size), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + tstf); } else { // team_size_singleblock < cutoff => kernel must allow for a // block-stride internally @@ -4005,11 +4085,15 @@ void tri_solve_chain(TriSolveHandle &thandle, const RowMapType row_map, #endif Kokkos::parallel_for( "parfor_u_team_chainmulti_cutoff", - large_cutoff_policy_type(1, team_size_singleblock, vector_size), + Kokkos::Experimental::require( + large_cutoff_policy_type(1, team_size_singleblock, + vector_size), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), tstf); } node_count += lvl_nodes; } + // TODO: space.fence() Kokkos::fence(); // TODO - is this necessary? that is, can the // parallel_for launch before the s/echain values have // been updated? diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp index e36b9df236..6ad321c286 100644 --- a/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp @@ -96,9 +96,9 @@ template ::value> struct SPTRSV_SOLVE { - static void sptrsv_solve(KernelHandle *handle, const RowMapType row_map, - const EntriesType entries, const ValuesType values, - BType b, XType x); + static void sptrsv_solve(ExecutionSpace &space, KernelHandle *handle, + const RowMapType row_map, const EntriesType entries, + const ValuesType values, BType b, XType x); static void sptrsv_solve_streams( const std::vector &execspace_v, @@ -117,9 +117,9 @@ template { - static void sptrsv_solve(KernelHandle *handle, const RowMapType row_map, - const EntriesType entries, const ValuesType values, - BType b, XType x) { + static void sptrsv_solve(ExecutionSpace &space, KernelHandle *handle, + const RowMapType row_map, const EntriesType entries, + const ValuesType values, BType b, XType x) { // Call specific algorithm type auto sptrsv_handle = handle->get_sptrsv_handle(); Kokkos::Profiling::pushRegion(sptrsv_handle->is_lower_tri() @@ -127,40 +127,44 @@ struct SPTRSV_SOLVEis_lower_tri()) { if (sptrsv_handle->is_symbolic_complete() == false) { - Experimental::lower_tri_symbolic(*sptrsv_handle, row_map, entries); + Experimental::lower_tri_symbolic(space, *sptrsv_handle, row_map, + entries); } if (sptrsv_handle->get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN) { - Experimental::tri_solve_chain(*sptrsv_handle, row_map, entries, values, - b, x, true); + Experimental::tri_solve_chain(space, *sptrsv_handle, row_map, entries, + values, b, x, true); } else { #ifdef KOKKOSKERNELS_SPTRSV_CUDAGRAPHSUPPORT using ExecSpace = typename RowMapType::memory_space::execution_space; if (std::is_same::value) + // TODO: set stream in thandle's sptrsvCudaGraph Experimental::lower_tri_solve_cg(*sptrsv_handle, row_map, entries, values, b, x); else #endif - Experimental::lower_tri_solve(*sptrsv_handle, row_map, entries, + Experimental::lower_tri_solve(space, *sptrsv_handle, row_map, entries, values, b, x); } } else { if (sptrsv_handle->is_symbolic_complete() == false) { - Experimental::upper_tri_symbolic(*sptrsv_handle, row_map, entries); + Experimental::upper_tri_symbolic(space, *sptrsv_handle, row_map, + entries); } if (sptrsv_handle->get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN) { - Experimental::tri_solve_chain(*sptrsv_handle, row_map, entries, values, - b, x, false); + Experimental::tri_solve_chain(space, *sptrsv_handle, row_map, entries, + values, b, x, false); } else { #ifdef KOKKOSKERNELS_SPTRSV_CUDAGRAPHSUPPORT using ExecSpace = typename RowMapType::memory_space::execution_space; if (std::is_same::value) + // TODO: set stream in thandle's sptrsvCudaGraph Experimental::upper_tri_solve_cg(*sptrsv_handle, row_map, entries, values, b, x); else #endif - Experimental::upper_tri_solve(*sptrsv_handle, row_map, entries, + Experimental::upper_tri_solve(space, *sptrsv_handle, row_map, entries, values, b, x); } } @@ -188,7 +192,8 @@ struct SPTRSV_SOLVEis_lower_tri()) { for (int i = 0; i < static_cast(execspace_v.size()); i++) { if (sptrsv_handle_v[i]->is_symbolic_complete() == false) { - Experimental::lower_tri_symbolic(*(sptrsv_handle_v[i]), row_map_v[i], + Experimental::lower_tri_symbolic(execspace_v[i], + *(sptrsv_handle_v[i]), row_map_v[i], entries_v[i]); } } @@ -198,7 +203,8 @@ struct SPTRSV_SOLVE(execspace_v.size()); i++) { if (sptrsv_handle_v[i]->is_symbolic_complete() == false) { - Experimental::upper_tri_symbolic(*(sptrsv_handle_v[i]), row_map_v[i], + Experimental::upper_tri_symbolic(execspace_v[i], + *(sptrsv_handle_v[i]), row_map_v[i], entries_v[i]); } } diff --git a/sparse/impl/KokkosSparse_sptrsv_symbolic_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_symbolic_impl.hpp index 3ef3be8780..36ea2d9df8 100644 --- a/sparse/impl/KokkosSparse_sptrsv_symbolic_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_symbolic_impl.hpp @@ -147,9 +147,10 @@ void symbolic_chain_phase(TriSolveHandle& thandle, #endif } // end symbolic_chain_phase -template -void lower_tri_symbolic(TriSolveHandle& thandle, const RowMapType drow_map, - const EntriesType dentries) { +template +void lower_tri_symbolic(ExecSpaceIn& space, TriSolveHandle& thandle, + const RowMapType drow_map, const EntriesType dentries) { #ifdef TRISOLVE_SYMB_TIMERS Kokkos::Timer timer_sym_lowertri_total; Kokkos::Timer timer; @@ -177,10 +178,10 @@ void lower_tri_symbolic(TriSolveHandle& thandle, const RowMapType drow_map, size_type nrows = drow_map.extent(0) - 1; auto row_map = Kokkos::create_mirror_view(drow_map); - Kokkos::deep_copy(row_map, drow_map); + Kokkos::deep_copy(space, row_map, drow_map); auto entries = Kokkos::create_mirror_view(dentries); - Kokkos::deep_copy(entries, dentries); + Kokkos::deep_copy(space, entries, dentries); // get device view - will deep_copy to it at end of this host routine DeviceEntriesType dnodes_per_level = thandle.get_nodes_per_level(); @@ -193,11 +194,12 @@ void lower_tri_symbolic(TriSolveHandle& thandle, const RowMapType drow_map, DeviceSignedEntriesType dlevel_list = thandle.get_level_list(); HostSignedEntriesType level_list = Kokkos::create_mirror_view(dlevel_list); - Kokkos::deep_copy(level_list, dlevel_list); + Kokkos::deep_copy(space, level_list, dlevel_list); signed_integral_t level = 0; size_type node_count = 0; + space.fence(); // wait for deep copy write to land typename DeviceEntriesType::HostMirror level_ptr( "lp", nrows + 1); // temp View used for index bookkeeping level_ptr(0) = 0; @@ -227,9 +229,9 @@ void lower_tri_symbolic(TriSolveHandle& thandle, const RowMapType drow_map, // Create the chain now if (thandle.algm_requires_symb_chain()) { + // No need to pass in space, chain phase runs on the host symbolic_chain_phase(thandle, nodes_per_level); } - thandle.set_symbolic_complete(); // Output check @@ -257,9 +259,9 @@ void lower_tri_symbolic(TriSolveHandle& thandle, const RowMapType drow_map, #endif // Deep copy to device views - Kokkos::deep_copy(dnodes_grouped_by_level, nodes_grouped_by_level); - Kokkos::deep_copy(dnodes_per_level, nodes_per_level); - Kokkos::deep_copy(dlevel_list, level_list); + Kokkos::deep_copy(space, dnodes_grouped_by_level, nodes_grouped_by_level); + Kokkos::deep_copy(space, dnodes_per_level, nodes_per_level); + Kokkos::deep_copy(space, dlevel_list, level_list); // Extra check: #ifdef LVL_OUTPUT_INFO @@ -279,6 +281,7 @@ void lower_tri_symbolic(TriSolveHandle& thandle, const RowMapType drow_map, check_count); std::cout << " host check_count= " << check_count << std::endl; + space.fence(); // wait for deep copy writes to land check_count = 0; // reset Kokkos::parallel_reduce( "check_count device", @@ -568,20 +571,21 @@ void lower_tri_symbolic(TriSolveHandle& thandle, const RowMapType drow_map, thandle.set_workspace_size(max_lwork); // workspace offset initialized to be zero integer_view_t work_offset = thandle.get_work_offset(); - Kokkos::deep_copy(work_offset, work_offset_host); + Kokkos::deep_copy(space, work_offset, work_offset_host); // kernel types // > off-diagonal integer_view_t dkernel_type_by_level = thandle.get_kernel_type(); - Kokkos::deep_copy(dkernel_type_by_level, kernel_type_by_level); + Kokkos::deep_copy(space, dkernel_type_by_level, kernel_type_by_level); // > diagonal integer_view_t ddiag_kernel_type_by_level = thandle.get_diag_kernel_type(); - Kokkos::deep_copy(ddiag_kernel_type_by_level, diag_kernel_type_by_level); + Kokkos::deep_copy(space, ddiag_kernel_type_by_level, + diag_kernel_type_by_level); // deep copy to device (of scheduling info) - Kokkos::deep_copy(dnodes_grouped_by_level, nodes_grouped_by_level); - Kokkos::deep_copy(dnodes_per_level, nodes_per_level); - Kokkos::deep_copy(dlevel_list, level_list); + Kokkos::deep_copy(space, dnodes_grouped_by_level, nodes_grouped_by_level); + Kokkos::deep_copy(space, dnodes_per_level, nodes_per_level); + Kokkos::deep_copy(space, dlevel_list, level_list); #ifdef TRISOLVE_SYMB_TIMERS std::cout << " + workspace time = " << timer.seconds() << std::endl; @@ -598,9 +602,10 @@ void lower_tri_symbolic(TriSolveHandle& thandle, const RowMapType drow_map, #endif } // end lower_tri_symbolic -template -void upper_tri_symbolic(TriSolveHandle& thandle, const RowMapType drow_map, - const EntriesType dentries) { +template +void upper_tri_symbolic(ExecutionSpace& space, TriSolveHandle& thandle, + const RowMapType drow_map, const EntriesType dentries) { #ifdef TRISOLVE_SYMB_TIMERS Kokkos::Timer timer_sym_uppertri_total; Kokkos::Timer timer; @@ -626,10 +631,10 @@ void upper_tri_symbolic(TriSolveHandle& thandle, const RowMapType drow_map, size_type nrows = drow_map.extent(0) - 1; auto row_map = Kokkos::create_mirror_view(drow_map); - Kokkos::deep_copy(row_map, drow_map); + Kokkos::deep_copy(space, row_map, drow_map); auto entries = Kokkos::create_mirror_view(dentries); - Kokkos::deep_copy(entries, dentries); + Kokkos::deep_copy(space, entries, dentries); // get device view - will deep_copy to it at end of this host routine DeviceEntriesType dnodes_per_level = thandle.get_nodes_per_level(); @@ -642,11 +647,12 @@ void upper_tri_symbolic(TriSolveHandle& thandle, const RowMapType drow_map, DeviceSignedEntriesType dlevel_list = thandle.get_level_list(); HostSignedEntriesType level_list = Kokkos::create_mirror_view(dlevel_list); - Kokkos::deep_copy(level_list, dlevel_list); + Kokkos::deep_copy(space, level_list, dlevel_list); signed_integral_t level = 0; size_type node_count = 0; + space.fence(); // Wait for deep copy writes to land typename DeviceEntriesType::HostMirror level_ptr( "lp", nrows + 1); // temp View used for index bookkeeping level_ptr(0) = 0; @@ -708,9 +714,9 @@ void upper_tri_symbolic(TriSolveHandle& thandle, const RowMapType drow_map, #endif // Deep copy to device views - Kokkos::deep_copy(dnodes_grouped_by_level, nodes_grouped_by_level); - Kokkos::deep_copy(dnodes_per_level, nodes_per_level); - Kokkos::deep_copy(dlevel_list, level_list); + Kokkos::deep_copy(space, dnodes_grouped_by_level, nodes_grouped_by_level); + Kokkos::deep_copy(space, dnodes_per_level, nodes_per_level); + Kokkos::deep_copy(space, dlevel_list, level_list); // Extra check: #ifdef LVL_OUTPUT_INFO @@ -730,6 +736,7 @@ void upper_tri_symbolic(TriSolveHandle& thandle, const RowMapType drow_map, check_count); std::cout << " host check_count= " << check_count << std::endl; + space.fence(); // wait for deep copy writes to land check_count = 0; // reset Kokkos::parallel_reduce( "check_count device", diff --git a/sparse/impl/KokkosSparse_sptrsv_symbolic_spec.hpp b/sparse/impl/KokkosSparse_sptrsv_symbolic_spec.hpp index 73389d10d0..5b9304356d 100644 --- a/sparse/impl/KokkosSparse_sptrsv_symbolic_spec.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_symbolic_spec.hpp @@ -67,33 +67,37 @@ namespace Impl { // Unification layer /// \brief Implementation of KokkosSparse::sptrsv_symbolic -template ::value, bool eti_spec_avail = sptrsv_symbolic_eti_spec_avail< KernelHandle, RowMapType, EntriesType>::value> struct SPTRSV_SYMBOLIC { - static void sptrsv_symbolic(KernelHandle *handle, const RowMapType row_map, + static void sptrsv_symbolic(const ExecutionSpace &space, KernelHandle *handle, + const RowMapType row_map, const EntriesType entries); }; #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY //! Full specialization of sptrsv_symbolic // Unification layer -template -struct SPTRSV_SYMBOLIC { - static void sptrsv_symbolic(KernelHandle *handle, const RowMapType row_map, +template +struct SPTRSV_SYMBOLIC { + static void sptrsv_symbolic(const ExecutionSpace &space, KernelHandle *handle, + const RowMapType row_map, const EntriesType entries) { auto sptrsv_handle = handle->get_sptrsv_handle(); auto nrows = row_map.extent(0) - 1; sptrsv_handle->new_init_handle(nrows); if (sptrsv_handle->is_lower_tri()) { - Experimental::lower_tri_symbolic(*sptrsv_handle, row_map, entries); + Experimental::lower_tri_symbolic(space, *sptrsv_handle, row_map, entries); sptrsv_handle->set_symbolic_complete(); } else { - Experimental::upper_tri_symbolic(*sptrsv_handle, row_map, entries); + Experimental::upper_tri_symbolic(space, *sptrsv_handle, row_map, entries); sptrsv_handle->set_symbolic_complete(); } } @@ -113,6 +117,7 @@ struct SPTRSV_SYMBOLIC, \ @@ -130,6 +135,7 @@ struct SPTRSV_SYMBOLIC, \ diff --git a/sparse/src/KokkosSparse_sptrsv.hpp b/sparse/src/KokkosSparse_sptrsv.hpp index 859918c58d..9ab7c9fc6a 100644 --- a/sparse/src/KokkosSparse_sptrsv.hpp +++ b/sparse/src/KokkosSparse_sptrsv.hpp @@ -40,10 +40,23 @@ namespace Experimental { std::is_same::type, \ typename std::remove_const::type>::value -template -void sptrsv_symbolic(KernelHandle *handle, lno_row_view_t_ rowmap, - lno_nnz_view_t_ entries) { +/** + * @brief sptrsv symbolic phase for linear system Ax=b + * + * @tparam ExecutionSpace This kernels execution space type + * @tparam KernelHandle A specialization of + * KokkosKernels::Experimental::KokkosKernelsHandle + * @tparam lno_row_view_t_ The CRS matrix's (A) rowmap type + * @tparam lno_nnz_view_t_ The CRS matrix's (A) entries type + * @param space The execution space instance this kernel will run on + * @param handle KernelHandle instance + * @param rowmap The CRS matrix's (A) rowmap + * @param entries The CRS matrix's (A) entries + */ +template +void sptrsv_symbolic(const ExecutionSpace &space, KernelHandle *handle, + lno_row_view_t_ rowmap, lno_nnz_view_t_ entries) { typedef typename KernelHandle::size_type size_type; typedef typename KernelHandle::nnz_lno_t ordinal_type; @@ -94,8 +107,9 @@ void sptrsv_symbolic(KernelHandle *handle, lno_row_view_t_ rowmap, Entries_Internal entries_i = entries; KokkosSparse::Impl::SPTRSV_SYMBOLIC< - const_handle_type, RowMap_Internal, - Entries_Internal>::sptrsv_symbolic(&tmp_handle, rowmap_i, entries_i); + ExecutionSpace, const_handle_type, RowMap_Internal, + Entries_Internal>::sptrsv_symbolic(space, &tmp_handle, rowmap_i, + entries_i); #ifdef KK_TRISOLVE_TIMERS std::cout << " > sptrsv_symbolic time = " << timer_sptrsv.seconds() @@ -103,10 +117,46 @@ void sptrsv_symbolic(KernelHandle *handle, lno_row_view_t_ rowmap, #endif } // sptrsv_symbolic +/** + * @brief sptrsv symbolic phase for linear system Ax=b + * + * @tparam KernelHandle A specialization of + * KokkosKernels::Experimental::KokkosKernelsHandle + * @tparam lno_row_view_t_ The CRS matrix's (A) rowmap type + * @tparam lno_nnz_view_t_ The CRS matrix's (A) entries type + * @param handle KernelHandle instance + * @param rowmap The CRS matrix's (A) rowmap + * @param entries The CRS matrix's (A) entries + */ template + typename lno_nnz_view_t_> void sptrsv_symbolic(KernelHandle *handle, lno_row_view_t_ rowmap, - lno_nnz_view_t_ entries, scalar_nnz_view_t_ values) { + lno_nnz_view_t_ entries) { + using ExecutionSpace = typename KernelHandle::HandleExecSpace; + auto my_exec_space = ExecutionSpace(); + sptrsv_symbolic(my_exec_space, handle, rowmap, entries); +} + +/** + * @brief sptrsv symbolic phase for linear system Ax=b + * + * @tparam ExecutionSpace This kernels execution space type + * @tparam KernelHandle A specialization of + * KokkosKernels::Experimental::KokkosKernelsHandle + * @tparam lno_row_view_t_ The CRS matrix's (A) rowmap type + * @tparam lno_nnz_view_t_ The CRS matrix's (A) entries type + * @param space The execution space instance this kernel will run on + * @param handle KernelHandle instance + * @param rowmap The CRS matrix's (A) rowmap + * @param entries The CRS matrix's (A) entries + * @param values The CRS matrix's (A) values + */ +template +void sptrsv_symbolic(ExecutionSpace &space, KernelHandle *handle, + lno_row_view_t_ rowmap, lno_nnz_view_t_ entries, + scalar_nnz_view_t_ values) { typedef typename KernelHandle::size_type size_type; typedef typename KernelHandle::nnz_lno_t ordinal_type; typedef typename KernelHandle::nnz_scalar_t scalar_type; @@ -179,11 +229,12 @@ void sptrsv_symbolic(KernelHandle *handle, lno_row_view_t_ rowmap, auto nrows = sh->get_nrows(); KokkosSparse::Impl::sptrsvcuSPARSE_symbolic< - sptrsvHandleType, RowMap_Internal, Entries_Internal, Values_Internal>( - sh, nrows, rowmap_i, entries_i, values_i, false); + ExecutionSpace, sptrsvHandleType, RowMap_Internal, Entries_Internal, + Values_Internal>(space, sh, nrows, rowmap_i, entries_i, values_i, + false); } else { - KokkosSparse::Experimental::sptrsv_symbolic(handle, rowmap, entries); + KokkosSparse::Experimental::sptrsv_symbolic(space, handle, rowmap, entries); } #ifdef KK_TRISOLVE_TIMERS std::cout << " + sptrsv_symbolic time = " << timer_sptrsv.seconds() @@ -191,12 +242,52 @@ void sptrsv_symbolic(KernelHandle *handle, lno_row_view_t_ rowmap, #endif } // sptrsv_symbolic +/** + * @brief sptrsv symbolic phase for linear system Ax=b + * + * @tparam KernelHandle A specialization of + * KokkosKernels::Experimental::KokkosKernelsHandle + * @tparam lno_row_view_t_ The CRS matrix's (A) rowmap type + * @tparam lno_nnz_view_t_ The CRS matrix's (A) entries type + * @param handle KernelHandle instance + * @param rowmap The CRS matrix's (A) rowmap + * @param entries The CRS matrix's (A) entries + * @param values The CRS matrix's (A) values + */ template -void sptrsv_solve(KernelHandle *handle, lno_row_view_t_ rowmap, - lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, BType b, - XType x) { + typename lno_nnz_view_t_, typename scalar_nnz_view_t_> +void sptrsv_symbolic(KernelHandle *handle, lno_row_view_t_ rowmap, + lno_nnz_view_t_ entries, scalar_nnz_view_t_ values) { + using ExecutionSpace = typename KernelHandle::HandleExecSpace; + auto my_exec_space = ExecutionSpace(); + sptrsv_symbolic(my_exec_space, handle, rowmap, entries, values); +} + +/** + * @brief sptrsv solve phase of x for linear system Ax=b + * + * @tparam ExecutionSpace This kernels execution space + * @tparam KernelHandle A specialization of + * KokkosKernels::Experimental::KokkosKernelsHandle + * @tparam lno_row_view_t_ The CRS matrix's (A) rowmap type + * @tparam lno_nnz_view_t_ The CRS matrix's (A) entries type + * @tparam scalar_nnz_view_t_ The CRS matrix's (A) values type + * @tparam BType The b vector type + * @tparam XType The x vector type + * @param space The execution space instance this kernel will be run on + * @param handle KernelHandle instance + * @param rowmap The CRS matrix's (A) rowmap + * @param entries The CRS matrix's (A) entries + * @param values The CRS matrix's (A) values + * @param b The b vector + * @param x The x vector + */ +template +void sptrsv_solve(ExecutionSpace &space, KernelHandle *handle, + lno_row_view_t_ rowmap, lno_nnz_view_t_ entries, + scalar_nnz_view_t_ values, BType b, XType x) { typedef typename KernelHandle::size_type size_type; typedef typename KernelHandle::nnz_lno_t ordinal_type; typedef typename KernelHandle::nnz_scalar_t scalar_type; @@ -305,25 +396,65 @@ void sptrsv_solve(KernelHandle *handle, lno_row_view_t_ rowmap, sptrsvHandleType *sh = handle->get_sptrsv_handle(); auto nrows = sh->get_nrows(); - KokkosSparse::Impl::sptrsvcuSPARSE_solve( - sh, nrows, rowmap_i, entries_i, values_i, b_i, x_i, false); + KokkosSparse::Impl::sptrsvcuSPARSE_solve< + ExecutionSpace, sptrsvHandleType, RowMap_Internal, Entries_Internal, + Values_Internal, BType_Internal, XType_Internal>( + space, sh, nrows, rowmap_i, entries_i, values_i, b_i, x_i, false); } else { KokkosSparse::Impl::SPTRSV_SOLVE< - typename scalar_nnz_view_t_::execution_space, const_handle_type, - RowMap_Internal, Entries_Internal, Values_Internal, BType_Internal, - XType_Internal>::sptrsv_solve(&tmp_handle, rowmap_i, entries_i, + ExecutionSpace, const_handle_type, RowMap_Internal, Entries_Internal, + Values_Internal, BType_Internal, + XType_Internal>::sptrsv_solve(space, &tmp_handle, rowmap_i, entries_i, values_i, b_i, x_i); } } // sptrsv_solve -#if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV) -// --------------------------------------------------------------------- -template -void sptrsv_solve(KernelHandle *handle, XType x, XType b) { +/** + * @brief sptrsv solve phase of x for linear system Ax=b + * + * @tparam KernelHandle A specialization of + * KokkosKernels::Experimental::KokkosKernelsHandle + * @tparam lno_row_view_t_ The CRS matrix's (A) rowmap type + * @tparam lno_nnz_view_t_ The CRS matrix's (A) entries type + * @tparam scalar_nnz_view_t_ The CRS matrix's (A) values type + * @tparam BType The b vector type + * @tparam XType The x vector type + * @param handle KernelHandle instance + * @param rowmap The CRS matrix's (A) rowmap + * @param entries The CRS matrix's (A) entries + * @param values The CRS matrix's (A) values + * @param b The b vector + * @param x The x vector + */ +template +void sptrsv_solve(KernelHandle *handle, lno_row_view_t_ rowmap, + lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, BType b, + XType x) { + using ExecutionSpace = typename KernelHandle::HandleExecSpace; + auto my_exec_space = ExecutionSpace(); + sptrsv_solve(my_exec_space, handle, rowmap, entries, values, b, x); +} + +#if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV) || defined(DOXY) +/** + * @brief Supernodal sptrsv solve phase of x for linear system Ax=b + * + * @tparam ExecutionSpace This kernels execution space + * @tparam KernelHandle A specialization of + * KokkosKernels::Experimental::KokkosKernelsHandle + * @tparam XType The x and b vector type + * @param space The execution space instance this kernel will run on + * @param handle KernelHandle instance + * @param x The x vector + * @param b The b vector + */ +template +void sptrsv_solve(ExecutionSpace &space, KernelHandle *handle, XType x, + XType b) { auto crsmat = handle->get_sptrsv_handle()->get_crsmat(); auto values = crsmat.values; auto graph = crsmat.graph; @@ -341,31 +472,79 @@ void sptrsv_solve(KernelHandle *handle, XType x, XType b) { if (handle->is_sptrsv_lower_tri()) { // apply forward pivoting - Kokkos::deep_copy(x, b); + Kokkos::deep_copy(space, x, b); // the fifth argument (i.e., first x) is not used - sptrsv_solve(handle, row_map, entries, values, x, x); + sptrsv_solve(space, handle, row_map, entries, values, x, x); } else { // the fifth argument (i.e., first x) is not used - sptrsv_solve(handle, row_map, entries, values, b, b); + sptrsv_solve(space, handle, row_map, entries, values, b, b); // apply backward pivoting - Kokkos::deep_copy(x, b); + Kokkos::deep_copy(space, x, b); } } -// --------------------------------------------------------------------- +/** + * @brief Supernodal sptrsv solve phase of x for linear system Ax=b + * + * @tparam KernelHandle A specialization of + * KokkosKernels::Experimental::KokkosKernelsHandle + * @tparam XType The x and b vector type + * @param handle KernelHandle instance + * @param x The x vector + * @param b The b vector + */ template -void sptrsv_solve(KernelHandle *handleL, KernelHandle *handleU, XType x, - XType b) { +void sptrsv_solve(KernelHandle *handle, XType x, XType b) { + using ExecutionSpace = typename KernelHandle::HandleExecSpace; + auto my_exec_space = ExecutionSpace(); + sptrsv_solve(my_exec_space, handle, x, b); +} + +/** + * @brief Supernodal sptrsv solve phase of x for linear system Ax=b + * + * @tparam ExecutionSpace This kernels execution space + * @tparam KernelHandle A specialization of + * KokkosKernels::Experimental::KokkosKernelsHandle + * @tparam XType The x and b vector type + * @param space The execution space instance this kernel will run on + * @param handleL KernelHandle instance for lower triangular matrix + * @param handleU KernelHandle instance for upper triangular matrix + * @param x The x vector + * @param b The b vector + */ +template +void sptrsv_solve(ExecutionSpace &space, KernelHandle *handleL, + KernelHandle *handleU, XType x, XType b) { // Lower-triangular solve - sptrsv_solve(handleL, x, b); + sptrsv_solve(space, handleL, x, b); // copy the solution to rhs - Kokkos::deep_copy(b, x); + Kokkos::deep_copy(space, b, x); // uper-triangular solve - sptrsv_solve(handleU, x, b); + sptrsv_solve(space, handleU, x, b); +} + +/** + * @brief Supernodal sptrsv solve phase of x for linear system Ax=b + * + * @tparam KernelHandle A specialization of + * KokkosKernels::Experimental::KokkosKernelsHandle + * @tparam XType The x and b vector type + * @param handleL KernelHandle instance for lower triangular matrix + * @param handleU KernelHandle instance for upper triangular matrix + * @param x The x vector + * @param b The b vector + */ +template +void sptrsv_solve(KernelHandle *handleL, KernelHandle *handleU, XType x, + XType b) { + using ExecutionSpace = typename KernelHandle::HandleExecSpace; + auto my_exec_space = ExecutionSpace(); + sptrsv_solve(my_exec_space, handleL, handleU, x, b); } #endif