Skip to content

Commit

Permalink
Stream support for Gauss-Seidel: Symbolic, Numeric, Apply (Twostage)
Browse files Browse the repository at this point in the history
  • Loading branch information
e10harvey committed Sep 26, 2023
1 parent 043376f commit 82a2050
Show file tree
Hide file tree
Showing 6 changed files with 134 additions and 110 deletions.
5 changes: 2 additions & 3 deletions common/src/KokkosKernels_Utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -892,7 +892,7 @@ void permute_block_vector(typename idx_array_type::value_type num_elements,
// TODO BMK: clean this up by removing 1st argument. It is unused but
// its name gives the impression that only num_elements of the vector are
// zeroed, when really it's always the whole thing.
template <class ExecSpaceIn, typename value_array_type, typename MyExecSpace>
template <class ExecSpaceIn, typename value_array_type>
void zero_vector(ExecSpaceIn &exec_space_in,
typename value_array_type::value_type /* num_elements */,
value_array_type &vector) {
Expand All @@ -908,8 +908,7 @@ void zero_vector(typename value_array_type::value_type /* num_elements */,
using ne_tmp_t = typename value_array_type::value_type;
ne_tmp_t ne_tmp = ne_tmp_t(0);
MyExecSpace my_exec_space;
zero_vector<MyExecSpace, value_array_type, MyExecSpace>(my_exec_space, ne_tmp,
vector);
zero_vector(my_exec_space, ne_tmp, vector);
}

template <typename v1, typename v2, typename v3>
Expand Down
10 changes: 4 additions & 6 deletions sparse/impl/KokkosSparse_gauss_seidel_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1547,9 +1547,8 @@ class PointGaussSeidel {
Permuted_Yvector);
}
if (init_zero_x_vector) {
KokkosKernels::Impl::zero_vector<
MyExecSpace, scalar_persistent_work_view2d_t, MyExecSpace>(
my_exec_space, num_cols * block_size, Permuted_Xvector);
KokkosKernels::Impl::zero_vector(my_exec_space, num_cols * block_size,
Permuted_Xvector);
} else {
KokkosKernels::Impl::permute_block_vector<
x_value_array_type, scalar_persistent_work_view2d_t,
Expand Down Expand Up @@ -1664,9 +1663,8 @@ class PointGaussSeidel {
Permuted_Yvector);
}
if (init_zero_x_vector) {
KokkosKernels::Impl::zero_vector<
MyExecSpace, scalar_persistent_work_view2d_t, MyExecSpace>(
my_exec_space, num_cols, Permuted_Xvector);
KokkosKernels::Impl::zero_vector(my_exec_space, num_cols,
Permuted_Xvector);
} else {
KokkosKernels::Impl::permute_vector<
x_value_array_type, scalar_persistent_work_view2d_t,
Expand Down
96 changes: 53 additions & 43 deletions sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -861,10 +861,11 @@ class TwostageGaussSeidel {
#endif

//
auto *gsHandle = get_gs_handle();
bool two_stage = gsHandle->isTwoStage();
bool compact_form = gsHandle->isCompactForm();
scalar_t gamma = gsHandle->getInnerDampFactor();
auto *gsHandle = get_gs_handle();
auto my_exec_space = gsHandle->get_execution_space();
bool two_stage = gsHandle->isTwoStage();
bool compact_form = gsHandle->isCompactForm();
scalar_t gamma = gsHandle->getInnerDampFactor();

GSDirection direction = gsHandle->getSweepDirection();
if (apply_forward && apply_backward) {
Expand All @@ -887,7 +888,7 @@ class TwostageGaussSeidel {
auto crsmatUa =
gsHandle->getUa(); // complement of U+D (used only for compact form)

// wratp A into crsmat
// wrap A into crsmat
input_crsmat_t crsmatA("A", num_rows, num_cols, values_view.extent(0),
values_view, rowmap_view, column_view);
#ifdef KOKKOSSPARSE_IMPL_TIME_TWOSTAGE_GS
Expand Down Expand Up @@ -920,36 +921,36 @@ class TwostageGaussSeidel {
NumSweeps *= 2;
}
if (init_zero_x_vector) {
KokkosKernels::Impl::zero_vector<x_value_array_type, execution_space>(
nrhs, localX);
KokkosKernels::Impl::zero_vector(my_exec_space, nrhs, localX);
}
for (int sweep = 0; sweep < NumSweeps; ++sweep) {
bool forward_sweep = (direction == GS_FORWARD ||
(direction == GS_SYMMETRIC && sweep % 2 == 0));
// compute residual vector
KokkosBlas::scal(localR, one, localB);
KokkosBlas::scal(my_exec_space, localR, one, localB);
if (sweep > 0 || !init_zero_x_vector) {
if (compact_form) {
if (forward_sweep) {
// R = B - U*x
KokkosSparse::spmv("N", scalar_t(-one), crsmatUa, localX, one,
localR);
KokkosSparse::spmv(my_exec_space, "N", scalar_t(-one), crsmatUa,
localX, one, localR);
} else {
// R = B - L*x
KokkosSparse::spmv("N", scalar_t(-one), crsmatLa, localX, one,
localR);
KokkosSparse::spmv(my_exec_space, "N", scalar_t(-one), crsmatLa,
localX, one, localR);
}
if (omega != one) {
// R = B - (U + (1-1/omega)D)*x
scalar_t omega2 = (one / omega - one);
auto localY =
Kokkos::subview(localX, range_type(0, num_rows), Kokkos::ALL());
KokkosBlas::mult(zero, localZ, one, localDa, localY);
KokkosBlas::axpy(omega2, localZ, localR);
KokkosBlas::mult(my_exec_space, zero, localZ, one, localDa, localY);
KokkosBlas::axpy(my_exec_space, omega2, localZ, localR);
}
} else { // not compact_form
// R = B - A*x
KokkosSparse::spmv("N", scalar_t(-one), crsmatA, localX, one, localR);
KokkosSparse::spmv(my_exec_space, "N", scalar_t(-one), crsmatA,
localX, one, localR);
#ifdef KOKKOSSPARSE_IMPL_TIME_TWOSTAGE_GS
{
auto localRj =
Expand Down Expand Up @@ -987,6 +988,8 @@ class TwostageGaussSeidel {
single_vector_view_t Zj(localZj.data(), num_rows);
sptrsv_solve(handle->get_gs_sptrsvL_handle(), crsmatL.graph.row_map,
crsmatL.graph.entries, crsmatL.values, Rj, Zj);
execution_space()
.fence(); // TODO: call sptrsv_solve on stream and remove
}
} else {
using namespace KokkosSparse::Experimental;
Expand All @@ -1001,6 +1004,8 @@ class TwostageGaussSeidel {
single_vector_view_t Zj(localZj.data(), num_rows);
sptrsv_solve(handle->get_gs_sptrsvU_handle(), crsmatU.graph.row_map,
crsmatU.graph.entries, crsmatU.values, Rj, Zj);
execution_space()
.fence(); // TODO: call sptrsv_solve on stream and remove
}
}

Expand All @@ -1009,29 +1014,33 @@ class TwostageGaussSeidel {
Kokkos::subview(localX, range_type(0, num_rows), Kokkos::ALL());
if (compact_form) {
// Y = omega * Z
KokkosBlas::scal(localY, one, localZ);
KokkosBlas::scal(my_exec_space, localY, one, localZ);
} else {
// Y = Y + omega * Z
KokkosBlas::axpy(one, localZ, localY);
KokkosBlas::axpy(my_exec_space, one, localZ, localY);
}
} else {
// ====== inner Jacobi-Richardson =====
#ifdef KOKKOSSPARSE_IMPL_TIME_TWOSTAGE_GS
// compute initial residual norm
// > compute RHS for the inner loop, R = B - A*x
internal_vector_view_t tempR("tempR", num_rows, 1);
KokkosBlas::scal(tempR, one, localB);
KokkosSparse::spmv("N", scalar_t(-one), crsmatA, localX, one, tempR);
internal_vector_view_t tempR(
Kokkos::view_alloc(my_exec_space, std::string("tempR")), num_rows,
1);
KokkosBlas::scal(my_exec_space, tempR, one, localB);
KokkosSparse::spmv(my_exec_space, "N", scalar_t(-one), crsmatA, localX,
one, tempR);
// > initial vector for the inner loop is zero
Kokkos::deep_copy(localZ, zero);
Kokkos::deep_copy(my_exec_space, localZ, zero);
my_exec_space.fence();
using Norm_Functor_t =
TwostageGaussSeidel_functor<row_map_view_t, entries_view_t,
values_view_t>;
using range_policy = Kokkos::RangePolicy<Tag_normR, execution_space>;
{
mag_t normR = zero;
Kokkos::parallel_reduce(
"normR", range_policy(0, num_rows),
"normR", range_policy(my_exec_space, 0, num_rows),
Norm_Functor_t(forward_sweep, num_rows, rowmap_view, column_view,
values_view, localD, localZ, tempR),
normR);
Expand All @@ -1046,31 +1055,31 @@ class TwostageGaussSeidel {

// row-scale: (D^{-1}*L)*Y = D^{-1}*B
// compute Z := D^{-1}*R
KokkosBlas::mult(zero, localZ, one, localD, localR);
KokkosBlas::mult(my_exec_space, zero, localZ, one, localD, localR);
// apply inner damping factor, if not one
if (gamma != one) {
// Z = gamma * Z
KokkosBlas::scal(localZ, gamma, localZ);
KokkosBlas::scal(my_exec_space, localZ, gamma, localZ);
}
} else {
// copy to localT (workspace used to save D^{-1}*R for JR iteration)
KokkosBlas::mult(zero, localT, one, localD, localR);
KokkosBlas::mult(my_exec_space, zero, localT, one, localD, localR);
// initialize Jacobi-Richardson (using R as workspace for JR
// iteration)
KokkosBlas::scal(localR, one, localT);
KokkosBlas::scal(my_exec_space, localR, one, localT);

// apply inner damping factor, if not one
if (gamma != one) {
// R = gamma * R
KokkosBlas::scal(localR, gamma, localR);
KokkosBlas::scal(my_exec_space, localR, gamma, localR);
}
}
#ifdef KOKKOSSPARSE_IMPL_TIME_TWOSTAGE_GS
{
// compute residual norm of the starting vector (D^{-1}R)
mag_t normR = zero;
Kokkos::parallel_reduce(
"normR", range_policy(0, num_rows),
"normR", range_policy(my_exec_space, 0, num_rows),
Norm_Functor_t(forward_sweep, num_rows, rowmap_view, column_view,
values_view, localD, localT, tempR),
normR);
Expand All @@ -1081,34 +1090,34 @@ class TwostageGaussSeidel {
for (int ii = 0; ii < NumInnerSweeps; ii++) {
// T = D^{-1}*R, and L = D^{-1}*L and U = D^{-1}*U
// copy T into Z
KokkosBlas::scal(localZ, one, localT);
KokkosBlas::scal(my_exec_space, localZ, one, localT);
if (forward_sweep) {
// Z = Z - L*R
KokkosSparse::spmv("N", scalar_t(-omega), crsmatL, localR, one,
localZ);
KokkosSparse::spmv(my_exec_space, "N", scalar_t(-omega), crsmatL,
localR, one, localZ);
} else {
// Z = R - U*T
KokkosSparse::spmv("N", scalar_t(-omega), crsmatU, localR, one,
localZ);
KokkosSparse::spmv(my_exec_space, "N", scalar_t(-omega), crsmatU,
localR, one, localZ);
}
// apply inner damping factor, if not one
if (gamma != one) {
// Z = gamma * Z
KokkosBlas::scal(localZ, gamma, localZ);
KokkosBlas::scal(my_exec_space, localZ, gamma, localZ);
// Z = Z + (one - one/gamma) * R
scalar_t gamma2 = one - gamma;
KokkosBlas::axpy(gamma2, localR, localZ);
KokkosBlas::axpy(my_exec_space, gamma2, localR, localZ);
}
if (ii + 1 < NumInnerSweeps) {
// reinitialize (R to be Z)
KokkosBlas::scal(localR, one, localZ);
KokkosBlas::scal(my_exec_space, localR, one, localZ);
}
#ifdef KOKKOSSPARSE_IMPL_TIME_TWOSTAGE_GS
{
// compute residual norm(r - (L+D)*y)
mag_t normR = zero;
Kokkos::parallel_reduce(
"normR", range_policy(0, num_rows),
"normR", range_policy(my_exec_space, 0, num_rows),
Norm_Functor_t(forward_sweep, num_rows, rowmap_view,
column_view, values_view, localD, localZ, tempR),
normR);
Expand All @@ -1123,22 +1132,23 @@ class TwostageGaussSeidel {
Kokkos::subview(localX, range_type(0, num_rows), Kokkos::ALL());
if (compact_form) {
// Y := omega * z
KokkosBlas::scal(localY, omega, localZ);
KokkosBlas::scal(my_exec_space, localY, omega, localZ);
} else {
// Y := X + omega * Z
KokkosBlas::axpy(omega, localZ, localY);
KokkosBlas::axpy(my_exec_space, omega, localZ, localY);
}
} // end of inner GS sweep
} // end of outer GS sweep
#ifdef KOKKOSSPARSE_IMPL_TIME_TWOSTAGE_GS
{
// R = B - A*x
KokkosBlas::scal(localR, one, localB);
KokkosSparse::spmv("N", scalar_t(-one), crsmatA, localX, one, localR);
KokkosBlas::scal(my_exec_space, localR, one, localB);
KokkosSparse::spmv(my_exec_space, "N", scalar_t(-one), crsmatA, localX,
one, localR);
auto localRj = Kokkos::subview(localR, Kokkos::ALL(), range_type(0, 1));
single_vector_view_t Rj(localRj.data(), num_rows);
std::cout << "norm(GS)-" << NumSweeps << " " << KokkosBlas::nrm2(Rj)
<< std::endl;
std::cout << "norm(GS)-" << NumSweeps << " "
<< KokkosBlas::nrm2(my_exec_space, Rj) << std::endl;
}
#endif
}
Expand Down
22 changes: 15 additions & 7 deletions sparse/src/KokkosKernels_Handle.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -610,14 +610,19 @@ class KokkosKernelsHandle {
* @param num_streams The number of streams to allocate memory for.
* @param gs_algorithm Specifies which algorithm to use:
*
* KokkosSpace::GS_DEFAULT PointGaussSeidel
* KokkosSpace::GS_PERMUTED ??
* KokkosSpace::GS_TEAM ??
* KokkosSpace::GS_CLUSTER ??
* KokkosSpace::GS_TWOSTAGE ??
* KokkosSpace::GS_DEFAULT PointGaussSeidel or BlockGaussSeidel, depending on matrix type.
* KokkosSpace::GS_PERMUTED Reorders rows/cols into colors to improve locality. Uses RangePolicy over rows.
* KokkosSpace::GS_TEAM Uses TeamPolicy over batches of rows with ThreadVector within rows.
* KokkosSpace::GS_CLUSTER Uses independent clusters of nodes in the graph. Within a cluster, x is updated sequentially.
* For more information, see: https://arxiv.org/pdf/2204.02934.pdf.
* KokkosSpace::GS_TWOSTAGE Uses spmv to parallelize inner sweeps of x.
* For more information, see: https://arxiv.org/pdf/2104.01196.pdf.
* @param coloring_algorithm Specifies which coloring algorithm to color the graph with:
*
* KokkosGraph::COLORING_DEFAULT ??
* KokkosGraph::COLORING_DEFAULT Depends on execution space:
* COLORING_SERIAL on Kokkos::Serial;
* COLORING_EB on GPUs;
* COLORING_VBBIT on Kokkos::Sycl or elsewhere.
* KokkosGraph::COLORING_SERIAL Serial Greedy Coloring
* KokkosGraph::COLORING_VB Vertex Based Coloring
* KokkosGraph::COLORING_VBBIT Vertex Based Coloring with bit array
Expand Down Expand Up @@ -754,7 +759,10 @@ class KokkosKernelsHandle {
* @param hint_verts_per_cluster Hint how many verticies to use per cluster
* @param coloring_algorithm Specifies which coloring algorithm to color the graph with:
*
* KokkosGraph::COLORING_DEFAULT ??
* KokkosGraph::COLORING_DEFAULT Depends on execution space:
* COLORING_SERIAL on Kokkos::Serial;
* COLORING_EB on GPUs;
* COLORING_VBBIT on Kokkos::Sycl or elsewhere.
* KokkosGraph::COLORING_SERIAL Serial Greedy Coloring
* KokkosGraph::COLORING_VB Vertex Based Coloring
* KokkosGraph::COLORING_VBBIT Vertex Based Coloring with bit array
Expand Down
Loading

0 comments on commit 82a2050

Please sign in to comment.