From 54faa2cb9db40851ee0164fd3472219444036606 Mon Sep 17 00:00:00 2001 From: peekxc Date: Thu, 21 Dec 2023 10:10:31 -0500 Subject: [PATCH] header file cleaning --- include/_lanczos/lanczos.h | 137 ++----------------- include/_orthogonalize/orthogonalize.h | 7 +- include/_random_generator/vector_generator.h | 5 +- include/_trace/hutch.h | 116 +++++++++++++++- src/primate/_lanczos.cpp | 14 +- src/primate/_trace.cpp | 4 +- tests/test_lanczos.py | 8 +- 7 files changed, 150 insertions(+), 141 deletions(-) diff --git a/include/_lanczos/lanczos.h b/include/_lanczos/lanczos.h index 8e7d74e..c6e72e6 100644 --- a/include/_lanczos/lanczos.h +++ b/include/_lanczos/lanczos.h @@ -4,14 +4,14 @@ #include #include // function #include // max -#include "omp_support.h" // conditionally enables openmp pragmas #include "_operators/linear_operator.h" // LinearOperator #include "_orthogonalize/orthogonalize.h" // orth_vector, mod -#include "_random_generator/vector_generator.h" // ThreadSafeRBG, generate_array +// #include "_random_generator/vector_generator.h" // ThreadSafeRBG, generate_array + +#include "omp_support.h" // conditionally enables openmp pragmas #include "eigen_core.h" -#include -#include "_trace/hutch.h" -#include +#include +// #include using std::function; @@ -38,16 +38,16 @@ void lanczos_recurrence( const size_t m = A_shape.second; const F residual_tol = std::sqrt(n) * lanczos_rtol; - // Allocation / views - Eigen::Map< DenseMatrix< F > > Q(V, n, ncv); // Lanczos vectors - Eigen::Map< VectorF > v(q, m, 1); // map initial vector (no-op) - Q.col(0) = v.normalized(); // load normalized v0 into Q + // Setup views + Eigen::Map< DenseMatrix< F > > Q(V, n, ncv); // Lanczos vectors + Eigen::Map< VectorF > v(q, m, 1); // map initial vector (no-op) + const auto Q_ref = Eigen::Ref< const DenseMatrix< F > >(Q); // const view + + // Setup for first iteration + std::array< int, 3 > pos = { int(ncv) - 1, 0, 1 }; // Indices for the recurrence + Q.col(pos[0]) = static_cast< VectorF >(VectorF::Zero(n)); // Ensure previous is 0 + Q.col(0) = v.normalized(); // load normalized v0 into Q - // In the following, beta[j] means beta[j-1] in the Demmel text - const auto Q_ref = Eigen::Ref< const DenseMatrix< F > >(Q); - std::array< int, 3 > pos = { static_cast< int >(ncv - 1), 0, 1 }; - Q.col(pos[0]) = static_cast< VectorF >(VectorF::Zero(n)); // Ensure previous is 0 - for (int j = 0; j < k; ++j) { // Apply the three-term recurrence @@ -111,113 +111,6 @@ auto lanczos_quadrature( std::copy(tau.begin(), tau.end(), weights); }; -// Stochastic Lanczos quadrature method -// std::function -template< std::floating_point F, LinearOperator Matrix, ThreadSafeRBG RBG, typename Lambda > -void slq ( - const Matrix& A, // Any linear operator supporting .matvec() and .shape() methods - const Lambda& f, // Thread-safe function with signature f(int i, F* nodes, F* weights) - const std::function< bool(int) >& stop_check, // Function to check for convergence or early-stopping (takes no arguments) - const int nv, // Number of sample vectors to generate - const Distribution dist, // Isotropic distribution used to generate random vectors - RBG& rng, // Random bit generator - const int lanczos_degree, // Polynomial degree of the Krylov expansion - const F lanczos_rtol, // residual tolerance to consider subspace A-invariant - const int orth, // Number of vectors to re-orthogonalize against <= lanczos_degree - const int ncv, // Number of Lanczos vectors to keep in memory (per-thread) - const int num_threads, // Number of threads used to parallelize the computation - const int seed // Seed for random number generator for determinism -){ - using ArrayF = Eigen::Array< F, Dynamic, 1 >; - using EigenSolver = Eigen::SelfAdjointEigenSolver< DenseMatrix< F > >; - if (ncv < 2){ throw std::invalid_argument("Invalid number of lanczos vectors supplied; must be >= 2."); } - if (ncv < orth+2){ throw std::invalid_argument("Invalid number of lanczos vectors supplied; must be >= 2+orth."); } - - // Constants - const auto A_shape = A.shape(); - const size_t n = A_shape.first; - const size_t m = A_shape.second; - - // Set the number of threads + initialize multi-threaded RNG - const auto nt = num_threads <= 0 ? omp_get_max_threads() : num_threads; - omp_set_num_threads(nt); - rng.initialize(nt, seed); - - // Using square-root of max possible chunk size for parallel schedules - unsigned int chunk_size = std::max(int(sqrt(nv / nt)), 1); - - // Monte-Carlo ensemble sampling - int i; - volatile bool stop_flag = false; // early-stop flag for convergence checking - #pragma omp parallel shared(stop_flag) - { - int tid = omp_get_thread_num(); // thread-id - - // Pre-allocate memory needed for Lanczos iterations - auto q_norm = static_cast< F >(0.0); - auto q = static_cast< ArrayF >(ArrayF::Zero(m)); - auto Q = static_cast< DenseMatrix< F > >(DenseMatrix< F >::Zero(n, ncv)); - auto alpha = static_cast< ArrayF >(ArrayF::Zero(lanczos_degree+1)); - auto beta = static_cast< ArrayF >(ArrayF::Zero(lanczos_degree+1)); - auto nodes = static_cast< ArrayF >(ArrayF::Zero(lanczos_degree)); - auto weights = static_cast< ArrayF >(ArrayF::Zero(lanczos_degree)); - auto solver = EigenSolver(lanczos_degree); - - // Run in parallel - #pragma omp for schedule(dynamic, chunk_size) - for (i = 0; i < nv; ++i){ - if (stop_flag){ continue; } - - // Generate isotropic vector (w/ unit norm) - generate_isotropic< F >(dist, m, rng, tid, q.data(), q_norm); - - // Perform a lanczos iteration (populates alpha, beta) - lanczos_recurrence< F >(A, q.data(), lanczos_degree, lanczos_rtol, orth, alpha.data(), beta.data(), Q.data(), ncv); - - // Obtain nodes + weights via quadrature algorithm - lanczos_quadrature< F >(alpha.data(), beta.data(), lanczos_degree, solver, nodes.data(), weights.data()); - - // Run the user-supplied function (parallel section!) - f(i, q.data(), Q.data(), nodes.data(), weights.data()); - - // If supplied, check early-stopping condition - #pragma omp critical - { - stop_flag = stop_check(i); - } - } - } -} - -template< std::floating_point F, LinearOperator Matrix, ThreadSafeRBG RBG > -void sl_quadrature( - const Matrix& mat, - RBG& rbg, const int nv, const int dist, const int engine_id, const int seed, - const int lanczos_degree, const F lanczos_rtol, const int orth, const int ncv, - const int num_threads, - F* quad_nw -){ - using VectorF = Eigen::Array< F, Dynamic, 1>; - - // Parameterize the quadrature function - Eigen::Map< DenseMatrix< F >> quad_nw_map(quad_nw, lanczos_degree * nv, 2); - const auto quad_f = [lanczos_degree, &quad_nw_map](int i, [[maybe_unused]] F* q, [[maybe_unused]] F* Q, F* nodes, F* weights){ - // printf("iter %0d, tid %0d, q[0] = %.4g, nodes[0] = %.4g\n", i, omp_get_thread_num(), q[0], nodes[0]); - Eigen::Map< VectorF > nodes_v(nodes, lanczos_degree, 1); // no-op - Eigen::Map< VectorF > weights_v(weights, lanczos_degree, 1); // no-op - quad_nw_map.block(i*lanczos_degree, 0, lanczos_degree, 1) = nodes_v; - quad_nw_map.block(i*lanczos_degree, 1, lanczos_degree, 1) = weights_v; - }; - - // Parameterize when to stop (run in critical section) - constexpr auto early_stop = [](int i) -> bool { - return false; - }; - - // Execute the stochastic Lanczos quadrature - slq< F >(mat, quad_f, early_stop, nv, static_cast< Distribution >(dist), rbg, lanczos_degree, lanczos_rtol, orth, ncv, num_threads, seed); -} - template< std::floating_point F, LinearOperator Matrix > struct MatrixFunction { // static const bool owning = false; @@ -275,7 +168,7 @@ struct MatrixFunction { // Lanczos iteration: save v norm const F v_scale = v_map.norm(); VectorF v_copy = v_map; // save copy - std::cout << v_copy << std::endl; + // std::cout << v_copy << std::endl; lanczos_recurrence< F >(op, v_copy.data(), deg, rtol, orth, alpha.data(), beta.data(), Q.data(), deg); // Note: Maps are used here to offset the pointers; they should be no-ops anyways diff --git a/include/_orthogonalize/orthogonalize.h b/include/_orthogonalize/orthogonalize.h index 91e1a2b..f3d9038 100644 --- a/include/_orthogonalize/orthogonalize.h +++ b/include/_orthogonalize/orthogonalize.h @@ -34,9 +34,10 @@ void orth_vector( const int diff = reverse ? -1 : 1; for (int i = mod(start_idx, m), c = 0; c < p; ++c, i = mod(i + diff, m)){ const auto u_norm = U.col(i).squaredNorm(); // norm of u_i - const auto proj_len = std::abs(v.dot(U.col(i))); // < v, u_i > - if (u_norm > tol && proj_len > tol){ // u_norm > tol should protect against nan vectors - v -= (proj_len / u_norm) * U.col(i); + const auto s_proj = v.dot(U.col(i)); // < v, u_i > + // should protect against nan and 0 vectors, even is isnan(u_norm) is true + if (u_norm > tol && std::abs(s_proj) > tol){ + v -= (s_proj / u_norm) * U.col(i); } } } diff --git a/include/_random_generator/vector_generator.h b/include/_random_generator/vector_generator.h index 1cc57b3..7bb2ef2 100644 --- a/include/_random_generator/vector_generator.h +++ b/include/_random_generator/vector_generator.h @@ -6,8 +6,9 @@ #include // isnan #include // bitset #include "omp_support.h" // conditionally enables openmp pragmas -#include "threadedrng64.h" // ThreadSafeRBG -#include "rne_engines.h" // all the engines +#include "random_concepts.h" // ThreadSafeRBG +// #include "threadedrng64.h" +// #include "rne_engines.h" // all the engines using IndexType = unsigned int; using LongIndexType = unsigned long; diff --git a/include/_trace/hutch.h b/include/_trace/hutch.h index 79ced90..a6a8f71 100644 --- a/include/_trace/hutch.h +++ b/include/_trace/hutch.h @@ -6,8 +6,8 @@ #include // max #include // constants, isnan #include "omp_support.h" // conditionally enables openmp pragmas -#include "_operators/linear_operator.h" // LinearOperator -#include "_orthogonalize/orthogonalize.h" // orth_vector, mod +// #include "_operators/linear_operator.h" // LinearOperator +// #include "_orthogonalize/orthogonalize.h" // orth_vector, mod #include "_random_generator/vector_generator.h" // ThreadSafeRBG, generate_array, Distribution #include "_lanczos/lanczos.h" #include "eigen_core.h" @@ -86,8 +86,8 @@ void monte_carlo_quad( auto q = static_cast< VectorF >(VectorF::Zero(m)); // Parameterize the matrix function - // TODO: this is a somewhat poorly designed way to get around copying matrices / matrix functions, - // as the former is read-only and can be shared amongst threads, but the latter needs thread-specific storage + // TODO: this is not a great way to avoid copying matrices / matrix functions, but is needed as + // the former is read-only and can be shared amongst threads, but the latter needs thread-specific storage const auto M = get_matrix(A); #pragma omp for schedule(dynamic, chunk_size) @@ -203,4 +203,112 @@ auto hutch( // Don't pull down monte_carlo_quad; the early-stop defs need to be local scope for some reason! } + +// Stochastic Lanczos quadrature method +// std::function +template< std::floating_point F, LinearOperator Matrix, ThreadSafeRBG RBG, typename Lambda > +void slq ( + const Matrix& A, // Any linear operator supporting .matvec() and .shape() methods + const Lambda& f, // Thread-safe function with signature f(int i, F* nodes, F* weights) + const std::function< bool(int) >& stop_check, // Function to check for convergence or early-stopping (takes no arguments) + const int nv, // Number of sample vectors to generate + const Distribution dist, // Isotropic distribution used to generate random vectors + RBG& rng, // Random bit generator + const int lanczos_degree, // Polynomial degree of the Krylov expansion + const F lanczos_rtol, // residual tolerance to consider subspace A-invariant + const int orth, // Number of vectors to re-orthogonalize against <= lanczos_degree + const int ncv, // Number of Lanczos vectors to keep in memory (per-thread) + const int num_threads, // Number of threads used to parallelize the computation + const int seed // Seed for random number generator for determinism +){ + using ArrayF = Eigen::Array< F, Dynamic, 1 >; + using EigenSolver = Eigen::SelfAdjointEigenSolver< DenseMatrix< F > >; + if (ncv < 2){ throw std::invalid_argument("Invalid number of lanczos vectors supplied; must be >= 2."); } + if (ncv < orth+2){ throw std::invalid_argument("Invalid number of lanczos vectors supplied; must be >= 2+orth."); } + + // Constants + const auto A_shape = A.shape(); + const size_t n = A_shape.first; + const size_t m = A_shape.second; + + // Set the number of threads + initialize multi-threaded RNG + const auto nt = num_threads <= 0 ? omp_get_max_threads() : num_threads; + omp_set_num_threads(nt); + rng.initialize(nt, seed); + + // Using square-root of max possible chunk size for parallel schedules + unsigned int chunk_size = std::max(int(sqrt(nv / nt)), 1); + + // Monte-Carlo ensemble sampling + int i; + volatile bool stop_flag = false; // early-stop flag for convergence checking + #pragma omp parallel shared(stop_flag) + { + int tid = omp_get_thread_num(); // thread-id + + // Pre-allocate memory needed for Lanczos iterations + auto q_norm = static_cast< F >(0.0); + auto q = static_cast< ArrayF >(ArrayF::Zero(m)); + auto Q = static_cast< DenseMatrix< F > >(DenseMatrix< F >::Zero(n, ncv)); + auto alpha = static_cast< ArrayF >(ArrayF::Zero(lanczos_degree+1)); + auto beta = static_cast< ArrayF >(ArrayF::Zero(lanczos_degree+1)); + auto nodes = static_cast< ArrayF >(ArrayF::Zero(lanczos_degree)); + auto weights = static_cast< ArrayF >(ArrayF::Zero(lanczos_degree)); + auto solver = EigenSolver(lanczos_degree); + + // Run in parallel + #pragma omp for schedule(dynamic, chunk_size) + for (i = 0; i < nv; ++i){ + if (stop_flag){ continue; } + + // Generate isotropic vector (w/ unit norm) + generate_isotropic< F >(dist, m, rng, tid, q.data(), q_norm); + + // Perform a lanczos iteration (populates alpha, beta) + lanczos_recurrence< F >(A, q.data(), lanczos_degree, lanczos_rtol, orth, alpha.data(), beta.data(), Q.data(), ncv); + + // Obtain nodes + weights via quadrature algorithm + lanczos_quadrature< F >(alpha.data(), beta.data(), lanczos_degree, solver, nodes.data(), weights.data()); + + // Run the user-supplied function (parallel section!) + f(i, q.data(), Q.data(), nodes.data(), weights.data()); + + // If supplied, check early-stopping condition + #pragma omp critical + { + stop_flag = stop_check(i); + } + } + } +} + +template< std::floating_point F, LinearOperator Matrix, ThreadSafeRBG RBG > +void sl_quadrature( + const Matrix& mat, + RBG& rbg, const int nv, const int dist, const int engine_id, const int seed, + const int lanczos_degree, const F lanczos_rtol, const int orth, const int ncv, + const int num_threads, + F* quad_nw +){ + using VectorF = Eigen::Array< F, Dynamic, 1>; + + // Parameterize the quadrature function + Eigen::Map< DenseMatrix< F >> quad_nw_map(quad_nw, lanczos_degree * nv, 2); + const auto quad_f = [lanczos_degree, &quad_nw_map](int i, [[maybe_unused]] F* q, [[maybe_unused]] F* Q, F* nodes, F* weights){ + // printf("iter %0d, tid %0d, q[0] = %.4g, nodes[0] = %.4g\n", i, omp_get_thread_num(), q[0], nodes[0]); + Eigen::Map< VectorF > nodes_v(nodes, lanczos_degree, 1); // no-op + Eigen::Map< VectorF > weights_v(weights, lanczos_degree, 1); // no-op + quad_nw_map.block(i*lanczos_degree, 0, lanczos_degree, 1) = nodes_v; + quad_nw_map.block(i*lanczos_degree, 1, lanczos_degree, 1) = weights_v; + }; + + // Parameterize when to stop (run in critical section) + constexpr auto early_stop = [](int i) -> bool { + return false; + }; + + // Execute the stochastic Lanczos quadrature + slq< F >(mat, quad_f, early_stop, nv, static_cast< Distribution >(dist), rbg, lanczos_degree, lanczos_rtol, orth, ncv, num_threads, seed); +} + #endif \ No newline at end of file diff --git a/src/primate/_lanczos.cpp b/src/primate/_lanczos.cpp index 4fb2aa6..cb0c9fd 100644 --- a/src/primate/_lanczos.cpp +++ b/src/primate/_lanczos.cpp @@ -1,18 +1,22 @@ +#include // result_of +#include // constants +#include +#include +#include + #include #include #include #include #include + #include "_random_generator/vector_generator.h" +#include "_random_generator/threadedrng64.h" #include "_lanczos/lanczos.h" +#include "_trace/hutch.h" #include "eigen_operators.h" #include "pylinop.h" #include "spectral_functions.h" -#include // result_of -#include // constants -#include -#include -#include namespace py = pybind11; diff --git a/src/primate/_trace.cpp b/src/primate/_trace.cpp index 1761d9e..05d14ac 100644 --- a/src/primate/_trace.cpp +++ b/src/primate/_trace.cpp @@ -1,10 +1,12 @@ +#include + #include "_lanczos/lanczos.h" #include "_trace/hutch.h" +#include "_random_generator/threadedrng64.h" #include "eigen_operators.h" // Eigen wrappers #include "pylinop.h" // py::object LinearOperator wrapper #include "spectral_functions.h" // to parameterize the functions -#include namespace py = pybind11; // NOTE: all matrices should be cast to Fortran ordering for compatibility with Eigen diff --git a/tests/test_lanczos.py b/tests/test_lanczos.py index b139249..2726d6a 100644 --- a/tests/test_lanczos.py +++ b/tests/test_lanczos.py @@ -36,8 +36,8 @@ def test_accuracy(): from primate.diagonalize import lanczos np.random.seed(1234) n = 30 - A = symmetric(n) - alpha, beta = np.zeros(n, dtype=np.float32), np.zeros(n, dtype=np.float32) + A = symmetric(n).astype(np.float64) + alpha, beta = np.zeros(n, dtype=np.float64), np.zeros(n, dtype=np.float64) ## In general not guaranteed, but with full re-orthogonalization it's likely (and deterministic w/ fixed seed) tol = np.zeros(30) @@ -46,8 +46,8 @@ def test_accuracy(): alpha, beta = lanczos(A, v0=v0, rtol=1e-8, orth=n-1) ew_true = np.sort(eigsh(A, k=n-1, return_eigenvectors=False)) ew_test = np.sort(eigh_tridiagonal(alpha, beta, eigvals_only=True)) - tol[i] = np.mean(np.abs(ew_test[1:] - ew_true)) - assert np.all(tol < 1e-5) + tol[i] = np.max(np.abs(ew_test[1:] - ew_true)) + assert np.all(tol < 1e-12) def test_high_degree(): from primate.diagonalize import lanczos