From 54faa2cb9db40851ee0164fd3472219444036606 Mon Sep 17 00:00:00 2001
From: peekxc <matt.piekenbrock@gmail.com>
Date: Thu, 21 Dec 2023 10:10:31 -0500
Subject: [PATCH] header file cleaning

---
 include/_lanczos/lanczos.h                   | 137 ++-----------------
 include/_orthogonalize/orthogonalize.h       |   7 +-
 include/_random_generator/vector_generator.h |   5 +-
 include/_trace/hutch.h                       | 116 +++++++++++++++-
 src/primate/_lanczos.cpp                     |  14 +-
 src/primate/_trace.cpp                       |   4 +-
 tests/test_lanczos.py                        |   8 +-
 7 files changed, 150 insertions(+), 141 deletions(-)
diff --git a/include/_lanczos/lanczos.h b/include/_lanczos/lanczos.h
index 8e7d74e..c6e72e6 100644
--- a/include/_lanczos/lanczos.h
+++ b/include/_lanczos/lanczos.h
@@ -4,14 +4,14 @@
 #include <concepts> 
 #include <functional> // function
 #include <algorithm>  // max
-#include "omp_support.h" // conditionally enables openmp pragmas
 #include "_operators/linear_operator.h" // LinearOperator
 #include "_orthogonalize/orthogonalize.h"   // orth_vector, mod
-#include "_random_generator/vector_generator.h" // ThreadSafeRBG, generate_array
+// #include "_random_generator/vector_generator.h" // ThreadSafeRBG, generate_array
+
+#include "omp_support.h" // conditionally enables openmp pragmas
 #include "eigen_core.h"
-#include <Eigen/Eigenvalues> 
-#include "_trace/hutch.h"
-#include <iostream>
+#include <Eigen/Eigenvalues>
+// #include <iostream>
 
 using std::function; 
 
@@ -38,16 +38,16 @@ void lanczos_recurrence(
   const size_t m = A_shape.second;
   const F residual_tol = std::sqrt(n) * lanczos_rtol;
 
-  // Allocation / views
-  Eigen::Map< DenseMatrix< F > > Q(V, n, ncv);  // Lanczos vectors 
-  Eigen::Map< VectorF > v(q, m, 1);          // map initial vector (no-op)
-  Q.col(0) = v.normalized();                    // load normalized v0 into Q  
+  // Setup views
+  Eigen::Map< DenseMatrix< F > > Q(V, n, ncv);                // Lanczos vectors 
+  Eigen::Map< VectorF > v(q, m, 1);                           // map initial vector (no-op)
+  const auto Q_ref = Eigen::Ref< const DenseMatrix< F > >(Q); // const view 
+
+  // Setup for first iteration
+  std::array< int, 3 > pos = { int(ncv) - 1, 0, 1 };          // Indices for the recurrence
+  Q.col(pos[0]) = static_cast< VectorF >(VectorF::Zero(n));   // Ensure previous is 0
+  Q.col(0) = v.normalized();                                  // load normalized v0 into Q  
 
-  // In the following, beta[j] means beta[j-1] in the Demmel text
-  const auto Q_ref = Eigen::Ref< const DenseMatrix< F > >(Q); 
-  std::array< int, 3 > pos = { static_cast< int >(ncv - 1), 0, 1 };
-  Q.col(pos[0]) = static_cast< VectorF >(VectorF::Zero(n)); // Ensure previous is 0
-    
   for (int j = 0; j < k; ++j) {
 
     // Apply the three-term recurrence
@@ -111,113 +111,6 @@ auto lanczos_quadrature(
   std::copy(tau.begin(), tau.end(), weights);
 };
 
-// Stochastic Lanczos quadrature method
-// std::function<F(int,F*,F*)>
-template< std::floating_point F, LinearOperator Matrix, ThreadSafeRBG RBG, typename Lambda >
-void slq (
-  const Matrix& A,                            // Any linear operator supporting .matvec() and .shape() methods
-  const Lambda& f,                            // Thread-safe function with signature f(int i, F* nodes, F* weights)
-  const std::function< bool(int) >& stop_check, // Function to check for convergence or early-stopping (takes no arguments)
-  const int nv,                               // Number of sample vectors to generate
-  const Distribution dist,                    // Isotropic distribution used to generate random vectors
-  RBG& rng,                                   // Random bit generator
-  const int lanczos_degree,                   // Polynomial degree of the Krylov expansion
-  const F lanczos_rtol,                       // residual tolerance to consider subspace A-invariant
-  const int orth,                             // Number of vectors to re-orthogonalize against <= lanczos_degree
-  const int ncv,                              // Number of Lanczos vectors to keep in memory (per-thread)
-  const int num_threads,                      // Number of threads used to parallelize the computation   
-  const int seed                              // Seed for random number generator for determinism
-){   
-  using ArrayF = Eigen::Array< F, Dynamic, 1 >;
-  using EigenSolver = Eigen::SelfAdjointEigenSolver< DenseMatrix< F > >; 
-  if (ncv < 2){ throw std::invalid_argument("Invalid number of lanczos vectors supplied; must be >= 2."); }
-  if (ncv < orth+2){ throw std::invalid_argument("Invalid number of lanczos vectors supplied; must be >= 2+orth."); }
-
-  // Constants
-  const auto A_shape = A.shape();
-  const size_t n = A_shape.first;
-  const size_t m = A_shape.second;
-
-  // Set the number of threads + initialize multi-threaded RNG
-  const auto nt = num_threads <= 0 ? omp_get_max_threads() : num_threads;
-  omp_set_num_threads(nt);
-  rng.initialize(nt, seed);
-
-  // Using square-root of max possible chunk size for parallel schedules
-  unsigned int chunk_size = std::max(int(sqrt(nv / nt)), 1);
-  
-  // Monte-Carlo ensemble sampling
-  int i;
-  volatile bool stop_flag = false; // early-stop flag for convergence checking
-  #pragma omp parallel shared(stop_flag)
-  {
-    int tid = omp_get_thread_num(); // thread-id 
-
-    // Pre-allocate memory needed for Lanczos iterations
-    auto q_norm = static_cast< F >(0.0);
-    auto q = static_cast< ArrayF >(ArrayF::Zero(m)); 
-    auto Q = static_cast< DenseMatrix< F > >(DenseMatrix< F >::Zero(n, ncv));
-    auto alpha = static_cast< ArrayF >(ArrayF::Zero(lanczos_degree+1));
-    auto beta = static_cast< ArrayF >(ArrayF::Zero(lanczos_degree+1));
-    auto nodes = static_cast< ArrayF >(ArrayF::Zero(lanczos_degree));
-    auto weights = static_cast< ArrayF >(ArrayF::Zero(lanczos_degree));
-    auto solver = EigenSolver(lanczos_degree);
-    
-    // Run in parallel 
-    #pragma omp for schedule(dynamic, chunk_size)
-    for (i = 0; i < nv; ++i){
-      if (stop_flag){ continue; }
-
-      // Generate isotropic vector (w/ unit norm)
-      generate_isotropic< F >(dist, m, rng, tid, q.data(), q_norm);
-      
-      // Perform a lanczos iteration (populates alpha, beta)
-      lanczos_recurrence< F >(A, q.data(), lanczos_degree, lanczos_rtol, orth, alpha.data(), beta.data(), Q.data(), ncv); 
-
-      // Obtain nodes + weights via quadrature algorithm
-      lanczos_quadrature< F >(alpha.data(), beta.data(), lanczos_degree, solver, nodes.data(), weights.data());
-
-      // Run the user-supplied function (parallel section!)
-      f(i, q.data(), Q.data(), nodes.data(), weights.data());
-      
-      // If supplied, check early-stopping condition
-      #pragma omp critical
-      {
-        stop_flag = stop_check(i);
-      }
-    }
-  }
-}
-
-template< std::floating_point F, LinearOperator Matrix, ThreadSafeRBG RBG > 
-void sl_quadrature(
-  const Matrix& mat, 
-  RBG& rbg, const int nv, const int dist, const int engine_id, const int seed,
-  const int lanczos_degree, const F lanczos_rtol, const int orth, const int ncv,
-  const int num_threads, 
-  F* quad_nw
-){
-  using VectorF = Eigen::Array< F, Dynamic, 1>;
-
-  // Parameterize the quadrature function
-  Eigen::Map< DenseMatrix< F >> quad_nw_map(quad_nw, lanczos_degree * nv, 2);
-  const auto quad_f = [lanczos_degree, &quad_nw_map](int i, [[maybe_unused]] F* q, [[maybe_unused]] F* Q, F* nodes, F* weights){
-    // printf("iter %0d, tid %0d, q[0] = %.4g, nodes[0] = %.4g\n", i, omp_get_thread_num(), q[0], nodes[0]);
-    Eigen::Map< VectorF > nodes_v(nodes, lanczos_degree, 1);        // no-op
-    Eigen::Map< VectorF > weights_v(weights, lanczos_degree, 1);    // no-op
-    quad_nw_map.block(i*lanczos_degree, 0, lanczos_degree, 1) = nodes_v;
-    quad_nw_map.block(i*lanczos_degree, 1, lanczos_degree, 1) = weights_v;
-  };
-
-  // Parameterize when to stop (run in critical section)
-  constexpr auto early_stop = [](int i) -> bool {
-    return false; 
-  };
-
-  // Execute the stochastic Lanczos quadrature
-  slq< F >(mat, quad_f, early_stop, nv, static_cast< Distribution >(dist), rbg, lanczos_degree, lanczos_rtol, orth, ncv, num_threads, seed);
-}
-
 template< std::floating_point F, LinearOperator Matrix > 
 struct MatrixFunction {
   // static const bool owning = false;
@@ -275,7 +168,7 @@ struct MatrixFunction {
     // Lanczos iteration: save v norm 
     const F v_scale = v_map.norm(); 
     VectorF v_copy = v_map; // save copy
-    std::cout << v_copy << std::endl;
+    // std::cout << v_copy << std::endl;
     lanczos_recurrence< F >(op, v_copy.data(), deg, rtol, orth, alpha.data(), beta.data(), Q.data(), deg); 
     
     // Note: Maps are used here to offset the pointers; they should be no-ops anyways
diff --git a/include/_orthogonalize/orthogonalize.h b/include/_orthogonalize/orthogonalize.h
index 91e1a2b..f3d9038 100644
--- a/include/_orthogonalize/orthogonalize.h
+++ b/include/_orthogonalize/orthogonalize.h
@@ -34,9 +34,10 @@ void orth_vector(
   const int diff = reverse ? -1 : 1; 
   for (int i = mod(start_idx, m), c = 0; c < p; ++c, i = mod(i + diff, m)){
     const auto u_norm = U.col(i).squaredNorm();      // norm of u_i
-    const auto proj_len = std::abs(v.dot(U.col(i))); // < v, u_i > 
-    if (u_norm > tol && proj_len > tol){ // u_norm > tol should protect against nan vectors
-      v -= (proj_len / u_norm) * U.col(i);
+    const auto s_proj = v.dot(U.col(i)); // < v, u_i > 
+    // should protect against nan and 0 vectors, even is isnan(u_norm) is true
+    if (u_norm > tol && std::abs(s_proj) > tol){ 
+      v -= (s_proj / u_norm) * U.col(i);
     }
   }
 }
diff --git a/include/_random_generator/vector_generator.h b/include/_random_generator/vector_generator.h
index 1cc57b3..7bb2ef2 100644
--- a/include/_random_generator/vector_generator.h
+++ b/include/_random_generator/vector_generator.h
@@ -6,8 +6,9 @@
 #include <cmath> // isnan
 #include <bitset>   // bitset
 #include "omp_support.h" // conditionally enables openmp pragmas
-#include "threadedrng64.h"	// ThreadSafeRBG
-#include "rne_engines.h"		// all the engines
+#include "random_concepts.h" // ThreadSafeRBG
+// #include "threadedrng64.h"	
+// #include "rne_engines.h"		// all the engines
 
 using IndexType = unsigned int;
 using LongIndexType = unsigned long;
diff --git a/include/_trace/hutch.h b/include/_trace/hutch.h
index 79ced90..a6a8f71 100644
--- a/include/_trace/hutch.h
+++ b/include/_trace/hutch.h
@@ -6,8 +6,8 @@
 #include <algorithm>  // max
 #include <cmath> // constants, isnan
 #include "omp_support.h" // conditionally enables openmp pragmas
-#include "_operators/linear_operator.h" // LinearOperator
-#include "_orthogonalize/orthogonalize.h"   // orth_vector, mod
+// #include "_operators/linear_operator.h" // LinearOperator
+// #include "_orthogonalize/orthogonalize.h"   // orth_vector, mod
 #include "_random_generator/vector_generator.h" // ThreadSafeRBG, generate_array, Distribution
 #include "_lanczos/lanczos.h"
 #include "eigen_core.h"
@@ -86,8 +86,8 @@ void monte_carlo_quad(
     auto q = static_cast< VectorF >(VectorF::Zero(m));
     
     // Parameterize the matrix function
-    // TODO: this is a somewhat poorly designed way to get around copying matrices / matrix functions, 
-    // as the former is read-only and can be shared amongst threads, but the latter needs thread-specific storage
+    // TODO: this is not a great way to avoid copying matrices / matrix functions, but is needed as 
+    // the former is read-only and can be shared amongst threads, but the latter needs thread-specific storage
     const auto M = get_matrix(A); 
 
     #pragma omp for schedule(dynamic, chunk_size)
@@ -203,4 +203,112 @@ auto hutch(
   // Don't pull down monte_carlo_quad; the early-stop defs need to be local scope for some reason!
 }
 
+
+// Stochastic Lanczos quadrature method
+// std::function<F(int,F*,F*)>
+template< std::floating_point F, LinearOperator Matrix, ThreadSafeRBG RBG, typename Lambda >
+void slq (
+  const Matrix& A,                            // Any linear operator supporting .matvec() and .shape() methods
+  const Lambda& f,                            // Thread-safe function with signature f(int i, F* nodes, F* weights)
+  const std::function< bool(int) >& stop_check, // Function to check for convergence or early-stopping (takes no arguments)
+  const int nv,                               // Number of sample vectors to generate
+  const Distribution dist,                    // Isotropic distribution used to generate random vectors
+  RBG& rng,                                   // Random bit generator
+  const int lanczos_degree,                   // Polynomial degree of the Krylov expansion
+  const F lanczos_rtol,                       // residual tolerance to consider subspace A-invariant
+  const int orth,                             // Number of vectors to re-orthogonalize against <= lanczos_degree
+  const int ncv,                              // Number of Lanczos vectors to keep in memory (per-thread)
+  const int num_threads,                      // Number of threads used to parallelize the computation   
+  const int seed                              // Seed for random number generator for determinism
+){   
+  using ArrayF = Eigen::Array< F, Dynamic, 1 >;
+  using EigenSolver = Eigen::SelfAdjointEigenSolver< DenseMatrix< F > >; 
+  if (ncv < 2){ throw std::invalid_argument("Invalid number of lanczos vectors supplied; must be >= 2."); }
+  if (ncv < orth+2){ throw std::invalid_argument("Invalid number of lanczos vectors supplied; must be >= 2+orth."); }
+
+  // Constants
+  const auto A_shape = A.shape();
+  const size_t n = A_shape.first;
+  const size_t m = A_shape.second;
+
+  // Set the number of threads + initialize multi-threaded RNG
+  const auto nt = num_threads <= 0 ? omp_get_max_threads() : num_threads;
+  omp_set_num_threads(nt);
+  rng.initialize(nt, seed);
+
+  // Using square-root of max possible chunk size for parallel schedules
+  unsigned int chunk_size = std::max(int(sqrt(nv / nt)), 1);
+  
+  // Monte-Carlo ensemble sampling
+  int i;
+  volatile bool stop_flag = false; // early-stop flag for convergence checking
+  #pragma omp parallel shared(stop_flag)
+  {
+    int tid = omp_get_thread_num(); // thread-id 
+
+    // Pre-allocate memory needed for Lanczos iterations
+    auto q_norm = static_cast< F >(0.0);
+    auto q = static_cast< ArrayF >(ArrayF::Zero(m)); 
+    auto Q = static_cast< DenseMatrix< F > >(DenseMatrix< F >::Zero(n, ncv));
+    auto alpha = static_cast< ArrayF >(ArrayF::Zero(lanczos_degree+1));
+    auto beta = static_cast< ArrayF >(ArrayF::Zero(lanczos_degree+1));
+    auto nodes = static_cast< ArrayF >(ArrayF::Zero(lanczos_degree));
+    auto weights = static_cast< ArrayF >(ArrayF::Zero(lanczos_degree));
+    auto solver = EigenSolver(lanczos_degree);
+    
+    // Run in parallel 
+    #pragma omp for schedule(dynamic, chunk_size)
+    for (i = 0; i < nv; ++i){
+      if (stop_flag){ continue; }
+
+      // Generate isotropic vector (w/ unit norm)
+      generate_isotropic< F >(dist, m, rng, tid, q.data(), q_norm);
+      
+      // Perform a lanczos iteration (populates alpha, beta)
+      lanczos_recurrence< F >(A, q.data(), lanczos_degree, lanczos_rtol, orth, alpha.data(), beta.data(), Q.data(), ncv); 
+
+      // Obtain nodes + weights via quadrature algorithm
+      lanczos_quadrature< F >(alpha.data(), beta.data(), lanczos_degree, solver, nodes.data(), weights.data());
+
+      // Run the user-supplied function (parallel section!)
+      f(i, q.data(), Q.data(), nodes.data(), weights.data());
+      
+      // If supplied, check early-stopping condition
+      #pragma omp critical
+      {
+        stop_flag = stop_check(i);
+      }
+    }
+  }
+}
+
+template< std::floating_point F, LinearOperator Matrix, ThreadSafeRBG RBG > 
+void sl_quadrature(
+  const Matrix& mat, 
+  RBG& rbg, const int nv, const int dist, const int engine_id, const int seed,
+  const int lanczos_degree, const F lanczos_rtol, const int orth, const int ncv,
+  const int num_threads, 
+  F* quad_nw
+){
+  using VectorF = Eigen::Array< F, Dynamic, 1>;
+
+  // Parameterize the quadrature function
+  Eigen::Map< DenseMatrix< F >> quad_nw_map(quad_nw, lanczos_degree * nv, 2);
+  const auto quad_f = [lanczos_degree, &quad_nw_map](int i, [[maybe_unused]] F* q, [[maybe_unused]] F* Q, F* nodes, F* weights){
+    // printf("iter %0d, tid %0d, q[0] = %.4g, nodes[0] = %.4g\n", i, omp_get_thread_num(), q[0], nodes[0]);
+    Eigen::Map< VectorF > nodes_v(nodes, lanczos_degree, 1);        // no-op
+    Eigen::Map< VectorF > weights_v(weights, lanczos_degree, 1);    // no-op
+    quad_nw_map.block(i*lanczos_degree, 0, lanczos_degree, 1) = nodes_v;
+    quad_nw_map.block(i*lanczos_degree, 1, lanczos_degree, 1) = weights_v;
+  };
+
+  // Parameterize when to stop (run in critical section)
+  constexpr auto early_stop = [](int i) -> bool {
+    return false; 
+  };
+
+  // Execute the stochastic Lanczos quadrature
+  slq< F >(mat, quad_f, early_stop, nv, static_cast< Distribution >(dist), rbg, lanczos_degree, lanczos_rtol, orth, ncv, num_threads, seed);
+}
+
 #endif
\ No newline at end of file
diff --git a/src/primate/_lanczos.cpp b/src/primate/_lanczos.cpp
index 4fb2aa6..cb0c9fd 100644
--- a/src/primate/_lanczos.cpp
+++ b/src/primate/_lanczos.cpp
@@ -1,18 +1,22 @@
+#include <type_traits> // result_of
+#include <cmath> // constants
+#include <iostream>
+#include <stdio.h>
+#include <functional>
+
 #include <pybind11/pybind11.h>
 #include <pybind11/eigen.h>
 #include <pybind11/numpy.h>
 #include <pybind11/stl.h>
 #include <pybind11/functional.h>
+
 #include "_random_generator/vector_generator.h"
+#include "_random_generator/threadedrng64.h"
 #include "_lanczos/lanczos.h"
+#include "_trace/hutch.h"
 #include "eigen_operators.h"
 #include "pylinop.h"
 #include "spectral_functions.h"
-#include <type_traits> // result_of
-#include <cmath> // constants
-#include <iostream>
-#include <stdio.h>
-#include <functional>
 
 namespace py = pybind11;
 
diff --git a/src/primate/_trace.cpp b/src/primate/_trace.cpp
index 1761d9e..05d14ac 100644
--- a/src/primate/_trace.cpp
+++ b/src/primate/_trace.cpp
@@ -1,10 +1,12 @@
+#include <pybind11/pybind11.h>
+
 #include "_lanczos/lanczos.h"
 #include "_trace/hutch.h"
+#include "_random_generator/threadedrng64.h"
 #include "eigen_operators.h"    // Eigen wrappers
 #include "pylinop.h"            // py::object LinearOperator wrapper
 #include "spectral_functions.h" // to parameterize the functions
 
-#include <pybind11/pybind11.h>
 namespace py = pybind11;
 
 // NOTE: all matrices should be cast to Fortran ordering for compatibility with Eigen
diff --git a/tests/test_lanczos.py b/tests/test_lanczos.py
index b139249..2726d6a 100644
--- a/tests/test_lanczos.py
+++ b/tests/test_lanczos.py
@@ -36,8 +36,8 @@ def test_accuracy():
   from primate.diagonalize import lanczos
   np.random.seed(1234)
   n = 30
-  A = symmetric(n)
-  alpha, beta = np.zeros(n, dtype=np.float32), np.zeros(n, dtype=np.float32)
+  A = symmetric(n).astype(np.float64)
+  alpha, beta = np.zeros(n, dtype=np.float64), np.zeros(n, dtype=np.float64)
   
   ## In general not guaranteed, but with full re-orthogonalization it's likely (and deterministic w/ fixed seed)
   tol = np.zeros(30)
@@ -46,8 +46,8 @@ def test_accuracy():
     alpha, beta = lanczos(A, v0=v0, rtol=1e-8, orth=n-1)
     ew_true = np.sort(eigsh(A, k=n-1, return_eigenvectors=False))
     ew_test = np.sort(eigh_tridiagonal(alpha, beta, eigvals_only=True))
-    tol[i] = np.mean(np.abs(ew_test[1:] - ew_true))
-  assert np.all(tol < 1e-5)
+    tol[i] = np.max(np.abs(ew_test[1:] - ew_true))
+  assert np.all(tol < 1e-12)
 
 def test_high_degree():
   from primate.diagonalize import lanczos