From 9d63b8a7eb6d762c60402a8c24116698e56c41c8 Mon Sep 17 00:00:00 2001
From: Nicholas K Sauter <nksauter@lbl.gov>
Date: Sun, 31 Mar 2024 14:38:50 -0700
Subject: [PATCH 1/4] Framework for compile-time polymorphism.

In the exascale_api, allow pixel values to be calculation either on
large array (all pixels), or with low-memory on just the whitelist
consisting of shoebox pixels.  This commit only gives the polymorphism
framework; both implementations are currently identical giving the
large-array behavior.
---
 simtbx/__init__.py                 |   7 +-
 simtbx/kokkos/SConscript           |   4 +
 simtbx/kokkos/detector.cpp         | 163 ---------
 simtbx/kokkos/detector.h           | 199 ++++++++++-
 simtbx/kokkos/kokkos_ext.cpp       |  69 ++--
 simtbx/kokkos/simulation.cpp       | 512 --------------------------
 simtbx/kokkos/simulation.h         | 557 ++++++++++++++++++++++++++++-
 simtbx/kokkos/simulation_kernels.h |   3 +
 simtbx/tests/tst_memory_policy.py  |   4 +-
 9 files changed, 781 insertions(+), 737 deletions(-)

diff --git a/simtbx/__init__.py b/simtbx/__init__.py
index c88f23d368..c1855fd4e9 100644
--- a/simtbx/__init__.py
+++ b/simtbx/__init__.py
@@ -3,11 +3,14 @@
 
 def get_exascale(interface, context):
   if context == "kokkos_gpu":
-    from simtbx.kokkos import gpu_instance, gpu_energy_channels, gpu_detector, exascale_api
+    from simtbx.kokkos import gpu_instance, gpu_energy_channels, gpu_detector, gpu_detector_small_whitelist
+    from simtbx.kokkos import exascale_api, exascale_api_small_whitelist
   elif context == "cuda":
     from simtbx.gpu import gpu_instance, gpu_energy_channels, gpu_detector, exascale_api
   else: raise NotImplementedError(context)
 
   return dict(gpu_instance = gpu_instance, gpu_energy_channels = gpu_energy_channels,
-              gpu_detector = gpu_detector, exascale_api = exascale_api)[interface]
+              gpu_detector = gpu_detector, exascale_api = exascale_api,
+              gpu_detector_small_whitelist = locals().get("gpu_detector_small_whitelist"),
+              exascale_api_small_whitelist = locals().get("exascale_api_small_whitelist"))[interface]
 
diff --git a/simtbx/kokkos/SConscript b/simtbx/kokkos/SConscript
index bc72ee122d..7ff0a63aaf 100644
--- a/simtbx/kokkos/SConscript
+++ b/simtbx/kokkos/SConscript
@@ -168,6 +168,10 @@ if not env_etc.no_boost_python:
   env_etc.include_registry.append(
     env=kokkos_ext_env,
     paths=env_etc.simtbx_common_includes + [env_etc.python_include])
+  if True: # same construct as above, temporarily accommodate the eigen library
+    env_etc.include_registry.append(
+    env=kokkos_ext_env,
+    paths=[env_etc.eigen_include])
   kokkos_ext_env.Replace(CXX=os.environ['CXX'])
   kokkos_ext_env.Replace(SHCXX=os.environ['CXX'])
   kokkos_ext_env.Replace(SHLINK=os.environ['CXX'])
diff --git a/simtbx/kokkos/detector.cpp b/simtbx/kokkos/detector.cpp
index 0c310551ca..233b97c7ff 100644
--- a/simtbx/kokkos/detector.cpp
+++ b/simtbx/kokkos/detector.cpp
@@ -92,168 +92,5 @@ namespace simtbx { namespace Kokkos {
     }
   }
 
-  vector_double_t
-  kokkos_detector::construct_detail(dxtbx::model::Detector const & arg_detector) {
-    //1) confirm the size
-    SCITBX_ASSERT( m_panel_count == arg_detector.size() );
-    SCITBX_ASSERT( m_panel_count >= 1 );
-
-    //2) confirm that array dimensions are similar for each size
-    for (int ipanel=1; ipanel < arg_detector.size(); ++ipanel){
-      SCITBX_ASSERT( arg_detector[ipanel].get_image_size()[1] == m_slow_dim_size );
-      SCITBX_ASSERT( arg_detector[ipanel].get_image_size()[0] == m_fast_dim_size );
-    }
-    // printf(" m_total_pixel_count: %d\n", m_total_pixel_count);
-    // printf("     m_slow_dim_size: %d\n", m_slow_dim_size);
-    // printf("     m_fast_dim_size: %d\n", m_fast_dim_size);
-    // printf("       m_panel_count: %d\n", m_panel_count);
-
-    //3) allocate a cuda array with these dimensions
-    // separate accumulator image outside the usual nanoBragg data structure.
-    //       1. accumulate contributions from a sequence of source energy channels computed separately
-    //       2. represent multiple panels, all same rectangular shape; slowest dimension = n_panels
-    vector_double_t view_floatimage( "m_accumulate_floatimage", m_total_pixel_count );
-    return view_floatimage;
-  };
-
-  kokkos_detector::kokkos_detector(int const& arg_device,
-                             dxtbx::model::Detector const & arg_detector,
-                             dxtbx::model::Beam const& arg_beam):
-    h_deviceID(arg_device),
-    metrology(arg_detector, arg_beam),
-    m_panel_count( arg_detector.size() ),
-    m_slow_dim_size( arg_detector[0].get_image_size()[1] ),
-    m_fast_dim_size( arg_detector[0].get_image_size()[0] ),
-    m_total_pixel_count( m_panel_count * m_slow_dim_size * m_fast_dim_size ),
-    m_accumulate_floatimage( construct_detail(arg_detector) ) { }
-    // Easy mistake: not realizing that the dxtbx detector model stores (fast,slow) sizes
-
-  kokkos_detector::kokkos_detector(int const& arg_device,
-                             const simtbx::nanoBragg::nanoBragg& nB):
-    h_deviceID(arg_device),
-    metrology(nB),
-    m_panel_count(1),
-    m_slow_dim_size(nB.spixels),
-    m_fast_dim_size(nB.fpixels),
-    m_total_pixel_count( m_panel_count * m_slow_dim_size * m_fast_dim_size ),
-    m_accumulate_floatimage( vector_double_t( "m_accumulate_floatimage", m_total_pixel_count) ) { }
-
-  void
-  kokkos_detector::scale_in_place(const double& factor){
-    auto local_accumulate_floatimage = m_accumulate_floatimage;
-    parallel_for("scale_in_place", range_policy(0,m_total_pixel_count), KOKKOS_LAMBDA (const int i) {
-      local_accumulate_floatimage( i ) = local_accumulate_floatimage( i ) * factor;
-    });
-  }
-
-  void
-  kokkos_detector::write_raw_pixels(simtbx::nanoBragg::nanoBragg& nB) {
-    //only implement the monolithic detector case, one panel
-    SCITBX_ASSERT(nB.spixels == m_slow_dim_size);
-    SCITBX_ASSERT(nB.fpixels == m_fast_dim_size);
-    SCITBX_ASSERT(m_panel_count == 1);
-    // nB.raw_pixels = af::flex_double(af::flex_grid<>(nB.spixels,nB.fpixels));
-    // do not reallocate CPU memory for the data write, as it is not needed
-
-    kokkostbx::transfer_kokkos2flex(nB.raw_pixels, m_accumulate_floatimage);
-    // vector_double_t::HostMirror host_floatimage = create_mirror_view(m_accumulate_floatimage);
-    // deep_copy(host_floatimage, m_accumulate_floatimage);
-
-    // printf(" m_total_pixel_count: %d\n", m_total_pixel_count);
-
-    // double * double_floatimage = nB.raw_pixels.begin();
-    // for (int i=0; i<m_total_pixel_count; ++i) {
-    //   double_floatimage[i] = host_floatimage( i );
-    // }
-  }
-
-  af::flex_double
-  kokkos_detector::get_raw_pixels(){
-    //return the data array for the multipanel detector case
-    af::flex_double output_array(af::flex_grid<>(m_panel_count,m_slow_dim_size,m_fast_dim_size), af::init_functor_null<double>());
-    kokkostbx::transfer_kokkos2flex(output_array, m_accumulate_floatimage);
-
-    // vector_double_t::HostMirror host_floatimage = create_mirror_view(m_accumulate_floatimage);
-    // deep_copy(host_floatimage, m_accumulate_floatimage);
-
-    // for (int i=0; i<m_total_pixel_count; ++i) {
-    //   output_array_ptr[ i ] = host_floatimage( i );
-    // }
-    return output_array;
-  }
-
-  void
-  kokkos_detector::set_active_pixels_on_GPU(af::shared<std::size_t> active_pixel_list_value) {
-    m_active_pixel_size = active_pixel_list_value.size();
-    kokkostbx::transfer_shared2kokkos(m_active_pixel_list, active_pixel_list_value);
-    active_pixel_list = active_pixel_list_value;
-  }
-
-  af::shared<double>
-  kokkos_detector::get_whitelist_raw_pixels(af::shared<std::size_t> selection) {
-    //return the data array for the multipanel detector case, but only for whitelist pixels
-    vector_size_t active_pixel_selection = vector_size_t("active_pixel_selection", selection.size());
-    kokkostbx::transfer_shared2kokkos(active_pixel_selection, selection);
-
-    size_t output_pixel_size = selection.size();
-    vector_cudareal_t active_pixel_results = vector_cudareal_t("active_pixel_results", output_pixel_size);
-
-    auto temp = m_accumulate_floatimage;
-
-    parallel_for("get_active_pixel_selection",
-                  range_policy(0, output_pixel_size),
-                  KOKKOS_LAMBDA (const int i) {
-      size_t index = active_pixel_selection( i );
-      active_pixel_results( i ) = temp( index );
-    });
-
-    af::shared<double> output_array(output_pixel_size, af::init_functor_null<double>());
-    kokkostbx::transfer_kokkos2shared(output_array, active_pixel_results);
-
-    SCITBX_ASSERT(output_array.size() == output_pixel_size);
-    return output_array;
-  }
-
-  void
-  kokkos_detector::each_image_allocate() {
-    resize(m_rangemap, m_total_pixel_count);
-    resize(m_omega_reduction, m_total_pixel_count);
-    resize(m_max_I_x_reduction, m_total_pixel_count);
-    resize(m_max_I_y_reduction, m_total_pixel_count);
-
-    resize(m_maskimage, m_total_pixel_count);
-    resize(m_floatimage, m_total_pixel_count);
-
-    kokkostbx::transfer_shared2kokkos(m_sdet_vector, metrology.sdet);
-    kokkostbx::transfer_shared2kokkos(m_fdet_vector, metrology.fdet);
-    kokkostbx::transfer_shared2kokkos(m_odet_vector, metrology.odet);
-    kokkostbx::transfer_shared2kokkos(m_pix0_vector, metrology.pix0);
-    kokkostbx::transfer_shared2kokkos(m_distance, metrology.dists);
-    kokkostbx::transfer_shared2kokkos(m_Xbeam, metrology.Xbeam);
-    kokkostbx::transfer_shared2kokkos(m_Ybeam, metrology.Ybeam);
-    fence();
-
-    // metrology.show();
-
-    // printf(" rangemap size:%d\n", m_rangemap.span());
-    // printf(" omega_reduction size:%d\n", m_omega_reduction.span());
-    // printf(" max_I_x_reduction size:%d\n", m_max_I_x_reduction.span());
-    // printf(" max_I_y_reduction size:%d\n", m_max_I_y_reduction.span());
-    // printf(" maskimage size:%d\n", m_maskimage.span());
-    // printf(" floatimage size:%d\n", m_floatimage.span());
-    // printf(" sdet_vector size:%d\n", m_sdet_vector.span());
-    // printf(" fdet_vector size:%d\n", m_fdet_vector.span());
-    // printf(" odet_vector size:%d\n", m_odet_vector.span());
-    // printf(" pix0_vector size:%d\n", m_pix0_vector.span());
-    // printf(" distance size:%d\n", m_distance.span());
-    // printf(" Xbeam size:%d\n", m_Xbeam.span());
-    // printf(" Ybeam size:%d\n", m_Ybeam.span());
-
-    // print_view(m_fdet_vector);
-    // print_view(m_odet_vector, 1, 3);
-
-    // printf("DONE.\n");
-  }
-
 } // Kokkos
 } // simtbx
diff --git a/simtbx/kokkos/detector.h b/simtbx/kokkos/detector.h
index 03e3913e17..c7149eaeb4 100644
--- a/simtbx/kokkos/detector.h
+++ b/simtbx/kokkos/detector.h
@@ -17,10 +17,11 @@
 #include "kokkostbx/kokkos_types.h"
 #include "kokkostbx/kokkos_vector3.h"
 #include "kokkostbx/kokkos_matrix3.h"
+#include "kokkostbx/kokkos_utils.h"
 
 using vec3 = kokkostbx::vector3<CUDAREAL>;
 using mat3 = kokkostbx::matrix3<CUDAREAL>;
-
+using Kokkos::fence;
 
 namespace simtbx { namespace Kokkos {
 
@@ -40,22 +41,29 @@ struct packed_metrology{
   af::shared<double>Ybeam;
 };
 
-struct kokkos_detector{
+struct large_array_policy {
+};
+struct small_whitelist_policy {
+};
+
+template <typename MemoryPolicy>
+struct kokkos_detector
+{
   inline kokkos_detector(){printf("NO OPERATION, DEVICE NUMBER IS NEEDED");};
-  kokkos_detector(int const&, const simtbx::nanoBragg::nanoBragg& nB);
-  kokkos_detector(int const&, dxtbx::model::Detector const &, dxtbx::model::Beam const &);
-  vector_double_t construct_detail(dxtbx::model::Detector const &);
+  //kokkos_detector(int const&, const simtbx::nanoBragg::nanoBragg& nB);
+  //kokkos_detector(int const&, dxtbx::model::Detector const &, dxtbx::model::Beam const &);
+  //vector_double_t construct_detail(dxtbx::model::Detector const &);
 
   inline void show_summary(){
     std::cout << "Detector size: " << m_panel_count << " panel" << ( (m_panel_count>1)? "s" : "" ) << std::endl;
     metrology.show();
   }
-  void each_image_allocate();
-  void scale_in_place(const double&);
-  void write_raw_pixels(simtbx::nanoBragg::nanoBragg&);
-  af::flex_double get_raw_pixels();
-  void set_active_pixels_on_GPU(af::shared<std::size_t>);
-  af::shared<double> get_whitelist_raw_pixels(af::shared<std::size_t>);
+  //void each_image_allocate();
+  //void scale_in_place(const double&);
+  //void write_raw_pixels(simtbx::nanoBragg::nanoBragg&);
+  //af::flex_double get_raw_pixels();
+  //void set_active_pixels_on_GPU(af::shared<std::size_t>);
+  //af::shared<double> get_whitelist_raw_pixels(af::shared<std::size_t>);
   inline void each_image_free(){} //no op in Kokkos
   int h_deviceID;
 
@@ -99,8 +107,175 @@ struct kokkos_detector{
   af::shared<std::size_t> active_pixel_list;
   std::size_t m_active_pixel_size;
   vector_size_t m_active_pixel_list = vector_size_t("m_active_pixel_list", 0);
+
+  inline
+  kokkos_detector(int const& arg_device,
+                             const simtbx::nanoBragg::nanoBragg& nB):
+    h_deviceID(arg_device),
+    metrology(nB),
+    m_panel_count(1),
+    m_slow_dim_size(nB.spixels),
+    m_fast_dim_size(nB.fpixels),
+    m_total_pixel_count( m_panel_count * m_slow_dim_size * m_fast_dim_size ),
+    m_accumulate_floatimage( vector_double_t( "m_accumulate_floatimage", m_total_pixel_count) ) { }
+
+  inline
+  kokkos_detector(int const& arg_device,
+                             dxtbx::model::Detector const & arg_detector,
+                             dxtbx::model::Beam const& arg_beam):
+    h_deviceID(arg_device),
+    metrology(arg_detector, arg_beam),
+    m_panel_count( arg_detector.size() ),
+    m_slow_dim_size( arg_detector[0].get_image_size()[1] ),
+    m_fast_dim_size( arg_detector[0].get_image_size()[0] ),
+    m_total_pixel_count( m_panel_count * m_slow_dim_size * m_fast_dim_size ),
+    m_accumulate_floatimage( construct_detail(arg_detector) ) { }
+    // Easy mistake: not realizing that the dxtbx detector model stores (fast,slow) sizes
+
+  inline
+  vector_double_t
+  construct_detail(dxtbx::model::Detector const & arg_detector) {
+    //1) confirm the size
+    SCITBX_ASSERT( m_panel_count == arg_detector.size() );
+    SCITBX_ASSERT( m_panel_count >= 1 );
+
+    //2) confirm that array dimensions are similar for each size
+    for (int ipanel=1; ipanel < arg_detector.size(); ++ipanel){
+      SCITBX_ASSERT( arg_detector[ipanel].get_image_size()[1] == m_slow_dim_size );
+      SCITBX_ASSERT( arg_detector[ipanel].get_image_size()[0] == m_fast_dim_size );
+    }
+    // printf(" m_total_pixel_count: %d\n", m_total_pixel_count);
+    // printf("     m_slow_dim_size: %d\n", m_slow_dim_size);
+    // printf("     m_fast_dim_size: %d\n", m_fast_dim_size);
+    // printf("       m_panel_count: %d\n", m_panel_count);
+
+    //3) allocate a cuda array with these dimensions
+    // separate accumulator image outside the usual nanoBragg data structure.
+    //       1. accumulate contributions from a sequence of source energy channels computed separately
+    //       2. represent multiple panels, all same rectangular shape; slowest dimension = n_panels
+    vector_double_t view_floatimage( "m_accumulate_floatimage", m_total_pixel_count );
+    return view_floatimage;
+  };
+
+  inline void
+  each_image_allocate() {
+    resize(m_rangemap, m_total_pixel_count);
+    resize(m_omega_reduction, m_total_pixel_count);
+    resize(m_max_I_x_reduction, m_total_pixel_count);
+    resize(m_max_I_y_reduction, m_total_pixel_count);
+
+    resize(m_maskimage, m_total_pixel_count);
+    resize(m_floatimage, m_total_pixel_count);
+
+    kokkostbx::transfer_shared2kokkos(m_sdet_vector, metrology.sdet);
+    kokkostbx::transfer_shared2kokkos(m_fdet_vector, metrology.fdet);
+    kokkostbx::transfer_shared2kokkos(m_odet_vector, metrology.odet);
+    kokkostbx::transfer_shared2kokkos(m_pix0_vector, metrology.pix0);
+    kokkostbx::transfer_shared2kokkos(m_distance, metrology.dists);
+    kokkostbx::transfer_shared2kokkos(m_Xbeam, metrology.Xbeam);
+    kokkostbx::transfer_shared2kokkos(m_Ybeam, metrology.Ybeam);
+    fence();
+
+    // metrology.show();
+
+    // printf(" rangemap size:%d\n", m_rangemap.span());
+    // printf(" omega_reduction size:%d\n", m_omega_reduction.span());
+    // printf(" max_I_x_reduction size:%d\n", m_max_I_x_reduction.span());
+    // printf(" max_I_y_reduction size:%d\n", m_max_I_y_reduction.span());
+    // printf(" maskimage size:%d\n", m_maskimage.span());
+    // printf(" floatimage size:%d\n", m_floatimage.span());
+    // printf(" sdet_vector size:%d\n", m_sdet_vector.span());
+    // printf(" fdet_vector size:%d\n", m_fdet_vector.span());
+    // printf(" odet_vector size:%d\n", m_odet_vector.span());
+    // printf(" pix0_vector size:%d\n", m_pix0_vector.span());
+    // printf(" distance size:%d\n", m_distance.span());
+    // printf(" Xbeam size:%d\n", m_Xbeam.span());
+    // printf(" Ybeam size:%d\n", m_Ybeam.span());
+
+    // print_view(m_fdet_vector);
+    // print_view(m_odet_vector, 1, 3);
+
+    // printf("DONE.\n");
+  }
+
+  inline void
+  scale_in_place(const double& factor){
+    auto local_accumulate_floatimage = m_accumulate_floatimage;
+    parallel_for("scale_in_place", range_policy(0,m_total_pixel_count), KOKKOS_LAMBDA (const int i) {
+      local_accumulate_floatimage( i ) = local_accumulate_floatimage( i ) * factor;
+    });
+  }
+
+  inline void
+  write_raw_pixels(simtbx::nanoBragg::nanoBragg& nB) {
+    //only implement the monolithic detector case, one panel
+    SCITBX_ASSERT(nB.spixels == m_slow_dim_size);
+    SCITBX_ASSERT(nB.fpixels == m_fast_dim_size);
+    SCITBX_ASSERT(m_panel_count == 1);
+    // nB.raw_pixels = af::flex_double(af::flex_grid<>(nB.spixels,nB.fpixels));
+    // do not reallocate CPU memory for the data write, as it is not needed
+
+    kokkostbx::transfer_kokkos2flex(nB.raw_pixels, m_accumulate_floatimage);
+    // vector_double_t::HostMirror host_floatimage = create_mirror_view(m_accumulate_floatimage);
+    // deep_copy(host_floatimage, m_accumulate_floatimage);
+
+    // printf(" m_total_pixel_count: %d\n", m_total_pixel_count);
+
+    // double * double_floatimage = nB.raw_pixels.begin();
+    // for (int i=0; i<m_total_pixel_count; ++i) {
+    //   double_floatimage[i] = host_floatimage( i );
+    // }
+  }
+
+  inline af::flex_double
+  get_raw_pixels(){
+    //return the data array for the multipanel detector case
+    af::flex_double output_array(af::flex_grid<>(m_panel_count,m_slow_dim_size,m_fast_dim_size), af::init_functor_null<double>());
+    kokkostbx::transfer_kokkos2flex(output_array, m_accumulate_floatimage);
+
+    // vector_double_t::HostMirror host_floatimage = create_mirror_view(m_accumulate_floatimage);
+    // deep_copy(host_floatimage, m_accumulate_floatimage);
+
+    // for (int i=0; i<m_total_pixel_count; ++i) {
+    //   output_array_ptr[ i ] = host_floatimage( i );
+    // }
+    return output_array;
+  }
+
+  inline void
+  set_active_pixels_on_GPU(af::shared<std::size_t> active_pixel_list_value) {
+    m_active_pixel_size = active_pixel_list_value.size();
+    kokkostbx::transfer_shared2kokkos(m_active_pixel_list, active_pixel_list_value);
+    active_pixel_list = active_pixel_list_value;
+  }
+
+  inline af::shared<double>
+  get_whitelist_raw_pixels(af::shared<std::size_t> selection) {
+    //return the data array for the multipanel detector case, but only for whitelist pixels
+    vector_size_t active_pixel_selection = vector_size_t("active_pixel_selection", selection.size());
+    kokkostbx::transfer_shared2kokkos(active_pixel_selection, selection);
+
+    size_t output_pixel_size = selection.size();
+    vector_cudareal_t active_pixel_results = vector_cudareal_t("active_pixel_results", output_pixel_size);
+
+    auto temp = m_accumulate_floatimage;
+
+    parallel_for("get_active_pixel_selection",
+                  range_policy(0, output_pixel_size),
+                  KOKKOS_LAMBDA (const int i) {
+      size_t index = active_pixel_selection( i );
+      active_pixel_results( i ) = temp( index );
+    });
+
+    af::shared<double> output_array(output_pixel_size, af::init_functor_null<double>());
+    kokkostbx::transfer_kokkos2shared(output_array, active_pixel_results);
+
+    SCITBX_ASSERT(output_array.size() == output_pixel_size);
+    return output_array;
+  }
 };
+
+
 } // Kokkos
 } // simtbx
 #endif // SIMTBX_KOKKOS_DETECTOR_H
-
diff --git a/simtbx/kokkos/kokkos_ext.cpp b/simtbx/kokkos/kokkos_ext.cpp
index f630f9bd3e..3ef9f4e2ce 100644
--- a/simtbx/kokkos/kokkos_ext.cpp
+++ b/simtbx/kokkos/kokkos_ext.cpp
@@ -66,13 +66,14 @@ namespace simtbx { namespace Kokkos {
     }
   };
 
+  template <typename memory_t>
   struct detector_wrapper
   {
     static void
-    wrap()
+    wrap(std::string pyname)
     {
       using namespace boost::python;
-      class_<simtbx::Kokkos::kokkos_detector>("gpu_detector",init<>() )
+      class_<simtbx::Kokkos::kokkos_detector<memory_t> >(pyname.c_str(),init<>() )
         .def(init<int const&, const simtbx::nanoBragg::nanoBragg&>(
             ( arg("deviceId")=-1,arg("nanoBragg")),
              "Single panel constructor with data taken from nanoBragg instance\n"
@@ -83,21 +84,21 @@ namespace simtbx { namespace Kokkos {
              "Multipanel constructor with data taken from dxtbx objects\n"
              "deviceId is completely optional and ignored for Kokkos, rather\n"
              "the device is set by the gpu_instance class."))
-        .def("show_summary",&simtbx::Kokkos::kokkos_detector::show_summary)
+        .def("show_summary",&simtbx::Kokkos::kokkos_detector<memory_t>::show_summary)
         .def("each_image_allocate",
-              &simtbx::Kokkos::kokkos_detector::each_image_allocate,
+              &simtbx::Kokkos::kokkos_detector<memory_t>::each_image_allocate,
              "Allocate large pixel arrays")
-        .def("scale_in_place", &simtbx::Kokkos::kokkos_detector::scale_in_place,
+        .def("scale_in_place", &simtbx::Kokkos::kokkos_detector<memory_t>::scale_in_place,
              "Multiply by a scale factor on the GPU")
-        .def("write_raw_pixels",&simtbx::Kokkos::kokkos_detector::write_raw_pixels,
+        .def("write_raw_pixels",&simtbx::Kokkos::kokkos_detector<memory_t>::write_raw_pixels,
              "Update raw_pixels on host with array from GPU")
-        .def("get_raw_pixels",&simtbx::Kokkos::kokkos_detector::get_raw_pixels,
+        .def("get_raw_pixels",&simtbx::Kokkos::kokkos_detector<memory_t>::get_raw_pixels,
              "return multipanel detector raw pixels as a flex array")
         .def("get_whitelist_raw_pixels",
-             (af::shared<double> (simtbx::Kokkos::kokkos_detector::*)(af::shared<std::size_t>))
-             &simtbx::Kokkos::kokkos_detector::get_whitelist_raw_pixels,
+             (af::shared<double> (simtbx::Kokkos::kokkos_detector<memory_t>::*)(af::shared<std::size_t>))
+             &simtbx::Kokkos::kokkos_detector<memory_t>::get_whitelist_raw_pixels,
             "return only those raw pixels requested by the whitelist selection, as a 1D flex array")
-        .def("each_image_free", &simtbx::Kokkos::kokkos_detector::each_image_free)
+        .def("each_image_free", &simtbx::Kokkos::kokkos_detector<memory_t>::each_image_free)
         ;
     }
   };
@@ -127,59 +128,70 @@ namespace simtbx { namespace Kokkos {
     }
   };
 
+  template <typename memory_t>
   struct simulation_wrapper
   {
     static void
-    wrap()
+    wrap(std::string pyname)
     {
       using namespace boost::python;
       typedef return_value_policy<return_by_value> rbv;
       typedef default_call_policies dcp;
       using namespace simtbx::Kokkos;
-      class_<simtbx::Kokkos::exascale_api>("exascale_api",no_init )
+      class_<simtbx::Kokkos::exascale_api<memory_t> >(pyname.c_str(),no_init )
         .def(init<const simtbx::nanoBragg::nanoBragg&>(
             ( arg("nanoBragg"))))
-        .def("allocate",&simtbx::Kokkos::exascale_api::allocate,
+        .def("allocate",&simtbx::Kokkos::exascale_api<memory_t>::allocate,
              "Allocate and transfer input data on the GPU")
         .def("add_energy_channel_from_gpu_amplitudes",
-             &simtbx::Kokkos::exascale_api::add_energy_channel_from_gpu_amplitudes,
+             &simtbx::Kokkos::exascale_api<memory_t>::add_energy_channel_from_gpu_amplitudes,
              (arg_("channel_number"), arg_("gpu_amplitudes"), arg_("gpu_detector"), arg_("weight")=1.0),
              "Point to Fhkl at a new energy channel on the GPU, and accumulate Bragg spot contributions to the detector's accumulator array")
         .def("add_energy_channel_mask_allpanel",
-             static_cast<void (exascale_api::*)(int const&,kokkos_energy_channels&,kokkos_detector&, af::shared<bool>) >
-             (&exascale_api::add_energy_channel_mask_allpanel),
+             static_cast<void (exascale_api<memory_t>::*)(int const&,kokkos_energy_channels&,kokkos_detector<memory_t>&, af::shared<bool>) >
+             (&exascale_api<memory_t>::add_energy_channel_mask_allpanel),
              (arg_("channel_number"), arg_("gpu_amplitudes"), arg_("gpu_detector"), arg_("pixel_active_mask_bools")),
              "Point to Fhkl at a new energy channel on the GPU, and accumulate Bragg spots on mask==True pixels\n"
              "The pixel_active_mask_bools is a large array with one bool per detector pixel")
         .def("add_energy_channel_mask_allpanel",
-             static_cast<void (exascale_api::*)(int const&,kokkos_energy_channels&,kokkos_detector&, af::shared<std::size_t> const) >
-             (&exascale_api::add_energy_channel_mask_allpanel),
+             static_cast<void (exascale_api<memory_t>::*)(int const&,kokkos_energy_channels&,kokkos_detector<memory_t>&, af::shared<std::size_t> const) >
+             (&exascale_api<memory_t>::add_energy_channel_mask_allpanel),
              (arg_("channel_number"), arg_("gpu_amplitudes"), arg_("gpu_detector"), arg_("pixel_active_list_ints")),
              "Point to Fhkl at a new energy channel on the GPU, and accumulate Bragg spots on mask==True pixels\n"
              "The pixel_active_list_ints is a small array with integer-offset addresses for each pixel-of-interest")
         .def("add_energy_multichannel_mask_allpanel",
-             static_cast<void (exascale_api::*)(af::shared<int> const,kokkos_energy_channels&,kokkos_detector&, af::shared<std::size_t> const,
+             static_cast<void (exascale_api<memory_t>::*)(af::shared<int> const,kokkos_energy_channels&,kokkos_detector<memory_t>&, af::shared<std::size_t> const,
              af::shared<double> const) >
-             (&exascale_api::add_energy_multichannel_mask_allpanel),
+             (&exascale_api<memory_t>::add_energy_multichannel_mask_allpanel),
              (arg_("ichannels"), arg_("gpu_amplitudes"), arg_("gpu_detector"), arg_("pixel_active_list_ints"), arg_("weights")),
              "Point to Fhkl at a new energy channel on the GPU, and accumulate Bragg spots on mask==True pixels\n"
              "The pixel_active_list_ints is a small array with integer-offset addresses for each pixel-of-interest"
              "ichannels: for each nanoBragg source, the value instructs the simulation which channel in gpu_structure_factors"
              "to use for structure factor lookup.  If -1, skip this source wavelength."
              )
-        .def("add_background", &simtbx::Kokkos::exascale_api::add_background,
+        .def("add_background", &simtbx::Kokkos::exascale_api<memory_t>::add_background,
              (arg_("detector"), arg_("override_source")=-1),
              "Add a background field directly on the GPU")
-        .def("add_noise", &simtbx::Kokkos::exascale_api::add_noise,
+        .def("add_noise", &simtbx::Kokkos::exascale_api<memory_t>::add_noise,
              (arg_("detector")),
              "Modify pixels with noise on CPU. Unusual pattern, returns pixels directly instead of saving persistent")
-        .def("show",&simtbx::Kokkos::exascale_api::show)
+        .def("show",&simtbx::Kokkos::exascale_api<memory_t>::show)
         .add_property("diffuse",
-             make_getter(&simtbx::Kokkos::exascale_api::diffuse,rbv()),
-             make_setter(&simtbx::Kokkos::exascale_api::diffuse,dcp()),
+             make_getter(&simtbx::Kokkos::exascale_api<memory_t>::diffuse,rbv()),
+             make_setter(&simtbx::Kokkos::exascale_api<memory_t>::diffuse,dcp()),
              "the diffuse parameters for the simulation.")
         ;
+    }
+  };
 
+  struct diffuse_wrapper
+  {
+    static void
+    wrap()
+    {
+      using namespace boost::python;
+      typedef return_value_policy<return_by_value> rbv;
+      typedef default_call_policies dcp;
       class_<simtbx::Kokkos::diffuse_api>("diffuse_api",no_init )
         .def(init<>())
         .add_property("enable",
@@ -220,7 +232,10 @@ namespace simtbx { namespace Kokkos {
   BOOST_PYTHON_MODULE(simtbx_kokkos_ext) {
     simtbx::Kokkos::kokkos_instance_wrapper::wrap();
     simtbx::Kokkos::structure_factor_wrapper::wrap();
-    simtbx::Kokkos::detector_wrapper::wrap();
-    simtbx::Kokkos::simulation_wrapper::wrap();
+    simtbx::Kokkos::detector_wrapper<simtbx::Kokkos::large_array_policy>::wrap("gpu_detector");
+    simtbx::Kokkos::detector_wrapper<simtbx::Kokkos::small_whitelist_policy>::wrap("gpu_detector_small_whitelist");
+    simtbx::Kokkos::simulation_wrapper<simtbx::Kokkos::large_array_policy>::wrap("exascale_api");
+    simtbx::Kokkos::simulation_wrapper<simtbx::Kokkos::small_whitelist_policy>::wrap("exascale_api_small_whitelist");
+    simtbx::Kokkos::diffuse_wrapper::wrap();
   }
 } // namespace simtbx
diff --git a/simtbx/kokkos/simulation.cpp b/simtbx/kokkos/simulation.cpp
index 5644654192..b4fd96861b 100644
--- a/simtbx/kokkos/simulation.cpp
+++ b/simtbx/kokkos/simulation.cpp
@@ -24,518 +24,6 @@ namespace Kokkos {
         return ret;
   }*/
 
-  // extract subview from [start_index * length; (start_index + 1) * length)
-  template <typename T>
-  view_1d_t<T> extract_subview(view_1d_t<T> A, int start_index, int length) {
-    return ::Kokkos::subview(A, ::Kokkos::pair<int, int>(start_index * length, (start_index + 1) * length ));
-  }
-
-  // make a unit vector pointing in same direction and report magnitude (both args can be same vector)
-  double cpu_unitize(const double * vector, double * new_unit_vector) {
-
-        double v1 = vector[1];
-        double v2 = vector[2];
-        double v3 = vector[3];
-
-        double mag = sqrt(v1 * v1 + v2 * v2 + v3 * v3);
-
-        if (mag != 0.0) {
-                // normalize it
-                new_unit_vector[0] = mag;
-                new_unit_vector[1] = v1 / mag;
-                new_unit_vector[2] = v2 / mag;
-                new_unit_vector[3] = v3 / mag;
-        } else {
-                // can't normalize, report zero vector
-                new_unit_vector[0] = 0.0;
-                new_unit_vector[1] = 0.0;
-                new_unit_vector[2] = 0.0;
-                new_unit_vector[3] = 0.0;
-        }
-        return mag;
-  }
-
-  void
-  exascale_api::show(){
-    SCITBX_EXAMINE(SIM.roi_xmin);
-    SCITBX_EXAMINE(SIM.roi_xmax);
-    SCITBX_EXAMINE(SIM.roi_ymin);
-    SCITBX_EXAMINE(SIM.roi_ymax);
-    SCITBX_EXAMINE(SIM.oversample);
-    SCITBX_EXAMINE(SIM.point_pixel);
-    SCITBX_EXAMINE(SIM.pixel_size);
-    SCITBX_EXAMINE(m_subpixel_size);
-    SCITBX_EXAMINE(m_steps);
-    SCITBX_EXAMINE(SIM.detector_thickstep);
-    SCITBX_EXAMINE(SIM.detector_thicksteps);
-    SCITBX_EXAMINE(SIM.detector_thick);
-    SCITBX_EXAMINE(SIM.detector_attnlen);
-    SCITBX_EXAMINE(SIM.curved_detector);
-    SCITBX_EXAMINE(SIM.distance);
-    SCITBX_EXAMINE(SIM.close_distance);
-    SCITBX_EXAMINE(SIM.dmin);
-    SCITBX_EXAMINE(SIM.phi0);
-    SCITBX_EXAMINE(SIM.phistep);
-    SCITBX_EXAMINE(SIM.phisteps);
-    SCITBX_EXAMINE(SIM.sources);
-    SCITBX_EXAMINE(SIM.mosaic_spread);
-    SCITBX_EXAMINE(SIM.mosaic_domains);
-    SCITBX_EXAMINE(SIM.Na);
-    SCITBX_EXAMINE(SIM.Nb);
-    SCITBX_EXAMINE(SIM.Nc);
-    SCITBX_EXAMINE(SIM.fluence);
-    SCITBX_EXAMINE(SIM.spot_scale);
-    SCITBX_EXAMINE(SIM.integral_form);
-    SCITBX_EXAMINE(SIM.default_F);
-    SCITBX_EXAMINE(SIM.interpolate);
-    SCITBX_EXAMINE(SIM.nopolar);
-    SCITBX_EXAMINE(SIM.polarization);
-    SCITBX_EXAMINE(SIM.fudge);
-  }
-
-  void
-  exascale_api::add_energy_channel_from_gpu_amplitudes(
-    int const& ichannel,
-    simtbx::Kokkos::kokkos_energy_channels & kec,
-    simtbx::Kokkos::kokkos_detector & kdt,
-    double const& weight
-  ){
-    // transfer source_I, source_lambda
-    // the int arguments are for sizes of the arrays
-    int source_count = SIM.sources;
-    af::shared<double> weighted_sources_I = af::shared<double>(source_count);
-    double* wptr = weighted_sources_I.begin();
-    for (std::size_t iwt = 0; iwt < source_count; iwt++){wptr[iwt] = weight*(SIM.source_I[iwt]);}
-    kokkostbx::transfer_double2kokkos(m_source_I, wptr, source_count);
-    kokkostbx::transfer_double2kokkos(m_source_lambda, SIM.source_lambda, source_count);
-
-    ::Kokkos::resize(m_crystal_orientation, SIM.phisteps, SIM.mosaic_domains, 3);
-    calc_CrystalOrientations(
-      SIM.phi0, SIM.phistep, SIM.phisteps, m_spindle_vector, m_a0, m_b0, m_c0, SIM.mosaic_spread,
-      SIM.mosaic_domains, m_mosaic_umats, m_crystal_orientation);
-
-    // magic happens here(?): take pointer from singleton, temporarily use it for add Bragg iteration:
-    vector_cudareal_t current_channel_Fhkl = kec.d_channel_Fhkl[ichannel];
-
-    std::size_t panel_size = kdt.m_slow_dim_size * kdt.m_fast_dim_size;
-
-    // the for loop around panels.  Offsets given.
-    for (std::size_t panel_id = 0; panel_id < kdt.m_panel_count; panel_id++){
-      // loop thru panels and increment the array ptrs
-      kokkosSpotsKernel(
-      kdt.m_slow_dim_size, kdt.m_fast_dim_size, SIM.roi_xmin, SIM.roi_xmax,
-      SIM.roi_ymin, SIM.roi_ymax, SIM.oversample, SIM.point_pixel,
-      SIM.pixel_size, m_subpixel_size, m_steps, SIM.detector_thickstep,
-      SIM.detector_thicksteps, SIM.detector_thick, SIM.detector_attnlen,
-      extract_subview(kdt.m_sdet_vector, panel_id, 1),
-      extract_subview(kdt.m_fdet_vector, panel_id, 1),
-      extract_subview(kdt.m_odet_vector, panel_id, 1),
-      extract_subview(kdt.m_pix0_vector, panel_id, 1),
-      SIM.curved_detector, kdt.metrology.dists[panel_id], kdt.metrology.dists[panel_id],
-      m_beam_vector,
-      SIM.dmin, SIM.phisteps, SIM.sources,
-      m_source_X, m_source_Y, m_source_Z,
-      m_source_I, m_source_lambda,
-      SIM.xtal_shape,
-      SIM.mosaic_domains, m_crystal_orientation,
-      SIM.Na, SIM.Nb, SIM.Nc, SIM.V_cell,
-      m_water_size, m_water_F, m_water_MW, simtbx::nanoBragg::r_e_sqr,
-      SIM.fluence, simtbx::nanoBragg::Avogadro, SIM.spot_scale, SIM.integral_form,
-      SIM.default_F,
-      current_channel_Fhkl,
-      kec.m_FhklParams,
-      SIM.nopolar,
-      m_polar_vector, SIM.polarization, SIM.fudge,
-      // &(kdt.m_maskimage[panel_size * panel_id]),
-      nullptr,
-      // return arrays:
-      extract_subview(kdt.m_floatimage, panel_id, panel_size),
-      extract_subview(kdt.m_omega_reduction, panel_id, panel_size),
-      extract_subview(kdt.m_max_I_x_reduction, panel_id, panel_size),
-      extract_subview(kdt.m_max_I_y_reduction, panel_id, panel_size),
-      extract_subview(kdt.m_rangemap, panel_id, panel_size));
-      //cudaSafeCall(cudaPeekAtLastError());
-      ::Kokkos::fence();
-    }
-    //cudaSafeCall(cudaDeviceSynchronize());
-
-    //don't want to free the kec data when the nanoBragg goes out of scope, so switch the pointer
-    // cu_current_channel_Fhkl = NULL;
-
-    add_array(kdt.m_accumulate_floatimage, kdt.m_floatimage);
-  }
-
-  void
-  exascale_api::add_energy_channel_mask_allpanel(
-    int const& ichannel,
-    simtbx::Kokkos::kokkos_energy_channels & kec,
-    simtbx::Kokkos::kokkos_detector & kdt,
-    af::shared<bool> all_panel_mask
-  ){
-    // here or there, need to convert the all_panel_mask (3D map) into a 1D list of accepted pixels
-    // coordinates for the active pixel list are absolute offsets into the detector array
-    af::shared<std::size_t> active_pixel_list;
-    const bool* jptr = all_panel_mask.begin();
-    for (int j=0; j < all_panel_mask.size(); ++j){
-      if (jptr[j]) {
-        active_pixel_list.push_back(j);
-      }
-    }
-    add_energy_channel_mask_allpanel(ichannel, kec, kdt, active_pixel_list);
-  }
-
-  void
-  exascale_api::add_energy_channel_mask_allpanel(
-    int const& ichannel,
-    simtbx::Kokkos::kokkos_energy_channels & kec,
-    simtbx::Kokkos::kokkos_detector & kdt,
-    af::shared<std::size_t> const active_pixel_list
-  ){
-    kdt.set_active_pixels_on_GPU(active_pixel_list);
-
-    // transfer source_I, source_lambda
-    // the int arguments are for sizes of the arrays
-    int source_count = SIM.sources;
-    kokkostbx::transfer_double2kokkos(m_source_I, SIM.source_I, source_count);
-    kokkostbx::transfer_double2kokkos(m_source_lambda, SIM.source_lambda, source_count);
-    // cudaSafeCall(cudaMemcpyVectorDoubleToDevice(m_source_I, SIM.source_I, SIM.sources));
-    // cudaSafeCall(cudaMemcpyVectorDoubleToDevice(m_source_lambda, SIM.source_lambda, SIM.sources));
-
-    ::Kokkos::resize(m_crystal_orientation, SIM.phisteps, SIM.mosaic_domains, 3);
-    calc_CrystalOrientations(
-      SIM.phi0, SIM.phistep, SIM.phisteps, m_spindle_vector, m_a0, m_b0, m_c0, SIM.mosaic_spread,
-      SIM.mosaic_domains, m_mosaic_umats, m_crystal_orientation);
-
-    // magic happens here: take pointer from singleton, temporarily use it for add Bragg iteration:
-    vector_cudareal_t current_channel_Fhkl = kec.d_channel_Fhkl[ichannel];
-
-    // cudaDeviceProp deviceProps = { 0 };
-    // cudaSafeCall(cudaGetDeviceProperties(&deviceProps, SIM.device_Id));
-    // int smCount = deviceProps.multiProcessorCount;
-    // dim3 threadsPerBlock(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y);
-    // dim3 numBlocks(smCount * 8, 1);
-
-    // for call for all panels at the same time
-
-      // set up cell basis vectors for the diffuse parameters (convert vec4 to vec3)
-      diffuse.a0 << SIM.a0[1],SIM.a0[2],SIM.a0[3]; diffuse.b0 << SIM.b0[1],SIM.b0[2],SIM.b0[3]; diffuse.c0 << SIM.c0[1],SIM.c0[2],SIM.c0[3];
-
-      debranch_maskall_Kernel(
-      kdt.m_panel_count, kdt.m_slow_dim_size, kdt.m_fast_dim_size, active_pixel_list.size(),
-      SIM.oversample, SIM.point_pixel,
-      SIM.pixel_size, m_subpixel_size, m_steps,
-      SIM.detector_thickstep, SIM.detector_thicksteps, SIM.detector_thick, SIM.detector_attnlen,
-      kdt.m_sdet_vector,
-      kdt.m_fdet_vector,
-      kdt.m_odet_vector,
-      kdt.m_pix0_vector,
-      kdt.m_distance,
-      m_beam_vector,
-      SIM.dmin, SIM.phisteps, SIM.sources,
-      m_source_X, m_source_Y,
-      m_source_Z,
-      m_source_I, m_source_lambda,
-      SIM.mosaic_domains, m_crystal_orientation,
-      SIM.Na, SIM.Nb, SIM.Nc, SIM.V_cell,
-      m_water_size, m_water_F, m_water_MW, simtbx::nanoBragg::r_e_sqr,
-      SIM.fluence, simtbx::nanoBragg::Avogadro, SIM.spot_scale, SIM.integral_form,
-      SIM.default_F,
-      current_channel_Fhkl,
-      kec.m_FhklParams,
-      SIM.nopolar,
-      m_polar_vector, SIM.polarization, SIM.fudge,
-      kdt.m_active_pixel_list,
-      diffuse,
-      // return arrays:
-      kdt.m_floatimage,
-      kdt.m_omega_reduction,
-      kdt.m_max_I_x_reduction,
-      kdt.m_max_I_y_reduction,
-      kdt.m_rangemap);
-
-    // cudaSafeCall(cudaPeekAtLastError());
-    // cudaSafeCall(cudaDeviceSynchronize());
-
-    //don't want to free the kec data when the nanoBragg goes out of scope, so switch the pointer
-    // cu_current_channel_Fhkl = NULL;
-
-    add_array(kdt.m_accumulate_floatimage, kdt.m_floatimage);
-  }
-
-  void
-  exascale_api::add_energy_multichannel_mask_allpanel(
-    af::shared<int> const ichannels,
-    simtbx::Kokkos::kokkos_energy_channels & kec,
-    simtbx::Kokkos::kokkos_detector & kdt,
-    af::shared<std::size_t> const active_pixel_list,
-    af::shared<double> const weight
-  ){
-    kdt.set_active_pixels_on_GPU(active_pixel_list);
-
-    // transfer source_I, source_lambda
-    // the int arguments are for sizes of the arrays
-
-    SCITBX_ASSERT(SIM.sources == ichannels.size()); /* For each nanoBragg source, this value instructs
-    the simulation where to look for structure factors.  If -1, skip this source wavelength. */
-    SCITBX_ASSERT(SIM.sources == weight.size());
-
-    for (int ictr = 0; ictr < SIM.sources; ++ictr){
-      if (ichannels[ictr] < 0) continue; // the ichannel array
-      //printf("HA HA %4d channel %4d, %5.1f %5.1f %5.1f %10.5f %10.5g\n",ictr,ichannels[ictr],
-      //  SIM.source_X[ictr], SIM.source_Y[ictr], SIM.source_Z[ictr],
-      //  SIM.source_I[ictr], SIM.source_lambda[ictr]);
-
-      ::Kokkos::resize(m_crystal_orientation, SIM.phisteps, SIM.mosaic_domains, 3);
-      calc_CrystalOrientations(
-      SIM.phi0, SIM.phistep, SIM.phisteps, m_spindle_vector, m_a0, m_b0, m_c0, SIM.mosaic_spread,
-      SIM.mosaic_domains, m_mosaic_umats, m_crystal_orientation);
-
-      // magic happens here: take pointer from singleton, temporarily use it for add Bragg iteration:
-      vector_cudareal_t current_channel_Fhkl = kec.d_channel_Fhkl[ichannels[ictr]];
-
-      vector_cudareal_t c_source_X = vector_cudareal_t("c_source_X", 0);
-      vector_cudareal_t c_source_Y = vector_cudareal_t("c_source_Y", 0);
-      vector_cudareal_t c_source_Z = vector_cudareal_t("c_source_Z", 0);
-      vector_cudareal_t c_source_I = vector_cudareal_t("c_source_I", 0);
-      vector_cudareal_t c_source_lambda = vector_cudareal_t("c_source_lambda", 0);
-      double weighted_I = SIM.source_I[ictr] * weight[ictr];
-      kokkostbx::transfer_double2kokkos(c_source_X, &(SIM.source_X[ictr]), 1);
-      kokkostbx::transfer_double2kokkos(c_source_Y, &(SIM.source_Y[ictr]), 1);
-      kokkostbx::transfer_double2kokkos(c_source_Z, &(SIM.source_Z[ictr]), 1);
-      kokkostbx::transfer_double2kokkos(c_source_I, &(weighted_I), 1);
-      kokkostbx::transfer_double2kokkos(c_source_lambda, &(SIM.source_lambda[ictr]), 1);
-
-      // set up cell basis vectors for the diffuse parameters (convert vec4 to vec3)
-      diffuse.a0 << SIM.a0[1],SIM.a0[2],SIM.a0[3]; diffuse.b0 << SIM.b0[1],SIM.b0[2],SIM.b0[3]; diffuse.c0 << SIM.c0[1],SIM.c0[2],SIM.c0[3];
-
-      debranch_maskall_Kernel(
-      kdt.m_panel_count, kdt.m_slow_dim_size, kdt.m_fast_dim_size, active_pixel_list.size(),
-      SIM.oversample, SIM.point_pixel,
-      SIM.pixel_size, m_subpixel_size, m_steps,
-      SIM.detector_thickstep, SIM.detector_thicksteps, SIM.detector_thick, SIM.detector_attnlen,
-      kdt.m_sdet_vector,
-      kdt.m_fdet_vector,
-      kdt.m_odet_vector,
-      kdt.m_pix0_vector,
-      kdt.m_distance,
-      m_beam_vector,
-      SIM.dmin, SIM.phisteps, 1,
-      c_source_X, c_source_Y,
-      c_source_Z,
-      c_source_I, c_source_lambda,
-      SIM.mosaic_domains, m_crystal_orientation,
-      SIM.Na, SIM.Nb, SIM.Nc, SIM.V_cell,
-      m_water_size, m_water_F, m_water_MW, simtbx::nanoBragg::r_e_sqr,
-      SIM.fluence, simtbx::nanoBragg::Avogadro, SIM.spot_scale, SIM.integral_form,
-      SIM.default_F,
-      current_channel_Fhkl,
-      kec.m_FhklParams,
-      SIM.nopolar,
-      m_polar_vector, SIM.polarization, SIM.fudge,
-      kdt.m_active_pixel_list,
-      diffuse,
-      // return arrays:
-      kdt.m_floatimage,
-      kdt.m_omega_reduction,
-      kdt.m_max_I_x_reduction,
-      kdt.m_max_I_y_reduction,
-      kdt.m_rangemap);
-    //don't want to free the kec data when the nanoBragg goes out of scope, so switch the pointer
-    // cu_current_channel_Fhkl = NULL;
-      add_array(kdt.m_accumulate_floatimage, kdt.m_floatimage);
-    }// loop over channels
-  }
-
-  void
-  exascale_api::add_background(simtbx::Kokkos::kokkos_detector & kdt, int const& override_source) {
-        // cudaSafeCall(cudaSetDevice(SIM.device_Id));
-
-        // transfer source_I, source_lambda
-        // the int arguments are for sizes of the arrays
-        int sources_count = SIM.sources;
-        kokkostbx::transfer_double2kokkos(m_source_I, SIM.source_I, sources_count);
-        kokkostbx::transfer_double2kokkos(m_source_lambda, SIM.source_lambda, sources_count);
-        // cudaSafeCall(cudaMemcpyVectorDoubleToDevice(m_source_I, SIM.source_I, SIM.sources));
-        // cudaSafeCall(cudaMemcpyVectorDoubleToDevice(m_source_lambda, SIM.source_lambda, SIM.sources));
-
-        vector_cudareal_t stol_of("stol_of", SIM.stols);
-        transfer_X2kokkos(stol_of, SIM.stol_of, SIM.stols);
-        // CUDAREAL * cu_stol_of;
-        // cudaSafeCall(cudaMalloc((void ** )&cu_stol_of, sizeof(*cu_stol_of) * SIM.stols));
-        // cudaSafeCall(cudaMemcpyVectorDoubleToDevice(cu_stol_of, SIM.stol_of, SIM.stols));
-
-        vector_cudareal_t Fbg_of("Fbg_of", SIM.stols);
-        transfer_X2kokkos(Fbg_of, SIM.Fbg_of, SIM.stols);
-        // CUDAREAL * cu_Fbg_of;
-        // cudaSafeCall(cudaMalloc((void ** )&cu_Fbg_of, sizeof(*cu_Fbg_of) * SIM.stols));
-        // cudaSafeCall(cudaMemcpyVectorDoubleToDevice(cu_Fbg_of, SIM.Fbg_of, SIM.stols));
-
-        // cudaDeviceProp deviceProps = { 0 };
-        // cudaSafeCall(cudaGetDeviceProperties(&deviceProps, SIM.device_Id));
-        // int smCount = deviceProps.multiProcessorCount;
-        // dim3 threadsPerBlock(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y);
-        // dim3 numBlocks(smCount * 8, 1);
-
-        //  initialize the device memory within a kernel.
-        //  modify the arguments to initialize multipanel detector.
-        ::Kokkos::parallel_for("kokkosSpotsInit", kdt.m_panel_count * kdt.m_slow_dim_size * kdt.m_fast_dim_size, KOKKOS_LAMBDA (const int& j) {
-          kdt.m_floatimage(j) = 0;
-          kdt.m_omega_reduction(j) = 0;
-          kdt.m_max_I_x_reduction(j) = 0;
-          kdt.m_max_I_y_reduction(j) = 0;
-          kdt.m_rangemap(j) = false;
-        });
-        // nanoBraggSpotsInitCUDAKernel<<<numBlocks, threadsPerBlock>>>(
-        //   kdt.m_panel_count * kdt.m_slow_dim_size, kdt.m_fast_dim_size,
-        //   kdt.cu_floatimage, kdt.cu_omega_reduction,
-        //   kdt.cu_max_I_x_reduction, kdt.cu_max_I_y_reduction,
-        //   kdt.cu_rangemap);
-        // cudaSafeCall(cudaPeekAtLastError());
-        // cudaSafeCall(cudaDeviceSynchronize());
-
-        std::size_t panel_size = kdt.m_slow_dim_size * kdt.m_fast_dim_size;
-
-        // the for loop around panels.  Offsets given.
-        for (std::size_t panel_id = 0; panel_id < kdt.m_panel_count; panel_id++) {
-          add_background_kokkos_kernel(SIM.sources,
-          SIM.oversample, override_source,
-          SIM.pixel_size, kdt.m_slow_dim_size, kdt.m_fast_dim_size, SIM.detector_thicksteps,
-          SIM.detector_thickstep, SIM.detector_attnlen,
-          extract_subview(kdt.m_sdet_vector, panel_id, 1),
-          extract_subview(kdt.m_fdet_vector, panel_id, 1),
-          extract_subview(kdt.m_odet_vector, panel_id, 1),
-          extract_subview(kdt.m_pix0_vector, panel_id, 1),
-          kdt.metrology.dists[panel_id], SIM.point_pixel, SIM.detector_thick,
-          m_source_X, m_source_Y, m_source_Z,
-          m_source_lambda, m_source_I,
-          SIM.stols, stol_of, Fbg_of,
-          SIM.nopolar, SIM.polarization, m_polar_vector,
-          simtbx::nanoBragg::r_e_sqr, SIM.fluence, SIM.amorphous_molecules,
-          // returns:
-          extract_subview(kdt.m_floatimage, panel_id, panel_size));
-
-          // cudaSafeCall(cudaPeekAtLastError());
-        }
-        // cudaSafeCall(cudaDeviceSynchronize());
-        ::Kokkos::fence();
-        add_array(kdt.m_accumulate_floatimage, kdt.m_floatimage);
-
-        // cudaSafeCall(cudaFree(cu_stol_of));
-        // cudaSafeCall(cudaFree(cu_Fbg_of));
-  }
-
-  af::flex_double
-  exascale_api::add_noise(simtbx::Kokkos::kokkos_detector & kdt) {
-        // put raw pixels into a temporary hold
-        af::flex_double tmp_hold_pixels = kdt.get_raw_pixels();
-        std::size_t panel_size = kdt.m_slow_dim_size * kdt.m_fast_dim_size;
-        af::flex_double panel_pixels = af::flex_double(af::flex_grid<>(kdt.m_slow_dim_size,kdt.m_fast_dim_size), af::init_functor_null<double>());
-        for (std::size_t panel_id = 0; panel_id < kdt.m_panel_count; panel_id++) {
-          double* dst_beg = panel_pixels.begin();
-          double* dst_end = panel_pixels.end();
-          double* dstptr = dst_beg;
-          for (double *srcptr = &(tmp_hold_pixels[panel_id*panel_size]); dstptr != dst_end; ){
-            *dstptr++ = *srcptr++;
-          } // get the pixels from one panel on GPU memory into panel_pixels for add_noise
-          SIM.add_noise(panel_pixels);
-          dstptr = dst_beg;
-          for (double *srcptr = &(tmp_hold_pixels[panel_id*panel_size]); dstptr != dst_end; ){
-            *srcptr++ = *dstptr++;
-          } // return the pixels from SIM.raw_pixels to temporary array
-        }
-        ::Kokkos::fence();
-        return tmp_hold_pixels;
-  }
-
-  void
-  exascale_api::allocate() {
-    //cudaSafeCall(cudaSetDevice(SIM.device_Id));
-
-    // water_size not defined in class, CLI argument, defaults to 0
-    double water_size = 0.0;
-    // missing constants
-    double water_F = 2.57;
-    double water_MW = 18.0;
-
-    // make sure we are normalizing with the right number of sub-steps
-    int nb_steps = SIM.phisteps * SIM.mosaic_domains * SIM.oversample * SIM.oversample;
-    double nb_subpixel_size = SIM.pixel_size / SIM.oversample;
-
-    //create transfer arguments to device space
-    m_subpixel_size = nb_subpixel_size; //check for conflict?
-    m_steps = nb_steps; //check for conflict?
-
-    // presumably thickness and attenuation can be migrated to the gpu detector class XXX FIXME
-    //cu_detector_thick = SIM.detector_thick;
-    //cu_detector_mu = SIM.detector_attnlen; // synonyms
-    //cu_distance = SIM.distance; // distance and close distance, detector properties? XXX FIXME
-    //cu_close_distance = SIM.close_distance;
-
-    m_water_size = water_size;
-    m_water_F = water_F;
-    m_water_MW = water_MW;
-
-    //const int vector_length = 4;
-
-    kokkostbx::transfer_double2kokkos(m_beam_vector, SIM.beam_vector, m_vector_length);
-    kokkostbx::transfer_double2kokkos(m_spindle_vector, SIM.spindle_vector, m_vector_length);
-    kokkostbx::transfer_double2kokkos(m_a0, SIM.a0, m_vector_length);
-    kokkostbx::transfer_double2kokkos(m_b0, SIM.b0, m_vector_length);
-    kokkostbx::transfer_double2kokkos(m_c0, SIM.c0, m_vector_length);
-
-    // cudaSafeCall(cudaMalloc((void ** )&cu_beam_vector, sizeof(*cu_beam_vector) * vector_length));
-    // cudaSafeCall(cudaMemcpyVectorDoubleToDevice(cu_beam_vector, SIM.beam_vector, vector_length));
-
-    // cudaSafeCall(cudaMalloc((void ** )&cu_spindle_vector, sizeof(*cu_spindle_vector) * vector_length));
-    // cudaSafeCall(cudaMemcpyVectorDoubleToDevice(cu_spindle_vector, SIM.spindle_vector, vector_length));
-
-    // cudaSafeCall(cudaMalloc((void ** )&cu_a0, sizeof(*cu_a0) * vector_length));
-    // cudaSafeCall(cudaMemcpyVectorDoubleToDevice(cu_a0, SIM.a0, vector_length));
-
-    // cudaSafeCall(cudaMalloc((void ** )&cu_b0, sizeof(*cu_b0) * vector_length));
-    // cudaSafeCall(cudaMemcpyVectorDoubleToDevice(cu_b0, SIM.b0, vector_length));
-
-    // cudaSafeCall(cudaMalloc((void ** )&cu_c0, sizeof(*cu_c0) * vector_length));
-    // cudaSafeCall(cudaMemcpyVectorDoubleToDevice(cu_c0, SIM.c0, vector_length));
-
-    // Unitize polar vector before sending it to the GPU.
-    // Optimization do it only once here rather than multiple time per pixel in the GPU.
-    double polar_vector_unitized[4];
-    cpu_unitize(SIM.polar_vector, polar_vector_unitized);
-    kokkostbx::transfer_double2kokkos(m_polar_vector, polar_vector_unitized, m_vector_length);
-
-    int sources_count = SIM.sources;
-    kokkostbx::transfer_double2kokkos(m_source_X, SIM.source_X, sources_count);
-    kokkostbx::transfer_double2kokkos(m_source_Y, SIM.source_Y, sources_count);
-    kokkostbx::transfer_double2kokkos(m_source_Z, SIM.source_Z, sources_count);
-    kokkostbx::transfer_double2kokkos(m_source_I, SIM.source_I, sources_count);
-    kokkostbx::transfer_double2kokkos(m_source_lambda, SIM.source_lambda, sources_count);
-
-    int mosaic_domains_count = SIM.mosaic_domains;
-    kokkostbx::transfer_double2kokkos(m_mosaic_umats, SIM.mosaic_umats, mosaic_domains_count * 9);
-
-    // cudaSafeCall(cudaMalloc((void ** )&cu_polar_vector, sizeof(*cu_polar_vector) * vector_length));
-    // cudaSafeCall(cudaMemcpyVectorDoubleToDevice(cu_polar_vector, polar_vector_unitized, vector_length));
-
-    // cudaSafeCall(cudaMalloc((void ** )&cu_source_X, sizeof(*cu_source_X) * sources_count));
-    // cudaSafeCall(cudaMemcpyVectorDoubleToDevice(cu_source_X, SIM.source_X, sources_count));
-
-    // cudaSafeCall(cudaMalloc((void ** )&cu_source_Y, sizeof(*cu_source_Y) * sources_count));
-    // cudaSafeCall(cudaMemcpyVectorDoubleToDevice(cu_source_Y, SIM.source_Y, sources_count));
-
-    // cudaSafeCall(cudaMalloc((void ** )&cu_source_Z, sizeof(*cu_source_Z) * sources_count));
-    // cudaSafeCall(cudaMemcpyVectorDoubleToDevice(cu_source_Z, SIM.source_Z, sources_count));
-
-    // cudaSafeCall(cudaMalloc((void ** )&cu_source_I, sizeof(*cu_source_I) * sources_count));
-    // cudaSafeCall(cudaMemcpyVectorDoubleToDevice(cu_source_I, SIM.source_I, sources_count));
-
-    // cudaSafeCall(cudaMalloc((void ** )&cu_source_lambda, sizeof(*cu_source_lambda) * sources_count));
-    // cudaSafeCall(cudaMemcpyVectorDoubleToDevice(cu_source_lambda, SIM.source_lambda, sources_count));
-
-    // cudaSafeCall(cudaMalloc((void ** )&cu_mosaic_umats, sizeof(*cu_mosaic_umats) * mosaic_domains_count * 9));
-    // cudaSafeCall(cudaMemcpyVectorDoubleToDevice(cu_mosaic_umats, SIM.mosaic_umats, mosaic_domains_count * 9));
-  };
-
 /*  exascale_api::~exascale_api(){
     cudaSafeCall(cudaSetDevice(SIM.device_Id));
 
diff --git a/simtbx/kokkos/simulation.h b/simtbx/kokkos/simulation.h
index ca2d763ad7..9b9d6a6552 100644
--- a/simtbx/kokkos/simulation.h
+++ b/simtbx/kokkos/simulation.h
@@ -15,9 +15,6 @@ using mat3 = kokkostbx::matrix3<CUDAREAL>;
 using crystal_orientation_t = Kokkos::View<vec3***, Kokkos::LayoutRight, MemSpace>; // [phisteps, domains, 3]
 
 namespace simtbx { namespace Kokkos {
-
-namespace af = scitbx::af;
-
 struct diffuse_api {
   inline diffuse_api() {};
   inline void show() {};
@@ -30,37 +27,75 @@ struct diffuse_api {
   mat3 rotate_principal_axes;
   vec3 a0, b0, c0; // cell basis vectors
 };
+}}
+
+#include "simtbx/kokkos/simulation_kernels.h"
+
+namespace simtbx { namespace Kokkos {
+
+namespace af = scitbx::af;
+  // extract subview from [start_index * length; (start_index + 1) * length)
+  template <typename T>
+  view_1d_t<T> extract_subview(view_1d_t<T> A, int start_index, int length) {
+    return ::Kokkos::subview(A, ::Kokkos::pair<int, int>(start_index * length, (start_index + 1) * length ));
+  }
+
+  // make a unit vector pointing in same direction and report magnitude (both args can be same vector)
+  double cpu_unitize(const double * vector, double * new_unit_vector) {
+
+        double v1 = vector[1];
+        double v2 = vector[2];
+        double v3 = vector[3];
 
+        double mag = sqrt(v1 * v1 + v2 * v2 + v3 * v3);
+
+        if (mag != 0.0) {
+                // normalize it
+                new_unit_vector[0] = mag;
+                new_unit_vector[1] = v1 / mag;
+                new_unit_vector[2] = v2 / mag;
+                new_unit_vector[3] = v3 / mag;
+        } else {
+                // can't normalize, report zero vector
+                new_unit_vector[0] = 0.0;
+                new_unit_vector[1] = 0.0;
+                new_unit_vector[2] = 0.0;
+                new_unit_vector[3] = 0.0;
+        }
+        return mag;
+  }
+
+template <typename MemoryPolicy=large_array_policy>
 struct exascale_api {
   inline
   exascale_api(const simtbx::nanoBragg::nanoBragg& nB) : SIM(nB) { }
 
-  void show();
-  void add_energy_channel_from_gpu_amplitudes(int const&,
+  //void show();
+  /*void add_energy_channel_from_gpu_amplitudes(int const&,
     simtbx::Kokkos::kokkos_energy_channels &,
-    simtbx::Kokkos::kokkos_detector &,
+    simtbx::Kokkos::kokkos_detector<MemoryPolicy> &,
     double const&
-  );
-  void add_energy_channel_mask_allpanel(int const&,
+  ); */
+  /*void add_energy_channel_mask_allpanel(int const&,
     simtbx::Kokkos::kokkos_energy_channels &,
-    simtbx::Kokkos::kokkos_detector &,
+    simtbx::Kokkos::kokkos_detector<MemoryPolicy> &,
     af::shared<bool>
-  );
-  void add_energy_channel_mask_allpanel(int const&,
+  ); */
+  /*void add_energy_channel_mask_allpanel(int const&,
     simtbx::Kokkos::kokkos_energy_channels &,
-    simtbx::Kokkos::kokkos_detector &,
+    simtbx::Kokkos::kokkos_detector<MemoryPolicy> &,
     af::shared<std::size_t> const
-  );
-  void add_energy_multichannel_mask_allpanel(af::shared<int> const,
+  ); */
+  /*void add_energy_multichannel_mask_allpanel(af::shared<int> const,
     simtbx::Kokkos::kokkos_energy_channels &,
-    simtbx::Kokkos::kokkos_detector &,
+    simtbx::Kokkos::kokkos_detector<MemoryPolicy> &,
     af::shared<std::size_t> const,
     af::shared<double> const
-  );
+  ); */
 
-  void add_background(simtbx::Kokkos::kokkos_detector &, int const&);
-  af::flex_double add_noise(simtbx::Kokkos::kokkos_detector &);
-  void allocate();
+  //void add_background(simtbx::Kokkos::kokkos_detector<MemoryPolicy> &, int const&);
+  //af::flex_double add_noise(simtbx::Kokkos::kokkos_detector<MemoryPolicy> &);
+  //void allocate();
   //~exascale_api();
 
   const simtbx::nanoBragg::nanoBragg& SIM;
@@ -86,6 +121,490 @@ struct exascale_api {
   CUDAREAL m_water_F = 0;
   CUDAREAL m_water_MW = 0;
   diffuse_api diffuse;
+
+  inline void
+  show(){
+    SCITBX_EXAMINE(SIM.roi_xmin);
+    SCITBX_EXAMINE(SIM.roi_xmax);
+    SCITBX_EXAMINE(SIM.roi_ymin);
+    SCITBX_EXAMINE(SIM.roi_ymax);
+    SCITBX_EXAMINE(SIM.oversample);
+    SCITBX_EXAMINE(SIM.point_pixel);
+    SCITBX_EXAMINE(SIM.pixel_size);
+    SCITBX_EXAMINE(m_subpixel_size);
+    SCITBX_EXAMINE(m_steps);
+    SCITBX_EXAMINE(SIM.detector_thickstep);
+    SCITBX_EXAMINE(SIM.detector_thicksteps);
+    SCITBX_EXAMINE(SIM.detector_thick);
+    SCITBX_EXAMINE(SIM.detector_attnlen);
+    SCITBX_EXAMINE(SIM.curved_detector);
+    SCITBX_EXAMINE(SIM.distance);
+    SCITBX_EXAMINE(SIM.close_distance);
+    SCITBX_EXAMINE(SIM.dmin);
+    SCITBX_EXAMINE(SIM.phi0);
+    SCITBX_EXAMINE(SIM.phistep);
+    SCITBX_EXAMINE(SIM.phisteps);
+    SCITBX_EXAMINE(SIM.sources);
+    SCITBX_EXAMINE(SIM.mosaic_spread);
+    SCITBX_EXAMINE(SIM.mosaic_domains);
+    SCITBX_EXAMINE(SIM.Na);
+    SCITBX_EXAMINE(SIM.Nb);
+    SCITBX_EXAMINE(SIM.Nc);
+    SCITBX_EXAMINE(SIM.fluence);
+    SCITBX_EXAMINE(SIM.spot_scale);
+    SCITBX_EXAMINE(SIM.integral_form);
+    SCITBX_EXAMINE(SIM.default_F);
+    SCITBX_EXAMINE(SIM.interpolate);
+    SCITBX_EXAMINE(SIM.nopolar);
+    SCITBX_EXAMINE(SIM.polarization);
+    SCITBX_EXAMINE(SIM.fudge);
+  }
+
+  inline void
+  add_energy_channel_from_gpu_amplitudes(
+    int const& ichannel,
+    simtbx::Kokkos::kokkos_energy_channels & kec,
+    simtbx::Kokkos::kokkos_detector<MemoryPolicy> & kdt,
+    double const& weight
+  ){
+    // transfer source_I, source_lambda
+    // the int arguments are for sizes of the arrays
+    int source_count = SIM.sources;
+    af::shared<double> weighted_sources_I = af::shared<double>(source_count);
+    double* wptr = weighted_sources_I.begin();
+    for (std::size_t iwt = 0; iwt < source_count; iwt++){wptr[iwt] = weight*(SIM.source_I[iwt]);}
+    kokkostbx::transfer_double2kokkos(m_source_I, wptr, source_count);
+    kokkostbx::transfer_double2kokkos(m_source_lambda, SIM.source_lambda, source_count);
+
+    ::Kokkos::resize(m_crystal_orientation, SIM.phisteps, SIM.mosaic_domains, 3);
+    calc_CrystalOrientations(
+      SIM.phi0, SIM.phistep, SIM.phisteps, m_spindle_vector, m_a0, m_b0, m_c0, SIM.mosaic_spread,
+      SIM.mosaic_domains, m_mosaic_umats, m_crystal_orientation);
+
+    // magic happens here(?): take pointer from singleton, temporarily use it for add Bragg iteration:
+    vector_cudareal_t current_channel_Fhkl = kec.d_channel_Fhkl[ichannel];
+
+    std::size_t panel_size = kdt.m_slow_dim_size * kdt.m_fast_dim_size;
+
+    // the for loop around panels.  Offsets given.
+    for (std::size_t panel_id = 0; panel_id < kdt.m_panel_count; panel_id++){
+      // loop thru panels and increment the array ptrs
+      kokkosSpotsKernel(
+      kdt.m_slow_dim_size, kdt.m_fast_dim_size, SIM.roi_xmin, SIM.roi_xmax,
+      SIM.roi_ymin, SIM.roi_ymax, SIM.oversample, SIM.point_pixel,
+      SIM.pixel_size, m_subpixel_size, m_steps, SIM.detector_thickstep,
+      SIM.detector_thicksteps, SIM.detector_thick, SIM.detector_attnlen,
+      extract_subview(kdt.m_sdet_vector, panel_id, 1),
+      extract_subview(kdt.m_fdet_vector, panel_id, 1),
+      extract_subview(kdt.m_odet_vector, panel_id, 1),
+      extract_subview(kdt.m_pix0_vector, panel_id, 1),
+      SIM.curved_detector, kdt.metrology.dists[panel_id], kdt.metrology.dists[panel_id],
+      m_beam_vector,
+      SIM.dmin, SIM.phisteps, SIM.sources,
+      m_source_X, m_source_Y, m_source_Z,
+      m_source_I, m_source_lambda,
+      SIM.xtal_shape,
+      SIM.mosaic_domains, m_crystal_orientation,
+      SIM.Na, SIM.Nb, SIM.Nc, SIM.V_cell,
+      m_water_size, m_water_F, m_water_MW, simtbx::nanoBragg::r_e_sqr,
+      SIM.fluence, simtbx::nanoBragg::Avogadro, SIM.spot_scale, SIM.integral_form,
+      SIM.default_F,
+      current_channel_Fhkl,
+      kec.m_FhklParams,
+      SIM.nopolar,
+      m_polar_vector, SIM.polarization, SIM.fudge,
+      // &(kdt.m_maskimage[panel_size * panel_id]),
+      nullptr,
+      // return arrays:
+      extract_subview(kdt.m_floatimage, panel_id, panel_size),
+      extract_subview(kdt.m_omega_reduction, panel_id, panel_size),
+      extract_subview(kdt.m_max_I_x_reduction, panel_id, panel_size),
+      extract_subview(kdt.m_max_I_y_reduction, panel_id, panel_size),
+      extract_subview(kdt.m_rangemap, panel_id, panel_size));
+      //cudaSafeCall(cudaPeekAtLastError());
+      ::Kokkos::fence();
+    }
+    //cudaSafeCall(cudaDeviceSynchronize());
+
+    //don't want to free the kec data when the nanoBragg goes out of scope, so switch the pointer
+    // cu_current_channel_Fhkl = NULL;
+
+    add_array(kdt.m_accumulate_floatimage, kdt.m_floatimage);
+  }
+
+  inline void
+  add_energy_channel_mask_allpanel(
+    int const& ichannel,
+    simtbx::Kokkos::kokkos_energy_channels & kec,
+    simtbx::Kokkos::kokkos_detector<MemoryPolicy> & kdt,
+    af::shared<bool> all_panel_mask
+  ){
+    // here or there, need to convert the all_panel_mask (3D map) into a 1D list of accepted pixels
+    // coordinates for the active pixel list are absolute offsets into the detector array
+    af::shared<std::size_t> active_pixel_list;
+    const bool* jptr = all_panel_mask.begin();
+    for (int j=0; j < all_panel_mask.size(); ++j){
+      if (jptr[j]) {
+        active_pixel_list.push_back(j);
+      }
+    }
+    add_energy_channel_mask_allpanel(ichannel, kec, kdt, active_pixel_list);
+  }
+
+  inline void
+  add_energy_channel_mask_allpanel(
+    int const& ichannel,
+    simtbx::Kokkos::kokkos_energy_channels & kec,
+    simtbx::Kokkos::kokkos_detector<MemoryPolicy> & kdt,
+    af::shared<std::size_t> const active_pixel_list
+  ){
+    //SCITBX_CHECK_POINT; //tested by tst_shoeboxes.py
+    kdt.set_active_pixels_on_GPU(active_pixel_list);
+
+    // transfer source_I, source_lambda
+    // the int arguments are for sizes of the arrays
+    int source_count = SIM.sources;
+    kokkostbx::transfer_double2kokkos(m_source_I, SIM.source_I, source_count);
+    kokkostbx::transfer_double2kokkos(m_source_lambda, SIM.source_lambda, source_count);
+    // cudaSafeCall(cudaMemcpyVectorDoubleToDevice(m_source_I, SIM.source_I, SIM.sources));
+    // cudaSafeCall(cudaMemcpyVectorDoubleToDevice(m_source_lambda, SIM.source_lambda, SIM.sources));
+
+    ::Kokkos::resize(m_crystal_orientation, SIM.phisteps, SIM.mosaic_domains, 3);
+    calc_CrystalOrientations(
+      SIM.phi0, SIM.phistep, SIM.phisteps, m_spindle_vector, m_a0, m_b0, m_c0, SIM.mosaic_spread,
+      SIM.mosaic_domains, m_mosaic_umats, m_crystal_orientation);
+
+    // magic happens here: take pointer from singleton, temporarily use it for add Bragg iteration:
+    vector_cudareal_t current_channel_Fhkl = kec.d_channel_Fhkl[ichannel];
+
+    // cudaDeviceProp deviceProps = { 0 };
+    // cudaSafeCall(cudaGetDeviceProperties(&deviceProps, SIM.device_Id));
+    // int smCount = deviceProps.multiProcessorCount;
+    // dim3 threadsPerBlock(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y);
+    // dim3 numBlocks(smCount * 8, 1);
+
+    // for call for all panels at the same time
+
+      // set up cell basis vectors for the diffuse parameters (convert vec4 to vec3)
+      diffuse.a0 << SIM.a0[1],SIM.a0[2],SIM.a0[3]; diffuse.b0 << SIM.b0[1],SIM.b0[2],SIM.b0[3]; diffuse.c0 << SIM.c0[1],SIM.c0[2],SIM.c0[3];
+
+      debranch_maskall_Kernel(
+      kdt.m_panel_count, kdt.m_slow_dim_size, kdt.m_fast_dim_size, active_pixel_list.size(),
+      SIM.oversample, SIM.point_pixel,
+      SIM.pixel_size, m_subpixel_size, m_steps,
+      SIM.detector_thickstep, SIM.detector_thicksteps, SIM.detector_thick, SIM.detector_attnlen,
+      kdt.m_sdet_vector,
+      kdt.m_fdet_vector,
+      kdt.m_odet_vector,
+      kdt.m_pix0_vector,
+      kdt.m_distance,
+      m_beam_vector,
+      SIM.dmin, SIM.phisteps, SIM.sources,
+      m_source_X, m_source_Y,
+      m_source_Z,
+      m_source_I, m_source_lambda,
+      SIM.mosaic_domains, m_crystal_orientation,
+      SIM.Na, SIM.Nb, SIM.Nc, SIM.V_cell,
+      m_water_size, m_water_F, m_water_MW, simtbx::nanoBragg::r_e_sqr,
+      SIM.fluence, simtbx::nanoBragg::Avogadro, SIM.spot_scale, SIM.integral_form,
+      SIM.default_F,
+      current_channel_Fhkl,
+      kec.m_FhklParams,
+      SIM.nopolar,
+      m_polar_vector, SIM.polarization, SIM.fudge,
+      kdt.m_active_pixel_list,
+      diffuse,
+      // return arrays:
+      kdt.m_floatimage,
+      kdt.m_omega_reduction,
+      kdt.m_max_I_x_reduction,
+      kdt.m_max_I_y_reduction,
+      kdt.m_rangemap);
+
+    // cudaSafeCall(cudaPeekAtLastError());
+    // cudaSafeCall(cudaDeviceSynchronize());
+
+    //don't want to free the kec data when the nanoBragg goes out of scope, so switch the pointer
+    // cu_current_channel_Fhkl = NULL;
+
+    add_array(kdt.m_accumulate_floatimage, kdt.m_floatimage);
+  }
+
+  inline void
+  add_energy_multichannel_mask_allpanel(
+    af::shared<int> const ichannels,
+    simtbx::Kokkos::kokkos_energy_channels & kec,
+    simtbx::Kokkos::kokkos_detector<MemoryPolicy> & kdt,
+    af::shared<std::size_t> const active_pixel_list,
+    af::shared<double> const weight
+  ){
+    //SCITBX_CHECK_POINT; // tested by tst_unified.py
+    kdt.set_active_pixels_on_GPU(active_pixel_list);
+
+    // transfer source_I, source_lambda
+    // the int arguments are for sizes of the arrays
+
+    SCITBX_ASSERT(SIM.sources == ichannels.size()); /* For each nanoBragg source, this value instructs
+    the simulation where to look for structure factors.  If -1, skip this source wavelength. */
+    SCITBX_ASSERT(SIM.sources == weight.size());
+
+    for (int ictr = 0; ictr < SIM.sources; ++ictr){
+      if (ichannels[ictr] < 0) continue; // the ichannel array
+      //printf("HA HA %4d channel %4d, %5.1f %5.1f %5.1f %10.5f %10.5g\n",ictr,ichannels[ictr],
+      //  SIM.source_X[ictr], SIM.source_Y[ictr], SIM.source_Z[ictr],
+      //  SIM.source_I[ictr], SIM.source_lambda[ictr]);
+
+      ::Kokkos::resize(m_crystal_orientation, SIM.phisteps, SIM.mosaic_domains, 3);
+      calc_CrystalOrientations(
+      SIM.phi0, SIM.phistep, SIM.phisteps, m_spindle_vector, m_a0, m_b0, m_c0, SIM.mosaic_spread,
+      SIM.mosaic_domains, m_mosaic_umats, m_crystal_orientation);
+
+      // magic happens here: take pointer from singleton, temporarily use it for add Bragg iteration:
+      vector_cudareal_t current_channel_Fhkl = kec.d_channel_Fhkl[ichannels[ictr]];
+
+      vector_cudareal_t c_source_X = vector_cudareal_t("c_source_X", 0);
+      vector_cudareal_t c_source_Y = vector_cudareal_t("c_source_Y", 0);
+      vector_cudareal_t c_source_Z = vector_cudareal_t("c_source_Z", 0);
+      vector_cudareal_t c_source_I = vector_cudareal_t("c_source_I", 0);
+      vector_cudareal_t c_source_lambda = vector_cudareal_t("c_source_lambda", 0);
+      double weighted_I = SIM.source_I[ictr] * weight[ictr];
+      kokkostbx::transfer_double2kokkos(c_source_X, &(SIM.source_X[ictr]), 1);
+      kokkostbx::transfer_double2kokkos(c_source_Y, &(SIM.source_Y[ictr]), 1);
+      kokkostbx::transfer_double2kokkos(c_source_Z, &(SIM.source_Z[ictr]), 1);
+      kokkostbx::transfer_double2kokkos(c_source_I, &(weighted_I), 1);
+      kokkostbx::transfer_double2kokkos(c_source_lambda, &(SIM.source_lambda[ictr]), 1);
+
+      // set up cell basis vectors for the diffuse parameters (convert vec4 to vec3)
+      diffuse.a0 << SIM.a0[1],SIM.a0[2],SIM.a0[3]; diffuse.b0 << SIM.b0[1],SIM.b0[2],SIM.b0[3]; diffuse.c0 << SIM.c0[1],SIM.c0[2],SIM.c0[3];
+
+      debranch_maskall_Kernel(
+      kdt.m_panel_count, kdt.m_slow_dim_size, kdt.m_fast_dim_size, active_pixel_list.size(),
+      SIM.oversample, SIM.point_pixel,
+      SIM.pixel_size, m_subpixel_size, m_steps,
+      SIM.detector_thickstep, SIM.detector_thicksteps, SIM.detector_thick, SIM.detector_attnlen,
+      kdt.m_sdet_vector,
+      kdt.m_fdet_vector,
+      kdt.m_odet_vector,
+      kdt.m_pix0_vector,
+      kdt.m_distance,
+      m_beam_vector,
+      SIM.dmin, SIM.phisteps, 1,
+      c_source_X, c_source_Y,
+      c_source_Z,
+      c_source_I, c_source_lambda,
+      SIM.mosaic_domains, m_crystal_orientation,
+      SIM.Na, SIM.Nb, SIM.Nc, SIM.V_cell,
+      m_water_size, m_water_F, m_water_MW, simtbx::nanoBragg::r_e_sqr,
+      SIM.fluence, simtbx::nanoBragg::Avogadro, SIM.spot_scale, SIM.integral_form,
+      SIM.default_F,
+      current_channel_Fhkl,
+      kec.m_FhklParams,
+      SIM.nopolar,
+      m_polar_vector, SIM.polarization, SIM.fudge,
+      kdt.m_active_pixel_list,
+      diffuse,
+      // return arrays:
+      kdt.m_floatimage,
+      kdt.m_omega_reduction,
+      kdt.m_max_I_x_reduction,
+      kdt.m_max_I_y_reduction,
+      kdt.m_rangemap);
+    //don't want to free the kec data when the nanoBragg goes out of scope, so switch the pointer
+    // cu_current_channel_Fhkl = NULL;
+      add_array(kdt.m_accumulate_floatimage, kdt.m_floatimage);
+    }// loop over channels
+  }
+
+  inline void
+  add_background(simtbx::Kokkos::kokkos_detector<MemoryPolicy> & kdt, int const& override_source) {
+        // cudaSafeCall(cudaSetDevice(SIM.device_Id));
+
+        // transfer source_I, source_lambda
+        // the int arguments are for sizes of the arrays
+        int sources_count = SIM.sources;
+        kokkostbx::transfer_double2kokkos(m_source_I, SIM.source_I, sources_count);
+        kokkostbx::transfer_double2kokkos(m_source_lambda, SIM.source_lambda, sources_count);
+        // cudaSafeCall(cudaMemcpyVectorDoubleToDevice(m_source_I, SIM.source_I, SIM.sources));
+        // cudaSafeCall(cudaMemcpyVectorDoubleToDevice(m_source_lambda, SIM.source_lambda, SIM.sources));
+
+        vector_cudareal_t stol_of("stol_of", SIM.stols);
+        transfer_X2kokkos(stol_of, SIM.stol_of, SIM.stols);
+        // CUDAREAL * cu_stol_of;
+        // cudaSafeCall(cudaMalloc((void ** )&cu_stol_of, sizeof(*cu_stol_of) * SIM.stols));
+        // cudaSafeCall(cudaMemcpyVectorDoubleToDevice(cu_stol_of, SIM.stol_of, SIM.stols));
+
+        vector_cudareal_t Fbg_of("Fbg_of", SIM.stols);
+        transfer_X2kokkos(Fbg_of, SIM.Fbg_of, SIM.stols);
+        // CUDAREAL * cu_Fbg_of;
+        // cudaSafeCall(cudaMalloc((void ** )&cu_Fbg_of, sizeof(*cu_Fbg_of) * SIM.stols));
+        // cudaSafeCall(cudaMemcpyVectorDoubleToDevice(cu_Fbg_of, SIM.Fbg_of, SIM.stols));
+
+        // cudaDeviceProp deviceProps = { 0 };
+        // cudaSafeCall(cudaGetDeviceProperties(&deviceProps, SIM.device_Id));
+        // int smCount = deviceProps.multiProcessorCount;
+        // dim3 threadsPerBlock(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y);
+        // dim3 numBlocks(smCount * 8, 1);
+
+        //  initialize the device memory within a kernel.
+        //  modify the arguments to initialize multipanel detector.
+        ::Kokkos::parallel_for("kokkosSpotsInit", kdt.m_panel_count * kdt.m_slow_dim_size * kdt.m_fast_dim_size, KOKKOS_LAMBDA (const int& j) {
+          kdt.m_floatimage(j) = 0;
+          kdt.m_omega_reduction(j) = 0;
+          kdt.m_max_I_x_reduction(j) = 0;
+          kdt.m_max_I_y_reduction(j) = 0;
+          kdt.m_rangemap(j) = false;
+        });
+        // nanoBraggSpotsInitCUDAKernel<<<numBlocks, threadsPerBlock>>>(
+        //   kdt.m_panel_count * kdt.m_slow_dim_size, kdt.m_fast_dim_size,
+        //   kdt.cu_floatimage, kdt.cu_omega_reduction,
+        //   kdt.cu_max_I_x_reduction, kdt.cu_max_I_y_reduction,
+        //   kdt.cu_rangemap);
+        // cudaSafeCall(cudaPeekAtLastError());
+        // cudaSafeCall(cudaDeviceSynchronize());
+
+        std::size_t panel_size = kdt.m_slow_dim_size * kdt.m_fast_dim_size;
+
+        // the for loop around panels.  Offsets given.
+        for (std::size_t panel_id = 0; panel_id < kdt.m_panel_count; panel_id++) {
+          add_background_kokkos_kernel(SIM.sources,
+          SIM.oversample, override_source,
+          SIM.pixel_size, kdt.m_slow_dim_size, kdt.m_fast_dim_size, SIM.detector_thicksteps,
+          SIM.detector_thickstep, SIM.detector_attnlen,
+          extract_subview(kdt.m_sdet_vector, panel_id, 1),
+          extract_subview(kdt.m_fdet_vector, panel_id, 1),
+          extract_subview(kdt.m_odet_vector, panel_id, 1),
+          extract_subview(kdt.m_pix0_vector, panel_id, 1),
+          kdt.metrology.dists[panel_id], SIM.point_pixel, SIM.detector_thick,
+          m_source_X, m_source_Y, m_source_Z,
+          m_source_lambda, m_source_I,
+          SIM.stols, stol_of, Fbg_of,
+          SIM.nopolar, SIM.polarization, m_polar_vector,
+          simtbx::nanoBragg::r_e_sqr, SIM.fluence, SIM.amorphous_molecules,
+          // returns:
+          extract_subview(kdt.m_floatimage, panel_id, panel_size));
+
+          // cudaSafeCall(cudaPeekAtLastError());
+        }
+        // cudaSafeCall(cudaDeviceSynchronize());
+        ::Kokkos::fence();
+        add_array(kdt.m_accumulate_floatimage, kdt.m_floatimage);
+
+        // cudaSafeCall(cudaFree(cu_stol_of));
+        // cudaSafeCall(cudaFree(cu_Fbg_of));
+  }
+
+  inline af::flex_double
+  add_noise(simtbx::Kokkos::kokkos_detector<MemoryPolicy> & kdt) {
+        // put raw pixels into a temporary hold
+        af::flex_double tmp_hold_pixels = kdt.get_raw_pixels();
+        std::size_t panel_size = kdt.m_slow_dim_size * kdt.m_fast_dim_size;
+        af::flex_double panel_pixels = af::flex_double(af::flex_grid<>(kdt.m_slow_dim_size,kdt.m_fast_dim_size), af::init_functor_null<double>());
+        for (std::size_t panel_id = 0; panel_id < kdt.m_panel_count; panel_id++) {
+          double* dst_beg = panel_pixels.begin();
+          double* dst_end = panel_pixels.end();
+          double* dstptr = dst_beg;
+          for (double *srcptr = &(tmp_hold_pixels[panel_id*panel_size]); dstptr != dst_end; ){
+            *dstptr++ = *srcptr++;
+          } // get the pixels from one panel on GPU memory into panel_pixels for add_noise
+          SIM.add_noise(panel_pixels);
+          dstptr = dst_beg;
+          for (double *srcptr = &(tmp_hold_pixels[panel_id*panel_size]); dstptr != dst_end; ){
+            *srcptr++ = *dstptr++;
+          } // return the pixels from SIM.raw_pixels to temporary array
+        }
+        ::Kokkos::fence();
+        return tmp_hold_pixels;
+  }
+
+  inline void
+  allocate() {
+    //cudaSafeCall(cudaSetDevice(SIM.device_Id));
+
+    // water_size not defined in class, CLI argument, defaults to 0
+    double water_size = 0.0;
+    // missing constants
+    double water_F = 2.57;
+    double water_MW = 18.0;
+
+    // make sure we are normalizing with the right number of sub-steps
+    int nb_steps = SIM.phisteps * SIM.mosaic_domains * SIM.oversample * SIM.oversample;
+    double nb_subpixel_size = SIM.pixel_size / SIM.oversample;
+
+    //create transfer arguments to device space
+    m_subpixel_size = nb_subpixel_size; //check for conflict?
+    m_steps = nb_steps; //check for conflict?
+
+    // presumably thickness and attenuation can be migrated to the gpu detector class XXX FIXME
+    //cu_detector_thick = SIM.detector_thick;
+    //cu_detector_mu = SIM.detector_attnlen; // synonyms
+    //cu_distance = SIM.distance; // distance and close distance, detector properties? XXX FIXME
+    //cu_close_distance = SIM.close_distance;
+
+    m_water_size = water_size;
+    m_water_F = water_F;
+    m_water_MW = water_MW;
+
+    //const int vector_length = 4;
+
+    kokkostbx::transfer_double2kokkos(m_beam_vector, SIM.beam_vector, m_vector_length);
+    kokkostbx::transfer_double2kokkos(m_spindle_vector, SIM.spindle_vector, m_vector_length);
+    kokkostbx::transfer_double2kokkos(m_a0, SIM.a0, m_vector_length);
+    kokkostbx::transfer_double2kokkos(m_b0, SIM.b0, m_vector_length);
+    kokkostbx::transfer_double2kokkos(m_c0, SIM.c0, m_vector_length);
+
+    // cudaSafeCall(cudaMalloc((void ** )&cu_beam_vector, sizeof(*cu_beam_vector) * vector_length));
+    // cudaSafeCall(cudaMemcpyVectorDoubleToDevice(cu_beam_vector, SIM.beam_vector, vector_length));
+
+    // cudaSafeCall(cudaMalloc((void ** )&cu_spindle_vector, sizeof(*cu_spindle_vector) * vector_length));
+    // cudaSafeCall(cudaMemcpyVectorDoubleToDevice(cu_spindle_vector, SIM.spindle_vector, vector_length));
+
+    // cudaSafeCall(cudaMalloc((void ** )&cu_a0, sizeof(*cu_a0) * vector_length));
+    // cudaSafeCall(cudaMemcpyVectorDoubleToDevice(cu_a0, SIM.a0, vector_length));
+
+    // cudaSafeCall(cudaMalloc((void ** )&cu_b0, sizeof(*cu_b0) * vector_length));
+    // cudaSafeCall(cudaMemcpyVectorDoubleToDevice(cu_b0, SIM.b0, vector_length));
+
+    // cudaSafeCall(cudaMalloc((void ** )&cu_c0, sizeof(*cu_c0) * vector_length));
+    // cudaSafeCall(cudaMemcpyVectorDoubleToDevice(cu_c0, SIM.c0, vector_length));
+
+    // Unitize polar vector before sending it to the GPU.
+    // Optimization do it only once here rather than multiple time per pixel in the GPU.
+    double polar_vector_unitized[4];
+    cpu_unitize(SIM.polar_vector, polar_vector_unitized);
+    kokkostbx::transfer_double2kokkos(m_polar_vector, polar_vector_unitized, m_vector_length);
+
+    int sources_count = SIM.sources;
+    kokkostbx::transfer_double2kokkos(m_source_X, SIM.source_X, sources_count);
+    kokkostbx::transfer_double2kokkos(m_source_Y, SIM.source_Y, sources_count);
+    kokkostbx::transfer_double2kokkos(m_source_Z, SIM.source_Z, sources_count);
+    kokkostbx::transfer_double2kokkos(m_source_I, SIM.source_I, sources_count);
+    kokkostbx::transfer_double2kokkos(m_source_lambda, SIM.source_lambda, sources_count);
+
+    int mosaic_domains_count = SIM.mosaic_domains;
+    kokkostbx::transfer_double2kokkos(m_mosaic_umats, SIM.mosaic_umats, mosaic_domains_count * 9);
+
+    // cudaSafeCall(cudaMalloc((void ** )&cu_polar_vector, sizeof(*cu_polar_vector) * vector_length));
+    // cudaSafeCall(cudaMemcpyVectorDoubleToDevice(cu_polar_vector, polar_vector_unitized, vector_length));
+
+    // cudaSafeCall(cudaMalloc((void ** )&cu_source_X, sizeof(*cu_source_X) * sources_count));
+    // cudaSafeCall(cudaMemcpyVectorDoubleToDevice(cu_source_X, SIM.source_X, sources_count));
+
+    // cudaSafeCall(cudaMalloc((void ** )&cu_source_Y, sizeof(*cu_source_Y) * sources_count));
+    // cudaSafeCall(cudaMemcpyVectorDoubleToDevice(cu_source_Y, SIM.source_Y, sources_count));
+
+    // cudaSafeCall(cudaMalloc((void ** )&cu_source_Z, sizeof(*cu_source_Z) * sources_count));
+    // cudaSafeCall(cudaMemcpyVectorDoubleToDevice(cu_source_Z, SIM.source_Z, sources_count));
+
+    // cudaSafeCall(cudaMalloc((void ** )&cu_source_I, sizeof(*cu_source_I) * sources_count));
+    // cudaSafeCall(cudaMemcpyVectorDoubleToDevice(cu_source_I, SIM.source_I, sources_count));
+
+    // cudaSafeCall(cudaMalloc((void ** )&cu_source_lambda, sizeof(*cu_source_lambda) * sources_count));
+    // cudaSafeCall(cudaMemcpyVectorDoubleToDevice(cu_source_lambda, SIM.source_lambda, sources_count));
+
+    // cudaSafeCall(cudaMalloc((void ** )&cu_mosaic_umats, sizeof(*cu_mosaic_umats) * mosaic_domains_count * 9));
+    // cudaSafeCall(cudaMemcpyVectorDoubleToDevice(cu_mosaic_umats, SIM.mosaic_umats, mosaic_domains_count * 9));
+  };
+
 };
 } // Kokkos
 } // simtbx
diff --git a/simtbx/kokkos/simulation_kernels.h b/simtbx/kokkos/simulation_kernels.h
index 0a91f33e59..7c1fa9eb21 100644
--- a/simtbx/kokkos/simulation_kernels.h
+++ b/simtbx/kokkos/simulation_kernels.h
@@ -1,3 +1,5 @@
+#ifndef SIMTBX_KOKKOS_SIMULATION_KERNELS_H
+#define SIMTBX_KOKKOS_SIMULATION_KERNELS_H
 #include <kokkostbx/kokkos_types.h>
 #include <kokkostbx/kokkos_vector3.h>
 #include <kokkostbx/kokkos_matrix3.h>
@@ -792,3 +794,4 @@ void add_background_kokkos_kernel(int sources, int nanoBragg_oversample, int ove
         floatimage(pixIdx) += Ibg*r_e_sqr*fluence*amorphous_molecules/steps;
     }); // end of pixIdx loop
 }
+#endif // SIMTBX_KOKKOS_SIMULATION_KERNELS_H
diff --git a/simtbx/tests/tst_memory_policy.py b/simtbx/tests/tst_memory_policy.py
index 95f8755643..dbb94cee5e 100644
--- a/simtbx/tests/tst_memory_policy.py
+++ b/simtbx/tests/tst_memory_policy.py
@@ -211,9 +211,9 @@ def specialized_api_for_whitelist_low_memory(self, params, whitelist_pixels, arg
     self.SIM.xtal_shape = shapetype.Gauss
     self.SIM.interpolate = 0
     # allocate GPU arrays
-    self.gpu_simulation = get_exascale("exascale_api",params.context)(nanoBragg = self.SIM)
+    self.gpu_simulation = get_exascale("exascale_api_small_whitelist",params.context)(nanoBragg = self.SIM)
     self.gpu_simulation.allocate()
-    self.gpu_detector = get_exascale("gpu_detector",params.context)(
+    self.gpu_detector = get_exascale("gpu_detector_small_whitelist",params.context)(
                  deviceId=self.SIM.device_Id, detector=self.DETECTOR, beam=self.BEAM)
     self.gpu_detector.each_image_allocate()
     # self.gpu_detector.show_summary()

From 79de5b7cda1af7acd4221e0130f7dc444038add5 Mon Sep 17 00:00:00 2001
From: Nicholas K Sauter <nksauter@lbl.gov>
Date: Tue, 9 Apr 2024 15:59:59 -0700
Subject: [PATCH 2/4] Debug case for non-working test.

The script tests/tst_memory_policy.py fails with a cuda illegal access.
The intention is to get help from NESAP to get a functional test.
---
 simtbx/kokkos/detector.cpp         | 135 +++++++++++++
 simtbx/kokkos/detector.h           |  82 +-------
 simtbx/kokkos/simulation.cpp       | 173 +++++++++++++++++
 simtbx/kokkos/simulation.h         |  89 +--------
 simtbx/kokkos/simulation_kernels.h | 300 +++++++++++++++++++++++++++++
 simtbx/tests/tst_memory_policy.py  |  56 +++++-
 6 files changed, 664 insertions(+), 171 deletions(-)

diff --git a/simtbx/kokkos/detector.cpp b/simtbx/kokkos/detector.cpp
index 233b97c7ff..c4ef795d72 100644
--- a/simtbx/kokkos/detector.cpp
+++ b/simtbx/kokkos/detector.cpp
@@ -92,5 +92,140 @@ namespace simtbx { namespace Kokkos {
     }
   }
 
+  template<>
+  void kokkos_detector<small_whitelist_policy>::hello(){
+    SCITBX_EXAMINE("small small small");
+  }
+  template<>
+  void kokkos_detector<large_array_policy>::hello(){
+    SCITBX_EXAMINE("large large large");
+  }
+
+  template<> void
+  kokkos_detector<large_array_policy>::each_image_allocate() {
+    resize(m_rangemap, m_total_pixel_count);
+    resize(m_omega_reduction, m_total_pixel_count);
+    resize(m_max_I_x_reduction, m_total_pixel_count);
+    resize(m_max_I_y_reduction, m_total_pixel_count);
+    resize(m_floatimage, m_total_pixel_count);
+
+    resize(m_maskimage, m_total_pixel_count);
+    kokkostbx::transfer_shared2kokkos(m_sdet_vector, metrology.sdet);
+    kokkostbx::transfer_shared2kokkos(m_fdet_vector, metrology.fdet);
+    kokkostbx::transfer_shared2kokkos(m_odet_vector, metrology.odet);
+    kokkostbx::transfer_shared2kokkos(m_pix0_vector, metrology.pix0);
+    kokkostbx::transfer_shared2kokkos(m_distance, metrology.dists);
+    kokkostbx::transfer_shared2kokkos(m_Xbeam, metrology.Xbeam);
+    kokkostbx::transfer_shared2kokkos(m_Ybeam, metrology.Ybeam);
+    fence();
+
+    // metrology.show();
+
+    // printf(" rangemap size:%d\n", m_rangemap.span());
+    // printf(" omega_reduction size:%d\n", m_omega_reduction.span());
+    // printf(" max_I_x_reduction size:%d\n", m_max_I_x_reduction.span());
+    // printf(" max_I_y_reduction size:%d\n", m_max_I_y_reduction.span());
+    // printf(" maskimage size:%d\n", m_maskimage.span());
+    // printf(" floatimage size:%d\n", m_floatimage.span());
+    // printf(" sdet_vector size:%d\n", m_sdet_vector.span());
+    // printf(" fdet_vector size:%d\n", m_fdet_vector.span());
+    // printf(" odet_vector size:%d\n", m_odet_vector.span());
+    // printf(" pix0_vector size:%d\n", m_pix0_vector.span());
+    // printf(" distance size:%d\n", m_distance.span());
+    // printf(" Xbeam size:%d\n", m_Xbeam.span());
+    // printf(" Ybeam size:%d\n", m_Ybeam.span());
+
+    // print_view(m_fdet_vector);
+    // print_view(m_odet_vector, 1, 3);
+
+    // printf("DONE.\n");
+  }
+  template<> void
+  kokkos_detector<small_whitelist_policy>::each_image_allocate() {
+    resize(m_maskimage, m_total_pixel_count);
+    kokkostbx::transfer_shared2kokkos(m_sdet_vector, metrology.sdet);
+    kokkostbx::transfer_shared2kokkos(m_fdet_vector, metrology.fdet);
+    kokkostbx::transfer_shared2kokkos(m_odet_vector, metrology.odet);
+    kokkostbx::transfer_shared2kokkos(m_pix0_vector, metrology.pix0);
+    kokkostbx::transfer_shared2kokkos(m_distance, metrology.dists);
+    kokkostbx::transfer_shared2kokkos(m_Xbeam, metrology.Xbeam);
+    kokkostbx::transfer_shared2kokkos(m_Ybeam, metrology.Ybeam);
+    fence();
+  }
+
+  template<>
+  void
+  kokkos_detector<large_array_policy>::set_active_pixels_on_GPU(af::shared<std::size_t> active_pixel_list_value) {
+    m_active_pixel_size = active_pixel_list_value.size();
+    kokkostbx::transfer_shared2kokkos(m_active_pixel_list, active_pixel_list_value);
+    active_pixel_list = active_pixel_list_value;
+  }
+
+  template<>
+  void
+  kokkos_detector<small_whitelist_policy>::set_active_pixels_on_GPU(af::shared<std::size_t> active_pixel_list_value) {
+    m_active_pixel_size = active_pixel_list_value.size();
+    kokkostbx::transfer_shared2kokkos(m_active_pixel_list, active_pixel_list_value);
+    active_pixel_list = active_pixel_list_value;
+    resize(m_rangemap, m_active_pixel_size);
+    resize(m_omega_reduction, m_active_pixel_size);
+    resize(m_max_I_x_reduction, m_active_pixel_size);
+    resize(m_max_I_y_reduction, m_active_pixel_size);
+    resize(m_floatimage, m_active_pixel_size);
+    resize(m_accumulate_floatimage, m_active_pixel_size);
+    fence();
+  }
+
+  template<> af::shared<double>
+  kokkos_detector<large_array_policy>::get_whitelist_raw_pixels(af::shared<std::size_t> selection) {
+    hello();
+    //return the data array for the multipanel detector case, but only for whitelist pixels
+    vector_size_t active_pixel_selection = vector_size_t("active_pixel_selection", selection.size());
+    kokkostbx::transfer_shared2kokkos(active_pixel_selection, selection);
+
+    size_t output_pixel_size = selection.size();
+    vector_cudareal_t active_pixel_results = vector_cudareal_t("active_pixel_results", output_pixel_size);
+
+    auto temp = m_accumulate_floatimage;
+
+    parallel_for("get_active_pixel_selection",
+                  range_policy(0, output_pixel_size),
+                  KOKKOS_LAMBDA (const int i) {
+      size_t index = active_pixel_selection( i );
+      active_pixel_results( i ) = temp( index );
+    });
+
+    af::shared<double> output_array(output_pixel_size, af::init_functor_null<double>());
+    kokkostbx::transfer_kokkos2shared(output_array, active_pixel_results);
+
+    SCITBX_ASSERT(output_array.size() == output_pixel_size);
+    return output_array;
+  }
+  template<> af::shared<double>
+  kokkos_detector<small_whitelist_policy>::get_whitelist_raw_pixels(af::shared<std::size_t> selection) {
+    SCITBX_CHECK_POINT;
+    hello();
+    //return the data array for the multipanel detector case, but only for whitelist pixels
+
+    std::size_t output_pixel_size = selection.size();
+    //vector_cudareal_t active_pixel_results = vector_cudareal_t("active_pixel_results", output_pixel_size);
+
+    //auto temp = m_accumulate_floatimage;
+
+    //parallel_for("get_active_pixel_selection2",
+    //              range_policy(0, output_pixel_size),
+    //              KOKKOS_LAMBDA (const int i) {
+    //  active_pixel_results( i ) = temp( i );
+    //});
+
+    af::shared<double> output_array(output_pixel_size, af::init_functor_null<double>());
+    SCITBX_CHECK_POINT;
+    kokkostbx::transfer_kokkos2shared(output_array, m_accumulate_floatimage);//active_pixel_results);
+    SCITBX_CHECK_POINT;
+
+    SCITBX_ASSERT(output_array.size() == output_pixel_size);
+    return output_array;
+  }
+
 } // Kokkos
 } // simtbx
diff --git a/simtbx/kokkos/detector.h b/simtbx/kokkos/detector.h
index c7149eaeb4..74d286ffbe 100644
--- a/simtbx/kokkos/detector.h
+++ b/simtbx/kokkos/detector.h
@@ -41,10 +41,8 @@ struct packed_metrology{
   af::shared<double>Ybeam;
 };
 
-struct large_array_policy {
-};
-struct small_whitelist_policy {
-};
+struct large_array_policy {};
+struct small_whitelist_policy {};
 
 template <typename MemoryPolicy>
 struct kokkos_detector
@@ -63,7 +61,7 @@ struct kokkos_detector
   //void write_raw_pixels(simtbx::nanoBragg::nanoBragg&);
   //af::flex_double get_raw_pixels();
   //void set_active_pixels_on_GPU(af::shared<std::size_t>);
-  //af::shared<double> get_whitelist_raw_pixels(af::shared<std::size_t>);
+  af::shared<double> get_whitelist_raw_pixels(af::shared<std::size_t>);
   inline void each_image_free(){} //no op in Kokkos
   int h_deviceID;
 
@@ -157,46 +155,7 @@ struct kokkos_detector
     return view_floatimage;
   };
 
-  inline void
-  each_image_allocate() {
-    resize(m_rangemap, m_total_pixel_count);
-    resize(m_omega_reduction, m_total_pixel_count);
-    resize(m_max_I_x_reduction, m_total_pixel_count);
-    resize(m_max_I_y_reduction, m_total_pixel_count);
-
-    resize(m_maskimage, m_total_pixel_count);
-    resize(m_floatimage, m_total_pixel_count);
-
-    kokkostbx::transfer_shared2kokkos(m_sdet_vector, metrology.sdet);
-    kokkostbx::transfer_shared2kokkos(m_fdet_vector, metrology.fdet);
-    kokkostbx::transfer_shared2kokkos(m_odet_vector, metrology.odet);
-    kokkostbx::transfer_shared2kokkos(m_pix0_vector, metrology.pix0);
-    kokkostbx::transfer_shared2kokkos(m_distance, metrology.dists);
-    kokkostbx::transfer_shared2kokkos(m_Xbeam, metrology.Xbeam);
-    kokkostbx::transfer_shared2kokkos(m_Ybeam, metrology.Ybeam);
-    fence();
-
-    // metrology.show();
-
-    // printf(" rangemap size:%d\n", m_rangemap.span());
-    // printf(" omega_reduction size:%d\n", m_omega_reduction.span());
-    // printf(" max_I_x_reduction size:%d\n", m_max_I_x_reduction.span());
-    // printf(" max_I_y_reduction size:%d\n", m_max_I_y_reduction.span());
-    // printf(" maskimage size:%d\n", m_maskimage.span());
-    // printf(" floatimage size:%d\n", m_floatimage.span());
-    // printf(" sdet_vector size:%d\n", m_sdet_vector.span());
-    // printf(" fdet_vector size:%d\n", m_fdet_vector.span());
-    // printf(" odet_vector size:%d\n", m_odet_vector.span());
-    // printf(" pix0_vector size:%d\n", m_pix0_vector.span());
-    // printf(" distance size:%d\n", m_distance.span());
-    // printf(" Xbeam size:%d\n", m_Xbeam.span());
-    // printf(" Ybeam size:%d\n", m_Ybeam.span());
-
-    // print_view(m_fdet_vector);
-    // print_view(m_odet_vector, 1, 3);
-
-    // printf("DONE.\n");
-  }
+  void each_image_allocate();
 
   inline void
   scale_in_place(const double& factor){
@@ -206,6 +165,8 @@ struct kokkos_detector
     });
   }
 
+  void set_active_pixels_on_GPU(af::shared<std::size_t> active_pixel_list_value);
+
   inline void
   write_raw_pixels(simtbx::nanoBragg::nanoBragg& nB) {
     //only implement the monolithic detector case, one panel
@@ -242,37 +203,8 @@ struct kokkos_detector
     return output_array;
   }
 
-  inline void
-  set_active_pixels_on_GPU(af::shared<std::size_t> active_pixel_list_value) {
-    m_active_pixel_size = active_pixel_list_value.size();
-    kokkostbx::transfer_shared2kokkos(m_active_pixel_list, active_pixel_list_value);
-    active_pixel_list = active_pixel_list_value;
-  }
-
-  inline af::shared<double>
-  get_whitelist_raw_pixels(af::shared<std::size_t> selection) {
-    //return the data array for the multipanel detector case, but only for whitelist pixels
-    vector_size_t active_pixel_selection = vector_size_t("active_pixel_selection", selection.size());
-    kokkostbx::transfer_shared2kokkos(active_pixel_selection, selection);
+  void hello();
 
-    size_t output_pixel_size = selection.size();
-    vector_cudareal_t active_pixel_results = vector_cudareal_t("active_pixel_results", output_pixel_size);
-
-    auto temp = m_accumulate_floatimage;
-
-    parallel_for("get_active_pixel_selection",
-                  range_policy(0, output_pixel_size),
-                  KOKKOS_LAMBDA (const int i) {
-      size_t index = active_pixel_selection( i );
-      active_pixel_results( i ) = temp( index );
-    });
-
-    af::shared<double> output_array(output_pixel_size, af::init_functor_null<double>());
-    kokkostbx::transfer_kokkos2shared(output_array, active_pixel_results);
-
-    SCITBX_ASSERT(output_array.size() == output_pixel_size);
-    return output_array;
-  }
 };
 
 
diff --git a/simtbx/kokkos/simulation.cpp b/simtbx/kokkos/simulation.cpp
index b4fd96861b..cf2d79dd4e 100644
--- a/simtbx/kokkos/simulation.cpp
+++ b/simtbx/kokkos/simulation.cpp
@@ -4,6 +4,7 @@
 #include "simtbx/kokkos/simulation_kernels.h"
 #include "kokkostbx/kokkos_utils.h"
 #include "scitbx/array_family/flex_types.h"
+#include "simtbx/kokkos/detector.h"
 
 #define THREADS_PER_BLOCK_X 128
 #define THREADS_PER_BLOCK_Y 1
@@ -42,5 +43,177 @@ namespace Kokkos {
   }
 */
 
+  template<> void
+  exascale_api<large_array_policy>::add_energy_multichannel_mask_allpanel(
+    af::shared<int> const ichannels,
+    simtbx::Kokkos::kokkos_energy_channels & kec,
+    simtbx::Kokkos::kokkos_detector<large_array_policy> & kdt,
+    af::shared<std::size_t> const active_pixel_list,
+    af::shared<double> const weight
+  ){
+    //SCITBX_CHECK_POINT; // tested by tst_unified.py
+    kdt.set_active_pixels_on_GPU(active_pixel_list);
+
+    // transfer source_I, source_lambda
+    // the int arguments are for sizes of the arrays
+
+    SCITBX_ASSERT(SIM.sources == ichannels.size()); /* For each nanoBragg source, this value instructs
+    the simulation where to look for structure factors.  If -1, skip this source wavelength. */
+    SCITBX_ASSERT(SIM.sources == weight.size());
+
+    for (int ictr = 0; ictr < SIM.sources; ++ictr){
+      if (ichannels[ictr] < 0) continue; // the ichannel array
+      //printf("HA HA %4d channel %4d, %5.1f %5.1f %5.1f %10.5f %10.5g\n",ictr,ichannels[ictr],
+      //  SIM.source_X[ictr], SIM.source_Y[ictr], SIM.source_Z[ictr],
+      //  SIM.source_I[ictr], SIM.source_lambda[ictr]);
+
+      ::Kokkos::resize(m_crystal_orientation, SIM.phisteps, SIM.mosaic_domains, 3);
+      calc_CrystalOrientations(
+      SIM.phi0, SIM.phistep, SIM.phisteps, m_spindle_vector, m_a0, m_b0, m_c0, SIM.mosaic_spread,
+      SIM.mosaic_domains, m_mosaic_umats, m_crystal_orientation);
+
+      // magic happens here: take pointer from singleton, temporarily use it for add Bragg iteration:
+      vector_cudareal_t current_channel_Fhkl = kec.d_channel_Fhkl[ichannels[ictr]];
+
+      vector_cudareal_t c_source_X = vector_cudareal_t("c_source_X", 0);
+      vector_cudareal_t c_source_Y = vector_cudareal_t("c_source_Y", 0);
+      vector_cudareal_t c_source_Z = vector_cudareal_t("c_source_Z", 0);
+      vector_cudareal_t c_source_I = vector_cudareal_t("c_source_I", 0);
+      vector_cudareal_t c_source_lambda = vector_cudareal_t("c_source_lambda", 0);
+      double weighted_I = SIM.source_I[ictr] * weight[ictr];
+      kokkostbx::transfer_double2kokkos(c_source_X, &(SIM.source_X[ictr]), 1);
+      kokkostbx::transfer_double2kokkos(c_source_Y, &(SIM.source_Y[ictr]), 1);
+      kokkostbx::transfer_double2kokkos(c_source_Z, &(SIM.source_Z[ictr]), 1);
+      kokkostbx::transfer_double2kokkos(c_source_I, &(weighted_I), 1);
+      kokkostbx::transfer_double2kokkos(c_source_lambda, &(SIM.source_lambda[ictr]), 1);
+
+      // set up cell basis vectors for the diffuse parameters (convert vec4 to vec3)
+      diffuse.a0 << SIM.a0[1],SIM.a0[2],SIM.a0[3]; diffuse.b0 << SIM.b0[1],SIM.b0[2],SIM.b0[3]; diffuse.c0 << SIM.c0[1],SIM.c0[2],SIM.c0[3];
+
+      debranch_maskall_Kernel(
+      kdt.m_panel_count, kdt.m_slow_dim_size, kdt.m_fast_dim_size, active_pixel_list.size(),
+      SIM.oversample, SIM.point_pixel,
+      SIM.pixel_size, m_subpixel_size, m_steps,
+      SIM.detector_thickstep, SIM.detector_thicksteps, SIM.detector_thick, SIM.detector_attnlen,
+      kdt.m_sdet_vector,
+      kdt.m_fdet_vector,
+      kdt.m_odet_vector,
+      kdt.m_pix0_vector,
+      kdt.m_distance,
+      m_beam_vector,
+      SIM.dmin, SIM.phisteps, 1,
+      c_source_X, c_source_Y,
+      c_source_Z,
+      c_source_I, c_source_lambda,
+      SIM.mosaic_domains, m_crystal_orientation,
+      SIM.Na, SIM.Nb, SIM.Nc, SIM.V_cell,
+      m_water_size, m_water_F, m_water_MW, simtbx::nanoBragg::r_e_sqr,
+      SIM.fluence, simtbx::nanoBragg::Avogadro, SIM.spot_scale, SIM.integral_form,
+      SIM.default_F,
+      current_channel_Fhkl,
+      kec.m_FhklParams,
+      SIM.nopolar,
+      m_polar_vector, SIM.polarization, SIM.fudge,
+      kdt.m_active_pixel_list,
+      diffuse,
+      // return arrays:
+      kdt.m_floatimage,
+      kdt.m_omega_reduction,
+      kdt.m_max_I_x_reduction,
+      kdt.m_max_I_y_reduction,
+      kdt.m_rangemap);
+    //don't want to free the kec data when the nanoBragg goes out of scope, so switch the pointer
+    // cu_current_channel_Fhkl = NULL;
+
+      add_array(kdt.m_accumulate_floatimage, kdt.m_floatimage);
+    }// loop over channels
+  }
+
+  template<> void
+  exascale_api<small_whitelist_policy>::add_energy_multichannel_mask_allpanel(
+    af::shared<int> const ichannels,
+    simtbx::Kokkos::kokkos_energy_channels & kec,
+    simtbx::Kokkos::kokkos_detector<small_whitelist_policy> & kdt,
+    af::shared<std::size_t> const active_pixel_list,
+    af::shared<double> const weight
+  ){
+    //SCITBX_CHECK_POINT; // tested by tst_memory_policy.py
+    kdt.set_active_pixels_on_GPU(active_pixel_list);
+
+    // transfer source_I, source_lambda
+    // the int arguments are for sizes of the arrays
+
+    SCITBX_ASSERT(SIM.sources == ichannels.size()); /* For each nanoBragg source, this value instructs
+    the simulation where to look for structure factors.  If -1, skip this source wavelength. */
+    SCITBX_ASSERT(SIM.sources == weight.size());
+
+    for (int ictr = 0; ictr < SIM.sources; ++ictr){
+      if (ichannels[ictr] < 0) continue; // the ichannel array
+      //printf("HA HA %4d channel %4d, %5.1f %5.1f %5.1f %10.5f %10.5g\n",ictr,ichannels[ictr],
+      //  SIM.source_X[ictr], SIM.source_Y[ictr], SIM.source_Z[ictr],
+      //  SIM.source_I[ictr], SIM.source_lambda[ictr]);
+
+      ::Kokkos::resize(m_crystal_orientation, SIM.phisteps, SIM.mosaic_domains, 3);
+      calc_CrystalOrientations(
+      SIM.phi0, SIM.phistep, SIM.phisteps, m_spindle_vector, m_a0, m_b0, m_c0, SIM.mosaic_spread,
+      SIM.mosaic_domains, m_mosaic_umats, m_crystal_orientation);
+
+      // magic happens here: take pointer from singleton, temporarily use it for add Bragg iteration:
+      vector_cudareal_t current_channel_Fhkl = kec.d_channel_Fhkl[ichannels[ictr]];
+
+      vector_cudareal_t c_source_X = vector_cudareal_t("c_source_X", 0);
+      vector_cudareal_t c_source_Y = vector_cudareal_t("c_source_Y", 0);
+      vector_cudareal_t c_source_Z = vector_cudareal_t("c_source_Z", 0);
+      vector_cudareal_t c_source_I = vector_cudareal_t("c_source_I", 0);
+      vector_cudareal_t c_source_lambda = vector_cudareal_t("c_source_lambda", 0);
+      double weighted_I = SIM.source_I[ictr] * weight[ictr];
+      kokkostbx::transfer_double2kokkos(c_source_X, &(SIM.source_X[ictr]), 1);
+      kokkostbx::transfer_double2kokkos(c_source_Y, &(SIM.source_Y[ictr]), 1);
+      kokkostbx::transfer_double2kokkos(c_source_Z, &(SIM.source_Z[ictr]), 1);
+      kokkostbx::transfer_double2kokkos(c_source_I, &(weighted_I), 1);
+      kokkostbx::transfer_double2kokkos(c_source_lambda, &(SIM.source_lambda[ictr]), 1);
+
+      // set up cell basis vectors for the diffuse parameters (convert vec4 to vec3)
+      diffuse.a0 << SIM.a0[1],SIM.a0[2],SIM.a0[3]; diffuse.b0 << SIM.b0[1],SIM.b0[2],SIM.b0[3]; diffuse.c0 << SIM.c0[1],SIM.c0[2],SIM.c0[3];
+
+      debranch_maskall_low_memory_Kernel(
+      kdt.m_panel_count, kdt.m_slow_dim_size, kdt.m_fast_dim_size, active_pixel_list.size(),
+      SIM.oversample, SIM.point_pixel,
+      SIM.pixel_size, m_subpixel_size, m_steps,
+      SIM.detector_thickstep, SIM.detector_thicksteps, SIM.detector_thick, SIM.detector_attnlen,
+      kdt.m_sdet_vector,
+      kdt.m_fdet_vector,
+      kdt.m_odet_vector,
+      kdt.m_pix0_vector,
+      kdt.m_distance,
+      m_beam_vector,
+      SIM.dmin, SIM.phisteps, 1,
+      c_source_X, c_source_Y,
+      c_source_Z,
+      c_source_I, c_source_lambda,
+      SIM.mosaic_domains, m_crystal_orientation,
+      SIM.Na, SIM.Nb, SIM.Nc, SIM.V_cell,
+      m_water_size, m_water_F, m_water_MW, simtbx::nanoBragg::r_e_sqr,
+      SIM.fluence, simtbx::nanoBragg::Avogadro, SIM.spot_scale, SIM.integral_form,
+      SIM.default_F,
+      current_channel_Fhkl,
+      kec.m_FhklParams,
+      SIM.nopolar,
+      m_polar_vector, SIM.polarization, SIM.fudge,
+      kdt.m_active_pixel_list,
+      diffuse,
+      // return arrays:
+      kdt.m_floatimage,
+      kdt.m_omega_reduction,
+      kdt.m_max_I_x_reduction,
+      kdt.m_max_I_y_reduction,
+      kdt.m_rangemap);
+    //don't want to free the kec data when the nanoBragg goes out of scope, so switch the pointer
+    // cu_current_channel_Fhkl = NULL;
+
+      add_array(kdt.m_accumulate_floatimage, kdt.m_floatimage);
+    }// loop over channels
+  }
+
 } // Kokkos
 } // simtbx
diff --git a/simtbx/kokkos/simulation.h b/simtbx/kokkos/simulation.h
index 9b9d6a6552..9b69b4ce95 100644
--- a/simtbx/kokkos/simulation.h
+++ b/simtbx/kokkos/simulation.h
@@ -86,12 +86,12 @@ struct exascale_api {
     simtbx::Kokkos::kokkos_detector<MemoryPolicy> &,
     af::shared<std::size_t> const
   ); */
-  /*void add_energy_multichannel_mask_allpanel(af::shared<int> const,
+  void add_energy_multichannel_mask_allpanel(af::shared<int> const,
     simtbx::Kokkos::kokkos_energy_channels &,
     simtbx::Kokkos::kokkos_detector<MemoryPolicy> &,
     af::shared<std::size_t> const,
     af::shared<double> const
-  ); */
+  );
 
   //void add_background(simtbx::Kokkos::kokkos_detector<MemoryPolicy> &, int const&);
   //af::flex_double add_noise(simtbx::Kokkos::kokkos_detector<MemoryPolicy> &);
@@ -330,91 +330,6 @@ struct exascale_api {
     add_array(kdt.m_accumulate_floatimage, kdt.m_floatimage);
   }
 
-  inline void
-  add_energy_multichannel_mask_allpanel(
-    af::shared<int> const ichannels,
-    simtbx::Kokkos::kokkos_energy_channels & kec,
-    simtbx::Kokkos::kokkos_detector<MemoryPolicy> & kdt,
-    af::shared<std::size_t> const active_pixel_list,
-    af::shared<double> const weight
-  ){
-    //SCITBX_CHECK_POINT; // tested by tst_unified.py
-    kdt.set_active_pixels_on_GPU(active_pixel_list);
-
-    // transfer source_I, source_lambda
-    // the int arguments are for sizes of the arrays
-
-    SCITBX_ASSERT(SIM.sources == ichannels.size()); /* For each nanoBragg source, this value instructs
-    the simulation where to look for structure factors.  If -1, skip this source wavelength. */
-    SCITBX_ASSERT(SIM.sources == weight.size());
-
-    for (int ictr = 0; ictr < SIM.sources; ++ictr){
-      if (ichannels[ictr] < 0) continue; // the ichannel array
-      //printf("HA HA %4d channel %4d, %5.1f %5.1f %5.1f %10.5f %10.5g\n",ictr,ichannels[ictr],
-      //  SIM.source_X[ictr], SIM.source_Y[ictr], SIM.source_Z[ictr],
-      //  SIM.source_I[ictr], SIM.source_lambda[ictr]);
-
-      ::Kokkos::resize(m_crystal_orientation, SIM.phisteps, SIM.mosaic_domains, 3);
-      calc_CrystalOrientations(
-      SIM.phi0, SIM.phistep, SIM.phisteps, m_spindle_vector, m_a0, m_b0, m_c0, SIM.mosaic_spread,
-      SIM.mosaic_domains, m_mosaic_umats, m_crystal_orientation);
-
-      // magic happens here: take pointer from singleton, temporarily use it for add Bragg iteration:
-      vector_cudareal_t current_channel_Fhkl = kec.d_channel_Fhkl[ichannels[ictr]];
-
-      vector_cudareal_t c_source_X = vector_cudareal_t("c_source_X", 0);
-      vector_cudareal_t c_source_Y = vector_cudareal_t("c_source_Y", 0);
-      vector_cudareal_t c_source_Z = vector_cudareal_t("c_source_Z", 0);
-      vector_cudareal_t c_source_I = vector_cudareal_t("c_source_I", 0);
-      vector_cudareal_t c_source_lambda = vector_cudareal_t("c_source_lambda", 0);
-      double weighted_I = SIM.source_I[ictr] * weight[ictr];
-      kokkostbx::transfer_double2kokkos(c_source_X, &(SIM.source_X[ictr]), 1);
-      kokkostbx::transfer_double2kokkos(c_source_Y, &(SIM.source_Y[ictr]), 1);
-      kokkostbx::transfer_double2kokkos(c_source_Z, &(SIM.source_Z[ictr]), 1);
-      kokkostbx::transfer_double2kokkos(c_source_I, &(weighted_I), 1);
-      kokkostbx::transfer_double2kokkos(c_source_lambda, &(SIM.source_lambda[ictr]), 1);
-
-      // set up cell basis vectors for the diffuse parameters (convert vec4 to vec3)
-      diffuse.a0 << SIM.a0[1],SIM.a0[2],SIM.a0[3]; diffuse.b0 << SIM.b0[1],SIM.b0[2],SIM.b0[3]; diffuse.c0 << SIM.c0[1],SIM.c0[2],SIM.c0[3];
-
-      debranch_maskall_Kernel(
-      kdt.m_panel_count, kdt.m_slow_dim_size, kdt.m_fast_dim_size, active_pixel_list.size(),
-      SIM.oversample, SIM.point_pixel,
-      SIM.pixel_size, m_subpixel_size, m_steps,
-      SIM.detector_thickstep, SIM.detector_thicksteps, SIM.detector_thick, SIM.detector_attnlen,
-      kdt.m_sdet_vector,
-      kdt.m_fdet_vector,
-      kdt.m_odet_vector,
-      kdt.m_pix0_vector,
-      kdt.m_distance,
-      m_beam_vector,
-      SIM.dmin, SIM.phisteps, 1,
-      c_source_X, c_source_Y,
-      c_source_Z,
-      c_source_I, c_source_lambda,
-      SIM.mosaic_domains, m_crystal_orientation,
-      SIM.Na, SIM.Nb, SIM.Nc, SIM.V_cell,
-      m_water_size, m_water_F, m_water_MW, simtbx::nanoBragg::r_e_sqr,
-      SIM.fluence, simtbx::nanoBragg::Avogadro, SIM.spot_scale, SIM.integral_form,
-      SIM.default_F,
-      current_channel_Fhkl,
-      kec.m_FhklParams,
-      SIM.nopolar,
-      m_polar_vector, SIM.polarization, SIM.fudge,
-      kdt.m_active_pixel_list,
-      diffuse,
-      // return arrays:
-      kdt.m_floatimage,
-      kdt.m_omega_reduction,
-      kdt.m_max_I_x_reduction,
-      kdt.m_max_I_y_reduction,
-      kdt.m_rangemap);
-    //don't want to free the kec data when the nanoBragg goes out of scope, so switch the pointer
-    // cu_current_channel_Fhkl = NULL;
-      add_array(kdt.m_accumulate_floatimage, kdt.m_floatimage);
-    }// loop over channels
-  }
-
   inline void
   add_background(simtbx::Kokkos::kokkos_detector<MemoryPolicy> & kdt, int const& override_source) {
         // cudaSafeCall(cudaSetDevice(SIM.device_Id));
diff --git a/simtbx/kokkos/simulation_kernels.h b/simtbx/kokkos/simulation_kernels.h
index 7c1fa9eb21..dd78e00242 100644
--- a/simtbx/kokkos/simulation_kernels.h
+++ b/simtbx/kokkos/simulation_kernels.h
@@ -643,6 +643,306 @@ void debranch_maskall_Kernel(int npanels, int spixels, int fpixels, int total_pi
         });
 }
 
+void debranch_maskall_low_memory_Kernel(int npanels, int spixels, int fpixels, int total_pixels,
+    int oversample, int point_pixel,
+    CUDAREAL pixel_size, CUDAREAL subpixel_size, int steps,
+    CUDAREAL detector_thickstep, int detector_thicksteps, CUDAREAL detector_thick, CUDAREAL detector_mu,
+    const view_1d_t<vec3> sdet_vector, const view_1d_t<vec3> fdet_vector,
+    const view_1d_t<vec3> odet_vector, const view_1d_t<vec3> pix0_vector,
+    const vector_cudareal_t close_distance,
+    const vector_cudareal_t beam_vector,
+    CUDAREAL dmin, int phisteps, int sources,
+    const vector_cudareal_t source_X, const vector_cudareal_t source_Y,
+    const vector_cudareal_t source_Z,
+    const vector_cudareal_t source_I, const vector_cudareal_t source_lambda,
+    int mosaic_domains, crystal_orientation_t crystal_orientation,
+    CUDAREAL Na, CUDAREAL Nb, CUDAREAL Nc, CUDAREAL V_cell,
+    CUDAREAL water_size, CUDAREAL water_F, CUDAREAL water_MW, CUDAREAL r_e_sqr,
+    CUDAREAL fluence, CUDAREAL Avogadro, CUDAREAL spot_scale, int integral_form,
+    CUDAREAL default_F,
+    const vector_cudareal_t Fhkl,
+    const hklParams FhklParams,
+    int nopolar,
+    const vector_cudareal_t polar_vector, CUDAREAL polarization, CUDAREAL fudge,
+    const vector_size_t pixel_lookup,
+    simtbx::Kokkos::diffuse_api diffuse,
+    vector_float_t floatimage /*out*/,
+    vector_float_t omega_reduction /*out*/, vector_float_t max_I_x_reduction /*out*/,
+    vector_float_t max_I_y_reduction /*out*/, vector_bool_t rangemap) {
+
+
+        const int s_h_min = FhklParams.h_min;
+        const int s_k_min = FhklParams.k_min;
+        const int s_l_min = FhklParams.l_min;
+        const int s_h_range = FhklParams.h_range;
+        const int s_k_range = FhklParams.k_range;
+        const int s_l_range = FhklParams.l_range;
+        const int s_h_max = s_h_min + s_h_range - 1;
+        const int s_k_max = s_k_min + s_k_range - 1;
+        const int s_l_max = s_l_min + s_l_range - 1;
+
+        // set up diffuse scattering if needed
+        vector_mat3_t laue_mats = vector_mat3_t("laue_mats",24);
+        vector_cudareal_t dG_trace = vector_cudareal_t("dG_trace",3);
+        vector_vec3_t dG_dgam = vector_vec3_t("dG_dgam",3);
+        int num_laue_mats = 0;
+        int dhh = 0, dkk = 0, dll = 0;
+        KOKKOS_MAT3 rotate_principal_axes = diffuse.rotate_principal_axes; // (1,0,0,0,1,0,0,0,1);
+        int laue_group_num = diffuse.laue_group_num;
+        int stencil_size = diffuse.stencil_size;
+        KOKKOS_MAT3 anisoG = diffuse.anisoG; // (300.,0,0,0,100.,0,0,0,300.);
+        KOKKOS_MAT3 anisoU = diffuse.anisoU; // (0.48,0,0,0,0.16,0,0,0,0.16);
+        KOKKOS_MAT3 Bmat_realspace(1.,0,0,0,1.,0,0,0,1.); // Placeholder
+        KOKKOS_MAT3 anisoG_local;
+        CUDAREAL anisoG_determ = 0;
+        KOKKOS_MAT3 anisoU_local;
+        bool use_diffuse=diffuse.enable;
+        // ***NEEDS UPDATE: use legacy API for passing diffuse scale as KOKKOS_MAT3
+        vector_mat3_t diffuse_scale_mat3 = vector_mat3_t("diffuse_scale_mat3",1);
+
+        if (use_diffuse){
+          anisoG_local = anisoG;
+          anisoU_local = anisoU;
+
+          if (laue_group_num < 1 || laue_group_num >14 ){
+            throw std::string("Laue group number not in range 1-14");
+          }
+
+          Kokkos::parallel_reduce("prepare diffuse mats", 1, KOKKOS_LAMBDA (const int& i, int& num_laue_mats_temp){
+            num_laue_mats_temp = gen_laue_mats(laue_group_num, laue_mats, rotate_principal_axes);
+            // KOKKOS_MAT3 rotate_principal_axes;
+            // rotate_principal_axes << 0.70710678, -0.70710678, 0., 0.70710678, 0.70710678, 0., 0., 0., 1.;
+
+            KOKKOS_MAT3 Amatrix(diffuse.a0[0],diffuse.a0[1],diffuse.a0[2],diffuse.b0[0],diffuse.b0[1],diffuse.b0[2],
+                                diffuse.c0[0],diffuse.c0[1],diffuse.c0[2]);
+            KOKKOS_MAT3 Ainv = Amatrix.inverse()*1.e-10;
+            CUDAREAL reciprocal_space_volume = 8*M_PI*M_PI*M_PI*Ainv.determinant();
+            CUDAREAL _tmpfac = M_PI * 0.63 / fudge;
+            CUDAREAL diffuse_scale = reciprocal_space_volume * sqrt(_tmpfac*_tmpfac*_tmpfac);
+
+            // ***NEEDS UPDATE: use legacy API for passing diffuse scale as KOKKOS_MAT3
+            diffuse_scale_mat3(0)(0,0) = diffuse_scale;
+
+            for ( int iL = 0; iL < num_laue_mats_temp; iL++ ){
+              laue_mats(iL) = Ainv * laue_mats(iL);
+            }
+            const KOKKOS_MAT3 Ginv = anisoG_local.inverse();
+            const KOKKOS_MAT3 dG = Bmat_realspace * Ginv;
+            for (int i_gam=0; i_gam<3; i_gam++){
+              dG_dgam(i_gam)[i_gam] = 1;
+              KOKKOS_MAT3 temp_dgam;
+              temp_dgam(i_gam, 0) = dG_dgam(i_gam)[0];
+              temp_dgam(i_gam, 1) = dG_dgam(i_gam)[1];
+              temp_dgam(i_gam, 2) = dG_dgam(i_gam)[2];
+              dG_trace(i_gam) = (Ginv*temp_dgam).trace();
+            }
+          }, num_laue_mats);
+          anisoG_determ = anisoG_local.determinant();
+          dhh = dkk = dll = stencil_size; // Limits of stencil for diffuse calc
+        }
+        KOKKOS_VEC3 dHH(dhh,dkk,dll);
+        // end of diffuse setup
+
+// Implementation notes.  This kernel is aggressively debranched, therefore the assumptions are:
+// 1) mosaicity non-zero positive
+// 2) xtal shape is "Gauss" i.e. 3D spheroid.
+// 3) No bounds check for access to the structure factor array.
+// 4) No check for Flatt=0.
+        const CUDAREAL dmin_r = (dmin > 0.0) ? 1/dmin : 0.0;
+
+        // add background from something amorphous, precalculate scaling
+        const CUDAREAL F_bg = water_F;
+        const CUDAREAL I_bg = F_bg * F_bg * r_e_sqr * fluence * water_size * water_size * water_size * 1e6 * Avogadro / water_MW;
+        const CUDAREAL I_factor = r_e_sqr * spot_scale * fluence / steps;
+
+        Kokkos::parallel_for("debranch_maskall", total_pixels, KOKKOS_LAMBDA(const int& pixIdx) {
+
+                vec3 polar_vector_tmp {polar_vector(1), polar_vector(2), polar_vector(3)};
+
+                // position in pixel array
+                const int j = pixel_lookup(pixIdx);//pixIdx: index into pixel subset; j: index into the data.
+                const int i_panel = j / (fpixels*spixels); // the panel number
+                const int j_panel = j % (fpixels*spixels); // the pixel number within the panel
+                const int fpixel = j_panel % fpixels;
+                const int spixel = j_panel / fpixels;
+                const int outIdx = pixIdx;
+
+                // reset photon count for this pixel
+                CUDAREAL I = I_bg;
+                CUDAREAL omega_sub_reduction = 0.0;
+                CUDAREAL max_I_x_sub_reduction = 0.0;
+                CUDAREAL max_I_y_sub_reduction = 0.0;
+                CUDAREAL polar = 0.0;
+                if (nopolar) {
+                        polar = 1.0;
+                }
+
+                // add this now to avoid problems with skipping later
+                // move this to the bottom to avoid accessing global device memory. floatimage[j] = I_bg;
+                // loop over sub-pixels
+                int subS, subF;
+                for (subS = 0; subS < oversample; ++subS) { // Y voxel
+                        for (subF = 0; subF < oversample; ++subF) { // X voxel
+                                // absolute mm position on detector (relative to its origin)
+                                CUDAREAL Fdet = subpixel_size * (fpixel * oversample + subF) + subpixel_size / 2.0; // X voxel
+                                CUDAREAL Sdet = subpixel_size * (spixel * oversample + subS) + subpixel_size / 2.0; // Y voxel
+                                // Fdet = pixel_size*fpixel;
+                                // Sdet = pixel_size*spixel;
+
+                                max_I_x_sub_reduction = Fdet;
+                                max_I_y_sub_reduction = Sdet;
+
+                                int thick_tic;
+                                for (thick_tic = 0; thick_tic < detector_thicksteps; ++thick_tic) {
+                                        // assume "distance" is to the front of the detector sensor layer
+                                        CUDAREAL Odet = thick_tic * detector_thickstep; // Z Orthagonal voxel.
+
+                                        // construct detector subpixel position in 3D space
+                                        //                      pixel_X = distance;
+                                        //                      pixel_Y = Sdet-Ybeam;
+                                        //                      pixel_Z = Fdet-Xbeam;
+                                        vec3 pixel_pos;
+                                        pixel_pos += Fdet * fdet_vector(i_panel);
+                                        pixel_pos += Sdet * sdet_vector(i_panel);
+                                        pixel_pos += Odet * odet_vector(i_panel);
+                                        pixel_pos += pix0_vector(i_panel);
+
+                                        // construct the diffracted-beam unit vector to this sub-pixel
+                                        CUDAREAL airpath_r = 1 / pixel_pos.length();
+                                        vec3 diffracted = pixel_pos.get_unit_vector();
+
+                                        // solid angle subtended by a pixel: (pix/airpath)^2*cos(2theta)
+                                        CUDAREAL omega_pixel = pixel_size * pixel_size * airpath_r * airpath_r * close_distance(i_panel) * airpath_r;
+                                        // option to turn off obliquity effect, inverse-square-law only
+                                        if (point_pixel) {
+                                                omega_pixel = airpath_r * airpath_r;
+                                        }
+
+                                        // now calculate detector thickness effects
+                                        CUDAREAL capture_fraction = 1.0;
+                                        if (detector_thick > 0.0 && detector_mu> 0.0) {
+                                                // inverse of effective thickness increase
+                                                CUDAREAL parallax = odet_vector(i_panel).dot(diffracted);
+                                                capture_fraction = exp(-thick_tic * detector_thickstep / detector_mu / parallax)
+                                                                - exp(-(thick_tic + 1) * detector_thickstep / detector_mu / parallax);
+                                        }
+
+                                        // loop over sources now
+                                        int source;
+                                        for (source = 0; source < sources; ++source) {
+
+                                                // retrieve stuff from cache
+                                                vec3 incident = {-source_X(source), -source_Y(source), -source_Z(source)};
+                                                CUDAREAL lambda = source_lambda(source);
+                                                CUDAREAL source_fraction = source_I(source);
+
+                                                // construct the incident beam unit vector while recovering source distance
+                                                // TODO[Giles]: Optimization! We can unitize the source vectors before passing them in.
+                                                incident.normalize();
+
+                                                // construct the scattering vector for this pixel
+                                                vec3 scattering = (diffracted - incident) / lambda;
+                                                CUDAREAL stol = 0.5 * scattering.length();
+
+                                                // rough cut to speed things up when we aren't using whole detector
+                                                if (dmin > 0.0 && stol > 0.0) {
+                                                        // use reciprocal of (dmin > 0.5 / stol)
+                                                        if (dmin_r <= 2 * stol) {
+                                                                continue;
+                                                        }
+                                                }
+
+                                                // polarization factor
+                                                if (!nopolar) {
+                                                        // need to compute polarization factor
+                                                        polar = polarization_factor(polarization, incident, diffracted, polar_vector_tmp);
+                                                } else {
+                                                        polar = 1.0;
+                                                }
+
+                                                // sweep over phi angles
+                                                for (int phi_tic = 0; phi_tic < phisteps; ++phi_tic) {
+                                                        // enumerate mosaic domains
+                                                        for (int mos_tic = 0; mos_tic < mosaic_domains; ++mos_tic) {
+                                                                // apply mosaic rotation after phi rotation
+                                                                auto a = crystal_orientation(phi_tic, mos_tic, 0);
+                                                                auto b = crystal_orientation(phi_tic, mos_tic, 1);
+                                                                auto c = crystal_orientation(phi_tic, mos_tic, 2);
+
+                                                                // construct fractional Miller indicies
+                                                                CUDAREAL h = a.dot(scattering);
+                                                                CUDAREAL k = b.dot(scattering);
+                                                                CUDAREAL l = c.dot(scattering);
+
+                                                                // round off to nearest whole index
+                                                                int h0 = ceil(h - 0.5);
+                                                                int k0 = ceil(k - 0.5);
+                                                                int l0 = ceil(l - 0.5);
+
+                                                                // structure factor of the lattice (paralelpiped crystal)
+                                                                // F_latt = sin(M_PI*s_Na*h)*sin(M_PI*s_Nb*k)*sin(M_PI*s_Nc*l)/sin(M_PI*h)/sin(M_PI*k)/sin(M_PI*l);
+
+                                                                CUDAREAL I_latt = 1.0; // Shape transform for the crystal.
+                                                                CUDAREAL hrad_sqr = 0.0;
+
+                                                                // handy radius in reciprocal space, squared
+                                                                hrad_sqr = (h - h0) * (h - h0) * Na * Na + (k - k0) * (k - k0) * Nb * Nb + (l - l0) * (l - l0) * Nc * Nc;
+                                                                // fudge the radius so that volume and FWHM are similar to square_xtal spots
+                                                                double my_arg = hrad_sqr / 0.63 * fudge;
+                                                                {
+                                                                CUDAREAL F_latt = Na * Nb * Nc * exp(-(my_arg));
+                                                                I_latt = F_latt * F_latt;
+                                                                }
+
+                                                                // new code for diffuse.
+                                                                if (use_diffuse) {
+                                                                  KOKKOS_VEC3 H_vec(h,k,l);
+                                                                  KOKKOS_VEC3 H0(h0,k0,l0);
+                                                                  CUDAREAL step_diffuse_param[6];
+                                                                  // ***NEEDS UPDATE: use legacy API for passing diffuse scale as KOKKOS_MAT3
+                                                                  calc_diffuse_at_hkl(H_vec,H0,dHH,s_h_min,s_k_min,s_l_min,s_h_max,s_k_max,
+                                                                    s_l_max,s_h_range,s_k_range,s_l_range,diffuse_scale_mat3(0),Fhkl,num_laue_mats,laue_mats,
+                                                                    anisoG_local,dG_trace,anisoG_determ,anisoU_local,dG_dgam,false,&I_latt,step_diffuse_param);
+                                                                }
+                                                                // end s_use_diffuse outer
+
+                                                                // structure factor of the unit cell
+                                                                CUDAREAL F_cell = default_F;
+                                                                //F_cell = quickFcell_ldg(s_hkls, s_h_max, s_h_min, s_k_max, s_k_min, s_l_max, s_l_min, h0, k0, l0, s_h_range, s_k_range, s_l_range, default_F, Fhkl);
+                                                                if (
+                                                                        h0 < s_h_min ||
+                                                                        k0 < s_k_min ||
+                                                                        l0 < s_l_min ||
+                                                                        h0 > s_h_max ||
+                                                                        k0 > s_k_max ||
+                                                                        l0 > s_l_max
+                                                                        ) {
+                                                                        F_cell = 0.;
+                                                                } else {
+                                                                        const int hkl_index = (h0-s_h_min)*s_k_range*s_l_range + (k0-s_k_min)*s_l_range + (l0-s_l_min);
+                                                                        F_cell = Fhkl[hkl_index];
+                                                                }
+
+                                                                // now we have the structure factor for this pixel
+
+                                                                // convert amplitudes into intensity (photons per steradian)
+                                                                I += F_cell * F_cell * I_latt * source_fraction * capture_fraction * omega_pixel;
+                                                                omega_sub_reduction += omega_pixel;
+                                                        } // end of mosaic loop
+                                                } // end of phi loop
+                                        } // end of source loop
+                                } // end of detector thickness loop
+                        } // end of sub-pixel y loop
+                } // end of sub-pixel x loop
+                const double photons = I_bg + I_factor * polar * I;
+                floatimage( outIdx ) = photons;
+                //omega_reduction( outIdx ) = omega_sub_reduction; // shared contention
+                //max_I_x_reduction( outIdx ) = max_I_x_sub_reduction;
+                //max_I_y_reduction( outIdx ) = max_I_y_sub_reduction;
+                //rangemap( outIdx ) = true;
+        });
+}
+
 // __global__ void nanoBraggSpotsInitCUDAKernel(int spixels, int fpixesl, float * floatimage, float * omega_reduction,
 //                 float * max_I_x_reduction,
 //                 float * max_I_y_reduction, bool * rangemap);
diff --git a/simtbx/tests/tst_memory_policy.py b/simtbx/tests/tst_memory_policy.py
index dbb94cee5e..177d6e5a97 100644
--- a/simtbx/tests/tst_memory_policy.py
+++ b/simtbx/tests/tst_memory_policy.py
@@ -233,10 +233,12 @@ def specialized_api_for_whitelist_low_memory(self, params, whitelist_pixels, arg
     per_image_scale_factor = self.domains_per_crystal # 1.0
     self.gpu_detector.scale_in_place(per_image_scale_factor) # apply scale directly on GPU
     self.reset_pythony_beams(self.SIM)
+    print("AAA")
     self.whitelist_values = self.gpu_detector.get_whitelist_raw_pixels(whitelist_pixels)
+    print("BBB")
 
-def get_whitelist_from_refls(prefix,SIM):
-    image_size = len(SIM.raw_pixels)
+def get_whitelist_from_refls(prefix,SIM=None):
+    #image_size = len(SIM.raw_pixels)
     from dials.array_family import flex
     from dxtbx.model.experiment_list import ExperimentListFactory
     experiments = ExperimentListFactory.from_json_file("%s_imported.expt"%prefix, check_format = False)
@@ -353,8 +355,9 @@ def run_all(params):
   free_gpu_before = get_gpu_memory()[0]
   del SWCs
   free_gpu_after = get_gpu_memory()[0]
-  print((free_gpu_after - free_gpu_before)/NTRIALS,"free")
-  assert (free_gpu_after - free_gpu_before)/NTRIALS >= 50 # old policy uses at least 50 MB per sim., actual value 57.6 MB
+  old_memory_use = (free_gpu_after - free_gpu_before)/NTRIALS
+  print(old_memory_use,"free")
+  assert old_memory_use >= 50 # old policy uses at least 50 MB per sim., actual value 57.6 MB
 
   #figure out the minimum change needed to reduce memory consumption by factor of image_size/whitelist_size
   #accomplish the same with compile-time polymorphism
@@ -378,16 +381,51 @@ def run_all(params):
   free_gpu_before = get_gpu_memory()[0]
   del SWCs
   free_gpu_after = get_gpu_memory()[0]
-  print((free_gpu_after - free_gpu_before)/NTRIALS,"free")
-  assert (free_gpu_after - free_gpu_before)/NTRIALS >= 50 # old policy uses at least 50 MB per sim., actual value 57.6 MB
+  new_memory_use = (free_gpu_after - free_gpu_before)/NTRIALS
+  print(new_memory_use,"free")
+  assert old_memory_use > 4.*new_memory_use # new policy cuts down at least 4-fold on memory, actual value **
 
-  return
+def run_subset_for_NESAP_debug(params):
+  # make the dxtbx objects
+  BEAM = basic_beam()
+  DETECTOR = basic_detector()
+  CRYSTAL = basic_crystal()
+  SF_model = amplitudes(CRYSTAL)
+  # Famp = SF_model.Famp # simple uniform amplitudes
+  SF_model.random_structure(CRYSTAL)
+  SF_model.ersatz_correct_to_P1()
+  whitelist_pixels = flex.size_t((11929,
+ 351293,351294,351295,352828,352829,352830,352831,354364,354365,
+ 354366,354367,355900,355901,355902,355903,357436,357437,357438,
+ 357439,352383,352384,352385,353919,353920,353921,355455,355456,
+ 355457,356991,356992,356993,354120,354121,354122,355656,355657))
+
+  NTRIALS=5
+  #figure out the minimum change needed to reduce memory consumption by factor of image_size/whitelist_size
+  #accomplish the same with compile-time polymorphism
+  #have side-by-side test in same python script
+  # Reproduce whitelist sims with small-memory mechanism
+  SWCs=[]
+  for x in range(NTRIALS):
+    print("Whitelist-only iteration with small memory",x)
+    SWCs.append(several_wavelength_case_policy(BEAM,DETECTOR,CRYSTAL,SF_model,weights=flex.double([1.])))
+    SWCs[-1].specialized_api_for_whitelist_low_memory(whitelist_pixels=whitelist_pixels,params=params,argchk=False,sources=True)
+  #produce an output image file for intermediate debugging
+  working_raw_pixels = flex.double(image_size) # blank array
+  working_raw_pixels.set_selected(whitelist_pixels, SWCs[-1].whitelist_values)
+  working_raw_pixels.reshape(flex.grid(SWCs[-1].SIM.raw_pixels.focus()))
+
+  free_gpu_before = get_gpu_memory()[0]
+  del SWCs
+  free_gpu_after = get_gpu_memory()[0]
+  new_memory_use = (free_gpu_after - free_gpu_before)/NTRIALS
+  print(new_memory_use,"free")
 
 if __name__=="__main__":
   params,options = parse_input()
   # Initialize based on GPU context
   gpu_instance_type = get_exascale("gpu_instance", params.context)
   gpu_instance = gpu_instance_type(deviceId = 0)
-
-  run_all(params)
+  #run_all(params)
+  run_subset_for_NESAP_debug(params)
 print("OK")

From 4cf3974b201c78ddc5cc31e231a78f00f597a96d Mon Sep 17 00:00:00 2001
From: Nicholas K Sauter <nksauter@lbl.gov>
Date: Sat, 20 Apr 2024 18:56:46 -0700
Subject: [PATCH 3/4] Minimal changes for working tst_memory_policy.py

Memory savings achieved through code specialization, for the case where
pixel values are simulated on a small whitelist. Specializations are not
yet optimal, as there is still a lot of code duplication.

Changes give ~4.5x reduction in memory footprint, but no success yet in
resizing the array m_accumulate_floatimage.  Attempts so far lead to
cuda memory allocation error.
---
 simtbx/kokkos/detector.cpp         | 97 +++++-------------------------
 simtbx/kokkos/detector.h           | 46 +++++++++++---
 simtbx/kokkos/kokkos_ext.cpp       |  2 +-
 simtbx/kokkos/simulation.cpp       |  3 +-
 simtbx/kokkos/simulation_kernels.h |  8 +++
 simtbx/tests/tst_memory_policy.py  | 28 +++------
 6 files changed, 71 insertions(+), 113 deletions(-)

diff --git a/simtbx/kokkos/detector.cpp b/simtbx/kokkos/detector.cpp
index c4ef795d72..238ca56881 100644
--- a/simtbx/kokkos/detector.cpp
+++ b/simtbx/kokkos/detector.cpp
@@ -93,16 +93,16 @@ namespace simtbx { namespace Kokkos {
   }
 
   template<>
-  void kokkos_detector<small_whitelist_policy>::hello(){
-    SCITBX_EXAMINE("small small small");
+  std::string kokkos_detector<small_whitelist_policy>::hello(){
+    return("small small small");
   }
   template<>
-  void kokkos_detector<large_array_policy>::hello(){
-    SCITBX_EXAMINE("large large large");
+  std::string kokkos_detector<large_array_policy>::hello(){
+    return("large large large");
   }
 
   template<> void
-  kokkos_detector<large_array_policy>::each_image_allocate() {
+  kokkos_detector<large_array_policy>::each_image_allocate(const std::size_t& n_pixels) {
     resize(m_rangemap, m_total_pixel_count);
     resize(m_omega_reduction, m_total_pixel_count);
     resize(m_max_I_x_reduction, m_total_pixel_count);
@@ -140,9 +140,17 @@ namespace simtbx { namespace Kokkos {
 
     // printf("DONE.\n");
   }
+
   template<> void
-  kokkos_detector<small_whitelist_policy>::each_image_allocate() {
-    resize(m_maskimage, m_total_pixel_count);
+  kokkos_detector<small_whitelist_policy>::each_image_allocate(const std::size_t& n_pixels) {
+    SCITBX_ASSERT(n_pixels > 0);
+    resize(m_rangemap, n_pixels);
+    resize(m_omega_reduction, n_pixels);
+    resize(m_max_I_x_reduction, n_pixels);
+    resize(m_max_I_y_reduction, n_pixels);
+    resize(m_floatimage, n_pixels);
+
+    resize(m_maskimage, n_pixels);
     kokkostbx::transfer_shared2kokkos(m_sdet_vector, metrology.sdet);
     kokkostbx::transfer_shared2kokkos(m_fdet_vector, metrology.fdet);
     kokkostbx::transfer_shared2kokkos(m_odet_vector, metrology.odet);
@@ -152,80 +160,5 @@ namespace simtbx { namespace Kokkos {
     kokkostbx::transfer_shared2kokkos(m_Ybeam, metrology.Ybeam);
     fence();
   }
-
-  template<>
-  void
-  kokkos_detector<large_array_policy>::set_active_pixels_on_GPU(af::shared<std::size_t> active_pixel_list_value) {
-    m_active_pixel_size = active_pixel_list_value.size();
-    kokkostbx::transfer_shared2kokkos(m_active_pixel_list, active_pixel_list_value);
-    active_pixel_list = active_pixel_list_value;
-  }
-
-  template<>
-  void
-  kokkos_detector<small_whitelist_policy>::set_active_pixels_on_GPU(af::shared<std::size_t> active_pixel_list_value) {
-    m_active_pixel_size = active_pixel_list_value.size();
-    kokkostbx::transfer_shared2kokkos(m_active_pixel_list, active_pixel_list_value);
-    active_pixel_list = active_pixel_list_value;
-    resize(m_rangemap, m_active_pixel_size);
-    resize(m_omega_reduction, m_active_pixel_size);
-    resize(m_max_I_x_reduction, m_active_pixel_size);
-    resize(m_max_I_y_reduction, m_active_pixel_size);
-    resize(m_floatimage, m_active_pixel_size);
-    resize(m_accumulate_floatimage, m_active_pixel_size);
-    fence();
-  }
-
-  template<> af::shared<double>
-  kokkos_detector<large_array_policy>::get_whitelist_raw_pixels(af::shared<std::size_t> selection) {
-    hello();
-    //return the data array for the multipanel detector case, but only for whitelist pixels
-    vector_size_t active_pixel_selection = vector_size_t("active_pixel_selection", selection.size());
-    kokkostbx::transfer_shared2kokkos(active_pixel_selection, selection);
-
-    size_t output_pixel_size = selection.size();
-    vector_cudareal_t active_pixel_results = vector_cudareal_t("active_pixel_results", output_pixel_size);
-
-    auto temp = m_accumulate_floatimage;
-
-    parallel_for("get_active_pixel_selection",
-                  range_policy(0, output_pixel_size),
-                  KOKKOS_LAMBDA (const int i) {
-      size_t index = active_pixel_selection( i );
-      active_pixel_results( i ) = temp( index );
-    });
-
-    af::shared<double> output_array(output_pixel_size, af::init_functor_null<double>());
-    kokkostbx::transfer_kokkos2shared(output_array, active_pixel_results);
-
-    SCITBX_ASSERT(output_array.size() == output_pixel_size);
-    return output_array;
-  }
-  template<> af::shared<double>
-  kokkos_detector<small_whitelist_policy>::get_whitelist_raw_pixels(af::shared<std::size_t> selection) {
-    SCITBX_CHECK_POINT;
-    hello();
-    //return the data array for the multipanel detector case, but only for whitelist pixels
-
-    std::size_t output_pixel_size = selection.size();
-    //vector_cudareal_t active_pixel_results = vector_cudareal_t("active_pixel_results", output_pixel_size);
-
-    //auto temp = m_accumulate_floatimage;
-
-    //parallel_for("get_active_pixel_selection2",
-    //              range_policy(0, output_pixel_size),
-    //              KOKKOS_LAMBDA (const int i) {
-    //  active_pixel_results( i ) = temp( i );
-    //});
-
-    af::shared<double> output_array(output_pixel_size, af::init_functor_null<double>());
-    SCITBX_CHECK_POINT;
-    kokkostbx::transfer_kokkos2shared(output_array, m_accumulate_floatimage);//active_pixel_results);
-    SCITBX_CHECK_POINT;
-
-    SCITBX_ASSERT(output_array.size() == output_pixel_size);
-    return output_array;
-  }
-
 } // Kokkos
 } // simtbx
diff --git a/simtbx/kokkos/detector.h b/simtbx/kokkos/detector.h
index 74d286ffbe..dc1f276a2d 100644
--- a/simtbx/kokkos/detector.h
+++ b/simtbx/kokkos/detector.h
@@ -23,6 +23,7 @@ using vec3 = kokkostbx::vector3<CUDAREAL>;
 using mat3 = kokkostbx::matrix3<CUDAREAL>;
 using Kokkos::fence;
 
+
 namespace simtbx { namespace Kokkos {
 
 namespace af = scitbx::af;
@@ -45,8 +46,7 @@ struct large_array_policy {};
 struct small_whitelist_policy {};
 
 template <typename MemoryPolicy>
-struct kokkos_detector
-{
+struct kokkos_detector{
   inline kokkos_detector(){printf("NO OPERATION, DEVICE NUMBER IS NEEDED");};
   //kokkos_detector(int const&, const simtbx::nanoBragg::nanoBragg& nB);
   //kokkos_detector(int const&, dxtbx::model::Detector const &, dxtbx::model::Beam const &);
@@ -56,12 +56,12 @@ struct kokkos_detector
     std::cout << "Detector size: " << m_panel_count << " panel" << ( (m_panel_count>1)? "s" : "" ) << std::endl;
     metrology.show();
   }
-  //void each_image_allocate();
+  void each_image_allocate(const std::size_t&);
   //void scale_in_place(const double&);
   //void write_raw_pixels(simtbx::nanoBragg::nanoBragg&);
   //af::flex_double get_raw_pixels();
   //void set_active_pixels_on_GPU(af::shared<std::size_t>);
-  af::shared<double> get_whitelist_raw_pixels(af::shared<std::size_t>);
+  //af::shared<double> get_whitelist_raw_pixels(af::shared<std::size_t>);
   inline void each_image_free(){} //no op in Kokkos
   int h_deviceID;
 
@@ -155,8 +155,6 @@ struct kokkos_detector
     return view_floatimage;
   };
 
-  void each_image_allocate();
-
   inline void
   scale_in_place(const double& factor){
     auto local_accumulate_floatimage = m_accumulate_floatimage;
@@ -165,8 +163,6 @@ struct kokkos_detector
     });
   }
 
-  void set_active_pixels_on_GPU(af::shared<std::size_t> active_pixel_list_value);
-
   inline void
   write_raw_pixels(simtbx::nanoBragg::nanoBragg& nB) {
     //only implement the monolithic detector case, one panel
@@ -203,11 +199,41 @@ struct kokkos_detector
     return output_array;
   }
 
-  void hello();
+  inline void
+  set_active_pixels_on_GPU(af::shared<std::size_t> active_pixel_list_value) {
+    m_active_pixel_size = active_pixel_list_value.size();
+    kokkostbx::transfer_shared2kokkos(m_active_pixel_list, active_pixel_list_value);
+    active_pixel_list = active_pixel_list_value;
+  }
 
-};
+  inline af::shared<double>
+  get_whitelist_raw_pixels(af::shared<std::size_t> selection) {
+    printf("algorithm: %20s selection size %10d\n",hello().c_str(), selection.size());
+    //return the data array for the multipanel detector case, but only for whitelist pixels
+    vector_size_t active_pixel_selection = vector_size_t("active_pixel_selection", selection.size());
+    kokkostbx::transfer_shared2kokkos(active_pixel_selection, selection);
+
+    size_t output_pixel_size = selection.size();
+    vector_cudareal_t active_pixel_results = vector_cudareal_t("active_pixel_results", output_pixel_size);
 
+    auto temp = m_accumulate_floatimage;
 
+    parallel_for("get_active_pixel_selection",
+                  range_policy(0, output_pixel_size),
+                  KOKKOS_LAMBDA (const int i) {
+      size_t index = active_pixel_selection( i );
+      active_pixel_results( i ) = temp( index );
+    });
+
+    af::shared<double> output_array(output_pixel_size, af::init_functor_null<double>());
+    kokkostbx::transfer_kokkos2shared(output_array, active_pixel_results);
+
+    SCITBX_ASSERT(output_array.size() == output_pixel_size);
+    return output_array;
+  }
+
+  std::string hello();
+};
 } // Kokkos
 } // simtbx
 #endif // SIMTBX_KOKKOS_DETECTOR_H
diff --git a/simtbx/kokkos/kokkos_ext.cpp b/simtbx/kokkos/kokkos_ext.cpp
index 3ef9f4e2ce..ca6d2dc096 100644
--- a/simtbx/kokkos/kokkos_ext.cpp
+++ b/simtbx/kokkos/kokkos_ext.cpp
@@ -87,6 +87,7 @@ namespace simtbx { namespace Kokkos {
         .def("show_summary",&simtbx::Kokkos::kokkos_detector<memory_t>::show_summary)
         .def("each_image_allocate",
               &simtbx::Kokkos::kokkos_detector<memory_t>::each_image_allocate,
+              ( arg_("n_pixels")=0 ),
              "Allocate large pixel arrays")
         .def("scale_in_place", &simtbx::Kokkos::kokkos_detector<memory_t>::scale_in_place,
              "Multiply by a scale factor on the GPU")
@@ -95,7 +96,6 @@ namespace simtbx { namespace Kokkos {
         .def("get_raw_pixels",&simtbx::Kokkos::kokkos_detector<memory_t>::get_raw_pixels,
              "return multipanel detector raw pixels as a flex array")
         .def("get_whitelist_raw_pixels",
-             (af::shared<double> (simtbx::Kokkos::kokkos_detector<memory_t>::*)(af::shared<std::size_t>))
              &simtbx::Kokkos::kokkos_detector<memory_t>::get_whitelist_raw_pixels,
             "return only those raw pixels requested by the whitelist selection, as a 1D flex array")
         .def("each_image_free", &simtbx::Kokkos::kokkos_detector<memory_t>::each_image_free)
diff --git a/simtbx/kokkos/simulation.cpp b/simtbx/kokkos/simulation.cpp
index cf2d79dd4e..65d3979e86 100644
--- a/simtbx/kokkos/simulation.cpp
+++ b/simtbx/kokkos/simulation.cpp
@@ -211,7 +211,8 @@ namespace Kokkos {
     //don't want to free the kec data when the nanoBragg goes out of scope, so switch the pointer
     // cu_current_channel_Fhkl = NULL;
 
-      add_array(kdt.m_accumulate_floatimage, kdt.m_floatimage);
+      //for the small_whitelist specialization, have a special version of add_array() that specifies size
+      add_array_limit(kdt.m_accumulate_floatimage, kdt.m_floatimage, kdt.m_floatimage.span());
     }// loop over channels
   }
 
diff --git a/simtbx/kokkos/simulation_kernels.h b/simtbx/kokkos/simulation_kernels.h
index dd78e00242..580a184b04 100644
--- a/simtbx/kokkos/simulation_kernels.h
+++ b/simtbx/kokkos/simulation_kernels.h
@@ -955,6 +955,14 @@ void add_array( view_1d_t<T> lhs, const view_1d_t<U> rhs ) {
   });
 }
 
+template <typename T, typename U>
+void add_array_limit( view_1d_t<T> lhs, const view_1d_t<U> rhs, const std::size_t& limit ) {
+  Kokkos::parallel_for("add_arrays", limit, KOKKOS_LAMBDA(const int& i) {
+    lhs( i ) = lhs( i ) + (T)rhs( i );
+    rhs( i ) = 0;
+  });
+}
+
 void add_background_kokkos_kernel(int sources, int nanoBragg_oversample, int override_source,
     CUDAREAL pixel_size, int spixels, int fpixels, int detector_thicksteps,
     CUDAREAL detector_thickstep, CUDAREAL detector_attnlen,
diff --git a/simtbx/tests/tst_memory_policy.py b/simtbx/tests/tst_memory_policy.py
index 177d6e5a97..fe1629314c 100644
--- a/simtbx/tests/tst_memory_policy.py
+++ b/simtbx/tests/tst_memory_policy.py
@@ -215,7 +215,8 @@ def specialized_api_for_whitelist_low_memory(self, params, whitelist_pixels, arg
     self.gpu_simulation.allocate()
     self.gpu_detector = get_exascale("gpu_detector_small_whitelist",params.context)(
                  deviceId=self.SIM.device_Id, detector=self.DETECTOR, beam=self.BEAM)
-    self.gpu_detector.each_image_allocate()
+
+    self.gpu_detector.each_image_allocate(n_pixels = whitelist_pixels.size() )
     # self.gpu_detector.show_summary()
 
     assert sources
@@ -233,9 +234,8 @@ def specialized_api_for_whitelist_low_memory(self, params, whitelist_pixels, arg
     per_image_scale_factor = self.domains_per_crystal # 1.0
     self.gpu_detector.scale_in_place(per_image_scale_factor) # apply scale directly on GPU
     self.reset_pythony_beams(self.SIM)
-    print("AAA")
-    self.whitelist_values = self.gpu_detector.get_whitelist_raw_pixels(whitelist_pixels)
-    print("BBB")
+    whitelist_idx = flex.size_t(range(whitelist_pixels.size()))
+    self.whitelist_values = self.gpu_detector.get_whitelist_raw_pixels(whitelist_idx)
 
 def get_whitelist_from_refls(prefix,SIM=None):
     #image_size = len(SIM.raw_pixels)
@@ -347,7 +347,7 @@ def run_all(params):
   # Now reproduce whitelist sims showing accumulation of large persistent memory
   SWCs=[]
   for x in range(NTRIALS):
-    print("Whitelist-only iteration",x)
+    print("\nWhitelist-only iteration",x)
     SWCs.append(several_wavelength_case_policy(BEAM,DETECTOR,CRYSTAL,SF_model,weights=flex.double([1.])))
     SWCs[-1].specialized_api_for_whitelist(whitelist_pixels=whitelist_pixels,params=params,argchk=False,sources=True)
 
@@ -365,7 +365,7 @@ def run_all(params):
   # Reproduce whitelist sims with small-memory mechanism
   SWCs=[]
   for x in range(NTRIALS):
-    print("Whitelist-only iteration with small memory",x)
+    print("\nWhitelist-only iteration with small memory",x)
     SWCs.append(several_wavelength_case_policy(BEAM,DETECTOR,CRYSTAL,SF_model,weights=flex.double([1.])))
     SWCs[-1].specialized_api_for_whitelist_low_memory(whitelist_pixels=whitelist_pixels,params=params,argchk=False,sources=True)
   #produce an output image file for intermediate debugging
@@ -407,25 +407,15 @@ def run_subset_for_NESAP_debug(params):
   # Reproduce whitelist sims with small-memory mechanism
   SWCs=[]
   for x in range(NTRIALS):
-    print("Whitelist-only iteration with small memory",x)
+    print("\n Whitelist-only iteration with small memory",x)
     SWCs.append(several_wavelength_case_policy(BEAM,DETECTOR,CRYSTAL,SF_model,weights=flex.double([1.])))
     SWCs[-1].specialized_api_for_whitelist_low_memory(whitelist_pixels=whitelist_pixels,params=params,argchk=False,sources=True)
-  #produce an output image file for intermediate debugging
-  working_raw_pixels = flex.double(image_size) # blank array
-  working_raw_pixels.set_selected(whitelist_pixels, SWCs[-1].whitelist_values)
-  working_raw_pixels.reshape(flex.grid(SWCs[-1].SIM.raw_pixels.focus()))
-
-  free_gpu_before = get_gpu_memory()[0]
-  del SWCs
-  free_gpu_after = get_gpu_memory()[0]
-  new_memory_use = (free_gpu_after - free_gpu_before)/NTRIALS
-  print(new_memory_use,"free")
 
 if __name__=="__main__":
   params,options = parse_input()
   # Initialize based on GPU context
   gpu_instance_type = get_exascale("gpu_instance", params.context)
   gpu_instance = gpu_instance_type(deviceId = 0)
-  #run_all(params)
-  run_subset_for_NESAP_debug(params)
+  run_all(params)
+  #run_subset_for_NESAP_debug(params)
 print("OK")

From fa0295b3344da50d1575a10f39e67d7a6a9a6645 Mon Sep 17 00:00:00 2001
From: Nicholas K Sauter <nksauter@lbl.gov>
Date: Wed, 22 May 2024 10:07:31 -0700
Subject: [PATCH 4/4] Remove debugging output to conserve stdout size.

---
 simtbx/kokkos/detector.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/simtbx/kokkos/detector.h b/simtbx/kokkos/detector.h
index dc1f276a2d..85529fdc5c 100644
--- a/simtbx/kokkos/detector.h
+++ b/simtbx/kokkos/detector.h
@@ -208,7 +208,7 @@ struct kokkos_detector{
 
   inline af::shared<double>
   get_whitelist_raw_pixels(af::shared<std::size_t> selection) {
-    printf("algorithm: %20s selection size %10d\n",hello().c_str(), selection.size());
+    //printf("algorithm: %20s selection size %10d\n",hello().c_str(), selection.size());
     //return the data array for the multipanel detector case, but only for whitelist pixels
     vector_size_t active_pixel_selection = vector_size_t("active_pixel_selection", selection.size());
     kokkostbx::transfer_shared2kokkos(active_pixel_selection, selection);