From e94da3ab29cd1d3956896b1c4e9851c13259f832 Mon Sep 17 00:00:00 2001
From: Nicholas K Sauter <nksauter@lbl.gov>
Date: Tue, 9 Apr 2024 15:59:59 -0700
Subject: [PATCH] Debug case for non-working test.

The script tests/tst_memory_policy.py fails with a cuda illegal access.
The intention is to get help from NESAP to get a functional test.
---
 simtbx/kokkos/detector.cpp         | 135 +++++++++++++
 simtbx/kokkos/detector.h           |  82 +-------
 simtbx/kokkos/simulation.cpp       | 173 +++++++++++++++++
 simtbx/kokkos/simulation.h         |  89 +--------
 simtbx/kokkos/simulation_kernels.h | 300 +++++++++++++++++++++++++++++
 simtbx/tests/tst_memory_policy.py  |  56 +++++-
 6 files changed, 664 insertions(+), 171 deletions(-)

diff --git a/simtbx/kokkos/detector.cpp b/simtbx/kokkos/detector.cpp
index 233b97c7ff6..c4ef795d725 100644
--- a/simtbx/kokkos/detector.cpp
+++ b/simtbx/kokkos/detector.cpp
@@ -92,5 +92,140 @@ namespace simtbx { namespace Kokkos {
     }
   }
 
+  template<>
+  void kokkos_detector<small_whitelist_policy>::hello(){
+    SCITBX_EXAMINE("small small small");
+  }
+  template<>
+  void kokkos_detector<large_array_policy>::hello(){
+    SCITBX_EXAMINE("large large large");
+  }
+
+  template<> void
+  kokkos_detector<large_array_policy>::each_image_allocate() {
+    resize(m_rangemap, m_total_pixel_count);
+    resize(m_omega_reduction, m_total_pixel_count);
+    resize(m_max_I_x_reduction, m_total_pixel_count);
+    resize(m_max_I_y_reduction, m_total_pixel_count);
+    resize(m_floatimage, m_total_pixel_count);
+
+    resize(m_maskimage, m_total_pixel_count);
+    kokkostbx::transfer_shared2kokkos(m_sdet_vector, metrology.sdet);
+    kokkostbx::transfer_shared2kokkos(m_fdet_vector, metrology.fdet);
+    kokkostbx::transfer_shared2kokkos(m_odet_vector, metrology.odet);
+    kokkostbx::transfer_shared2kokkos(m_pix0_vector, metrology.pix0);
+    kokkostbx::transfer_shared2kokkos(m_distance, metrology.dists);
+    kokkostbx::transfer_shared2kokkos(m_Xbeam, metrology.Xbeam);
+    kokkostbx::transfer_shared2kokkos(m_Ybeam, metrology.Ybeam);
+    fence();
+
+    // metrology.show();
+
+    // printf(" rangemap size:%d\n", m_rangemap.span());
+    // printf(" omega_reduction size:%d\n", m_omega_reduction.span());
+    // printf(" max_I_x_reduction size:%d\n", m_max_I_x_reduction.span());
+    // printf(" max_I_y_reduction size:%d\n", m_max_I_y_reduction.span());
+    // printf(" maskimage size:%d\n", m_maskimage.span());
+    // printf(" floatimage size:%d\n", m_floatimage.span());
+    // printf(" sdet_vector size:%d\n", m_sdet_vector.span());
+    // printf(" fdet_vector size:%d\n", m_fdet_vector.span());
+    // printf(" odet_vector size:%d\n", m_odet_vector.span());
+    // printf(" pix0_vector size:%d\n", m_pix0_vector.span());
+    // printf(" distance size:%d\n", m_distance.span());
+    // printf(" Xbeam size:%d\n", m_Xbeam.span());
+    // printf(" Ybeam size:%d\n", m_Ybeam.span());
+
+    // print_view(m_fdet_vector);
+    // print_view(m_odet_vector, 1, 3);
+
+    // printf("DONE.\n");
+  }
+  template<> void
+  kokkos_detector<small_whitelist_policy>::each_image_allocate() {
+    resize(m_maskimage, m_total_pixel_count);
+    kokkostbx::transfer_shared2kokkos(m_sdet_vector, metrology.sdet);
+    kokkostbx::transfer_shared2kokkos(m_fdet_vector, metrology.fdet);
+    kokkostbx::transfer_shared2kokkos(m_odet_vector, metrology.odet);
+    kokkostbx::transfer_shared2kokkos(m_pix0_vector, metrology.pix0);
+    kokkostbx::transfer_shared2kokkos(m_distance, metrology.dists);
+    kokkostbx::transfer_shared2kokkos(m_Xbeam, metrology.Xbeam);
+    kokkostbx::transfer_shared2kokkos(m_Ybeam, metrology.Ybeam);
+    fence();
+  }
+
+  template<>
+  void
+  kokkos_detector<large_array_policy>::set_active_pixels_on_GPU(af::shared<std::size_t> active_pixel_list_value) {
+    m_active_pixel_size = active_pixel_list_value.size();
+    kokkostbx::transfer_shared2kokkos(m_active_pixel_list, active_pixel_list_value);
+    active_pixel_list = active_pixel_list_value;
+  }
+
+  template<>
+  void
+  kokkos_detector<small_whitelist_policy>::set_active_pixels_on_GPU(af::shared<std::size_t> active_pixel_list_value) {
+    m_active_pixel_size = active_pixel_list_value.size();
+    kokkostbx::transfer_shared2kokkos(m_active_pixel_list, active_pixel_list_value);
+    active_pixel_list = active_pixel_list_value;
+    resize(m_rangemap, m_active_pixel_size);
+    resize(m_omega_reduction, m_active_pixel_size);
+    resize(m_max_I_x_reduction, m_active_pixel_size);
+    resize(m_max_I_y_reduction, m_active_pixel_size);
+    resize(m_floatimage, m_active_pixel_size);
+    resize(m_accumulate_floatimage, m_active_pixel_size);
+    fence();
+  }
+
+  template<> af::shared<double>
+  kokkos_detector<large_array_policy>::get_whitelist_raw_pixels(af::shared<std::size_t> selection) {
+    hello();
+    //return the data array for the multipanel detector case, but only for whitelist pixels
+    vector_size_t active_pixel_selection = vector_size_t("active_pixel_selection", selection.size());
+    kokkostbx::transfer_shared2kokkos(active_pixel_selection, selection);
+
+    size_t output_pixel_size = selection.size();
+    vector_cudareal_t active_pixel_results = vector_cudareal_t("active_pixel_results", output_pixel_size);
+
+    auto temp = m_accumulate_floatimage;
+
+    parallel_for("get_active_pixel_selection",
+                  range_policy(0, output_pixel_size),
+                  KOKKOS_LAMBDA (const int i) {
+      size_t index = active_pixel_selection( i );
+      active_pixel_results( i ) = temp( index );
+    });
+
+    af::shared<double> output_array(output_pixel_size, af::init_functor_null<double>());
+    kokkostbx::transfer_kokkos2shared(output_array, active_pixel_results);
+
+    SCITBX_ASSERT(output_array.size() == output_pixel_size);
+    return output_array;
+  }
+  template<> af::shared<double>
+  kokkos_detector<small_whitelist_policy>::get_whitelist_raw_pixels(af::shared<std::size_t> selection) {
+    SCITBX_CHECK_POINT;
+    hello();
+    //return the data array for the multipanel detector case, but only for whitelist pixels
+
+    std::size_t output_pixel_size = selection.size();
+    //vector_cudareal_t active_pixel_results = vector_cudareal_t("active_pixel_results", output_pixel_size);
+
+    //auto temp = m_accumulate_floatimage;
+
+    //parallel_for("get_active_pixel_selection2",
+    //              range_policy(0, output_pixel_size),
+    //              KOKKOS_LAMBDA (const int i) {
+    //  active_pixel_results( i ) = temp( i );
+    //});
+
+    af::shared<double> output_array(output_pixel_size, af::init_functor_null<double>());
+    SCITBX_CHECK_POINT;
+    kokkostbx::transfer_kokkos2shared(output_array, m_accumulate_floatimage);//active_pixel_results);
+    SCITBX_CHECK_POINT;
+
+    SCITBX_ASSERT(output_array.size() == output_pixel_size);
+    return output_array;
+  }
+
 } // Kokkos
 } // simtbx
diff --git a/simtbx/kokkos/detector.h b/simtbx/kokkos/detector.h
index c7149eaeb40..74d286ffbed 100644
--- a/simtbx/kokkos/detector.h
+++ b/simtbx/kokkos/detector.h
@@ -41,10 +41,8 @@ struct packed_metrology{
   af::shared<double>Ybeam;
 };
 
-struct large_array_policy {
-};
-struct small_whitelist_policy {
-};
+struct large_array_policy {};
+struct small_whitelist_policy {};
 
 template <typename MemoryPolicy>
 struct kokkos_detector
@@ -63,7 +61,7 @@ struct kokkos_detector
   //void write_raw_pixels(simtbx::nanoBragg::nanoBragg&);
   //af::flex_double get_raw_pixels();
   //void set_active_pixels_on_GPU(af::shared<std::size_t>);
-  //af::shared<double> get_whitelist_raw_pixels(af::shared<std::size_t>);
+  af::shared<double> get_whitelist_raw_pixels(af::shared<std::size_t>);
   inline void each_image_free(){} //no op in Kokkos
   int h_deviceID;
 
@@ -157,46 +155,7 @@ struct kokkos_detector
     return view_floatimage;
   };
 
-  inline void
-  each_image_allocate() {
-    resize(m_rangemap, m_total_pixel_count);
-    resize(m_omega_reduction, m_total_pixel_count);
-    resize(m_max_I_x_reduction, m_total_pixel_count);
-    resize(m_max_I_y_reduction, m_total_pixel_count);
-
-    resize(m_maskimage, m_total_pixel_count);
-    resize(m_floatimage, m_total_pixel_count);
-
-    kokkostbx::transfer_shared2kokkos(m_sdet_vector, metrology.sdet);
-    kokkostbx::transfer_shared2kokkos(m_fdet_vector, metrology.fdet);
-    kokkostbx::transfer_shared2kokkos(m_odet_vector, metrology.odet);
-    kokkostbx::transfer_shared2kokkos(m_pix0_vector, metrology.pix0);
-    kokkostbx::transfer_shared2kokkos(m_distance, metrology.dists);
-    kokkostbx::transfer_shared2kokkos(m_Xbeam, metrology.Xbeam);
-    kokkostbx::transfer_shared2kokkos(m_Ybeam, metrology.Ybeam);
-    fence();
-
-    // metrology.show();
-
-    // printf(" rangemap size:%d\n", m_rangemap.span());
-    // printf(" omega_reduction size:%d\n", m_omega_reduction.span());
-    // printf(" max_I_x_reduction size:%d\n", m_max_I_x_reduction.span());
-    // printf(" max_I_y_reduction size:%d\n", m_max_I_y_reduction.span());
-    // printf(" maskimage size:%d\n", m_maskimage.span());
-    // printf(" floatimage size:%d\n", m_floatimage.span());
-    // printf(" sdet_vector size:%d\n", m_sdet_vector.span());
-    // printf(" fdet_vector size:%d\n", m_fdet_vector.span());
-    // printf(" odet_vector size:%d\n", m_odet_vector.span());
-    // printf(" pix0_vector size:%d\n", m_pix0_vector.span());
-    // printf(" distance size:%d\n", m_distance.span());
-    // printf(" Xbeam size:%d\n", m_Xbeam.span());
-    // printf(" Ybeam size:%d\n", m_Ybeam.span());
-
-    // print_view(m_fdet_vector);
-    // print_view(m_odet_vector, 1, 3);
-
-    // printf("DONE.\n");
-  }
+  void each_image_allocate();
 
   inline void
   scale_in_place(const double& factor){
@@ -206,6 +165,8 @@ struct kokkos_detector
     });
   }
 
+  void set_active_pixels_on_GPU(af::shared<std::size_t> active_pixel_list_value);
+
   inline void
   write_raw_pixels(simtbx::nanoBragg::nanoBragg& nB) {
     //only implement the monolithic detector case, one panel
@@ -242,37 +203,8 @@ struct kokkos_detector
     return output_array;
   }
 
-  inline void
-  set_active_pixels_on_GPU(af::shared<std::size_t> active_pixel_list_value) {
-    m_active_pixel_size = active_pixel_list_value.size();
-    kokkostbx::transfer_shared2kokkos(m_active_pixel_list, active_pixel_list_value);
-    active_pixel_list = active_pixel_list_value;
-  }
-
-  inline af::shared<double>
-  get_whitelist_raw_pixels(af::shared<std::size_t> selection) {
-    //return the data array for the multipanel detector case, but only for whitelist pixels
-    vector_size_t active_pixel_selection = vector_size_t("active_pixel_selection", selection.size());
-    kokkostbx::transfer_shared2kokkos(active_pixel_selection, selection);
+  void hello();
 
-    size_t output_pixel_size = selection.size();
-    vector_cudareal_t active_pixel_results = vector_cudareal_t("active_pixel_results", output_pixel_size);
-
-    auto temp = m_accumulate_floatimage;
-
-    parallel_for("get_active_pixel_selection",
-                  range_policy(0, output_pixel_size),
-                  KOKKOS_LAMBDA (const int i) {
-      size_t index = active_pixel_selection( i );
-      active_pixel_results( i ) = temp( index );
-    });
-
-    af::shared<double> output_array(output_pixel_size, af::init_functor_null<double>());
-    kokkostbx::transfer_kokkos2shared(output_array, active_pixel_results);
-
-    SCITBX_ASSERT(output_array.size() == output_pixel_size);
-    return output_array;
-  }
 };
 
 
diff --git a/simtbx/kokkos/simulation.cpp b/simtbx/kokkos/simulation.cpp
index b4fd96861b1..cf2d79dd4e1 100644
--- a/simtbx/kokkos/simulation.cpp
+++ b/simtbx/kokkos/simulation.cpp
@@ -4,6 +4,7 @@
 #include "simtbx/kokkos/simulation_kernels.h"
 #include "kokkostbx/kokkos_utils.h"
 #include "scitbx/array_family/flex_types.h"
+#include "simtbx/kokkos/detector.h"
 
 #define THREADS_PER_BLOCK_X 128
 #define THREADS_PER_BLOCK_Y 1
@@ -42,5 +43,177 @@ namespace Kokkos {
   }
 */
 
+  template<> void
+  exascale_api<large_array_policy>::add_energy_multichannel_mask_allpanel(
+    af::shared<int> const ichannels,
+    simtbx::Kokkos::kokkos_energy_channels & kec,
+    simtbx::Kokkos::kokkos_detector<large_array_policy> & kdt,
+    af::shared<std::size_t> const active_pixel_list,
+    af::shared<double> const weight
+  ){
+    //SCITBX_CHECK_POINT; // tested by tst_unified.py
+    kdt.set_active_pixels_on_GPU(active_pixel_list);
+
+    // transfer source_I, source_lambda
+    // the int arguments are for sizes of the arrays
+
+    SCITBX_ASSERT(SIM.sources == ichannels.size()); /* For each nanoBragg source, this value instructs
+    the simulation where to look for structure factors.  If -1, skip this source wavelength. */
+    SCITBX_ASSERT(SIM.sources == weight.size());
+
+    for (int ictr = 0; ictr < SIM.sources; ++ictr){
+      if (ichannels[ictr] < 0) continue; // the ichannel array
+      //printf("HA HA %4d channel %4d, %5.1f %5.1f %5.1f %10.5f %10.5g\n",ictr,ichannels[ictr],
+      //  SIM.source_X[ictr], SIM.source_Y[ictr], SIM.source_Z[ictr],
+      //  SIM.source_I[ictr], SIM.source_lambda[ictr]);
+
+      ::Kokkos::resize(m_crystal_orientation, SIM.phisteps, SIM.mosaic_domains, 3);
+      calc_CrystalOrientations(
+      SIM.phi0, SIM.phistep, SIM.phisteps, m_spindle_vector, m_a0, m_b0, m_c0, SIM.mosaic_spread,
+      SIM.mosaic_domains, m_mosaic_umats, m_crystal_orientation);
+
+      // magic happens here: take pointer from singleton, temporarily use it for add Bragg iteration:
+      vector_cudareal_t current_channel_Fhkl = kec.d_channel_Fhkl[ichannels[ictr]];
+
+      vector_cudareal_t c_source_X = vector_cudareal_t("c_source_X", 0);
+      vector_cudareal_t c_source_Y = vector_cudareal_t("c_source_Y", 0);
+      vector_cudareal_t c_source_Z = vector_cudareal_t("c_source_Z", 0);
+      vector_cudareal_t c_source_I = vector_cudareal_t("c_source_I", 0);
+      vector_cudareal_t c_source_lambda = vector_cudareal_t("c_source_lambda", 0);
+      double weighted_I = SIM.source_I[ictr] * weight[ictr];
+      kokkostbx::transfer_double2kokkos(c_source_X, &(SIM.source_X[ictr]), 1);
+      kokkostbx::transfer_double2kokkos(c_source_Y, &(SIM.source_Y[ictr]), 1);
+      kokkostbx::transfer_double2kokkos(c_source_Z, &(SIM.source_Z[ictr]), 1);
+      kokkostbx::transfer_double2kokkos(c_source_I, &(weighted_I), 1);
+      kokkostbx::transfer_double2kokkos(c_source_lambda, &(SIM.source_lambda[ictr]), 1);
+
+      // set up cell basis vectors for the diffuse parameters (convert vec4 to vec3)
+      diffuse.a0 << SIM.a0[1],SIM.a0[2],SIM.a0[3]; diffuse.b0 << SIM.b0[1],SIM.b0[2],SIM.b0[3]; diffuse.c0 << SIM.c0[1],SIM.c0[2],SIM.c0[3];
+
+      debranch_maskall_Kernel(
+      kdt.m_panel_count, kdt.m_slow_dim_size, kdt.m_fast_dim_size, active_pixel_list.size(),
+      SIM.oversample, SIM.point_pixel,
+      SIM.pixel_size, m_subpixel_size, m_steps,
+      SIM.detector_thickstep, SIM.detector_thicksteps, SIM.detector_thick, SIM.detector_attnlen,
+      kdt.m_sdet_vector,
+      kdt.m_fdet_vector,
+      kdt.m_odet_vector,
+      kdt.m_pix0_vector,
+      kdt.m_distance,
+      m_beam_vector,
+      SIM.dmin, SIM.phisteps, 1,
+      c_source_X, c_source_Y,
+      c_source_Z,
+      c_source_I, c_source_lambda,
+      SIM.mosaic_domains, m_crystal_orientation,
+      SIM.Na, SIM.Nb, SIM.Nc, SIM.V_cell,
+      m_water_size, m_water_F, m_water_MW, simtbx::nanoBragg::r_e_sqr,
+      SIM.fluence, simtbx::nanoBragg::Avogadro, SIM.spot_scale, SIM.integral_form,
+      SIM.default_F,
+      current_channel_Fhkl,
+      kec.m_FhklParams,
+      SIM.nopolar,
+      m_polar_vector, SIM.polarization, SIM.fudge,
+      kdt.m_active_pixel_list,
+      diffuse,
+      // return arrays:
+      kdt.m_floatimage,
+      kdt.m_omega_reduction,
+      kdt.m_max_I_x_reduction,
+      kdt.m_max_I_y_reduction,
+      kdt.m_rangemap);
+    //don't want to free the kec data when the nanoBragg goes out of scope, so switch the pointer
+    // cu_current_channel_Fhkl = NULL;
+
+      add_array(kdt.m_accumulate_floatimage, kdt.m_floatimage);
+    }// loop over channels
+  }
+
+  template<> void
+  exascale_api<small_whitelist_policy>::add_energy_multichannel_mask_allpanel(
+    af::shared<int> const ichannels,
+    simtbx::Kokkos::kokkos_energy_channels & kec,
+    simtbx::Kokkos::kokkos_detector<small_whitelist_policy> & kdt,
+    af::shared<std::size_t> const active_pixel_list,
+    af::shared<double> const weight
+  ){
+    //SCITBX_CHECK_POINT; // tested by tst_memory_policy.py
+    kdt.set_active_pixels_on_GPU(active_pixel_list);
+
+    // transfer source_I, source_lambda
+    // the int arguments are for sizes of the arrays
+
+    SCITBX_ASSERT(SIM.sources == ichannels.size()); /* For each nanoBragg source, this value instructs
+    the simulation where to look for structure factors.  If -1, skip this source wavelength. */
+    SCITBX_ASSERT(SIM.sources == weight.size());
+
+    for (int ictr = 0; ictr < SIM.sources; ++ictr){
+      if (ichannels[ictr] < 0) continue; // the ichannel array
+      //printf("HA HA %4d channel %4d, %5.1f %5.1f %5.1f %10.5f %10.5g\n",ictr,ichannels[ictr],
+      //  SIM.source_X[ictr], SIM.source_Y[ictr], SIM.source_Z[ictr],
+      //  SIM.source_I[ictr], SIM.source_lambda[ictr]);
+
+      ::Kokkos::resize(m_crystal_orientation, SIM.phisteps, SIM.mosaic_domains, 3);
+      calc_CrystalOrientations(
+      SIM.phi0, SIM.phistep, SIM.phisteps, m_spindle_vector, m_a0, m_b0, m_c0, SIM.mosaic_spread,
+      SIM.mosaic_domains, m_mosaic_umats, m_crystal_orientation);
+
+      // magic happens here: take pointer from singleton, temporarily use it for add Bragg iteration:
+      vector_cudareal_t current_channel_Fhkl = kec.d_channel_Fhkl[ichannels[ictr]];
+
+      vector_cudareal_t c_source_X = vector_cudareal_t("c_source_X", 0);
+      vector_cudareal_t c_source_Y = vector_cudareal_t("c_source_Y", 0);
+      vector_cudareal_t c_source_Z = vector_cudareal_t("c_source_Z", 0);
+      vector_cudareal_t c_source_I = vector_cudareal_t("c_source_I", 0);
+      vector_cudareal_t c_source_lambda = vector_cudareal_t("c_source_lambda", 0);
+      double weighted_I = SIM.source_I[ictr] * weight[ictr];
+      kokkostbx::transfer_double2kokkos(c_source_X, &(SIM.source_X[ictr]), 1);
+      kokkostbx::transfer_double2kokkos(c_source_Y, &(SIM.source_Y[ictr]), 1);
+      kokkostbx::transfer_double2kokkos(c_source_Z, &(SIM.source_Z[ictr]), 1);
+      kokkostbx::transfer_double2kokkos(c_source_I, &(weighted_I), 1);
+      kokkostbx::transfer_double2kokkos(c_source_lambda, &(SIM.source_lambda[ictr]), 1);
+
+      // set up cell basis vectors for the diffuse parameters (convert vec4 to vec3)
+      diffuse.a0 << SIM.a0[1],SIM.a0[2],SIM.a0[3]; diffuse.b0 << SIM.b0[1],SIM.b0[2],SIM.b0[3]; diffuse.c0 << SIM.c0[1],SIM.c0[2],SIM.c0[3];
+
+      debranch_maskall_low_memory_Kernel(
+      kdt.m_panel_count, kdt.m_slow_dim_size, kdt.m_fast_dim_size, active_pixel_list.size(),
+      SIM.oversample, SIM.point_pixel,
+      SIM.pixel_size, m_subpixel_size, m_steps,
+      SIM.detector_thickstep, SIM.detector_thicksteps, SIM.detector_thick, SIM.detector_attnlen,
+      kdt.m_sdet_vector,
+      kdt.m_fdet_vector,
+      kdt.m_odet_vector,
+      kdt.m_pix0_vector,
+      kdt.m_distance,
+      m_beam_vector,
+      SIM.dmin, SIM.phisteps, 1,
+      c_source_X, c_source_Y,
+      c_source_Z,
+      c_source_I, c_source_lambda,
+      SIM.mosaic_domains, m_crystal_orientation,
+      SIM.Na, SIM.Nb, SIM.Nc, SIM.V_cell,
+      m_water_size, m_water_F, m_water_MW, simtbx::nanoBragg::r_e_sqr,
+      SIM.fluence, simtbx::nanoBragg::Avogadro, SIM.spot_scale, SIM.integral_form,
+      SIM.default_F,
+      current_channel_Fhkl,
+      kec.m_FhklParams,
+      SIM.nopolar,
+      m_polar_vector, SIM.polarization, SIM.fudge,
+      kdt.m_active_pixel_list,
+      diffuse,
+      // return arrays:
+      kdt.m_floatimage,
+      kdt.m_omega_reduction,
+      kdt.m_max_I_x_reduction,
+      kdt.m_max_I_y_reduction,
+      kdt.m_rangemap);
+    //don't want to free the kec data when the nanoBragg goes out of scope, so switch the pointer
+    // cu_current_channel_Fhkl = NULL;
+
+      add_array(kdt.m_accumulate_floatimage, kdt.m_floatimage);
+    }// loop over channels
+  }
+
 } // Kokkos
 } // simtbx
diff --git a/simtbx/kokkos/simulation.h b/simtbx/kokkos/simulation.h
index 9b9d6a65522..9b69b4ce954 100644
--- a/simtbx/kokkos/simulation.h
+++ b/simtbx/kokkos/simulation.h
@@ -86,12 +86,12 @@ struct exascale_api {
     simtbx::Kokkos::kokkos_detector<MemoryPolicy> &,
     af::shared<std::size_t> const
   ); */
-  /*void add_energy_multichannel_mask_allpanel(af::shared<int> const,
+  void add_energy_multichannel_mask_allpanel(af::shared<int> const,
     simtbx::Kokkos::kokkos_energy_channels &,
     simtbx::Kokkos::kokkos_detector<MemoryPolicy> &,
     af::shared<std::size_t> const,
     af::shared<double> const
-  ); */
+  );
 
   //void add_background(simtbx::Kokkos::kokkos_detector<MemoryPolicy> &, int const&);
   //af::flex_double add_noise(simtbx::Kokkos::kokkos_detector<MemoryPolicy> &);
@@ -330,91 +330,6 @@ struct exascale_api {
     add_array(kdt.m_accumulate_floatimage, kdt.m_floatimage);
   }
 
-  inline void
-  add_energy_multichannel_mask_allpanel(
-    af::shared<int> const ichannels,
-    simtbx::Kokkos::kokkos_energy_channels & kec,
-    simtbx::Kokkos::kokkos_detector<MemoryPolicy> & kdt,
-    af::shared<std::size_t> const active_pixel_list,
-    af::shared<double> const weight
-  ){
-    //SCITBX_CHECK_POINT; // tested by tst_unified.py
-    kdt.set_active_pixels_on_GPU(active_pixel_list);
-
-    // transfer source_I, source_lambda
-    // the int arguments are for sizes of the arrays
-
-    SCITBX_ASSERT(SIM.sources == ichannels.size()); /* For each nanoBragg source, this value instructs
-    the simulation where to look for structure factors.  If -1, skip this source wavelength. */
-    SCITBX_ASSERT(SIM.sources == weight.size());
-
-    for (int ictr = 0; ictr < SIM.sources; ++ictr){
-      if (ichannels[ictr] < 0) continue; // the ichannel array
-      //printf("HA HA %4d channel %4d, %5.1f %5.1f %5.1f %10.5f %10.5g\n",ictr,ichannels[ictr],
-      //  SIM.source_X[ictr], SIM.source_Y[ictr], SIM.source_Z[ictr],
-      //  SIM.source_I[ictr], SIM.source_lambda[ictr]);
-
-      ::Kokkos::resize(m_crystal_orientation, SIM.phisteps, SIM.mosaic_domains, 3);
-      calc_CrystalOrientations(
-      SIM.phi0, SIM.phistep, SIM.phisteps, m_spindle_vector, m_a0, m_b0, m_c0, SIM.mosaic_spread,
-      SIM.mosaic_domains, m_mosaic_umats, m_crystal_orientation);
-
-      // magic happens here: take pointer from singleton, temporarily use it for add Bragg iteration:
-      vector_cudareal_t current_channel_Fhkl = kec.d_channel_Fhkl[ichannels[ictr]];
-
-      vector_cudareal_t c_source_X = vector_cudareal_t("c_source_X", 0);
-      vector_cudareal_t c_source_Y = vector_cudareal_t("c_source_Y", 0);
-      vector_cudareal_t c_source_Z = vector_cudareal_t("c_source_Z", 0);
-      vector_cudareal_t c_source_I = vector_cudareal_t("c_source_I", 0);
-      vector_cudareal_t c_source_lambda = vector_cudareal_t("c_source_lambda", 0);
-      double weighted_I = SIM.source_I[ictr] * weight[ictr];
-      kokkostbx::transfer_double2kokkos(c_source_X, &(SIM.source_X[ictr]), 1);
-      kokkostbx::transfer_double2kokkos(c_source_Y, &(SIM.source_Y[ictr]), 1);
-      kokkostbx::transfer_double2kokkos(c_source_Z, &(SIM.source_Z[ictr]), 1);
-      kokkostbx::transfer_double2kokkos(c_source_I, &(weighted_I), 1);
-      kokkostbx::transfer_double2kokkos(c_source_lambda, &(SIM.source_lambda[ictr]), 1);
-
-      // set up cell basis vectors for the diffuse parameters (convert vec4 to vec3)
-      diffuse.a0 << SIM.a0[1],SIM.a0[2],SIM.a0[3]; diffuse.b0 << SIM.b0[1],SIM.b0[2],SIM.b0[3]; diffuse.c0 << SIM.c0[1],SIM.c0[2],SIM.c0[3];
-
-      debranch_maskall_Kernel(
-      kdt.m_panel_count, kdt.m_slow_dim_size, kdt.m_fast_dim_size, active_pixel_list.size(),
-      SIM.oversample, SIM.point_pixel,
-      SIM.pixel_size, m_subpixel_size, m_steps,
-      SIM.detector_thickstep, SIM.detector_thicksteps, SIM.detector_thick, SIM.detector_attnlen,
-      kdt.m_sdet_vector,
-      kdt.m_fdet_vector,
-      kdt.m_odet_vector,
-      kdt.m_pix0_vector,
-      kdt.m_distance,
-      m_beam_vector,
-      SIM.dmin, SIM.phisteps, 1,
-      c_source_X, c_source_Y,
-      c_source_Z,
-      c_source_I, c_source_lambda,
-      SIM.mosaic_domains, m_crystal_orientation,
-      SIM.Na, SIM.Nb, SIM.Nc, SIM.V_cell,
-      m_water_size, m_water_F, m_water_MW, simtbx::nanoBragg::r_e_sqr,
-      SIM.fluence, simtbx::nanoBragg::Avogadro, SIM.spot_scale, SIM.integral_form,
-      SIM.default_F,
-      current_channel_Fhkl,
-      kec.m_FhklParams,
-      SIM.nopolar,
-      m_polar_vector, SIM.polarization, SIM.fudge,
-      kdt.m_active_pixel_list,
-      diffuse,
-      // return arrays:
-      kdt.m_floatimage,
-      kdt.m_omega_reduction,
-      kdt.m_max_I_x_reduction,
-      kdt.m_max_I_y_reduction,
-      kdt.m_rangemap);
-    //don't want to free the kec data when the nanoBragg goes out of scope, so switch the pointer
-    // cu_current_channel_Fhkl = NULL;
-      add_array(kdt.m_accumulate_floatimage, kdt.m_floatimage);
-    }// loop over channels
-  }
-
   inline void
   add_background(simtbx::Kokkos::kokkos_detector<MemoryPolicy> & kdt, int const& override_source) {
         // cudaSafeCall(cudaSetDevice(SIM.device_Id));
diff --git a/simtbx/kokkos/simulation_kernels.h b/simtbx/kokkos/simulation_kernels.h
index 7c1fa9eb21b..dd78e002421 100644
--- a/simtbx/kokkos/simulation_kernels.h
+++ b/simtbx/kokkos/simulation_kernels.h
@@ -643,6 +643,306 @@ void debranch_maskall_Kernel(int npanels, int spixels, int fpixels, int total_pi
         });
 }
 
+void debranch_maskall_low_memory_Kernel(int npanels, int spixels, int fpixels, int total_pixels,
+    int oversample, int point_pixel,
+    CUDAREAL pixel_size, CUDAREAL subpixel_size, int steps,
+    CUDAREAL detector_thickstep, int detector_thicksteps, CUDAREAL detector_thick, CUDAREAL detector_mu,
+    const view_1d_t<vec3> sdet_vector, const view_1d_t<vec3> fdet_vector,
+    const view_1d_t<vec3> odet_vector, const view_1d_t<vec3> pix0_vector,
+    const vector_cudareal_t close_distance,
+    const vector_cudareal_t beam_vector,
+    CUDAREAL dmin, int phisteps, int sources,
+    const vector_cudareal_t source_X, const vector_cudareal_t source_Y,
+    const vector_cudareal_t source_Z,
+    const vector_cudareal_t source_I, const vector_cudareal_t source_lambda,
+    int mosaic_domains, crystal_orientation_t crystal_orientation,
+    CUDAREAL Na, CUDAREAL Nb, CUDAREAL Nc, CUDAREAL V_cell,
+    CUDAREAL water_size, CUDAREAL water_F, CUDAREAL water_MW, CUDAREAL r_e_sqr,
+    CUDAREAL fluence, CUDAREAL Avogadro, CUDAREAL spot_scale, int integral_form,
+    CUDAREAL default_F,
+    const vector_cudareal_t Fhkl,
+    const hklParams FhklParams,
+    int nopolar,
+    const vector_cudareal_t polar_vector, CUDAREAL polarization, CUDAREAL fudge,
+    const vector_size_t pixel_lookup,
+    simtbx::Kokkos::diffuse_api diffuse,
+    vector_float_t floatimage /*out*/,
+    vector_float_t omega_reduction /*out*/, vector_float_t max_I_x_reduction /*out*/,
+    vector_float_t max_I_y_reduction /*out*/, vector_bool_t rangemap) {
+
+
+        const int s_h_min = FhklParams.h_min;
+        const int s_k_min = FhklParams.k_min;
+        const int s_l_min = FhklParams.l_min;
+        const int s_h_range = FhklParams.h_range;
+        const int s_k_range = FhklParams.k_range;
+        const int s_l_range = FhklParams.l_range;
+        const int s_h_max = s_h_min + s_h_range - 1;
+        const int s_k_max = s_k_min + s_k_range - 1;
+        const int s_l_max = s_l_min + s_l_range - 1;
+
+        // set up diffuse scattering if needed
+        vector_mat3_t laue_mats = vector_mat3_t("laue_mats",24);
+        vector_cudareal_t dG_trace = vector_cudareal_t("dG_trace",3);
+        vector_vec3_t dG_dgam = vector_vec3_t("dG_dgam",3);
+        int num_laue_mats = 0;
+        int dhh = 0, dkk = 0, dll = 0;
+        KOKKOS_MAT3 rotate_principal_axes = diffuse.rotate_principal_axes; // (1,0,0,0,1,0,0,0,1);
+        int laue_group_num = diffuse.laue_group_num;
+        int stencil_size = diffuse.stencil_size;
+        KOKKOS_MAT3 anisoG = diffuse.anisoG; // (300.,0,0,0,100.,0,0,0,300.);
+        KOKKOS_MAT3 anisoU = diffuse.anisoU; // (0.48,0,0,0,0.16,0,0,0,0.16);
+        KOKKOS_MAT3 Bmat_realspace(1.,0,0,0,1.,0,0,0,1.); // Placeholder
+        KOKKOS_MAT3 anisoG_local;
+        CUDAREAL anisoG_determ = 0;
+        KOKKOS_MAT3 anisoU_local;
+        bool use_diffuse=diffuse.enable;
+        // ***NEEDS UPDATE: use legacy API for passing diffuse scale as KOKKOS_MAT3
+        vector_mat3_t diffuse_scale_mat3 = vector_mat3_t("diffuse_scale_mat3",1);
+
+        if (use_diffuse){
+          anisoG_local = anisoG;
+          anisoU_local = anisoU;
+
+          if (laue_group_num < 1 || laue_group_num >14 ){
+            throw std::string("Laue group number not in range 1-14");
+          }
+
+          Kokkos::parallel_reduce("prepare diffuse mats", 1, KOKKOS_LAMBDA (const int& i, int& num_laue_mats_temp){
+            num_laue_mats_temp = gen_laue_mats(laue_group_num, laue_mats, rotate_principal_axes);
+            // KOKKOS_MAT3 rotate_principal_axes;
+            // rotate_principal_axes << 0.70710678, -0.70710678, 0., 0.70710678, 0.70710678, 0., 0., 0., 1.;
+
+            KOKKOS_MAT3 Amatrix(diffuse.a0[0],diffuse.a0[1],diffuse.a0[2],diffuse.b0[0],diffuse.b0[1],diffuse.b0[2],
+                                diffuse.c0[0],diffuse.c0[1],diffuse.c0[2]);
+            KOKKOS_MAT3 Ainv = Amatrix.inverse()*1.e-10;
+            CUDAREAL reciprocal_space_volume = 8*M_PI*M_PI*M_PI*Ainv.determinant();
+            CUDAREAL _tmpfac = M_PI * 0.63 / fudge;
+            CUDAREAL diffuse_scale = reciprocal_space_volume * sqrt(_tmpfac*_tmpfac*_tmpfac);
+
+            // ***NEEDS UPDATE: use legacy API for passing diffuse scale as KOKKOS_MAT3
+            diffuse_scale_mat3(0)(0,0) = diffuse_scale;
+
+            for ( int iL = 0; iL < num_laue_mats_temp; iL++ ){
+              laue_mats(iL) = Ainv * laue_mats(iL);
+            }
+            const KOKKOS_MAT3 Ginv = anisoG_local.inverse();
+            const KOKKOS_MAT3 dG = Bmat_realspace * Ginv;
+            for (int i_gam=0; i_gam<3; i_gam++){
+              dG_dgam(i_gam)[i_gam] = 1;
+              KOKKOS_MAT3 temp_dgam;
+              temp_dgam(i_gam, 0) = dG_dgam(i_gam)[0];
+              temp_dgam(i_gam, 1) = dG_dgam(i_gam)[1];
+              temp_dgam(i_gam, 2) = dG_dgam(i_gam)[2];
+              dG_trace(i_gam) = (Ginv*temp_dgam).trace();
+            }
+          }, num_laue_mats);
+          anisoG_determ = anisoG_local.determinant();
+          dhh = dkk = dll = stencil_size; // Limits of stencil for diffuse calc
+        }
+        KOKKOS_VEC3 dHH(dhh,dkk,dll);
+        // end of diffuse setup
+
+// Implementation notes.  This kernel is aggressively debranched, therefore the assumptions are:
+// 1) mosaicity non-zero positive
+// 2) xtal shape is "Gauss" i.e. 3D spheroid.
+// 3) No bounds check for access to the structure factor array.
+// 4) No check for Flatt=0.
+        const CUDAREAL dmin_r = (dmin > 0.0) ? 1/dmin : 0.0;
+
+        // add background from something amorphous, precalculate scaling
+        const CUDAREAL F_bg = water_F;
+        const CUDAREAL I_bg = F_bg * F_bg * r_e_sqr * fluence * water_size * water_size * water_size * 1e6 * Avogadro / water_MW;
+        const CUDAREAL I_factor = r_e_sqr * spot_scale * fluence / steps;
+
+        Kokkos::parallel_for("debranch_maskall", total_pixels, KOKKOS_LAMBDA(const int& pixIdx) {
+
+                vec3 polar_vector_tmp {polar_vector(1), polar_vector(2), polar_vector(3)};
+
+                // position in pixel array
+                const int j = pixel_lookup(pixIdx);//pixIdx: index into pixel subset; j: index into the data.
+                const int i_panel = j / (fpixels*spixels); // the panel number
+                const int j_panel = j % (fpixels*spixels); // the pixel number within the panel
+                const int fpixel = j_panel % fpixels;
+                const int spixel = j_panel / fpixels;
+                const int outIdx = pixIdx;
+
+                // reset photon count for this pixel
+                CUDAREAL I = I_bg;
+                CUDAREAL omega_sub_reduction = 0.0;
+                CUDAREAL max_I_x_sub_reduction = 0.0;
+                CUDAREAL max_I_y_sub_reduction = 0.0;
+                CUDAREAL polar = 0.0;
+                if (nopolar) {
+                        polar = 1.0;
+                }
+
+                // add this now to avoid problems with skipping later
+                // move this to the bottom to avoid accessing global device memory. floatimage[j] = I_bg;
+                // loop over sub-pixels
+                int subS, subF;
+                for (subS = 0; subS < oversample; ++subS) { // Y voxel
+                        for (subF = 0; subF < oversample; ++subF) { // X voxel
+                                // absolute mm position on detector (relative to its origin)
+                                CUDAREAL Fdet = subpixel_size * (fpixel * oversample + subF) + subpixel_size / 2.0; // X voxel
+                                CUDAREAL Sdet = subpixel_size * (spixel * oversample + subS) + subpixel_size / 2.0; // Y voxel
+                                // Fdet = pixel_size*fpixel;
+                                // Sdet = pixel_size*spixel;
+
+                                max_I_x_sub_reduction = Fdet;
+                                max_I_y_sub_reduction = Sdet;
+
+                                int thick_tic;
+                                for (thick_tic = 0; thick_tic < detector_thicksteps; ++thick_tic) {
+                                        // assume "distance" is to the front of the detector sensor layer
+                                        CUDAREAL Odet = thick_tic * detector_thickstep; // Z Orthagonal voxel.
+
+                                        // construct detector subpixel position in 3D space
+                                        //                      pixel_X = distance;
+                                        //                      pixel_Y = Sdet-Ybeam;
+                                        //                      pixel_Z = Fdet-Xbeam;
+                                        vec3 pixel_pos;
+                                        pixel_pos += Fdet * fdet_vector(i_panel);
+                                        pixel_pos += Sdet * sdet_vector(i_panel);
+                                        pixel_pos += Odet * odet_vector(i_panel);
+                                        pixel_pos += pix0_vector(i_panel);
+
+                                        // construct the diffracted-beam unit vector to this sub-pixel
+                                        CUDAREAL airpath_r = 1 / pixel_pos.length();
+                                        vec3 diffracted = pixel_pos.get_unit_vector();
+
+                                        // solid angle subtended by a pixel: (pix/airpath)^2*cos(2theta)
+                                        CUDAREAL omega_pixel = pixel_size * pixel_size * airpath_r * airpath_r * close_distance(i_panel) * airpath_r;
+                                        // option to turn off obliquity effect, inverse-square-law only
+                                        if (point_pixel) {
+                                                omega_pixel = airpath_r * airpath_r;
+                                        }
+
+                                        // now calculate detector thickness effects
+                                        CUDAREAL capture_fraction = 1.0;
+                                        if (detector_thick > 0.0 && detector_mu> 0.0) {
+                                                // inverse of effective thickness increase
+                                                CUDAREAL parallax = odet_vector(i_panel).dot(diffracted);
+                                                capture_fraction = exp(-thick_tic * detector_thickstep / detector_mu / parallax)
+                                                                - exp(-(thick_tic + 1) * detector_thickstep / detector_mu / parallax);
+                                        }
+
+                                        // loop over sources now
+                                        int source;
+                                        for (source = 0; source < sources; ++source) {
+
+                                                // retrieve stuff from cache
+                                                vec3 incident = {-source_X(source), -source_Y(source), -source_Z(source)};
+                                                CUDAREAL lambda = source_lambda(source);
+                                                CUDAREAL source_fraction = source_I(source);
+
+                                                // construct the incident beam unit vector while recovering source distance
+                                                // TODO[Giles]: Optimization! We can unitize the source vectors before passing them in.
+                                                incident.normalize();
+
+                                                // construct the scattering vector for this pixel
+                                                vec3 scattering = (diffracted - incident) / lambda;
+                                                CUDAREAL stol = 0.5 * scattering.length();
+
+                                                // rough cut to speed things up when we aren't using whole detector
+                                                if (dmin > 0.0 && stol > 0.0) {
+                                                        // use reciprocal of (dmin > 0.5 / stol)
+                                                        if (dmin_r <= 2 * stol) {
+                                                                continue;
+                                                        }
+                                                }
+
+                                                // polarization factor
+                                                if (!nopolar) {
+                                                        // need to compute polarization factor
+                                                        polar = polarization_factor(polarization, incident, diffracted, polar_vector_tmp);
+                                                } else {
+                                                        polar = 1.0;
+                                                }
+
+                                                // sweep over phi angles
+                                                for (int phi_tic = 0; phi_tic < phisteps; ++phi_tic) {
+                                                        // enumerate mosaic domains
+                                                        for (int mos_tic = 0; mos_tic < mosaic_domains; ++mos_tic) {
+                                                                // apply mosaic rotation after phi rotation
+                                                                auto a = crystal_orientation(phi_tic, mos_tic, 0);
+                                                                auto b = crystal_orientation(phi_tic, mos_tic, 1);
+                                                                auto c = crystal_orientation(phi_tic, mos_tic, 2);
+
+                                                                // construct fractional Miller indicies
+                                                                CUDAREAL h = a.dot(scattering);
+                                                                CUDAREAL k = b.dot(scattering);
+                                                                CUDAREAL l = c.dot(scattering);
+
+                                                                // round off to nearest whole index
+                                                                int h0 = ceil(h - 0.5);
+                                                                int k0 = ceil(k - 0.5);
+                                                                int l0 = ceil(l - 0.5);
+
+                                                                // structure factor of the lattice (paralelpiped crystal)
+                                                                // F_latt = sin(M_PI*s_Na*h)*sin(M_PI*s_Nb*k)*sin(M_PI*s_Nc*l)/sin(M_PI*h)/sin(M_PI*k)/sin(M_PI*l);
+
+                                                                CUDAREAL I_latt = 1.0; // Shape transform for the crystal.
+                                                                CUDAREAL hrad_sqr = 0.0;
+
+                                                                // handy radius in reciprocal space, squared
+                                                                hrad_sqr = (h - h0) * (h - h0) * Na * Na + (k - k0) * (k - k0) * Nb * Nb + (l - l0) * (l - l0) * Nc * Nc;
+                                                                // fudge the radius so that volume and FWHM are similar to square_xtal spots
+                                                                double my_arg = hrad_sqr / 0.63 * fudge;
+                                                                {
+                                                                CUDAREAL F_latt = Na * Nb * Nc * exp(-(my_arg));
+                                                                I_latt = F_latt * F_latt;
+                                                                }
+
+                                                                // new code for diffuse.
+                                                                if (use_diffuse) {
+                                                                  KOKKOS_VEC3 H_vec(h,k,l);
+                                                                  KOKKOS_VEC3 H0(h0,k0,l0);
+                                                                  CUDAREAL step_diffuse_param[6];
+                                                                  // ***NEEDS UPDATE: use legacy API for passing diffuse scale as KOKKOS_MAT3
+                                                                  calc_diffuse_at_hkl(H_vec,H0,dHH,s_h_min,s_k_min,s_l_min,s_h_max,s_k_max,
+                                                                    s_l_max,s_h_range,s_k_range,s_l_range,diffuse_scale_mat3(0),Fhkl,num_laue_mats,laue_mats,
+                                                                    anisoG_local,dG_trace,anisoG_determ,anisoU_local,dG_dgam,false,&I_latt,step_diffuse_param);
+                                                                }
+                                                                // end s_use_diffuse outer
+
+                                                                // structure factor of the unit cell
+                                                                CUDAREAL F_cell = default_F;
+                                                                //F_cell = quickFcell_ldg(s_hkls, s_h_max, s_h_min, s_k_max, s_k_min, s_l_max, s_l_min, h0, k0, l0, s_h_range, s_k_range, s_l_range, default_F, Fhkl);
+                                                                if (
+                                                                        h0 < s_h_min ||
+                                                                        k0 < s_k_min ||
+                                                                        l0 < s_l_min ||
+                                                                        h0 > s_h_max ||
+                                                                        k0 > s_k_max ||
+                                                                        l0 > s_l_max
+                                                                        ) {
+                                                                        F_cell = 0.;
+                                                                } else {
+                                                                        const int hkl_index = (h0-s_h_min)*s_k_range*s_l_range + (k0-s_k_min)*s_l_range + (l0-s_l_min);
+                                                                        F_cell = Fhkl[hkl_index];
+                                                                }
+
+                                                                // now we have the structure factor for this pixel
+
+                                                                // convert amplitudes into intensity (photons per steradian)
+                                                                I += F_cell * F_cell * I_latt * source_fraction * capture_fraction * omega_pixel;
+                                                                omega_sub_reduction += omega_pixel;
+                                                        } // end of mosaic loop
+                                                } // end of phi loop
+                                        } // end of source loop
+                                } // end of detector thickness loop
+                        } // end of sub-pixel y loop
+                } // end of sub-pixel x loop
+                const double photons = I_bg + I_factor * polar * I;
+                floatimage( outIdx ) = photons;
+                //omega_reduction( outIdx ) = omega_sub_reduction; // shared contention
+                //max_I_x_reduction( outIdx ) = max_I_x_sub_reduction;
+                //max_I_y_reduction( outIdx ) = max_I_y_sub_reduction;
+                //rangemap( outIdx ) = true;
+        });
+}
+
 // __global__ void nanoBraggSpotsInitCUDAKernel(int spixels, int fpixesl, float * floatimage, float * omega_reduction,
 //                 float * max_I_x_reduction,
 //                 float * max_I_y_reduction, bool * rangemap);
diff --git a/simtbx/tests/tst_memory_policy.py b/simtbx/tests/tst_memory_policy.py
index dbb94cee5ed..177d6e5a97e 100644
--- a/simtbx/tests/tst_memory_policy.py
+++ b/simtbx/tests/tst_memory_policy.py
@@ -233,10 +233,12 @@ def specialized_api_for_whitelist_low_memory(self, params, whitelist_pixels, arg
     per_image_scale_factor = self.domains_per_crystal # 1.0
     self.gpu_detector.scale_in_place(per_image_scale_factor) # apply scale directly on GPU
     self.reset_pythony_beams(self.SIM)
+    print("AAA")
     self.whitelist_values = self.gpu_detector.get_whitelist_raw_pixels(whitelist_pixels)
+    print("BBB")
 
-def get_whitelist_from_refls(prefix,SIM):
-    image_size = len(SIM.raw_pixels)
+def get_whitelist_from_refls(prefix,SIM=None):
+    #image_size = len(SIM.raw_pixels)
     from dials.array_family import flex
     from dxtbx.model.experiment_list import ExperimentListFactory
     experiments = ExperimentListFactory.from_json_file("%s_imported.expt"%prefix, check_format = False)
@@ -353,8 +355,9 @@ def run_all(params):
   free_gpu_before = get_gpu_memory()[0]
   del SWCs
   free_gpu_after = get_gpu_memory()[0]
-  print((free_gpu_after - free_gpu_before)/NTRIALS,"free")
-  assert (free_gpu_after - free_gpu_before)/NTRIALS >= 50 # old policy uses at least 50 MB per sim., actual value 57.6 MB
+  old_memory_use = (free_gpu_after - free_gpu_before)/NTRIALS
+  print(old_memory_use,"free")
+  assert old_memory_use >= 50 # old policy uses at least 50 MB per sim., actual value 57.6 MB
 
   #figure out the minimum change needed to reduce memory consumption by factor of image_size/whitelist_size
   #accomplish the same with compile-time polymorphism
@@ -378,16 +381,51 @@ def run_all(params):
   free_gpu_before = get_gpu_memory()[0]
   del SWCs
   free_gpu_after = get_gpu_memory()[0]
-  print((free_gpu_after - free_gpu_before)/NTRIALS,"free")
-  assert (free_gpu_after - free_gpu_before)/NTRIALS >= 50 # old policy uses at least 50 MB per sim., actual value 57.6 MB
+  new_memory_use = (free_gpu_after - free_gpu_before)/NTRIALS
+  print(new_memory_use,"free")
+  assert old_memory_use > 4.*new_memory_use # new policy cuts down at least 4-fold on memory, actual value **
 
-  return
+def run_subset_for_NESAP_debug(params):
+  # make the dxtbx objects
+  BEAM = basic_beam()
+  DETECTOR = basic_detector()
+  CRYSTAL = basic_crystal()
+  SF_model = amplitudes(CRYSTAL)
+  # Famp = SF_model.Famp # simple uniform amplitudes
+  SF_model.random_structure(CRYSTAL)
+  SF_model.ersatz_correct_to_P1()
+  whitelist_pixels = flex.size_t((11929,
+ 351293,351294,351295,352828,352829,352830,352831,354364,354365,
+ 354366,354367,355900,355901,355902,355903,357436,357437,357438,
+ 357439,352383,352384,352385,353919,353920,353921,355455,355456,
+ 355457,356991,356992,356993,354120,354121,354122,355656,355657))
+
+  NTRIALS=5
+  #figure out the minimum change needed to reduce memory consumption by factor of image_size/whitelist_size
+  #accomplish the same with compile-time polymorphism
+  #have side-by-side test in same python script
+  # Reproduce whitelist sims with small-memory mechanism
+  SWCs=[]
+  for x in range(NTRIALS):
+    print("Whitelist-only iteration with small memory",x)
+    SWCs.append(several_wavelength_case_policy(BEAM,DETECTOR,CRYSTAL,SF_model,weights=flex.double([1.])))
+    SWCs[-1].specialized_api_for_whitelist_low_memory(whitelist_pixels=whitelist_pixels,params=params,argchk=False,sources=True)
+  #produce an output image file for intermediate debugging
+  working_raw_pixels = flex.double(image_size) # blank array
+  working_raw_pixels.set_selected(whitelist_pixels, SWCs[-1].whitelist_values)
+  working_raw_pixels.reshape(flex.grid(SWCs[-1].SIM.raw_pixels.focus()))
+
+  free_gpu_before = get_gpu_memory()[0]
+  del SWCs
+  free_gpu_after = get_gpu_memory()[0]
+  new_memory_use = (free_gpu_after - free_gpu_before)/NTRIALS
+  print(new_memory_use,"free")
 
 if __name__=="__main__":
   params,options = parse_input()
   # Initialize based on GPU context
   gpu_instance_type = get_exascale("gpu_instance", params.context)
   gpu_instance = gpu_instance_type(deviceId = 0)
-
-  run_all(params)
+  #run_all(params)
+  run_subset_for_NESAP_debug(params)
 print("OK")