Minimal changes for working tst_memory_policy.py

Memory savings achieved through code specialization, for the case where pixel values are simulated on a small whitelist. Specializations are not yet optimal, as there is still a lot of code duplication. Changes give ~4.5x reduction in memory footprint, but no success yet in resizing the array m_accumulate_floatimage. Attempts so far lead to cuda memory allocation error.
cctbx · Jun 13, 2024 · 144537d · 144537d
1 parent e94da3a
commit 144537d
Show file tree

Hide file tree

Showing 6 changed files with 71 additions and 113 deletions.
diff --git a/simtbx/kokkos/detector.cpp b/simtbx/kokkos/detector.cpp
@@ -93,16 +93,16 @@ namespace simtbx { namespace Kokkos {
   }
 
   template<>
-  void kokkos_detector<small_whitelist_policy>::hello(){
-    SCITBX_EXAMINE("small small small");
+  std::string kokkos_detector<small_whitelist_policy>::hello(){
+    return("small small small");
   }
   template<>
-  void kokkos_detector<large_array_policy>::hello(){
-    SCITBX_EXAMINE("large large large");
+  std::string kokkos_detector<large_array_policy>::hello(){
+    return("large large large");
   }
 
   template<> void
-  kokkos_detector<large_array_policy>::each_image_allocate() {
+  kokkos_detector<large_array_policy>::each_image_allocate(const std::size_t& n_pixels) {
     resize(m_rangemap, m_total_pixel_count);
     resize(m_omega_reduction, m_total_pixel_count);
     resize(m_max_I_x_reduction, m_total_pixel_count);
@@ -140,9 +140,17 @@ namespace simtbx { namespace Kokkos {
 
     // printf("DONE.\n");
   }
+
   template<> void
-  kokkos_detector<small_whitelist_policy>::each_image_allocate() {
-    resize(m_maskimage, m_total_pixel_count);
+  kokkos_detector<small_whitelist_policy>::each_image_allocate(const std::size_t& n_pixels) {
+    SCITBX_ASSERT(n_pixels > 0);
+    resize(m_rangemap, n_pixels);
+    resize(m_omega_reduction, n_pixels);
+    resize(m_max_I_x_reduction, n_pixels);
+    resize(m_max_I_y_reduction, n_pixels);
+    resize(m_floatimage, n_pixels);
+
+    resize(m_maskimage, n_pixels);
     kokkostbx::transfer_shared2kokkos(m_sdet_vector, metrology.sdet);
     kokkostbx::transfer_shared2kokkos(m_fdet_vector, metrology.fdet);
     kokkostbx::transfer_shared2kokkos(m_odet_vector, metrology.odet);
@@ -152,80 +160,5 @@ namespace simtbx { namespace Kokkos {
     kokkostbx::transfer_shared2kokkos(m_Ybeam, metrology.Ybeam);
     fence();
   }
-
-  template<>
-  void
-  kokkos_detector<large_array_policy>::set_active_pixels_on_GPU(af::shared<std::size_t> active_pixel_list_value) {
-    m_active_pixel_size = active_pixel_list_value.size();
-    kokkostbx::transfer_shared2kokkos(m_active_pixel_list, active_pixel_list_value);
-    active_pixel_list = active_pixel_list_value;
-  }
-
-  template<>
-  void
-  kokkos_detector<small_whitelist_policy>::set_active_pixels_on_GPU(af::shared<std::size_t> active_pixel_list_value) {
-    m_active_pixel_size = active_pixel_list_value.size();
-    kokkostbx::transfer_shared2kokkos(m_active_pixel_list, active_pixel_list_value);
-    active_pixel_list = active_pixel_list_value;
-    resize(m_rangemap, m_active_pixel_size);
-    resize(m_omega_reduction, m_active_pixel_size);
-    resize(m_max_I_x_reduction, m_active_pixel_size);
-    resize(m_max_I_y_reduction, m_active_pixel_size);
-    resize(m_floatimage, m_active_pixel_size);
-    resize(m_accumulate_floatimage, m_active_pixel_size);
-    fence();
-  }
-
-  template<> af::shared<double>
-  kokkos_detector<large_array_policy>::get_whitelist_raw_pixels(af::shared<std::size_t> selection) {
-    hello();
-    //return the data array for the multipanel detector case, but only for whitelist pixels
-    vector_size_t active_pixel_selection = vector_size_t("active_pixel_selection", selection.size());
-    kokkostbx::transfer_shared2kokkos(active_pixel_selection, selection);
-
-    size_t output_pixel_size = selection.size();
-    vector_cudareal_t active_pixel_results = vector_cudareal_t("active_pixel_results", output_pixel_size);
-
-    auto temp = m_accumulate_floatimage;
-
-    parallel_for("get_active_pixel_selection",
-                  range_policy(0, output_pixel_size),
-                  KOKKOS_LAMBDA (const int i) {
-      size_t index = active_pixel_selection( i );
-      active_pixel_results( i ) = temp( index );
-    });
-
-    af::shared<double> output_array(output_pixel_size, af::init_functor_null<double>());
-    kokkostbx::transfer_kokkos2shared(output_array, active_pixel_results);
-
-    SCITBX_ASSERT(output_array.size() == output_pixel_size);
-    return output_array;
-  }
-  template<> af::shared<double>
-  kokkos_detector<small_whitelist_policy>::get_whitelist_raw_pixels(af::shared<std::size_t> selection) {
-    SCITBX_CHECK_POINT;
-    hello();
-    //return the data array for the multipanel detector case, but only for whitelist pixels
-
-    std::size_t output_pixel_size = selection.size();
-    //vector_cudareal_t active_pixel_results = vector_cudareal_t("active_pixel_results", output_pixel_size);
-
-    //auto temp = m_accumulate_floatimage;
-
-    //parallel_for("get_active_pixel_selection2",
-    //              range_policy(0, output_pixel_size),
-    //              KOKKOS_LAMBDA (const int i) {
-    //  active_pixel_results( i ) = temp( i );
-    //});
-
-    af::shared<double> output_array(output_pixel_size, af::init_functor_null<double>());
-    SCITBX_CHECK_POINT;
-    kokkostbx::transfer_kokkos2shared(output_array, m_accumulate_floatimage);//active_pixel_results);
-    SCITBX_CHECK_POINT;
-
-    SCITBX_ASSERT(output_array.size() == output_pixel_size);
-    return output_array;
-  }
-
 } // Kokkos
 } // simtbx
diff --git a/simtbx/kokkos/detector.h b/simtbx/kokkos/detector.h
@@ -23,6 +23,7 @@ using vec3 = kokkostbx::vector3<CUDAREAL>;
 using mat3 = kokkostbx::matrix3<CUDAREAL>;
 using Kokkos::fence;
 
+
 namespace simtbx { namespace Kokkos {
 
 namespace af = scitbx::af;
@@ -45,8 +46,7 @@ struct large_array_policy {};
 struct small_whitelist_policy {};
 
 template <typename MemoryPolicy>
-struct kokkos_detector
-{
+struct kokkos_detector{
   inline kokkos_detector(){printf("NO OPERATION, DEVICE NUMBER IS NEEDED");};
   //kokkos_detector(int const&, const simtbx::nanoBragg::nanoBragg& nB);
   //kokkos_detector(int const&, dxtbx::model::Detector const &, dxtbx::model::Beam const &);
@@ -56,12 +56,12 @@ struct kokkos_detector
     std::cout << "Detector size: " << m_panel_count << " panel" << ( (m_panel_count>1)? "s" : "" ) << std::endl;
     metrology.show();
   }
-  //void each_image_allocate();
+  void each_image_allocate(const std::size_t&);
   //void scale_in_place(const double&);
   //void write_raw_pixels(simtbx::nanoBragg::nanoBragg&);
   //af::flex_double get_raw_pixels();
   //void set_active_pixels_on_GPU(af::shared<std::size_t>);
-  af::shared<double> get_whitelist_raw_pixels(af::shared<std::size_t>);
+  //af::shared<double> get_whitelist_raw_pixels(af::shared<std::size_t>);
   inline void each_image_free(){} //no op in Kokkos
   int h_deviceID;
 
@@ -155,8 +155,6 @@ struct kokkos_detector
     return view_floatimage;
   };
 
-  void each_image_allocate();
-
   inline void
   scale_in_place(const double& factor){
     auto local_accumulate_floatimage = m_accumulate_floatimage;
@@ -165,8 +163,6 @@ struct kokkos_detector
     });
   }
 
-  void set_active_pixels_on_GPU(af::shared<std::size_t> active_pixel_list_value);
-
   inline void
   write_raw_pixels(simtbx::nanoBragg::nanoBragg& nB) {
     //only implement the monolithic detector case, one panel
@@ -203,11 +199,41 @@ struct kokkos_detector
     return output_array;
   }
 
-  void hello();
+  inline void
+  set_active_pixels_on_GPU(af::shared<std::size_t> active_pixel_list_value) {
+    m_active_pixel_size = active_pixel_list_value.size();
+    kokkostbx::transfer_shared2kokkos(m_active_pixel_list, active_pixel_list_value);
+    active_pixel_list = active_pixel_list_value;
+  }
 
-};
+  inline af::shared<double>
+  get_whitelist_raw_pixels(af::shared<std::size_t> selection) {
+    printf("algorithm: %20s selection size %10d\n",hello().c_str(), selection.size());
+    //return the data array for the multipanel detector case, but only for whitelist pixels
+    vector_size_t active_pixel_selection = vector_size_t("active_pixel_selection", selection.size());
+    kokkostbx::transfer_shared2kokkos(active_pixel_selection, selection);
+
+    size_t output_pixel_size = selection.size();
+    vector_cudareal_t active_pixel_results = vector_cudareal_t("active_pixel_results", output_pixel_size);
 
+    auto temp = m_accumulate_floatimage;
 
+    parallel_for("get_active_pixel_selection",
+                  range_policy(0, output_pixel_size),
+                  KOKKOS_LAMBDA (const int i) {
+      size_t index = active_pixel_selection( i );
+      active_pixel_results( i ) = temp( index );
+    });
+
+    af::shared<double> output_array(output_pixel_size, af::init_functor_null<double>());
+    kokkostbx::transfer_kokkos2shared(output_array, active_pixel_results);
+
+    SCITBX_ASSERT(output_array.size() == output_pixel_size);
+    return output_array;
+  }
+
+  std::string hello();
+};
 } // Kokkos
 } // simtbx
 #endif // SIMTBX_KOKKOS_DETECTOR_H
diff --git a/simtbx/kokkos/kokkos_ext.cpp b/simtbx/kokkos/kokkos_ext.cpp
@@ -87,6 +87,7 @@ namespace simtbx { namespace Kokkos {
         .def("show_summary",&simtbx::Kokkos::kokkos_detector<memory_t>::show_summary)
         .def("each_image_allocate",
               &simtbx::Kokkos::kokkos_detector<memory_t>::each_image_allocate,
+              ( arg_("n_pixels")=0 ),
              "Allocate large pixel arrays")
         .def("scale_in_place", &simtbx::Kokkos::kokkos_detector<memory_t>::scale_in_place,
              "Multiply by a scale factor on the GPU")
@@ -95,7 +96,6 @@ namespace simtbx { namespace Kokkos {
         .def("get_raw_pixels",&simtbx::Kokkos::kokkos_detector<memory_t>::get_raw_pixels,
              "return multipanel detector raw pixels as a flex array")
         .def("get_whitelist_raw_pixels",
-             (af::shared<double> (simtbx::Kokkos::kokkos_detector<memory_t>::*)(af::shared<std::size_t>))
              &simtbx::Kokkos::kokkos_detector<memory_t>::get_whitelist_raw_pixels,
             "return only those raw pixels requested by the whitelist selection, as a 1D flex array")
         .def("each_image_free", &simtbx::Kokkos::kokkos_detector<memory_t>::each_image_free)

diff --git a/simtbx/kokkos/simulation.cpp b/simtbx/kokkos/simulation.cpp
@@ -211,7 +211,8 @@ namespace Kokkos {
     //don't want to free the kec data when the nanoBragg goes out of scope, so switch the pointer
     // cu_current_channel_Fhkl = NULL;
 
-      add_array(kdt.m_accumulate_floatimage, kdt.m_floatimage);
+      //for the small_whitelist specialization, have a special version of add_array() that specifies size
+      add_array_limit(kdt.m_accumulate_floatimage, kdt.m_floatimage, kdt.m_floatimage.span());
     }// loop over channels
   }
 

diff --git a/simtbx/kokkos/simulation_kernels.h b/simtbx/kokkos/simulation_kernels.h
@@ -955,6 +955,14 @@ void add_array( view_1d_t<T> lhs, const view_1d_t<U> rhs ) {
   });
 }
 
+template <typename T, typename U>
+void add_array_limit( view_1d_t<T> lhs, const view_1d_t<U> rhs, const std::size_t& limit ) {
+  Kokkos::parallel_for("add_arrays", limit, KOKKOS_LAMBDA(const int& i) {
+    lhs( i ) = lhs( i ) + (T)rhs( i );
+    rhs( i ) = 0;
+  });
+}
+
 void add_background_kokkos_kernel(int sources, int nanoBragg_oversample, int override_source,
     CUDAREAL pixel_size, int spixels, int fpixels, int detector_thicksteps,
     CUDAREAL detector_thickstep, CUDAREAL detector_attnlen,

diff --git a/simtbx/tests/tst_memory_policy.py b/simtbx/tests/tst_memory_policy.py
@@ -215,7 +215,8 @@ def specialized_api_for_whitelist_low_memory(self, params, whitelist_pixels, arg
     self.gpu_simulation.allocate()
     self.gpu_detector = get_exascale("gpu_detector_small_whitelist",params.context)(
                  deviceId=self.SIM.device_Id, detector=self.DETECTOR, beam=self.BEAM)
-    self.gpu_detector.each_image_allocate()
+
+    self.gpu_detector.each_image_allocate(n_pixels = whitelist_pixels.size() )
     # self.gpu_detector.show_summary()
 
     assert sources
@@ -233,9 +234,8 @@ def specialized_api_for_whitelist_low_memory(self, params, whitelist_pixels, arg
     per_image_scale_factor = self.domains_per_crystal # 1.0
     self.gpu_detector.scale_in_place(per_image_scale_factor) # apply scale directly on GPU
     self.reset_pythony_beams(self.SIM)
-    print("AAA")
-    self.whitelist_values = self.gpu_detector.get_whitelist_raw_pixels(whitelist_pixels)
-    print("BBB")
+    whitelist_idx = flex.size_t(range(whitelist_pixels.size()))
+    self.whitelist_values = self.gpu_detector.get_whitelist_raw_pixels(whitelist_idx)
 
 def get_whitelist_from_refls(prefix,SIM=None):
     #image_size = len(SIM.raw_pixels)
@@ -347,7 +347,7 @@ def run_all(params):
   # Now reproduce whitelist sims showing accumulation of large persistent memory
   SWCs=[]
   for x in range(NTRIALS):
-    print("Whitelist-only iteration",x)
+    print("\nWhitelist-only iteration",x)
     SWCs.append(several_wavelength_case_policy(BEAM,DETECTOR,CRYSTAL,SF_model,weights=flex.double([1.])))
     SWCs[-1].specialized_api_for_whitelist(whitelist_pixels=whitelist_pixels,params=params,argchk=False,sources=True)
 
@@ -365,7 +365,7 @@ def run_all(params):
   # Reproduce whitelist sims with small-memory mechanism
   SWCs=[]
   for x in range(NTRIALS):
-    print("Whitelist-only iteration with small memory",x)
+    print("\nWhitelist-only iteration with small memory",x)
     SWCs.append(several_wavelength_case_policy(BEAM,DETECTOR,CRYSTAL,SF_model,weights=flex.double([1.])))
     SWCs[-1].specialized_api_for_whitelist_low_memory(whitelist_pixels=whitelist_pixels,params=params,argchk=False,sources=True)
   #produce an output image file for intermediate debugging
@@ -407,25 +407,15 @@ def run_subset_for_NESAP_debug(params):
   # Reproduce whitelist sims with small-memory mechanism
   SWCs=[]
   for x in range(NTRIALS):
-    print("Whitelist-only iteration with small memory",x)
+    print("\n Whitelist-only iteration with small memory",x)
     SWCs.append(several_wavelength_case_policy(BEAM,DETECTOR,CRYSTAL,SF_model,weights=flex.double([1.])))
     SWCs[-1].specialized_api_for_whitelist_low_memory(whitelist_pixels=whitelist_pixels,params=params,argchk=False,sources=True)
-  #produce an output image file for intermediate debugging
-  working_raw_pixels = flex.double(image_size) # blank array
-  working_raw_pixels.set_selected(whitelist_pixels, SWCs[-1].whitelist_values)
-  working_raw_pixels.reshape(flex.grid(SWCs[-1].SIM.raw_pixels.focus()))
-
-  free_gpu_before = get_gpu_memory()[0]
-  del SWCs
-  free_gpu_after = get_gpu_memory()[0]
-  new_memory_use = (free_gpu_after - free_gpu_before)/NTRIALS
-  print(new_memory_use,"free")
 
 if __name__=="__main__":
   params,options = parse_input()
   # Initialize based on GPU context
   gpu_instance_type = get_exascale("gpu_instance", params.context)
   gpu_instance = gpu_instance_type(deviceId = 0)
-  #run_all(params)
-  run_subset_for_NESAP_debug(params)
+  run_all(params)
+  #run_subset_for_NESAP_debug(params)
 print("OK")