From 144537dbb2817a105cf66e116203df82f29649e8 Mon Sep 17 00:00:00 2001 From: Nicholas K Sauter Date: Sat, 20 Apr 2024 18:56:46 -0700 Subject: [PATCH] Minimal changes for working tst_memory_policy.py Memory savings achieved through code specialization, for the case where pixel values are simulated on a small whitelist. Specializations are not yet optimal, as there is still a lot of code duplication. Changes give ~4.5x reduction in memory footprint, but no success yet in resizing the array m_accumulate_floatimage. Attempts so far lead to cuda memory allocation error. --- simtbx/kokkos/detector.cpp | 97 +++++------------------------- simtbx/kokkos/detector.h | 46 +++++++++++--- simtbx/kokkos/kokkos_ext.cpp | 2 +- simtbx/kokkos/simulation.cpp | 3 +- simtbx/kokkos/simulation_kernels.h | 8 +++ simtbx/tests/tst_memory_policy.py | 28 +++------ 6 files changed, 71 insertions(+), 113 deletions(-) diff --git a/simtbx/kokkos/detector.cpp b/simtbx/kokkos/detector.cpp index c4ef795d725..238ca568819 100644 --- a/simtbx/kokkos/detector.cpp +++ b/simtbx/kokkos/detector.cpp @@ -93,16 +93,16 @@ namespace simtbx { namespace Kokkos { } template<> - void kokkos_detector::hello(){ - SCITBX_EXAMINE("small small small"); + std::string kokkos_detector::hello(){ + return("small small small"); } template<> - void kokkos_detector::hello(){ - SCITBX_EXAMINE("large large large"); + std::string kokkos_detector::hello(){ + return("large large large"); } template<> void - kokkos_detector::each_image_allocate() { + kokkos_detector::each_image_allocate(const std::size_t& n_pixels) { resize(m_rangemap, m_total_pixel_count); resize(m_omega_reduction, m_total_pixel_count); resize(m_max_I_x_reduction, m_total_pixel_count); @@ -140,9 +140,17 @@ namespace simtbx { namespace Kokkos { // printf("DONE.\n"); } + template<> void - kokkos_detector::each_image_allocate() { - resize(m_maskimage, m_total_pixel_count); + kokkos_detector::each_image_allocate(const std::size_t& n_pixels) { + SCITBX_ASSERT(n_pixels > 0); + resize(m_rangemap, n_pixels); + resize(m_omega_reduction, n_pixels); + resize(m_max_I_x_reduction, n_pixels); + resize(m_max_I_y_reduction, n_pixels); + resize(m_floatimage, n_pixels); + + resize(m_maskimage, n_pixels); kokkostbx::transfer_shared2kokkos(m_sdet_vector, metrology.sdet); kokkostbx::transfer_shared2kokkos(m_fdet_vector, metrology.fdet); kokkostbx::transfer_shared2kokkos(m_odet_vector, metrology.odet); @@ -152,80 +160,5 @@ namespace simtbx { namespace Kokkos { kokkostbx::transfer_shared2kokkos(m_Ybeam, metrology.Ybeam); fence(); } - - template<> - void - kokkos_detector::set_active_pixels_on_GPU(af::shared active_pixel_list_value) { - m_active_pixel_size = active_pixel_list_value.size(); - kokkostbx::transfer_shared2kokkos(m_active_pixel_list, active_pixel_list_value); - active_pixel_list = active_pixel_list_value; - } - - template<> - void - kokkos_detector::set_active_pixels_on_GPU(af::shared active_pixel_list_value) { - m_active_pixel_size = active_pixel_list_value.size(); - kokkostbx::transfer_shared2kokkos(m_active_pixel_list, active_pixel_list_value); - active_pixel_list = active_pixel_list_value; - resize(m_rangemap, m_active_pixel_size); - resize(m_omega_reduction, m_active_pixel_size); - resize(m_max_I_x_reduction, m_active_pixel_size); - resize(m_max_I_y_reduction, m_active_pixel_size); - resize(m_floatimage, m_active_pixel_size); - resize(m_accumulate_floatimage, m_active_pixel_size); - fence(); - } - - template<> af::shared - kokkos_detector::get_whitelist_raw_pixels(af::shared selection) { - hello(); - //return the data array for the multipanel detector case, but only for whitelist pixels - vector_size_t active_pixel_selection = vector_size_t("active_pixel_selection", selection.size()); - kokkostbx::transfer_shared2kokkos(active_pixel_selection, selection); - - size_t output_pixel_size = selection.size(); - vector_cudareal_t active_pixel_results = vector_cudareal_t("active_pixel_results", output_pixel_size); - - auto temp = m_accumulate_floatimage; - - parallel_for("get_active_pixel_selection", - range_policy(0, output_pixel_size), - KOKKOS_LAMBDA (const int i) { - size_t index = active_pixel_selection( i ); - active_pixel_results( i ) = temp( index ); - }); - - af::shared output_array(output_pixel_size, af::init_functor_null()); - kokkostbx::transfer_kokkos2shared(output_array, active_pixel_results); - - SCITBX_ASSERT(output_array.size() == output_pixel_size); - return output_array; - } - template<> af::shared - kokkos_detector::get_whitelist_raw_pixels(af::shared selection) { - SCITBX_CHECK_POINT; - hello(); - //return the data array for the multipanel detector case, but only for whitelist pixels - - std::size_t output_pixel_size = selection.size(); - //vector_cudareal_t active_pixel_results = vector_cudareal_t("active_pixel_results", output_pixel_size); - - //auto temp = m_accumulate_floatimage; - - //parallel_for("get_active_pixel_selection2", - // range_policy(0, output_pixel_size), - // KOKKOS_LAMBDA (const int i) { - // active_pixel_results( i ) = temp( i ); - //}); - - af::shared output_array(output_pixel_size, af::init_functor_null()); - SCITBX_CHECK_POINT; - kokkostbx::transfer_kokkos2shared(output_array, m_accumulate_floatimage);//active_pixel_results); - SCITBX_CHECK_POINT; - - SCITBX_ASSERT(output_array.size() == output_pixel_size); - return output_array; - } - } // Kokkos } // simtbx diff --git a/simtbx/kokkos/detector.h b/simtbx/kokkos/detector.h index 74d286ffbed..dc1f276a2df 100644 --- a/simtbx/kokkos/detector.h +++ b/simtbx/kokkos/detector.h @@ -23,6 +23,7 @@ using vec3 = kokkostbx::vector3; using mat3 = kokkostbx::matrix3; using Kokkos::fence; + namespace simtbx { namespace Kokkos { namespace af = scitbx::af; @@ -45,8 +46,7 @@ struct large_array_policy {}; struct small_whitelist_policy {}; template -struct kokkos_detector -{ +struct kokkos_detector{ inline kokkos_detector(){printf("NO OPERATION, DEVICE NUMBER IS NEEDED");}; //kokkos_detector(int const&, const simtbx::nanoBragg::nanoBragg& nB); //kokkos_detector(int const&, dxtbx::model::Detector const &, dxtbx::model::Beam const &); @@ -56,12 +56,12 @@ struct kokkos_detector std::cout << "Detector size: " << m_panel_count << " panel" << ( (m_panel_count>1)? "s" : "" ) << std::endl; metrology.show(); } - //void each_image_allocate(); + void each_image_allocate(const std::size_t&); //void scale_in_place(const double&); //void write_raw_pixels(simtbx::nanoBragg::nanoBragg&); //af::flex_double get_raw_pixels(); //void set_active_pixels_on_GPU(af::shared); - af::shared get_whitelist_raw_pixels(af::shared); + //af::shared get_whitelist_raw_pixels(af::shared); inline void each_image_free(){} //no op in Kokkos int h_deviceID; @@ -155,8 +155,6 @@ struct kokkos_detector return view_floatimage; }; - void each_image_allocate(); - inline void scale_in_place(const double& factor){ auto local_accumulate_floatimage = m_accumulate_floatimage; @@ -165,8 +163,6 @@ struct kokkos_detector }); } - void set_active_pixels_on_GPU(af::shared active_pixel_list_value); - inline void write_raw_pixels(simtbx::nanoBragg::nanoBragg& nB) { //only implement the monolithic detector case, one panel @@ -203,11 +199,41 @@ struct kokkos_detector return output_array; } - void hello(); + inline void + set_active_pixels_on_GPU(af::shared active_pixel_list_value) { + m_active_pixel_size = active_pixel_list_value.size(); + kokkostbx::transfer_shared2kokkos(m_active_pixel_list, active_pixel_list_value); + active_pixel_list = active_pixel_list_value; + } -}; + inline af::shared + get_whitelist_raw_pixels(af::shared selection) { + printf("algorithm: %20s selection size %10d\n",hello().c_str(), selection.size()); + //return the data array for the multipanel detector case, but only for whitelist pixels + vector_size_t active_pixel_selection = vector_size_t("active_pixel_selection", selection.size()); + kokkostbx::transfer_shared2kokkos(active_pixel_selection, selection); + + size_t output_pixel_size = selection.size(); + vector_cudareal_t active_pixel_results = vector_cudareal_t("active_pixel_results", output_pixel_size); + auto temp = m_accumulate_floatimage; + parallel_for("get_active_pixel_selection", + range_policy(0, output_pixel_size), + KOKKOS_LAMBDA (const int i) { + size_t index = active_pixel_selection( i ); + active_pixel_results( i ) = temp( index ); + }); + + af::shared output_array(output_pixel_size, af::init_functor_null()); + kokkostbx::transfer_kokkos2shared(output_array, active_pixel_results); + + SCITBX_ASSERT(output_array.size() == output_pixel_size); + return output_array; + } + + std::string hello(); +}; } // Kokkos } // simtbx #endif // SIMTBX_KOKKOS_DETECTOR_H diff --git a/simtbx/kokkos/kokkos_ext.cpp b/simtbx/kokkos/kokkos_ext.cpp index 3ef9f4e2ce5..ca6d2dc0967 100644 --- a/simtbx/kokkos/kokkos_ext.cpp +++ b/simtbx/kokkos/kokkos_ext.cpp @@ -87,6 +87,7 @@ namespace simtbx { namespace Kokkos { .def("show_summary",&simtbx::Kokkos::kokkos_detector::show_summary) .def("each_image_allocate", &simtbx::Kokkos::kokkos_detector::each_image_allocate, + ( arg_("n_pixels")=0 ), "Allocate large pixel arrays") .def("scale_in_place", &simtbx::Kokkos::kokkos_detector::scale_in_place, "Multiply by a scale factor on the GPU") @@ -95,7 +96,6 @@ namespace simtbx { namespace Kokkos { .def("get_raw_pixels",&simtbx::Kokkos::kokkos_detector::get_raw_pixels, "return multipanel detector raw pixels as a flex array") .def("get_whitelist_raw_pixels", - (af::shared (simtbx::Kokkos::kokkos_detector::*)(af::shared)) &simtbx::Kokkos::kokkos_detector::get_whitelist_raw_pixels, "return only those raw pixels requested by the whitelist selection, as a 1D flex array") .def("each_image_free", &simtbx::Kokkos::kokkos_detector::each_image_free) diff --git a/simtbx/kokkos/simulation.cpp b/simtbx/kokkos/simulation.cpp index cf2d79dd4e1..65d3979e868 100644 --- a/simtbx/kokkos/simulation.cpp +++ b/simtbx/kokkos/simulation.cpp @@ -211,7 +211,8 @@ namespace Kokkos { //don't want to free the kec data when the nanoBragg goes out of scope, so switch the pointer // cu_current_channel_Fhkl = NULL; - add_array(kdt.m_accumulate_floatimage, kdt.m_floatimage); + //for the small_whitelist specialization, have a special version of add_array() that specifies size + add_array_limit(kdt.m_accumulate_floatimage, kdt.m_floatimage, kdt.m_floatimage.span()); }// loop over channels } diff --git a/simtbx/kokkos/simulation_kernels.h b/simtbx/kokkos/simulation_kernels.h index dd78e002421..580a184b04f 100644 --- a/simtbx/kokkos/simulation_kernels.h +++ b/simtbx/kokkos/simulation_kernels.h @@ -955,6 +955,14 @@ void add_array( view_1d_t lhs, const view_1d_t rhs ) { }); } +template +void add_array_limit( view_1d_t lhs, const view_1d_t rhs, const std::size_t& limit ) { + Kokkos::parallel_for("add_arrays", limit, KOKKOS_LAMBDA(const int& i) { + lhs( i ) = lhs( i ) + (T)rhs( i ); + rhs( i ) = 0; + }); +} + void add_background_kokkos_kernel(int sources, int nanoBragg_oversample, int override_source, CUDAREAL pixel_size, int spixels, int fpixels, int detector_thicksteps, CUDAREAL detector_thickstep, CUDAREAL detector_attnlen, diff --git a/simtbx/tests/tst_memory_policy.py b/simtbx/tests/tst_memory_policy.py index 177d6e5a97e..fe1629314cf 100644 --- a/simtbx/tests/tst_memory_policy.py +++ b/simtbx/tests/tst_memory_policy.py @@ -215,7 +215,8 @@ def specialized_api_for_whitelist_low_memory(self, params, whitelist_pixels, arg self.gpu_simulation.allocate() self.gpu_detector = get_exascale("gpu_detector_small_whitelist",params.context)( deviceId=self.SIM.device_Id, detector=self.DETECTOR, beam=self.BEAM) - self.gpu_detector.each_image_allocate() + + self.gpu_detector.each_image_allocate(n_pixels = whitelist_pixels.size() ) # self.gpu_detector.show_summary() assert sources @@ -233,9 +234,8 @@ def specialized_api_for_whitelist_low_memory(self, params, whitelist_pixels, arg per_image_scale_factor = self.domains_per_crystal # 1.0 self.gpu_detector.scale_in_place(per_image_scale_factor) # apply scale directly on GPU self.reset_pythony_beams(self.SIM) - print("AAA") - self.whitelist_values = self.gpu_detector.get_whitelist_raw_pixels(whitelist_pixels) - print("BBB") + whitelist_idx = flex.size_t(range(whitelist_pixels.size())) + self.whitelist_values = self.gpu_detector.get_whitelist_raw_pixels(whitelist_idx) def get_whitelist_from_refls(prefix,SIM=None): #image_size = len(SIM.raw_pixels) @@ -347,7 +347,7 @@ def run_all(params): # Now reproduce whitelist sims showing accumulation of large persistent memory SWCs=[] for x in range(NTRIALS): - print("Whitelist-only iteration",x) + print("\nWhitelist-only iteration",x) SWCs.append(several_wavelength_case_policy(BEAM,DETECTOR,CRYSTAL,SF_model,weights=flex.double([1.]))) SWCs[-1].specialized_api_for_whitelist(whitelist_pixels=whitelist_pixels,params=params,argchk=False,sources=True) @@ -365,7 +365,7 @@ def run_all(params): # Reproduce whitelist sims with small-memory mechanism SWCs=[] for x in range(NTRIALS): - print("Whitelist-only iteration with small memory",x) + print("\nWhitelist-only iteration with small memory",x) SWCs.append(several_wavelength_case_policy(BEAM,DETECTOR,CRYSTAL,SF_model,weights=flex.double([1.]))) SWCs[-1].specialized_api_for_whitelist_low_memory(whitelist_pixels=whitelist_pixels,params=params,argchk=False,sources=True) #produce an output image file for intermediate debugging @@ -407,25 +407,15 @@ def run_subset_for_NESAP_debug(params): # Reproduce whitelist sims with small-memory mechanism SWCs=[] for x in range(NTRIALS): - print("Whitelist-only iteration with small memory",x) + print("\n Whitelist-only iteration with small memory",x) SWCs.append(several_wavelength_case_policy(BEAM,DETECTOR,CRYSTAL,SF_model,weights=flex.double([1.]))) SWCs[-1].specialized_api_for_whitelist_low_memory(whitelist_pixels=whitelist_pixels,params=params,argchk=False,sources=True) - #produce an output image file for intermediate debugging - working_raw_pixels = flex.double(image_size) # blank array - working_raw_pixels.set_selected(whitelist_pixels, SWCs[-1].whitelist_values) - working_raw_pixels.reshape(flex.grid(SWCs[-1].SIM.raw_pixels.focus())) - - free_gpu_before = get_gpu_memory()[0] - del SWCs - free_gpu_after = get_gpu_memory()[0] - new_memory_use = (free_gpu_after - free_gpu_before)/NTRIALS - print(new_memory_use,"free") if __name__=="__main__": params,options = parse_input() # Initialize based on GPU context gpu_instance_type = get_exascale("gpu_instance", params.context) gpu_instance = gpu_instance_type(deviceId = 0) - #run_all(params) - run_subset_for_NESAP_debug(params) + run_all(params) + #run_subset_for_NESAP_debug(params) print("OK")