Skip to content

Commit

Permalink
Minimal changes for working tst_memory_policy.py
Browse files Browse the repository at this point in the history
Memory savings achieved through code specialization, for the case where
pixel values are simulated on a small whitelist. Specializations are not
yet optimal, as there is still a lot of code duplication.

Changes give ~4.5x reduction in memory footprint, but no success yet in
resizing the array m_accumulate_floatimage.  Attempts so far lead to
cuda memory allocation error.
  • Loading branch information
nksauter committed Jun 13, 2024
1 parent e94da3a commit 144537d
Show file tree
Hide file tree
Showing 6 changed files with 71 additions and 113 deletions.
97 changes: 15 additions & 82 deletions simtbx/kokkos/detector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -93,16 +93,16 @@ namespace simtbx { namespace Kokkos {
}

template<>
void kokkos_detector<small_whitelist_policy>::hello(){
SCITBX_EXAMINE("small small small");
std::string kokkos_detector<small_whitelist_policy>::hello(){
return("small small small");
}
template<>
void kokkos_detector<large_array_policy>::hello(){
SCITBX_EXAMINE("large large large");
std::string kokkos_detector<large_array_policy>::hello(){
return("large large large");
}

template<> void
kokkos_detector<large_array_policy>::each_image_allocate() {
kokkos_detector<large_array_policy>::each_image_allocate(const std::size_t& n_pixels) {
resize(m_rangemap, m_total_pixel_count);
resize(m_omega_reduction, m_total_pixel_count);
resize(m_max_I_x_reduction, m_total_pixel_count);
Expand Down Expand Up @@ -140,9 +140,17 @@ namespace simtbx { namespace Kokkos {

// printf("DONE.\n");
}

template<> void
kokkos_detector<small_whitelist_policy>::each_image_allocate() {
resize(m_maskimage, m_total_pixel_count);
kokkos_detector<small_whitelist_policy>::each_image_allocate(const std::size_t& n_pixels) {
SCITBX_ASSERT(n_pixels > 0);
resize(m_rangemap, n_pixels);
resize(m_omega_reduction, n_pixels);
resize(m_max_I_x_reduction, n_pixels);
resize(m_max_I_y_reduction, n_pixels);
resize(m_floatimage, n_pixels);

resize(m_maskimage, n_pixels);
kokkostbx::transfer_shared2kokkos(m_sdet_vector, metrology.sdet);
kokkostbx::transfer_shared2kokkos(m_fdet_vector, metrology.fdet);
kokkostbx::transfer_shared2kokkos(m_odet_vector, metrology.odet);
Expand All @@ -152,80 +160,5 @@ namespace simtbx { namespace Kokkos {
kokkostbx::transfer_shared2kokkos(m_Ybeam, metrology.Ybeam);
fence();
}

template<>
void
kokkos_detector<large_array_policy>::set_active_pixels_on_GPU(af::shared<std::size_t> active_pixel_list_value) {
m_active_pixel_size = active_pixel_list_value.size();
kokkostbx::transfer_shared2kokkos(m_active_pixel_list, active_pixel_list_value);
active_pixel_list = active_pixel_list_value;
}

template<>
void
kokkos_detector<small_whitelist_policy>::set_active_pixels_on_GPU(af::shared<std::size_t> active_pixel_list_value) {
m_active_pixel_size = active_pixel_list_value.size();
kokkostbx::transfer_shared2kokkos(m_active_pixel_list, active_pixel_list_value);
active_pixel_list = active_pixel_list_value;
resize(m_rangemap, m_active_pixel_size);
resize(m_omega_reduction, m_active_pixel_size);
resize(m_max_I_x_reduction, m_active_pixel_size);
resize(m_max_I_y_reduction, m_active_pixel_size);
resize(m_floatimage, m_active_pixel_size);
resize(m_accumulate_floatimage, m_active_pixel_size);
fence();
}

template<> af::shared<double>
kokkos_detector<large_array_policy>::get_whitelist_raw_pixels(af::shared<std::size_t> selection) {
hello();
//return the data array for the multipanel detector case, but only for whitelist pixels
vector_size_t active_pixel_selection = vector_size_t("active_pixel_selection", selection.size());
kokkostbx::transfer_shared2kokkos(active_pixel_selection, selection);

size_t output_pixel_size = selection.size();
vector_cudareal_t active_pixel_results = vector_cudareal_t("active_pixel_results", output_pixel_size);

auto temp = m_accumulate_floatimage;

parallel_for("get_active_pixel_selection",
range_policy(0, output_pixel_size),
KOKKOS_LAMBDA (const int i) {
size_t index = active_pixel_selection( i );
active_pixel_results( i ) = temp( index );
});

af::shared<double> output_array(output_pixel_size, af::init_functor_null<double>());
kokkostbx::transfer_kokkos2shared(output_array, active_pixel_results);

SCITBX_ASSERT(output_array.size() == output_pixel_size);
return output_array;
}
template<> af::shared<double>
kokkos_detector<small_whitelist_policy>::get_whitelist_raw_pixels(af::shared<std::size_t> selection) {
SCITBX_CHECK_POINT;
hello();
//return the data array for the multipanel detector case, but only for whitelist pixels

std::size_t output_pixel_size = selection.size();
//vector_cudareal_t active_pixel_results = vector_cudareal_t("active_pixel_results", output_pixel_size);

//auto temp = m_accumulate_floatimage;

//parallel_for("get_active_pixel_selection2",
// range_policy(0, output_pixel_size),
// KOKKOS_LAMBDA (const int i) {
// active_pixel_results( i ) = temp( i );
//});

af::shared<double> output_array(output_pixel_size, af::init_functor_null<double>());
SCITBX_CHECK_POINT;
kokkostbx::transfer_kokkos2shared(output_array, m_accumulate_floatimage);//active_pixel_results);
SCITBX_CHECK_POINT;

SCITBX_ASSERT(output_array.size() == output_pixel_size);
return output_array;
}

} // Kokkos
} // simtbx
46 changes: 36 additions & 10 deletions simtbx/kokkos/detector.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ using vec3 = kokkostbx::vector3<CUDAREAL>;
using mat3 = kokkostbx::matrix3<CUDAREAL>;
using Kokkos::fence;


namespace simtbx { namespace Kokkos {

namespace af = scitbx::af;
Expand All @@ -45,8 +46,7 @@ struct large_array_policy {};
struct small_whitelist_policy {};

template <typename MemoryPolicy>
struct kokkos_detector
{
struct kokkos_detector{
inline kokkos_detector(){printf("NO OPERATION, DEVICE NUMBER IS NEEDED");};
//kokkos_detector(int const&, const simtbx::nanoBragg::nanoBragg& nB);
//kokkos_detector(int const&, dxtbx::model::Detector const &, dxtbx::model::Beam const &);
Expand All @@ -56,12 +56,12 @@ struct kokkos_detector
std::cout << "Detector size: " << m_panel_count << " panel" << ( (m_panel_count>1)? "s" : "" ) << std::endl;
metrology.show();
}
//void each_image_allocate();
void each_image_allocate(const std::size_t&);
//void scale_in_place(const double&);
//void write_raw_pixels(simtbx::nanoBragg::nanoBragg&);
//af::flex_double get_raw_pixels();
//void set_active_pixels_on_GPU(af::shared<std::size_t>);
af::shared<double> get_whitelist_raw_pixels(af::shared<std::size_t>);
//af::shared<double> get_whitelist_raw_pixels(af::shared<std::size_t>);
inline void each_image_free(){} //no op in Kokkos
int h_deviceID;

Expand Down Expand Up @@ -155,8 +155,6 @@ struct kokkos_detector
return view_floatimage;
};

void each_image_allocate();

inline void
scale_in_place(const double& factor){
auto local_accumulate_floatimage = m_accumulate_floatimage;
Expand All @@ -165,8 +163,6 @@ struct kokkos_detector
});
}

void set_active_pixels_on_GPU(af::shared<std::size_t> active_pixel_list_value);

inline void
write_raw_pixels(simtbx::nanoBragg::nanoBragg& nB) {
//only implement the monolithic detector case, one panel
Expand Down Expand Up @@ -203,11 +199,41 @@ struct kokkos_detector
return output_array;
}

void hello();
inline void
set_active_pixels_on_GPU(af::shared<std::size_t> active_pixel_list_value) {
m_active_pixel_size = active_pixel_list_value.size();
kokkostbx::transfer_shared2kokkos(m_active_pixel_list, active_pixel_list_value);
active_pixel_list = active_pixel_list_value;
}

};
inline af::shared<double>
get_whitelist_raw_pixels(af::shared<std::size_t> selection) {
printf("algorithm: %20s selection size %10d\n",hello().c_str(), selection.size());
//return the data array for the multipanel detector case, but only for whitelist pixels
vector_size_t active_pixel_selection = vector_size_t("active_pixel_selection", selection.size());
kokkostbx::transfer_shared2kokkos(active_pixel_selection, selection);

size_t output_pixel_size = selection.size();
vector_cudareal_t active_pixel_results = vector_cudareal_t("active_pixel_results", output_pixel_size);

auto temp = m_accumulate_floatimage;

parallel_for("get_active_pixel_selection",
range_policy(0, output_pixel_size),
KOKKOS_LAMBDA (const int i) {
size_t index = active_pixel_selection( i );
active_pixel_results( i ) = temp( index );
});

af::shared<double> output_array(output_pixel_size, af::init_functor_null<double>());
kokkostbx::transfer_kokkos2shared(output_array, active_pixel_results);

SCITBX_ASSERT(output_array.size() == output_pixel_size);
return output_array;
}

std::string hello();
};
} // Kokkos
} // simtbx
#endif // SIMTBX_KOKKOS_DETECTOR_H
2 changes: 1 addition & 1 deletion simtbx/kokkos/kokkos_ext.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ namespace simtbx { namespace Kokkos {
.def("show_summary",&simtbx::Kokkos::kokkos_detector<memory_t>::show_summary)
.def("each_image_allocate",
&simtbx::Kokkos::kokkos_detector<memory_t>::each_image_allocate,
( arg_("n_pixels")=0 ),
"Allocate large pixel arrays")
.def("scale_in_place", &simtbx::Kokkos::kokkos_detector<memory_t>::scale_in_place,
"Multiply by a scale factor on the GPU")
Expand All @@ -95,7 +96,6 @@ namespace simtbx { namespace Kokkos {
.def("get_raw_pixels",&simtbx::Kokkos::kokkos_detector<memory_t>::get_raw_pixels,
"return multipanel detector raw pixels as a flex array")
.def("get_whitelist_raw_pixels",
(af::shared<double> (simtbx::Kokkos::kokkos_detector<memory_t>::*)(af::shared<std::size_t>))
&simtbx::Kokkos::kokkos_detector<memory_t>::get_whitelist_raw_pixels,
"return only those raw pixels requested by the whitelist selection, as a 1D flex array")
.def("each_image_free", &simtbx::Kokkos::kokkos_detector<memory_t>::each_image_free)
Expand Down
3 changes: 2 additions & 1 deletion simtbx/kokkos/simulation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,8 @@ namespace Kokkos {
//don't want to free the kec data when the nanoBragg goes out of scope, so switch the pointer
// cu_current_channel_Fhkl = NULL;

add_array(kdt.m_accumulate_floatimage, kdt.m_floatimage);
//for the small_whitelist specialization, have a special version of add_array() that specifies size
add_array_limit(kdt.m_accumulate_floatimage, kdt.m_floatimage, kdt.m_floatimage.span());
}// loop over channels
}

Expand Down
8 changes: 8 additions & 0 deletions simtbx/kokkos/simulation_kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -955,6 +955,14 @@ void add_array( view_1d_t<T> lhs, const view_1d_t<U> rhs ) {
});
}

template <typename T, typename U>
void add_array_limit( view_1d_t<T> lhs, const view_1d_t<U> rhs, const std::size_t& limit ) {
Kokkos::parallel_for("add_arrays", limit, KOKKOS_LAMBDA(const int& i) {
lhs( i ) = lhs( i ) + (T)rhs( i );
rhs( i ) = 0;
});
}

void add_background_kokkos_kernel(int sources, int nanoBragg_oversample, int override_source,
CUDAREAL pixel_size, int spixels, int fpixels, int detector_thicksteps,
CUDAREAL detector_thickstep, CUDAREAL detector_attnlen,
Expand Down
28 changes: 9 additions & 19 deletions simtbx/tests/tst_memory_policy.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,8 @@ def specialized_api_for_whitelist_low_memory(self, params, whitelist_pixels, arg
self.gpu_simulation.allocate()
self.gpu_detector = get_exascale("gpu_detector_small_whitelist",params.context)(
deviceId=self.SIM.device_Id, detector=self.DETECTOR, beam=self.BEAM)
self.gpu_detector.each_image_allocate()

self.gpu_detector.each_image_allocate(n_pixels = whitelist_pixels.size() )
# self.gpu_detector.show_summary()

assert sources
Expand All @@ -233,9 +234,8 @@ def specialized_api_for_whitelist_low_memory(self, params, whitelist_pixels, arg
per_image_scale_factor = self.domains_per_crystal # 1.0
self.gpu_detector.scale_in_place(per_image_scale_factor) # apply scale directly on GPU
self.reset_pythony_beams(self.SIM)
print("AAA")
self.whitelist_values = self.gpu_detector.get_whitelist_raw_pixels(whitelist_pixels)
print("BBB")
whitelist_idx = flex.size_t(range(whitelist_pixels.size()))
self.whitelist_values = self.gpu_detector.get_whitelist_raw_pixels(whitelist_idx)

def get_whitelist_from_refls(prefix,SIM=None):
#image_size = len(SIM.raw_pixels)
Expand Down Expand Up @@ -347,7 +347,7 @@ def run_all(params):
# Now reproduce whitelist sims showing accumulation of large persistent memory
SWCs=[]
for x in range(NTRIALS):
print("Whitelist-only iteration",x)
print("\nWhitelist-only iteration",x)
SWCs.append(several_wavelength_case_policy(BEAM,DETECTOR,CRYSTAL,SF_model,weights=flex.double([1.])))
SWCs[-1].specialized_api_for_whitelist(whitelist_pixels=whitelist_pixels,params=params,argchk=False,sources=True)

Expand All @@ -365,7 +365,7 @@ def run_all(params):
# Reproduce whitelist sims with small-memory mechanism
SWCs=[]
for x in range(NTRIALS):
print("Whitelist-only iteration with small memory",x)
print("\nWhitelist-only iteration with small memory",x)
SWCs.append(several_wavelength_case_policy(BEAM,DETECTOR,CRYSTAL,SF_model,weights=flex.double([1.])))
SWCs[-1].specialized_api_for_whitelist_low_memory(whitelist_pixels=whitelist_pixels,params=params,argchk=False,sources=True)
#produce an output image file for intermediate debugging
Expand Down Expand Up @@ -407,25 +407,15 @@ def run_subset_for_NESAP_debug(params):
# Reproduce whitelist sims with small-memory mechanism
SWCs=[]
for x in range(NTRIALS):
print("Whitelist-only iteration with small memory",x)
print("\n Whitelist-only iteration with small memory",x)
SWCs.append(several_wavelength_case_policy(BEAM,DETECTOR,CRYSTAL,SF_model,weights=flex.double([1.])))
SWCs[-1].specialized_api_for_whitelist_low_memory(whitelist_pixels=whitelist_pixels,params=params,argchk=False,sources=True)
#produce an output image file for intermediate debugging
working_raw_pixels = flex.double(image_size) # blank array
working_raw_pixels.set_selected(whitelist_pixels, SWCs[-1].whitelist_values)
working_raw_pixels.reshape(flex.grid(SWCs[-1].SIM.raw_pixels.focus()))

free_gpu_before = get_gpu_memory()[0]
del SWCs
free_gpu_after = get_gpu_memory()[0]
new_memory_use = (free_gpu_after - free_gpu_before)/NTRIALS
print(new_memory_use,"free")

if __name__=="__main__":
params,options = parse_input()
# Initialize based on GPU context
gpu_instance_type = get_exascale("gpu_instance", params.context)
gpu_instance = gpu_instance_type(deviceId = 0)
#run_all(params)
run_subset_for_NESAP_debug(params)
run_all(params)
#run_subset_for_NESAP_debug(params)
print("OK")

0 comments on commit 144537d

Please sign in to comment.