From 9d63b8a7eb6d762c60402a8c24116698e56c41c8 Mon Sep 17 00:00:00 2001 From: Nicholas K Sauter Date: Sun, 31 Mar 2024 14:38:50 -0700 Subject: [PATCH 1/4] Framework for compile-time polymorphism. In the exascale_api, allow pixel values to be calculation either on large array (all pixels), or with low-memory on just the whitelist consisting of shoebox pixels. This commit only gives the polymorphism framework; both implementations are currently identical giving the large-array behavior. --- simtbx/__init__.py | 7 +- simtbx/kokkos/SConscript | 4 + simtbx/kokkos/detector.cpp | 163 --------- simtbx/kokkos/detector.h | 199 ++++++++++- simtbx/kokkos/kokkos_ext.cpp | 69 ++-- simtbx/kokkos/simulation.cpp | 512 -------------------------- simtbx/kokkos/simulation.h | 557 ++++++++++++++++++++++++++++- simtbx/kokkos/simulation_kernels.h | 3 + simtbx/tests/tst_memory_policy.py | 4 +- 9 files changed, 781 insertions(+), 737 deletions(-) diff --git a/simtbx/__init__.py b/simtbx/__init__.py index c88f23d368..c1855fd4e9 100644 --- a/simtbx/__init__.py +++ b/simtbx/__init__.py @@ -3,11 +3,14 @@ def get_exascale(interface, context): if context == "kokkos_gpu": - from simtbx.kokkos import gpu_instance, gpu_energy_channels, gpu_detector, exascale_api + from simtbx.kokkos import gpu_instance, gpu_energy_channels, gpu_detector, gpu_detector_small_whitelist + from simtbx.kokkos import exascale_api, exascale_api_small_whitelist elif context == "cuda": from simtbx.gpu import gpu_instance, gpu_energy_channels, gpu_detector, exascale_api else: raise NotImplementedError(context) return dict(gpu_instance = gpu_instance, gpu_energy_channels = gpu_energy_channels, - gpu_detector = gpu_detector, exascale_api = exascale_api)[interface] + gpu_detector = gpu_detector, exascale_api = exascale_api, + gpu_detector_small_whitelist = locals().get("gpu_detector_small_whitelist"), + exascale_api_small_whitelist = locals().get("exascale_api_small_whitelist"))[interface] diff --git a/simtbx/kokkos/SConscript b/simtbx/kokkos/SConscript index bc72ee122d..7ff0a63aaf 100644 --- a/simtbx/kokkos/SConscript +++ b/simtbx/kokkos/SConscript @@ -168,6 +168,10 @@ if not env_etc.no_boost_python: env_etc.include_registry.append( env=kokkos_ext_env, paths=env_etc.simtbx_common_includes + [env_etc.python_include]) + if True: # same construct as above, temporarily accommodate the eigen library + env_etc.include_registry.append( + env=kokkos_ext_env, + paths=[env_etc.eigen_include]) kokkos_ext_env.Replace(CXX=os.environ['CXX']) kokkos_ext_env.Replace(SHCXX=os.environ['CXX']) kokkos_ext_env.Replace(SHLINK=os.environ['CXX']) diff --git a/simtbx/kokkos/detector.cpp b/simtbx/kokkos/detector.cpp index 0c310551ca..233b97c7ff 100644 --- a/simtbx/kokkos/detector.cpp +++ b/simtbx/kokkos/detector.cpp @@ -92,168 +92,5 @@ namespace simtbx { namespace Kokkos { } } - vector_double_t - kokkos_detector::construct_detail(dxtbx::model::Detector const & arg_detector) { - //1) confirm the size - SCITBX_ASSERT( m_panel_count == arg_detector.size() ); - SCITBX_ASSERT( m_panel_count >= 1 ); - - //2) confirm that array dimensions are similar for each size - for (int ipanel=1; ipanel < arg_detector.size(); ++ipanel){ - SCITBX_ASSERT( arg_detector[ipanel].get_image_size()[1] == m_slow_dim_size ); - SCITBX_ASSERT( arg_detector[ipanel].get_image_size()[0] == m_fast_dim_size ); - } - // printf(" m_total_pixel_count: %d\n", m_total_pixel_count); - // printf(" m_slow_dim_size: %d\n", m_slow_dim_size); - // printf(" m_fast_dim_size: %d\n", m_fast_dim_size); - // printf(" m_panel_count: %d\n", m_panel_count); - - //3) allocate a cuda array with these dimensions - // separate accumulator image outside the usual nanoBragg data structure. - // 1. accumulate contributions from a sequence of source energy channels computed separately - // 2. represent multiple panels, all same rectangular shape; slowest dimension = n_panels - vector_double_t view_floatimage( "m_accumulate_floatimage", m_total_pixel_count ); - return view_floatimage; - }; - - kokkos_detector::kokkos_detector(int const& arg_device, - dxtbx::model::Detector const & arg_detector, - dxtbx::model::Beam const& arg_beam): - h_deviceID(arg_device), - metrology(arg_detector, arg_beam), - m_panel_count( arg_detector.size() ), - m_slow_dim_size( arg_detector[0].get_image_size()[1] ), - m_fast_dim_size( arg_detector[0].get_image_size()[0] ), - m_total_pixel_count( m_panel_count * m_slow_dim_size * m_fast_dim_size ), - m_accumulate_floatimage( construct_detail(arg_detector) ) { } - // Easy mistake: not realizing that the dxtbx detector model stores (fast,slow) sizes - - kokkos_detector::kokkos_detector(int const& arg_device, - const simtbx::nanoBragg::nanoBragg& nB): - h_deviceID(arg_device), - metrology(nB), - m_panel_count(1), - m_slow_dim_size(nB.spixels), - m_fast_dim_size(nB.fpixels), - m_total_pixel_count( m_panel_count * m_slow_dim_size * m_fast_dim_size ), - m_accumulate_floatimage( vector_double_t( "m_accumulate_floatimage", m_total_pixel_count) ) { } - - void - kokkos_detector::scale_in_place(const double& factor){ - auto local_accumulate_floatimage = m_accumulate_floatimage; - parallel_for("scale_in_place", range_policy(0,m_total_pixel_count), KOKKOS_LAMBDA (const int i) { - local_accumulate_floatimage( i ) = local_accumulate_floatimage( i ) * factor; - }); - } - - void - kokkos_detector::write_raw_pixels(simtbx::nanoBragg::nanoBragg& nB) { - //only implement the monolithic detector case, one panel - SCITBX_ASSERT(nB.spixels == m_slow_dim_size); - SCITBX_ASSERT(nB.fpixels == m_fast_dim_size); - SCITBX_ASSERT(m_panel_count == 1); - // nB.raw_pixels = af::flex_double(af::flex_grid<>(nB.spixels,nB.fpixels)); - // do not reallocate CPU memory for the data write, as it is not needed - - kokkostbx::transfer_kokkos2flex(nB.raw_pixels, m_accumulate_floatimage); - // vector_double_t::HostMirror host_floatimage = create_mirror_view(m_accumulate_floatimage); - // deep_copy(host_floatimage, m_accumulate_floatimage); - - // printf(" m_total_pixel_count: %d\n", m_total_pixel_count); - - // double * double_floatimage = nB.raw_pixels.begin(); - // for (int i=0; i(m_panel_count,m_slow_dim_size,m_fast_dim_size), af::init_functor_null()); - kokkostbx::transfer_kokkos2flex(output_array, m_accumulate_floatimage); - - // vector_double_t::HostMirror host_floatimage = create_mirror_view(m_accumulate_floatimage); - // deep_copy(host_floatimage, m_accumulate_floatimage); - - // for (int i=0; i active_pixel_list_value) { - m_active_pixel_size = active_pixel_list_value.size(); - kokkostbx::transfer_shared2kokkos(m_active_pixel_list, active_pixel_list_value); - active_pixel_list = active_pixel_list_value; - } - - af::shared - kokkos_detector::get_whitelist_raw_pixels(af::shared selection) { - //return the data array for the multipanel detector case, but only for whitelist pixels - vector_size_t active_pixel_selection = vector_size_t("active_pixel_selection", selection.size()); - kokkostbx::transfer_shared2kokkos(active_pixel_selection, selection); - - size_t output_pixel_size = selection.size(); - vector_cudareal_t active_pixel_results = vector_cudareal_t("active_pixel_results", output_pixel_size); - - auto temp = m_accumulate_floatimage; - - parallel_for("get_active_pixel_selection", - range_policy(0, output_pixel_size), - KOKKOS_LAMBDA (const int i) { - size_t index = active_pixel_selection( i ); - active_pixel_results( i ) = temp( index ); - }); - - af::shared output_array(output_pixel_size, af::init_functor_null()); - kokkostbx::transfer_kokkos2shared(output_array, active_pixel_results); - - SCITBX_ASSERT(output_array.size() == output_pixel_size); - return output_array; - } - - void - kokkos_detector::each_image_allocate() { - resize(m_rangemap, m_total_pixel_count); - resize(m_omega_reduction, m_total_pixel_count); - resize(m_max_I_x_reduction, m_total_pixel_count); - resize(m_max_I_y_reduction, m_total_pixel_count); - - resize(m_maskimage, m_total_pixel_count); - resize(m_floatimage, m_total_pixel_count); - - kokkostbx::transfer_shared2kokkos(m_sdet_vector, metrology.sdet); - kokkostbx::transfer_shared2kokkos(m_fdet_vector, metrology.fdet); - kokkostbx::transfer_shared2kokkos(m_odet_vector, metrology.odet); - kokkostbx::transfer_shared2kokkos(m_pix0_vector, metrology.pix0); - kokkostbx::transfer_shared2kokkos(m_distance, metrology.dists); - kokkostbx::transfer_shared2kokkos(m_Xbeam, metrology.Xbeam); - kokkostbx::transfer_shared2kokkos(m_Ybeam, metrology.Ybeam); - fence(); - - // metrology.show(); - - // printf(" rangemap size:%d\n", m_rangemap.span()); - // printf(" omega_reduction size:%d\n", m_omega_reduction.span()); - // printf(" max_I_x_reduction size:%d\n", m_max_I_x_reduction.span()); - // printf(" max_I_y_reduction size:%d\n", m_max_I_y_reduction.span()); - // printf(" maskimage size:%d\n", m_maskimage.span()); - // printf(" floatimage size:%d\n", m_floatimage.span()); - // printf(" sdet_vector size:%d\n", m_sdet_vector.span()); - // printf(" fdet_vector size:%d\n", m_fdet_vector.span()); - // printf(" odet_vector size:%d\n", m_odet_vector.span()); - // printf(" pix0_vector size:%d\n", m_pix0_vector.span()); - // printf(" distance size:%d\n", m_distance.span()); - // printf(" Xbeam size:%d\n", m_Xbeam.span()); - // printf(" Ybeam size:%d\n", m_Ybeam.span()); - - // print_view(m_fdet_vector); - // print_view(m_odet_vector, 1, 3); - - // printf("DONE.\n"); - } - } // Kokkos } // simtbx diff --git a/simtbx/kokkos/detector.h b/simtbx/kokkos/detector.h index 03e3913e17..c7149eaeb4 100644 --- a/simtbx/kokkos/detector.h +++ b/simtbx/kokkos/detector.h @@ -17,10 +17,11 @@ #include "kokkostbx/kokkos_types.h" #include "kokkostbx/kokkos_vector3.h" #include "kokkostbx/kokkos_matrix3.h" +#include "kokkostbx/kokkos_utils.h" using vec3 = kokkostbx::vector3; using mat3 = kokkostbx::matrix3; - +using Kokkos::fence; namespace simtbx { namespace Kokkos { @@ -40,22 +41,29 @@ struct packed_metrology{ af::sharedYbeam; }; -struct kokkos_detector{ +struct large_array_policy { +}; +struct small_whitelist_policy { +}; + +template +struct kokkos_detector +{ inline kokkos_detector(){printf("NO OPERATION, DEVICE NUMBER IS NEEDED");}; - kokkos_detector(int const&, const simtbx::nanoBragg::nanoBragg& nB); - kokkos_detector(int const&, dxtbx::model::Detector const &, dxtbx::model::Beam const &); - vector_double_t construct_detail(dxtbx::model::Detector const &); + //kokkos_detector(int const&, const simtbx::nanoBragg::nanoBragg& nB); + //kokkos_detector(int const&, dxtbx::model::Detector const &, dxtbx::model::Beam const &); + //vector_double_t construct_detail(dxtbx::model::Detector const &); inline void show_summary(){ std::cout << "Detector size: " << m_panel_count << " panel" << ( (m_panel_count>1)? "s" : "" ) << std::endl; metrology.show(); } - void each_image_allocate(); - void scale_in_place(const double&); - void write_raw_pixels(simtbx::nanoBragg::nanoBragg&); - af::flex_double get_raw_pixels(); - void set_active_pixels_on_GPU(af::shared); - af::shared get_whitelist_raw_pixels(af::shared); + //void each_image_allocate(); + //void scale_in_place(const double&); + //void write_raw_pixels(simtbx::nanoBragg::nanoBragg&); + //af::flex_double get_raw_pixels(); + //void set_active_pixels_on_GPU(af::shared); + //af::shared get_whitelist_raw_pixels(af::shared); inline void each_image_free(){} //no op in Kokkos int h_deviceID; @@ -99,8 +107,175 @@ struct kokkos_detector{ af::shared active_pixel_list; std::size_t m_active_pixel_size; vector_size_t m_active_pixel_list = vector_size_t("m_active_pixel_list", 0); + + inline + kokkos_detector(int const& arg_device, + const simtbx::nanoBragg::nanoBragg& nB): + h_deviceID(arg_device), + metrology(nB), + m_panel_count(1), + m_slow_dim_size(nB.spixels), + m_fast_dim_size(nB.fpixels), + m_total_pixel_count( m_panel_count * m_slow_dim_size * m_fast_dim_size ), + m_accumulate_floatimage( vector_double_t( "m_accumulate_floatimage", m_total_pixel_count) ) { } + + inline + kokkos_detector(int const& arg_device, + dxtbx::model::Detector const & arg_detector, + dxtbx::model::Beam const& arg_beam): + h_deviceID(arg_device), + metrology(arg_detector, arg_beam), + m_panel_count( arg_detector.size() ), + m_slow_dim_size( arg_detector[0].get_image_size()[1] ), + m_fast_dim_size( arg_detector[0].get_image_size()[0] ), + m_total_pixel_count( m_panel_count * m_slow_dim_size * m_fast_dim_size ), + m_accumulate_floatimage( construct_detail(arg_detector) ) { } + // Easy mistake: not realizing that the dxtbx detector model stores (fast,slow) sizes + + inline + vector_double_t + construct_detail(dxtbx::model::Detector const & arg_detector) { + //1) confirm the size + SCITBX_ASSERT( m_panel_count == arg_detector.size() ); + SCITBX_ASSERT( m_panel_count >= 1 ); + + //2) confirm that array dimensions are similar for each size + for (int ipanel=1; ipanel < arg_detector.size(); ++ipanel){ + SCITBX_ASSERT( arg_detector[ipanel].get_image_size()[1] == m_slow_dim_size ); + SCITBX_ASSERT( arg_detector[ipanel].get_image_size()[0] == m_fast_dim_size ); + } + // printf(" m_total_pixel_count: %d\n", m_total_pixel_count); + // printf(" m_slow_dim_size: %d\n", m_slow_dim_size); + // printf(" m_fast_dim_size: %d\n", m_fast_dim_size); + // printf(" m_panel_count: %d\n", m_panel_count); + + //3) allocate a cuda array with these dimensions + // separate accumulator image outside the usual nanoBragg data structure. + // 1. accumulate contributions from a sequence of source energy channels computed separately + // 2. represent multiple panels, all same rectangular shape; slowest dimension = n_panels + vector_double_t view_floatimage( "m_accumulate_floatimage", m_total_pixel_count ); + return view_floatimage; + }; + + inline void + each_image_allocate() { + resize(m_rangemap, m_total_pixel_count); + resize(m_omega_reduction, m_total_pixel_count); + resize(m_max_I_x_reduction, m_total_pixel_count); + resize(m_max_I_y_reduction, m_total_pixel_count); + + resize(m_maskimage, m_total_pixel_count); + resize(m_floatimage, m_total_pixel_count); + + kokkostbx::transfer_shared2kokkos(m_sdet_vector, metrology.sdet); + kokkostbx::transfer_shared2kokkos(m_fdet_vector, metrology.fdet); + kokkostbx::transfer_shared2kokkos(m_odet_vector, metrology.odet); + kokkostbx::transfer_shared2kokkos(m_pix0_vector, metrology.pix0); + kokkostbx::transfer_shared2kokkos(m_distance, metrology.dists); + kokkostbx::transfer_shared2kokkos(m_Xbeam, metrology.Xbeam); + kokkostbx::transfer_shared2kokkos(m_Ybeam, metrology.Ybeam); + fence(); + + // metrology.show(); + + // printf(" rangemap size:%d\n", m_rangemap.span()); + // printf(" omega_reduction size:%d\n", m_omega_reduction.span()); + // printf(" max_I_x_reduction size:%d\n", m_max_I_x_reduction.span()); + // printf(" max_I_y_reduction size:%d\n", m_max_I_y_reduction.span()); + // printf(" maskimage size:%d\n", m_maskimage.span()); + // printf(" floatimage size:%d\n", m_floatimage.span()); + // printf(" sdet_vector size:%d\n", m_sdet_vector.span()); + // printf(" fdet_vector size:%d\n", m_fdet_vector.span()); + // printf(" odet_vector size:%d\n", m_odet_vector.span()); + // printf(" pix0_vector size:%d\n", m_pix0_vector.span()); + // printf(" distance size:%d\n", m_distance.span()); + // printf(" Xbeam size:%d\n", m_Xbeam.span()); + // printf(" Ybeam size:%d\n", m_Ybeam.span()); + + // print_view(m_fdet_vector); + // print_view(m_odet_vector, 1, 3); + + // printf("DONE.\n"); + } + + inline void + scale_in_place(const double& factor){ + auto local_accumulate_floatimage = m_accumulate_floatimage; + parallel_for("scale_in_place", range_policy(0,m_total_pixel_count), KOKKOS_LAMBDA (const int i) { + local_accumulate_floatimage( i ) = local_accumulate_floatimage( i ) * factor; + }); + } + + inline void + write_raw_pixels(simtbx::nanoBragg::nanoBragg& nB) { + //only implement the monolithic detector case, one panel + SCITBX_ASSERT(nB.spixels == m_slow_dim_size); + SCITBX_ASSERT(nB.fpixels == m_fast_dim_size); + SCITBX_ASSERT(m_panel_count == 1); + // nB.raw_pixels = af::flex_double(af::flex_grid<>(nB.spixels,nB.fpixels)); + // do not reallocate CPU memory for the data write, as it is not needed + + kokkostbx::transfer_kokkos2flex(nB.raw_pixels, m_accumulate_floatimage); + // vector_double_t::HostMirror host_floatimage = create_mirror_view(m_accumulate_floatimage); + // deep_copy(host_floatimage, m_accumulate_floatimage); + + // printf(" m_total_pixel_count: %d\n", m_total_pixel_count); + + // double * double_floatimage = nB.raw_pixels.begin(); + // for (int i=0; i(m_panel_count,m_slow_dim_size,m_fast_dim_size), af::init_functor_null()); + kokkostbx::transfer_kokkos2flex(output_array, m_accumulate_floatimage); + + // vector_double_t::HostMirror host_floatimage = create_mirror_view(m_accumulate_floatimage); + // deep_copy(host_floatimage, m_accumulate_floatimage); + + // for (int i=0; i active_pixel_list_value) { + m_active_pixel_size = active_pixel_list_value.size(); + kokkostbx::transfer_shared2kokkos(m_active_pixel_list, active_pixel_list_value); + active_pixel_list = active_pixel_list_value; + } + + inline af::shared + get_whitelist_raw_pixels(af::shared selection) { + //return the data array for the multipanel detector case, but only for whitelist pixels + vector_size_t active_pixel_selection = vector_size_t("active_pixel_selection", selection.size()); + kokkostbx::transfer_shared2kokkos(active_pixel_selection, selection); + + size_t output_pixel_size = selection.size(); + vector_cudareal_t active_pixel_results = vector_cudareal_t("active_pixel_results", output_pixel_size); + + auto temp = m_accumulate_floatimage; + + parallel_for("get_active_pixel_selection", + range_policy(0, output_pixel_size), + KOKKOS_LAMBDA (const int i) { + size_t index = active_pixel_selection( i ); + active_pixel_results( i ) = temp( index ); + }); + + af::shared output_array(output_pixel_size, af::init_functor_null()); + kokkostbx::transfer_kokkos2shared(output_array, active_pixel_results); + + SCITBX_ASSERT(output_array.size() == output_pixel_size); + return output_array; + } }; + + } // Kokkos } // simtbx #endif // SIMTBX_KOKKOS_DETECTOR_H - diff --git a/simtbx/kokkos/kokkos_ext.cpp b/simtbx/kokkos/kokkos_ext.cpp index f630f9bd3e..3ef9f4e2ce 100644 --- a/simtbx/kokkos/kokkos_ext.cpp +++ b/simtbx/kokkos/kokkos_ext.cpp @@ -66,13 +66,14 @@ namespace simtbx { namespace Kokkos { } }; + template struct detector_wrapper { static void - wrap() + wrap(std::string pyname) { using namespace boost::python; - class_("gpu_detector",init<>() ) + class_ >(pyname.c_str(),init<>() ) .def(init( ( arg("deviceId")=-1,arg("nanoBragg")), "Single panel constructor with data taken from nanoBragg instance\n" @@ -83,21 +84,21 @@ namespace simtbx { namespace Kokkos { "Multipanel constructor with data taken from dxtbx objects\n" "deviceId is completely optional and ignored for Kokkos, rather\n" "the device is set by the gpu_instance class.")) - .def("show_summary",&simtbx::Kokkos::kokkos_detector::show_summary) + .def("show_summary",&simtbx::Kokkos::kokkos_detector::show_summary) .def("each_image_allocate", - &simtbx::Kokkos::kokkos_detector::each_image_allocate, + &simtbx::Kokkos::kokkos_detector::each_image_allocate, "Allocate large pixel arrays") - .def("scale_in_place", &simtbx::Kokkos::kokkos_detector::scale_in_place, + .def("scale_in_place", &simtbx::Kokkos::kokkos_detector::scale_in_place, "Multiply by a scale factor on the GPU") - .def("write_raw_pixels",&simtbx::Kokkos::kokkos_detector::write_raw_pixels, + .def("write_raw_pixels",&simtbx::Kokkos::kokkos_detector::write_raw_pixels, "Update raw_pixels on host with array from GPU") - .def("get_raw_pixels",&simtbx::Kokkos::kokkos_detector::get_raw_pixels, + .def("get_raw_pixels",&simtbx::Kokkos::kokkos_detector::get_raw_pixels, "return multipanel detector raw pixels as a flex array") .def("get_whitelist_raw_pixels", - (af::shared (simtbx::Kokkos::kokkos_detector::*)(af::shared)) - &simtbx::Kokkos::kokkos_detector::get_whitelist_raw_pixels, + (af::shared (simtbx::Kokkos::kokkos_detector::*)(af::shared)) + &simtbx::Kokkos::kokkos_detector::get_whitelist_raw_pixels, "return only those raw pixels requested by the whitelist selection, as a 1D flex array") - .def("each_image_free", &simtbx::Kokkos::kokkos_detector::each_image_free) + .def("each_image_free", &simtbx::Kokkos::kokkos_detector::each_image_free) ; } }; @@ -127,59 +128,70 @@ namespace simtbx { namespace Kokkos { } }; + template struct simulation_wrapper { static void - wrap() + wrap(std::string pyname) { using namespace boost::python; typedef return_value_policy rbv; typedef default_call_policies dcp; using namespace simtbx::Kokkos; - class_("exascale_api",no_init ) + class_ >(pyname.c_str(),no_init ) .def(init( ( arg("nanoBragg")))) - .def("allocate",&simtbx::Kokkos::exascale_api::allocate, + .def("allocate",&simtbx::Kokkos::exascale_api::allocate, "Allocate and transfer input data on the GPU") .def("add_energy_channel_from_gpu_amplitudes", - &simtbx::Kokkos::exascale_api::add_energy_channel_from_gpu_amplitudes, + &simtbx::Kokkos::exascale_api::add_energy_channel_from_gpu_amplitudes, (arg_("channel_number"), arg_("gpu_amplitudes"), arg_("gpu_detector"), arg_("weight")=1.0), "Point to Fhkl at a new energy channel on the GPU, and accumulate Bragg spot contributions to the detector's accumulator array") .def("add_energy_channel_mask_allpanel", - static_cast) > - (&exascale_api::add_energy_channel_mask_allpanel), + static_cast::*)(int const&,kokkos_energy_channels&,kokkos_detector&, af::shared) > + (&exascale_api::add_energy_channel_mask_allpanel), (arg_("channel_number"), arg_("gpu_amplitudes"), arg_("gpu_detector"), arg_("pixel_active_mask_bools")), "Point to Fhkl at a new energy channel on the GPU, and accumulate Bragg spots on mask==True pixels\n" "The pixel_active_mask_bools is a large array with one bool per detector pixel") .def("add_energy_channel_mask_allpanel", - static_cast const) > - (&exascale_api::add_energy_channel_mask_allpanel), + static_cast::*)(int const&,kokkos_energy_channels&,kokkos_detector&, af::shared const) > + (&exascale_api::add_energy_channel_mask_allpanel), (arg_("channel_number"), arg_("gpu_amplitudes"), arg_("gpu_detector"), arg_("pixel_active_list_ints")), "Point to Fhkl at a new energy channel on the GPU, and accumulate Bragg spots on mask==True pixels\n" "The pixel_active_list_ints is a small array with integer-offset addresses for each pixel-of-interest") .def("add_energy_multichannel_mask_allpanel", - static_cast const,kokkos_energy_channels&,kokkos_detector&, af::shared const, + static_cast::*)(af::shared const,kokkos_energy_channels&,kokkos_detector&, af::shared const, af::shared const) > - (&exascale_api::add_energy_multichannel_mask_allpanel), + (&exascale_api::add_energy_multichannel_mask_allpanel), (arg_("ichannels"), arg_("gpu_amplitudes"), arg_("gpu_detector"), arg_("pixel_active_list_ints"), arg_("weights")), "Point to Fhkl at a new energy channel on the GPU, and accumulate Bragg spots on mask==True pixels\n" "The pixel_active_list_ints is a small array with integer-offset addresses for each pixel-of-interest" "ichannels: for each nanoBragg source, the value instructs the simulation which channel in gpu_structure_factors" "to use for structure factor lookup. If -1, skip this source wavelength." ) - .def("add_background", &simtbx::Kokkos::exascale_api::add_background, + .def("add_background", &simtbx::Kokkos::exascale_api::add_background, (arg_("detector"), arg_("override_source")=-1), "Add a background field directly on the GPU") - .def("add_noise", &simtbx::Kokkos::exascale_api::add_noise, + .def("add_noise", &simtbx::Kokkos::exascale_api::add_noise, (arg_("detector")), "Modify pixels with noise on CPU. Unusual pattern, returns pixels directly instead of saving persistent") - .def("show",&simtbx::Kokkos::exascale_api::show) + .def("show",&simtbx::Kokkos::exascale_api::show) .add_property("diffuse", - make_getter(&simtbx::Kokkos::exascale_api::diffuse,rbv()), - make_setter(&simtbx::Kokkos::exascale_api::diffuse,dcp()), + make_getter(&simtbx::Kokkos::exascale_api::diffuse,rbv()), + make_setter(&simtbx::Kokkos::exascale_api::diffuse,dcp()), "the diffuse parameters for the simulation.") ; + } + }; + struct diffuse_wrapper + { + static void + wrap() + { + using namespace boost::python; + typedef return_value_policy rbv; + typedef default_call_policies dcp; class_("diffuse_api",no_init ) .def(init<>()) .add_property("enable", @@ -220,7 +232,10 @@ namespace simtbx { namespace Kokkos { BOOST_PYTHON_MODULE(simtbx_kokkos_ext) { simtbx::Kokkos::kokkos_instance_wrapper::wrap(); simtbx::Kokkos::structure_factor_wrapper::wrap(); - simtbx::Kokkos::detector_wrapper::wrap(); - simtbx::Kokkos::simulation_wrapper::wrap(); + simtbx::Kokkos::detector_wrapper::wrap("gpu_detector"); + simtbx::Kokkos::detector_wrapper::wrap("gpu_detector_small_whitelist"); + simtbx::Kokkos::simulation_wrapper::wrap("exascale_api"); + simtbx::Kokkos::simulation_wrapper::wrap("exascale_api_small_whitelist"); + simtbx::Kokkos::diffuse_wrapper::wrap(); } } // namespace simtbx diff --git a/simtbx/kokkos/simulation.cpp b/simtbx/kokkos/simulation.cpp index 5644654192..b4fd96861b 100644 --- a/simtbx/kokkos/simulation.cpp +++ b/simtbx/kokkos/simulation.cpp @@ -24,518 +24,6 @@ namespace Kokkos { return ret; }*/ - // extract subview from [start_index * length; (start_index + 1) * length) - template - view_1d_t extract_subview(view_1d_t A, int start_index, int length) { - return ::Kokkos::subview(A, ::Kokkos::pair(start_index * length, (start_index + 1) * length )); - } - - // make a unit vector pointing in same direction and report magnitude (both args can be same vector) - double cpu_unitize(const double * vector, double * new_unit_vector) { - - double v1 = vector[1]; - double v2 = vector[2]; - double v3 = vector[3]; - - double mag = sqrt(v1 * v1 + v2 * v2 + v3 * v3); - - if (mag != 0.0) { - // normalize it - new_unit_vector[0] = mag; - new_unit_vector[1] = v1 / mag; - new_unit_vector[2] = v2 / mag; - new_unit_vector[3] = v3 / mag; - } else { - // can't normalize, report zero vector - new_unit_vector[0] = 0.0; - new_unit_vector[1] = 0.0; - new_unit_vector[2] = 0.0; - new_unit_vector[3] = 0.0; - } - return mag; - } - - void - exascale_api::show(){ - SCITBX_EXAMINE(SIM.roi_xmin); - SCITBX_EXAMINE(SIM.roi_xmax); - SCITBX_EXAMINE(SIM.roi_ymin); - SCITBX_EXAMINE(SIM.roi_ymax); - SCITBX_EXAMINE(SIM.oversample); - SCITBX_EXAMINE(SIM.point_pixel); - SCITBX_EXAMINE(SIM.pixel_size); - SCITBX_EXAMINE(m_subpixel_size); - SCITBX_EXAMINE(m_steps); - SCITBX_EXAMINE(SIM.detector_thickstep); - SCITBX_EXAMINE(SIM.detector_thicksteps); - SCITBX_EXAMINE(SIM.detector_thick); - SCITBX_EXAMINE(SIM.detector_attnlen); - SCITBX_EXAMINE(SIM.curved_detector); - SCITBX_EXAMINE(SIM.distance); - SCITBX_EXAMINE(SIM.close_distance); - SCITBX_EXAMINE(SIM.dmin); - SCITBX_EXAMINE(SIM.phi0); - SCITBX_EXAMINE(SIM.phistep); - SCITBX_EXAMINE(SIM.phisteps); - SCITBX_EXAMINE(SIM.sources); - SCITBX_EXAMINE(SIM.mosaic_spread); - SCITBX_EXAMINE(SIM.mosaic_domains); - SCITBX_EXAMINE(SIM.Na); - SCITBX_EXAMINE(SIM.Nb); - SCITBX_EXAMINE(SIM.Nc); - SCITBX_EXAMINE(SIM.fluence); - SCITBX_EXAMINE(SIM.spot_scale); - SCITBX_EXAMINE(SIM.integral_form); - SCITBX_EXAMINE(SIM.default_F); - SCITBX_EXAMINE(SIM.interpolate); - SCITBX_EXAMINE(SIM.nopolar); - SCITBX_EXAMINE(SIM.polarization); - SCITBX_EXAMINE(SIM.fudge); - } - - void - exascale_api::add_energy_channel_from_gpu_amplitudes( - int const& ichannel, - simtbx::Kokkos::kokkos_energy_channels & kec, - simtbx::Kokkos::kokkos_detector & kdt, - double const& weight - ){ - // transfer source_I, source_lambda - // the int arguments are for sizes of the arrays - int source_count = SIM.sources; - af::shared weighted_sources_I = af::shared(source_count); - double* wptr = weighted_sources_I.begin(); - for (std::size_t iwt = 0; iwt < source_count; iwt++){wptr[iwt] = weight*(SIM.source_I[iwt]);} - kokkostbx::transfer_double2kokkos(m_source_I, wptr, source_count); - kokkostbx::transfer_double2kokkos(m_source_lambda, SIM.source_lambda, source_count); - - ::Kokkos::resize(m_crystal_orientation, SIM.phisteps, SIM.mosaic_domains, 3); - calc_CrystalOrientations( - SIM.phi0, SIM.phistep, SIM.phisteps, m_spindle_vector, m_a0, m_b0, m_c0, SIM.mosaic_spread, - SIM.mosaic_domains, m_mosaic_umats, m_crystal_orientation); - - // magic happens here(?): take pointer from singleton, temporarily use it for add Bragg iteration: - vector_cudareal_t current_channel_Fhkl = kec.d_channel_Fhkl[ichannel]; - - std::size_t panel_size = kdt.m_slow_dim_size * kdt.m_fast_dim_size; - - // the for loop around panels. Offsets given. - for (std::size_t panel_id = 0; panel_id < kdt.m_panel_count; panel_id++){ - // loop thru panels and increment the array ptrs - kokkosSpotsKernel( - kdt.m_slow_dim_size, kdt.m_fast_dim_size, SIM.roi_xmin, SIM.roi_xmax, - SIM.roi_ymin, SIM.roi_ymax, SIM.oversample, SIM.point_pixel, - SIM.pixel_size, m_subpixel_size, m_steps, SIM.detector_thickstep, - SIM.detector_thicksteps, SIM.detector_thick, SIM.detector_attnlen, - extract_subview(kdt.m_sdet_vector, panel_id, 1), - extract_subview(kdt.m_fdet_vector, panel_id, 1), - extract_subview(kdt.m_odet_vector, panel_id, 1), - extract_subview(kdt.m_pix0_vector, panel_id, 1), - SIM.curved_detector, kdt.metrology.dists[panel_id], kdt.metrology.dists[panel_id], - m_beam_vector, - SIM.dmin, SIM.phisteps, SIM.sources, - m_source_X, m_source_Y, m_source_Z, - m_source_I, m_source_lambda, - SIM.xtal_shape, - SIM.mosaic_domains, m_crystal_orientation, - SIM.Na, SIM.Nb, SIM.Nc, SIM.V_cell, - m_water_size, m_water_F, m_water_MW, simtbx::nanoBragg::r_e_sqr, - SIM.fluence, simtbx::nanoBragg::Avogadro, SIM.spot_scale, SIM.integral_form, - SIM.default_F, - current_channel_Fhkl, - kec.m_FhklParams, - SIM.nopolar, - m_polar_vector, SIM.polarization, SIM.fudge, - // &(kdt.m_maskimage[panel_size * panel_id]), - nullptr, - // return arrays: - extract_subview(kdt.m_floatimage, panel_id, panel_size), - extract_subview(kdt.m_omega_reduction, panel_id, panel_size), - extract_subview(kdt.m_max_I_x_reduction, panel_id, panel_size), - extract_subview(kdt.m_max_I_y_reduction, panel_id, panel_size), - extract_subview(kdt.m_rangemap, panel_id, panel_size)); - //cudaSafeCall(cudaPeekAtLastError()); - ::Kokkos::fence(); - } - //cudaSafeCall(cudaDeviceSynchronize()); - - //don't want to free the kec data when the nanoBragg goes out of scope, so switch the pointer - // cu_current_channel_Fhkl = NULL; - - add_array(kdt.m_accumulate_floatimage, kdt.m_floatimage); - } - - void - exascale_api::add_energy_channel_mask_allpanel( - int const& ichannel, - simtbx::Kokkos::kokkos_energy_channels & kec, - simtbx::Kokkos::kokkos_detector & kdt, - af::shared all_panel_mask - ){ - // here or there, need to convert the all_panel_mask (3D map) into a 1D list of accepted pixels - // coordinates for the active pixel list are absolute offsets into the detector array - af::shared active_pixel_list; - const bool* jptr = all_panel_mask.begin(); - for (int j=0; j < all_panel_mask.size(); ++j){ - if (jptr[j]) { - active_pixel_list.push_back(j); - } - } - add_energy_channel_mask_allpanel(ichannel, kec, kdt, active_pixel_list); - } - - void - exascale_api::add_energy_channel_mask_allpanel( - int const& ichannel, - simtbx::Kokkos::kokkos_energy_channels & kec, - simtbx::Kokkos::kokkos_detector & kdt, - af::shared const active_pixel_list - ){ - kdt.set_active_pixels_on_GPU(active_pixel_list); - - // transfer source_I, source_lambda - // the int arguments are for sizes of the arrays - int source_count = SIM.sources; - kokkostbx::transfer_double2kokkos(m_source_I, SIM.source_I, source_count); - kokkostbx::transfer_double2kokkos(m_source_lambda, SIM.source_lambda, source_count); - // cudaSafeCall(cudaMemcpyVectorDoubleToDevice(m_source_I, SIM.source_I, SIM.sources)); - // cudaSafeCall(cudaMemcpyVectorDoubleToDevice(m_source_lambda, SIM.source_lambda, SIM.sources)); - - ::Kokkos::resize(m_crystal_orientation, SIM.phisteps, SIM.mosaic_domains, 3); - calc_CrystalOrientations( - SIM.phi0, SIM.phistep, SIM.phisteps, m_spindle_vector, m_a0, m_b0, m_c0, SIM.mosaic_spread, - SIM.mosaic_domains, m_mosaic_umats, m_crystal_orientation); - - // magic happens here: take pointer from singleton, temporarily use it for add Bragg iteration: - vector_cudareal_t current_channel_Fhkl = kec.d_channel_Fhkl[ichannel]; - - // cudaDeviceProp deviceProps = { 0 }; - // cudaSafeCall(cudaGetDeviceProperties(&deviceProps, SIM.device_Id)); - // int smCount = deviceProps.multiProcessorCount; - // dim3 threadsPerBlock(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y); - // dim3 numBlocks(smCount * 8, 1); - - // for call for all panels at the same time - - // set up cell basis vectors for the diffuse parameters (convert vec4 to vec3) - diffuse.a0 << SIM.a0[1],SIM.a0[2],SIM.a0[3]; diffuse.b0 << SIM.b0[1],SIM.b0[2],SIM.b0[3]; diffuse.c0 << SIM.c0[1],SIM.c0[2],SIM.c0[3]; - - debranch_maskall_Kernel( - kdt.m_panel_count, kdt.m_slow_dim_size, kdt.m_fast_dim_size, active_pixel_list.size(), - SIM.oversample, SIM.point_pixel, - SIM.pixel_size, m_subpixel_size, m_steps, - SIM.detector_thickstep, SIM.detector_thicksteps, SIM.detector_thick, SIM.detector_attnlen, - kdt.m_sdet_vector, - kdt.m_fdet_vector, - kdt.m_odet_vector, - kdt.m_pix0_vector, - kdt.m_distance, - m_beam_vector, - SIM.dmin, SIM.phisteps, SIM.sources, - m_source_X, m_source_Y, - m_source_Z, - m_source_I, m_source_lambda, - SIM.mosaic_domains, m_crystal_orientation, - SIM.Na, SIM.Nb, SIM.Nc, SIM.V_cell, - m_water_size, m_water_F, m_water_MW, simtbx::nanoBragg::r_e_sqr, - SIM.fluence, simtbx::nanoBragg::Avogadro, SIM.spot_scale, SIM.integral_form, - SIM.default_F, - current_channel_Fhkl, - kec.m_FhklParams, - SIM.nopolar, - m_polar_vector, SIM.polarization, SIM.fudge, - kdt.m_active_pixel_list, - diffuse, - // return arrays: - kdt.m_floatimage, - kdt.m_omega_reduction, - kdt.m_max_I_x_reduction, - kdt.m_max_I_y_reduction, - kdt.m_rangemap); - - // cudaSafeCall(cudaPeekAtLastError()); - // cudaSafeCall(cudaDeviceSynchronize()); - - //don't want to free the kec data when the nanoBragg goes out of scope, so switch the pointer - // cu_current_channel_Fhkl = NULL; - - add_array(kdt.m_accumulate_floatimage, kdt.m_floatimage); - } - - void - exascale_api::add_energy_multichannel_mask_allpanel( - af::shared const ichannels, - simtbx::Kokkos::kokkos_energy_channels & kec, - simtbx::Kokkos::kokkos_detector & kdt, - af::shared const active_pixel_list, - af::shared const weight - ){ - kdt.set_active_pixels_on_GPU(active_pixel_list); - - // transfer source_I, source_lambda - // the int arguments are for sizes of the arrays - - SCITBX_ASSERT(SIM.sources == ichannels.size()); /* For each nanoBragg source, this value instructs - the simulation where to look for structure factors. If -1, skip this source wavelength. */ - SCITBX_ASSERT(SIM.sources == weight.size()); - - for (int ictr = 0; ictr < SIM.sources; ++ictr){ - if (ichannels[ictr] < 0) continue; // the ichannel array - //printf("HA HA %4d channel %4d, %5.1f %5.1f %5.1f %10.5f %10.5g\n",ictr,ichannels[ictr], - // SIM.source_X[ictr], SIM.source_Y[ictr], SIM.source_Z[ictr], - // SIM.source_I[ictr], SIM.source_lambda[ictr]); - - ::Kokkos::resize(m_crystal_orientation, SIM.phisteps, SIM.mosaic_domains, 3); - calc_CrystalOrientations( - SIM.phi0, SIM.phistep, SIM.phisteps, m_spindle_vector, m_a0, m_b0, m_c0, SIM.mosaic_spread, - SIM.mosaic_domains, m_mosaic_umats, m_crystal_orientation); - - // magic happens here: take pointer from singleton, temporarily use it for add Bragg iteration: - vector_cudareal_t current_channel_Fhkl = kec.d_channel_Fhkl[ichannels[ictr]]; - - vector_cudareal_t c_source_X = vector_cudareal_t("c_source_X", 0); - vector_cudareal_t c_source_Y = vector_cudareal_t("c_source_Y", 0); - vector_cudareal_t c_source_Z = vector_cudareal_t("c_source_Z", 0); - vector_cudareal_t c_source_I = vector_cudareal_t("c_source_I", 0); - vector_cudareal_t c_source_lambda = vector_cudareal_t("c_source_lambda", 0); - double weighted_I = SIM.source_I[ictr] * weight[ictr]; - kokkostbx::transfer_double2kokkos(c_source_X, &(SIM.source_X[ictr]), 1); - kokkostbx::transfer_double2kokkos(c_source_Y, &(SIM.source_Y[ictr]), 1); - kokkostbx::transfer_double2kokkos(c_source_Z, &(SIM.source_Z[ictr]), 1); - kokkostbx::transfer_double2kokkos(c_source_I, &(weighted_I), 1); - kokkostbx::transfer_double2kokkos(c_source_lambda, &(SIM.source_lambda[ictr]), 1); - - // set up cell basis vectors for the diffuse parameters (convert vec4 to vec3) - diffuse.a0 << SIM.a0[1],SIM.a0[2],SIM.a0[3]; diffuse.b0 << SIM.b0[1],SIM.b0[2],SIM.b0[3]; diffuse.c0 << SIM.c0[1],SIM.c0[2],SIM.c0[3]; - - debranch_maskall_Kernel( - kdt.m_panel_count, kdt.m_slow_dim_size, kdt.m_fast_dim_size, active_pixel_list.size(), - SIM.oversample, SIM.point_pixel, - SIM.pixel_size, m_subpixel_size, m_steps, - SIM.detector_thickstep, SIM.detector_thicksteps, SIM.detector_thick, SIM.detector_attnlen, - kdt.m_sdet_vector, - kdt.m_fdet_vector, - kdt.m_odet_vector, - kdt.m_pix0_vector, - kdt.m_distance, - m_beam_vector, - SIM.dmin, SIM.phisteps, 1, - c_source_X, c_source_Y, - c_source_Z, - c_source_I, c_source_lambda, - SIM.mosaic_domains, m_crystal_orientation, - SIM.Na, SIM.Nb, SIM.Nc, SIM.V_cell, - m_water_size, m_water_F, m_water_MW, simtbx::nanoBragg::r_e_sqr, - SIM.fluence, simtbx::nanoBragg::Avogadro, SIM.spot_scale, SIM.integral_form, - SIM.default_F, - current_channel_Fhkl, - kec.m_FhklParams, - SIM.nopolar, - m_polar_vector, SIM.polarization, SIM.fudge, - kdt.m_active_pixel_list, - diffuse, - // return arrays: - kdt.m_floatimage, - kdt.m_omega_reduction, - kdt.m_max_I_x_reduction, - kdt.m_max_I_y_reduction, - kdt.m_rangemap); - //don't want to free the kec data when the nanoBragg goes out of scope, so switch the pointer - // cu_current_channel_Fhkl = NULL; - add_array(kdt.m_accumulate_floatimage, kdt.m_floatimage); - }// loop over channels - } - - void - exascale_api::add_background(simtbx::Kokkos::kokkos_detector & kdt, int const& override_source) { - // cudaSafeCall(cudaSetDevice(SIM.device_Id)); - - // transfer source_I, source_lambda - // the int arguments are for sizes of the arrays - int sources_count = SIM.sources; - kokkostbx::transfer_double2kokkos(m_source_I, SIM.source_I, sources_count); - kokkostbx::transfer_double2kokkos(m_source_lambda, SIM.source_lambda, sources_count); - // cudaSafeCall(cudaMemcpyVectorDoubleToDevice(m_source_I, SIM.source_I, SIM.sources)); - // cudaSafeCall(cudaMemcpyVectorDoubleToDevice(m_source_lambda, SIM.source_lambda, SIM.sources)); - - vector_cudareal_t stol_of("stol_of", SIM.stols); - transfer_X2kokkos(stol_of, SIM.stol_of, SIM.stols); - // CUDAREAL * cu_stol_of; - // cudaSafeCall(cudaMalloc((void ** )&cu_stol_of, sizeof(*cu_stol_of) * SIM.stols)); - // cudaSafeCall(cudaMemcpyVectorDoubleToDevice(cu_stol_of, SIM.stol_of, SIM.stols)); - - vector_cudareal_t Fbg_of("Fbg_of", SIM.stols); - transfer_X2kokkos(Fbg_of, SIM.Fbg_of, SIM.stols); - // CUDAREAL * cu_Fbg_of; - // cudaSafeCall(cudaMalloc((void ** )&cu_Fbg_of, sizeof(*cu_Fbg_of) * SIM.stols)); - // cudaSafeCall(cudaMemcpyVectorDoubleToDevice(cu_Fbg_of, SIM.Fbg_of, SIM.stols)); - - // cudaDeviceProp deviceProps = { 0 }; - // cudaSafeCall(cudaGetDeviceProperties(&deviceProps, SIM.device_Id)); - // int smCount = deviceProps.multiProcessorCount; - // dim3 threadsPerBlock(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y); - // dim3 numBlocks(smCount * 8, 1); - - // initialize the device memory within a kernel. - // modify the arguments to initialize multipanel detector. - ::Kokkos::parallel_for("kokkosSpotsInit", kdt.m_panel_count * kdt.m_slow_dim_size * kdt.m_fast_dim_size, KOKKOS_LAMBDA (const int& j) { - kdt.m_floatimage(j) = 0; - kdt.m_omega_reduction(j) = 0; - kdt.m_max_I_x_reduction(j) = 0; - kdt.m_max_I_y_reduction(j) = 0; - kdt.m_rangemap(j) = false; - }); - // nanoBraggSpotsInitCUDAKernel<<>>( - // kdt.m_panel_count * kdt.m_slow_dim_size, kdt.m_fast_dim_size, - // kdt.cu_floatimage, kdt.cu_omega_reduction, - // kdt.cu_max_I_x_reduction, kdt.cu_max_I_y_reduction, - // kdt.cu_rangemap); - // cudaSafeCall(cudaPeekAtLastError()); - // cudaSafeCall(cudaDeviceSynchronize()); - - std::size_t panel_size = kdt.m_slow_dim_size * kdt.m_fast_dim_size; - - // the for loop around panels. Offsets given. - for (std::size_t panel_id = 0; panel_id < kdt.m_panel_count; panel_id++) { - add_background_kokkos_kernel(SIM.sources, - SIM.oversample, override_source, - SIM.pixel_size, kdt.m_slow_dim_size, kdt.m_fast_dim_size, SIM.detector_thicksteps, - SIM.detector_thickstep, SIM.detector_attnlen, - extract_subview(kdt.m_sdet_vector, panel_id, 1), - extract_subview(kdt.m_fdet_vector, panel_id, 1), - extract_subview(kdt.m_odet_vector, panel_id, 1), - extract_subview(kdt.m_pix0_vector, panel_id, 1), - kdt.metrology.dists[panel_id], SIM.point_pixel, SIM.detector_thick, - m_source_X, m_source_Y, m_source_Z, - m_source_lambda, m_source_I, - SIM.stols, stol_of, Fbg_of, - SIM.nopolar, SIM.polarization, m_polar_vector, - simtbx::nanoBragg::r_e_sqr, SIM.fluence, SIM.amorphous_molecules, - // returns: - extract_subview(kdt.m_floatimage, panel_id, panel_size)); - - // cudaSafeCall(cudaPeekAtLastError()); - } - // cudaSafeCall(cudaDeviceSynchronize()); - ::Kokkos::fence(); - add_array(kdt.m_accumulate_floatimage, kdt.m_floatimage); - - // cudaSafeCall(cudaFree(cu_stol_of)); - // cudaSafeCall(cudaFree(cu_Fbg_of)); - } - - af::flex_double - exascale_api::add_noise(simtbx::Kokkos::kokkos_detector & kdt) { - // put raw pixels into a temporary hold - af::flex_double tmp_hold_pixels = kdt.get_raw_pixels(); - std::size_t panel_size = kdt.m_slow_dim_size * kdt.m_fast_dim_size; - af::flex_double panel_pixels = af::flex_double(af::flex_grid<>(kdt.m_slow_dim_size,kdt.m_fast_dim_size), af::init_functor_null()); - for (std::size_t panel_id = 0; panel_id < kdt.m_panel_count; panel_id++) { - double* dst_beg = panel_pixels.begin(); - double* dst_end = panel_pixels.end(); - double* dstptr = dst_beg; - for (double *srcptr = &(tmp_hold_pixels[panel_id*panel_size]); dstptr != dst_end; ){ - *dstptr++ = *srcptr++; - } // get the pixels from one panel on GPU memory into panel_pixels for add_noise - SIM.add_noise(panel_pixels); - dstptr = dst_beg; - for (double *srcptr = &(tmp_hold_pixels[panel_id*panel_size]); dstptr != dst_end; ){ - *srcptr++ = *dstptr++; - } // return the pixels from SIM.raw_pixels to temporary array - } - ::Kokkos::fence(); - return tmp_hold_pixels; - } - - void - exascale_api::allocate() { - //cudaSafeCall(cudaSetDevice(SIM.device_Id)); - - // water_size not defined in class, CLI argument, defaults to 0 - double water_size = 0.0; - // missing constants - double water_F = 2.57; - double water_MW = 18.0; - - // make sure we are normalizing with the right number of sub-steps - int nb_steps = SIM.phisteps * SIM.mosaic_domains * SIM.oversample * SIM.oversample; - double nb_subpixel_size = SIM.pixel_size / SIM.oversample; - - //create transfer arguments to device space - m_subpixel_size = nb_subpixel_size; //check for conflict? - m_steps = nb_steps; //check for conflict? - - // presumably thickness and attenuation can be migrated to the gpu detector class XXX FIXME - //cu_detector_thick = SIM.detector_thick; - //cu_detector_mu = SIM.detector_attnlen; // synonyms - //cu_distance = SIM.distance; // distance and close distance, detector properties? XXX FIXME - //cu_close_distance = SIM.close_distance; - - m_water_size = water_size; - m_water_F = water_F; - m_water_MW = water_MW; - - //const int vector_length = 4; - - kokkostbx::transfer_double2kokkos(m_beam_vector, SIM.beam_vector, m_vector_length); - kokkostbx::transfer_double2kokkos(m_spindle_vector, SIM.spindle_vector, m_vector_length); - kokkostbx::transfer_double2kokkos(m_a0, SIM.a0, m_vector_length); - kokkostbx::transfer_double2kokkos(m_b0, SIM.b0, m_vector_length); - kokkostbx::transfer_double2kokkos(m_c0, SIM.c0, m_vector_length); - - // cudaSafeCall(cudaMalloc((void ** )&cu_beam_vector, sizeof(*cu_beam_vector) * vector_length)); - // cudaSafeCall(cudaMemcpyVectorDoubleToDevice(cu_beam_vector, SIM.beam_vector, vector_length)); - - // cudaSafeCall(cudaMalloc((void ** )&cu_spindle_vector, sizeof(*cu_spindle_vector) * vector_length)); - // cudaSafeCall(cudaMemcpyVectorDoubleToDevice(cu_spindle_vector, SIM.spindle_vector, vector_length)); - - // cudaSafeCall(cudaMalloc((void ** )&cu_a0, sizeof(*cu_a0) * vector_length)); - // cudaSafeCall(cudaMemcpyVectorDoubleToDevice(cu_a0, SIM.a0, vector_length)); - - // cudaSafeCall(cudaMalloc((void ** )&cu_b0, sizeof(*cu_b0) * vector_length)); - // cudaSafeCall(cudaMemcpyVectorDoubleToDevice(cu_b0, SIM.b0, vector_length)); - - // cudaSafeCall(cudaMalloc((void ** )&cu_c0, sizeof(*cu_c0) * vector_length)); - // cudaSafeCall(cudaMemcpyVectorDoubleToDevice(cu_c0, SIM.c0, vector_length)); - - // Unitize polar vector before sending it to the GPU. - // Optimization do it only once here rather than multiple time per pixel in the GPU. - double polar_vector_unitized[4]; - cpu_unitize(SIM.polar_vector, polar_vector_unitized); - kokkostbx::transfer_double2kokkos(m_polar_vector, polar_vector_unitized, m_vector_length); - - int sources_count = SIM.sources; - kokkostbx::transfer_double2kokkos(m_source_X, SIM.source_X, sources_count); - kokkostbx::transfer_double2kokkos(m_source_Y, SIM.source_Y, sources_count); - kokkostbx::transfer_double2kokkos(m_source_Z, SIM.source_Z, sources_count); - kokkostbx::transfer_double2kokkos(m_source_I, SIM.source_I, sources_count); - kokkostbx::transfer_double2kokkos(m_source_lambda, SIM.source_lambda, sources_count); - - int mosaic_domains_count = SIM.mosaic_domains; - kokkostbx::transfer_double2kokkos(m_mosaic_umats, SIM.mosaic_umats, mosaic_domains_count * 9); - - // cudaSafeCall(cudaMalloc((void ** )&cu_polar_vector, sizeof(*cu_polar_vector) * vector_length)); - // cudaSafeCall(cudaMemcpyVectorDoubleToDevice(cu_polar_vector, polar_vector_unitized, vector_length)); - - // cudaSafeCall(cudaMalloc((void ** )&cu_source_X, sizeof(*cu_source_X) * sources_count)); - // cudaSafeCall(cudaMemcpyVectorDoubleToDevice(cu_source_X, SIM.source_X, sources_count)); - - // cudaSafeCall(cudaMalloc((void ** )&cu_source_Y, sizeof(*cu_source_Y) * sources_count)); - // cudaSafeCall(cudaMemcpyVectorDoubleToDevice(cu_source_Y, SIM.source_Y, sources_count)); - - // cudaSafeCall(cudaMalloc((void ** )&cu_source_Z, sizeof(*cu_source_Z) * sources_count)); - // cudaSafeCall(cudaMemcpyVectorDoubleToDevice(cu_source_Z, SIM.source_Z, sources_count)); - - // cudaSafeCall(cudaMalloc((void ** )&cu_source_I, sizeof(*cu_source_I) * sources_count)); - // cudaSafeCall(cudaMemcpyVectorDoubleToDevice(cu_source_I, SIM.source_I, sources_count)); - - // cudaSafeCall(cudaMalloc((void ** )&cu_source_lambda, sizeof(*cu_source_lambda) * sources_count)); - // cudaSafeCall(cudaMemcpyVectorDoubleToDevice(cu_source_lambda, SIM.source_lambda, sources_count)); - - // cudaSafeCall(cudaMalloc((void ** )&cu_mosaic_umats, sizeof(*cu_mosaic_umats) * mosaic_domains_count * 9)); - // cudaSafeCall(cudaMemcpyVectorDoubleToDevice(cu_mosaic_umats, SIM.mosaic_umats, mosaic_domains_count * 9)); - }; - /* exascale_api::~exascale_api(){ cudaSafeCall(cudaSetDevice(SIM.device_Id)); diff --git a/simtbx/kokkos/simulation.h b/simtbx/kokkos/simulation.h index ca2d763ad7..9b9d6a6552 100644 --- a/simtbx/kokkos/simulation.h +++ b/simtbx/kokkos/simulation.h @@ -15,9 +15,6 @@ using mat3 = kokkostbx::matrix3; using crystal_orientation_t = Kokkos::View; // [phisteps, domains, 3] namespace simtbx { namespace Kokkos { - -namespace af = scitbx::af; - struct diffuse_api { inline diffuse_api() {}; inline void show() {}; @@ -30,37 +27,75 @@ struct diffuse_api { mat3 rotate_principal_axes; vec3 a0, b0, c0; // cell basis vectors }; +}} + +#include "simtbx/kokkos/simulation_kernels.h" + +namespace simtbx { namespace Kokkos { + +namespace af = scitbx::af; + // extract subview from [start_index * length; (start_index + 1) * length) + template + view_1d_t extract_subview(view_1d_t A, int start_index, int length) { + return ::Kokkos::subview(A, ::Kokkos::pair(start_index * length, (start_index + 1) * length )); + } + + // make a unit vector pointing in same direction and report magnitude (both args can be same vector) + double cpu_unitize(const double * vector, double * new_unit_vector) { + + double v1 = vector[1]; + double v2 = vector[2]; + double v3 = vector[3]; + double mag = sqrt(v1 * v1 + v2 * v2 + v3 * v3); + + if (mag != 0.0) { + // normalize it + new_unit_vector[0] = mag; + new_unit_vector[1] = v1 / mag; + new_unit_vector[2] = v2 / mag; + new_unit_vector[3] = v3 / mag; + } else { + // can't normalize, report zero vector + new_unit_vector[0] = 0.0; + new_unit_vector[1] = 0.0; + new_unit_vector[2] = 0.0; + new_unit_vector[3] = 0.0; + } + return mag; + } + +template struct exascale_api { inline exascale_api(const simtbx::nanoBragg::nanoBragg& nB) : SIM(nB) { } - void show(); - void add_energy_channel_from_gpu_amplitudes(int const&, + //void show(); + /*void add_energy_channel_from_gpu_amplitudes(int const&, simtbx::Kokkos::kokkos_energy_channels &, - simtbx::Kokkos::kokkos_detector &, + simtbx::Kokkos::kokkos_detector &, double const& - ); - void add_energy_channel_mask_allpanel(int const&, + ); */ + /*void add_energy_channel_mask_allpanel(int const&, simtbx::Kokkos::kokkos_energy_channels &, - simtbx::Kokkos::kokkos_detector &, + simtbx::Kokkos::kokkos_detector &, af::shared - ); - void add_energy_channel_mask_allpanel(int const&, + ); */ + /*void add_energy_channel_mask_allpanel(int const&, simtbx::Kokkos::kokkos_energy_channels &, - simtbx::Kokkos::kokkos_detector &, + simtbx::Kokkos::kokkos_detector &, af::shared const - ); - void add_energy_multichannel_mask_allpanel(af::shared const, + ); */ + /*void add_energy_multichannel_mask_allpanel(af::shared const, simtbx::Kokkos::kokkos_energy_channels &, - simtbx::Kokkos::kokkos_detector &, + simtbx::Kokkos::kokkos_detector &, af::shared const, af::shared const - ); + ); */ - void add_background(simtbx::Kokkos::kokkos_detector &, int const&); - af::flex_double add_noise(simtbx::Kokkos::kokkos_detector &); - void allocate(); + //void add_background(simtbx::Kokkos::kokkos_detector &, int const&); + //af::flex_double add_noise(simtbx::Kokkos::kokkos_detector &); + //void allocate(); //~exascale_api(); const simtbx::nanoBragg::nanoBragg& SIM; @@ -86,6 +121,490 @@ struct exascale_api { CUDAREAL m_water_F = 0; CUDAREAL m_water_MW = 0; diffuse_api diffuse; + + inline void + show(){ + SCITBX_EXAMINE(SIM.roi_xmin); + SCITBX_EXAMINE(SIM.roi_xmax); + SCITBX_EXAMINE(SIM.roi_ymin); + SCITBX_EXAMINE(SIM.roi_ymax); + SCITBX_EXAMINE(SIM.oversample); + SCITBX_EXAMINE(SIM.point_pixel); + SCITBX_EXAMINE(SIM.pixel_size); + SCITBX_EXAMINE(m_subpixel_size); + SCITBX_EXAMINE(m_steps); + SCITBX_EXAMINE(SIM.detector_thickstep); + SCITBX_EXAMINE(SIM.detector_thicksteps); + SCITBX_EXAMINE(SIM.detector_thick); + SCITBX_EXAMINE(SIM.detector_attnlen); + SCITBX_EXAMINE(SIM.curved_detector); + SCITBX_EXAMINE(SIM.distance); + SCITBX_EXAMINE(SIM.close_distance); + SCITBX_EXAMINE(SIM.dmin); + SCITBX_EXAMINE(SIM.phi0); + SCITBX_EXAMINE(SIM.phistep); + SCITBX_EXAMINE(SIM.phisteps); + SCITBX_EXAMINE(SIM.sources); + SCITBX_EXAMINE(SIM.mosaic_spread); + SCITBX_EXAMINE(SIM.mosaic_domains); + SCITBX_EXAMINE(SIM.Na); + SCITBX_EXAMINE(SIM.Nb); + SCITBX_EXAMINE(SIM.Nc); + SCITBX_EXAMINE(SIM.fluence); + SCITBX_EXAMINE(SIM.spot_scale); + SCITBX_EXAMINE(SIM.integral_form); + SCITBX_EXAMINE(SIM.default_F); + SCITBX_EXAMINE(SIM.interpolate); + SCITBX_EXAMINE(SIM.nopolar); + SCITBX_EXAMINE(SIM.polarization); + SCITBX_EXAMINE(SIM.fudge); + } + + inline void + add_energy_channel_from_gpu_amplitudes( + int const& ichannel, + simtbx::Kokkos::kokkos_energy_channels & kec, + simtbx::Kokkos::kokkos_detector & kdt, + double const& weight + ){ + // transfer source_I, source_lambda + // the int arguments are for sizes of the arrays + int source_count = SIM.sources; + af::shared weighted_sources_I = af::shared(source_count); + double* wptr = weighted_sources_I.begin(); + for (std::size_t iwt = 0; iwt < source_count; iwt++){wptr[iwt] = weight*(SIM.source_I[iwt]);} + kokkostbx::transfer_double2kokkos(m_source_I, wptr, source_count); + kokkostbx::transfer_double2kokkos(m_source_lambda, SIM.source_lambda, source_count); + + ::Kokkos::resize(m_crystal_orientation, SIM.phisteps, SIM.mosaic_domains, 3); + calc_CrystalOrientations( + SIM.phi0, SIM.phistep, SIM.phisteps, m_spindle_vector, m_a0, m_b0, m_c0, SIM.mosaic_spread, + SIM.mosaic_domains, m_mosaic_umats, m_crystal_orientation); + + // magic happens here(?): take pointer from singleton, temporarily use it for add Bragg iteration: + vector_cudareal_t current_channel_Fhkl = kec.d_channel_Fhkl[ichannel]; + + std::size_t panel_size = kdt.m_slow_dim_size * kdt.m_fast_dim_size; + + // the for loop around panels. Offsets given. + for (std::size_t panel_id = 0; panel_id < kdt.m_panel_count; panel_id++){ + // loop thru panels and increment the array ptrs + kokkosSpotsKernel( + kdt.m_slow_dim_size, kdt.m_fast_dim_size, SIM.roi_xmin, SIM.roi_xmax, + SIM.roi_ymin, SIM.roi_ymax, SIM.oversample, SIM.point_pixel, + SIM.pixel_size, m_subpixel_size, m_steps, SIM.detector_thickstep, + SIM.detector_thicksteps, SIM.detector_thick, SIM.detector_attnlen, + extract_subview(kdt.m_sdet_vector, panel_id, 1), + extract_subview(kdt.m_fdet_vector, panel_id, 1), + extract_subview(kdt.m_odet_vector, panel_id, 1), + extract_subview(kdt.m_pix0_vector, panel_id, 1), + SIM.curved_detector, kdt.metrology.dists[panel_id], kdt.metrology.dists[panel_id], + m_beam_vector, + SIM.dmin, SIM.phisteps, SIM.sources, + m_source_X, m_source_Y, m_source_Z, + m_source_I, m_source_lambda, + SIM.xtal_shape, + SIM.mosaic_domains, m_crystal_orientation, + SIM.Na, SIM.Nb, SIM.Nc, SIM.V_cell, + m_water_size, m_water_F, m_water_MW, simtbx::nanoBragg::r_e_sqr, + SIM.fluence, simtbx::nanoBragg::Avogadro, SIM.spot_scale, SIM.integral_form, + SIM.default_F, + current_channel_Fhkl, + kec.m_FhklParams, + SIM.nopolar, + m_polar_vector, SIM.polarization, SIM.fudge, + // &(kdt.m_maskimage[panel_size * panel_id]), + nullptr, + // return arrays: + extract_subview(kdt.m_floatimage, panel_id, panel_size), + extract_subview(kdt.m_omega_reduction, panel_id, panel_size), + extract_subview(kdt.m_max_I_x_reduction, panel_id, panel_size), + extract_subview(kdt.m_max_I_y_reduction, panel_id, panel_size), + extract_subview(kdt.m_rangemap, panel_id, panel_size)); + //cudaSafeCall(cudaPeekAtLastError()); + ::Kokkos::fence(); + } + //cudaSafeCall(cudaDeviceSynchronize()); + + //don't want to free the kec data when the nanoBragg goes out of scope, so switch the pointer + // cu_current_channel_Fhkl = NULL; + + add_array(kdt.m_accumulate_floatimage, kdt.m_floatimage); + } + + inline void + add_energy_channel_mask_allpanel( + int const& ichannel, + simtbx::Kokkos::kokkos_energy_channels & kec, + simtbx::Kokkos::kokkos_detector & kdt, + af::shared all_panel_mask + ){ + // here or there, need to convert the all_panel_mask (3D map) into a 1D list of accepted pixels + // coordinates for the active pixel list are absolute offsets into the detector array + af::shared active_pixel_list; + const bool* jptr = all_panel_mask.begin(); + for (int j=0; j < all_panel_mask.size(); ++j){ + if (jptr[j]) { + active_pixel_list.push_back(j); + } + } + add_energy_channel_mask_allpanel(ichannel, kec, kdt, active_pixel_list); + } + + inline void + add_energy_channel_mask_allpanel( + int const& ichannel, + simtbx::Kokkos::kokkos_energy_channels & kec, + simtbx::Kokkos::kokkos_detector & kdt, + af::shared const active_pixel_list + ){ + //SCITBX_CHECK_POINT; //tested by tst_shoeboxes.py + kdt.set_active_pixels_on_GPU(active_pixel_list); + + // transfer source_I, source_lambda + // the int arguments are for sizes of the arrays + int source_count = SIM.sources; + kokkostbx::transfer_double2kokkos(m_source_I, SIM.source_I, source_count); + kokkostbx::transfer_double2kokkos(m_source_lambda, SIM.source_lambda, source_count); + // cudaSafeCall(cudaMemcpyVectorDoubleToDevice(m_source_I, SIM.source_I, SIM.sources)); + // cudaSafeCall(cudaMemcpyVectorDoubleToDevice(m_source_lambda, SIM.source_lambda, SIM.sources)); + + ::Kokkos::resize(m_crystal_orientation, SIM.phisteps, SIM.mosaic_domains, 3); + calc_CrystalOrientations( + SIM.phi0, SIM.phistep, SIM.phisteps, m_spindle_vector, m_a0, m_b0, m_c0, SIM.mosaic_spread, + SIM.mosaic_domains, m_mosaic_umats, m_crystal_orientation); + + // magic happens here: take pointer from singleton, temporarily use it for add Bragg iteration: + vector_cudareal_t current_channel_Fhkl = kec.d_channel_Fhkl[ichannel]; + + // cudaDeviceProp deviceProps = { 0 }; + // cudaSafeCall(cudaGetDeviceProperties(&deviceProps, SIM.device_Id)); + // int smCount = deviceProps.multiProcessorCount; + // dim3 threadsPerBlock(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y); + // dim3 numBlocks(smCount * 8, 1); + + // for call for all panels at the same time + + // set up cell basis vectors for the diffuse parameters (convert vec4 to vec3) + diffuse.a0 << SIM.a0[1],SIM.a0[2],SIM.a0[3]; diffuse.b0 << SIM.b0[1],SIM.b0[2],SIM.b0[3]; diffuse.c0 << SIM.c0[1],SIM.c0[2],SIM.c0[3]; + + debranch_maskall_Kernel( + kdt.m_panel_count, kdt.m_slow_dim_size, kdt.m_fast_dim_size, active_pixel_list.size(), + SIM.oversample, SIM.point_pixel, + SIM.pixel_size, m_subpixel_size, m_steps, + SIM.detector_thickstep, SIM.detector_thicksteps, SIM.detector_thick, SIM.detector_attnlen, + kdt.m_sdet_vector, + kdt.m_fdet_vector, + kdt.m_odet_vector, + kdt.m_pix0_vector, + kdt.m_distance, + m_beam_vector, + SIM.dmin, SIM.phisteps, SIM.sources, + m_source_X, m_source_Y, + m_source_Z, + m_source_I, m_source_lambda, + SIM.mosaic_domains, m_crystal_orientation, + SIM.Na, SIM.Nb, SIM.Nc, SIM.V_cell, + m_water_size, m_water_F, m_water_MW, simtbx::nanoBragg::r_e_sqr, + SIM.fluence, simtbx::nanoBragg::Avogadro, SIM.spot_scale, SIM.integral_form, + SIM.default_F, + current_channel_Fhkl, + kec.m_FhklParams, + SIM.nopolar, + m_polar_vector, SIM.polarization, SIM.fudge, + kdt.m_active_pixel_list, + diffuse, + // return arrays: + kdt.m_floatimage, + kdt.m_omega_reduction, + kdt.m_max_I_x_reduction, + kdt.m_max_I_y_reduction, + kdt.m_rangemap); + + // cudaSafeCall(cudaPeekAtLastError()); + // cudaSafeCall(cudaDeviceSynchronize()); + + //don't want to free the kec data when the nanoBragg goes out of scope, so switch the pointer + // cu_current_channel_Fhkl = NULL; + + add_array(kdt.m_accumulate_floatimage, kdt.m_floatimage); + } + + inline void + add_energy_multichannel_mask_allpanel( + af::shared const ichannels, + simtbx::Kokkos::kokkos_energy_channels & kec, + simtbx::Kokkos::kokkos_detector & kdt, + af::shared const active_pixel_list, + af::shared const weight + ){ + //SCITBX_CHECK_POINT; // tested by tst_unified.py + kdt.set_active_pixels_on_GPU(active_pixel_list); + + // transfer source_I, source_lambda + // the int arguments are for sizes of the arrays + + SCITBX_ASSERT(SIM.sources == ichannels.size()); /* For each nanoBragg source, this value instructs + the simulation where to look for structure factors. If -1, skip this source wavelength. */ + SCITBX_ASSERT(SIM.sources == weight.size()); + + for (int ictr = 0; ictr < SIM.sources; ++ictr){ + if (ichannels[ictr] < 0) continue; // the ichannel array + //printf("HA HA %4d channel %4d, %5.1f %5.1f %5.1f %10.5f %10.5g\n",ictr,ichannels[ictr], + // SIM.source_X[ictr], SIM.source_Y[ictr], SIM.source_Z[ictr], + // SIM.source_I[ictr], SIM.source_lambda[ictr]); + + ::Kokkos::resize(m_crystal_orientation, SIM.phisteps, SIM.mosaic_domains, 3); + calc_CrystalOrientations( + SIM.phi0, SIM.phistep, SIM.phisteps, m_spindle_vector, m_a0, m_b0, m_c0, SIM.mosaic_spread, + SIM.mosaic_domains, m_mosaic_umats, m_crystal_orientation); + + // magic happens here: take pointer from singleton, temporarily use it for add Bragg iteration: + vector_cudareal_t current_channel_Fhkl = kec.d_channel_Fhkl[ichannels[ictr]]; + + vector_cudareal_t c_source_X = vector_cudareal_t("c_source_X", 0); + vector_cudareal_t c_source_Y = vector_cudareal_t("c_source_Y", 0); + vector_cudareal_t c_source_Z = vector_cudareal_t("c_source_Z", 0); + vector_cudareal_t c_source_I = vector_cudareal_t("c_source_I", 0); + vector_cudareal_t c_source_lambda = vector_cudareal_t("c_source_lambda", 0); + double weighted_I = SIM.source_I[ictr] * weight[ictr]; + kokkostbx::transfer_double2kokkos(c_source_X, &(SIM.source_X[ictr]), 1); + kokkostbx::transfer_double2kokkos(c_source_Y, &(SIM.source_Y[ictr]), 1); + kokkostbx::transfer_double2kokkos(c_source_Z, &(SIM.source_Z[ictr]), 1); + kokkostbx::transfer_double2kokkos(c_source_I, &(weighted_I), 1); + kokkostbx::transfer_double2kokkos(c_source_lambda, &(SIM.source_lambda[ictr]), 1); + + // set up cell basis vectors for the diffuse parameters (convert vec4 to vec3) + diffuse.a0 << SIM.a0[1],SIM.a0[2],SIM.a0[3]; diffuse.b0 << SIM.b0[1],SIM.b0[2],SIM.b0[3]; diffuse.c0 << SIM.c0[1],SIM.c0[2],SIM.c0[3]; + + debranch_maskall_Kernel( + kdt.m_panel_count, kdt.m_slow_dim_size, kdt.m_fast_dim_size, active_pixel_list.size(), + SIM.oversample, SIM.point_pixel, + SIM.pixel_size, m_subpixel_size, m_steps, + SIM.detector_thickstep, SIM.detector_thicksteps, SIM.detector_thick, SIM.detector_attnlen, + kdt.m_sdet_vector, + kdt.m_fdet_vector, + kdt.m_odet_vector, + kdt.m_pix0_vector, + kdt.m_distance, + m_beam_vector, + SIM.dmin, SIM.phisteps, 1, + c_source_X, c_source_Y, + c_source_Z, + c_source_I, c_source_lambda, + SIM.mosaic_domains, m_crystal_orientation, + SIM.Na, SIM.Nb, SIM.Nc, SIM.V_cell, + m_water_size, m_water_F, m_water_MW, simtbx::nanoBragg::r_e_sqr, + SIM.fluence, simtbx::nanoBragg::Avogadro, SIM.spot_scale, SIM.integral_form, + SIM.default_F, + current_channel_Fhkl, + kec.m_FhklParams, + SIM.nopolar, + m_polar_vector, SIM.polarization, SIM.fudge, + kdt.m_active_pixel_list, + diffuse, + // return arrays: + kdt.m_floatimage, + kdt.m_omega_reduction, + kdt.m_max_I_x_reduction, + kdt.m_max_I_y_reduction, + kdt.m_rangemap); + //don't want to free the kec data when the nanoBragg goes out of scope, so switch the pointer + // cu_current_channel_Fhkl = NULL; + add_array(kdt.m_accumulate_floatimage, kdt.m_floatimage); + }// loop over channels + } + + inline void + add_background(simtbx::Kokkos::kokkos_detector & kdt, int const& override_source) { + // cudaSafeCall(cudaSetDevice(SIM.device_Id)); + + // transfer source_I, source_lambda + // the int arguments are for sizes of the arrays + int sources_count = SIM.sources; + kokkostbx::transfer_double2kokkos(m_source_I, SIM.source_I, sources_count); + kokkostbx::transfer_double2kokkos(m_source_lambda, SIM.source_lambda, sources_count); + // cudaSafeCall(cudaMemcpyVectorDoubleToDevice(m_source_I, SIM.source_I, SIM.sources)); + // cudaSafeCall(cudaMemcpyVectorDoubleToDevice(m_source_lambda, SIM.source_lambda, SIM.sources)); + + vector_cudareal_t stol_of("stol_of", SIM.stols); + transfer_X2kokkos(stol_of, SIM.stol_of, SIM.stols); + // CUDAREAL * cu_stol_of; + // cudaSafeCall(cudaMalloc((void ** )&cu_stol_of, sizeof(*cu_stol_of) * SIM.stols)); + // cudaSafeCall(cudaMemcpyVectorDoubleToDevice(cu_stol_of, SIM.stol_of, SIM.stols)); + + vector_cudareal_t Fbg_of("Fbg_of", SIM.stols); + transfer_X2kokkos(Fbg_of, SIM.Fbg_of, SIM.stols); + // CUDAREAL * cu_Fbg_of; + // cudaSafeCall(cudaMalloc((void ** )&cu_Fbg_of, sizeof(*cu_Fbg_of) * SIM.stols)); + // cudaSafeCall(cudaMemcpyVectorDoubleToDevice(cu_Fbg_of, SIM.Fbg_of, SIM.stols)); + + // cudaDeviceProp deviceProps = { 0 }; + // cudaSafeCall(cudaGetDeviceProperties(&deviceProps, SIM.device_Id)); + // int smCount = deviceProps.multiProcessorCount; + // dim3 threadsPerBlock(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y); + // dim3 numBlocks(smCount * 8, 1); + + // initialize the device memory within a kernel. + // modify the arguments to initialize multipanel detector. + ::Kokkos::parallel_for("kokkosSpotsInit", kdt.m_panel_count * kdt.m_slow_dim_size * kdt.m_fast_dim_size, KOKKOS_LAMBDA (const int& j) { + kdt.m_floatimage(j) = 0; + kdt.m_omega_reduction(j) = 0; + kdt.m_max_I_x_reduction(j) = 0; + kdt.m_max_I_y_reduction(j) = 0; + kdt.m_rangemap(j) = false; + }); + // nanoBraggSpotsInitCUDAKernel<<>>( + // kdt.m_panel_count * kdt.m_slow_dim_size, kdt.m_fast_dim_size, + // kdt.cu_floatimage, kdt.cu_omega_reduction, + // kdt.cu_max_I_x_reduction, kdt.cu_max_I_y_reduction, + // kdt.cu_rangemap); + // cudaSafeCall(cudaPeekAtLastError()); + // cudaSafeCall(cudaDeviceSynchronize()); + + std::size_t panel_size = kdt.m_slow_dim_size * kdt.m_fast_dim_size; + + // the for loop around panels. Offsets given. + for (std::size_t panel_id = 0; panel_id < kdt.m_panel_count; panel_id++) { + add_background_kokkos_kernel(SIM.sources, + SIM.oversample, override_source, + SIM.pixel_size, kdt.m_slow_dim_size, kdt.m_fast_dim_size, SIM.detector_thicksteps, + SIM.detector_thickstep, SIM.detector_attnlen, + extract_subview(kdt.m_sdet_vector, panel_id, 1), + extract_subview(kdt.m_fdet_vector, panel_id, 1), + extract_subview(kdt.m_odet_vector, panel_id, 1), + extract_subview(kdt.m_pix0_vector, panel_id, 1), + kdt.metrology.dists[panel_id], SIM.point_pixel, SIM.detector_thick, + m_source_X, m_source_Y, m_source_Z, + m_source_lambda, m_source_I, + SIM.stols, stol_of, Fbg_of, + SIM.nopolar, SIM.polarization, m_polar_vector, + simtbx::nanoBragg::r_e_sqr, SIM.fluence, SIM.amorphous_molecules, + // returns: + extract_subview(kdt.m_floatimage, panel_id, panel_size)); + + // cudaSafeCall(cudaPeekAtLastError()); + } + // cudaSafeCall(cudaDeviceSynchronize()); + ::Kokkos::fence(); + add_array(kdt.m_accumulate_floatimage, kdt.m_floatimage); + + // cudaSafeCall(cudaFree(cu_stol_of)); + // cudaSafeCall(cudaFree(cu_Fbg_of)); + } + + inline af::flex_double + add_noise(simtbx::Kokkos::kokkos_detector & kdt) { + // put raw pixels into a temporary hold + af::flex_double tmp_hold_pixels = kdt.get_raw_pixels(); + std::size_t panel_size = kdt.m_slow_dim_size * kdt.m_fast_dim_size; + af::flex_double panel_pixels = af::flex_double(af::flex_grid<>(kdt.m_slow_dim_size,kdt.m_fast_dim_size), af::init_functor_null()); + for (std::size_t panel_id = 0; panel_id < kdt.m_panel_count; panel_id++) { + double* dst_beg = panel_pixels.begin(); + double* dst_end = panel_pixels.end(); + double* dstptr = dst_beg; + for (double *srcptr = &(tmp_hold_pixels[panel_id*panel_size]); dstptr != dst_end; ){ + *dstptr++ = *srcptr++; + } // get the pixels from one panel on GPU memory into panel_pixels for add_noise + SIM.add_noise(panel_pixels); + dstptr = dst_beg; + for (double *srcptr = &(tmp_hold_pixels[panel_id*panel_size]); dstptr != dst_end; ){ + *srcptr++ = *dstptr++; + } // return the pixels from SIM.raw_pixels to temporary array + } + ::Kokkos::fence(); + return tmp_hold_pixels; + } + + inline void + allocate() { + //cudaSafeCall(cudaSetDevice(SIM.device_Id)); + + // water_size not defined in class, CLI argument, defaults to 0 + double water_size = 0.0; + // missing constants + double water_F = 2.57; + double water_MW = 18.0; + + // make sure we are normalizing with the right number of sub-steps + int nb_steps = SIM.phisteps * SIM.mosaic_domains * SIM.oversample * SIM.oversample; + double nb_subpixel_size = SIM.pixel_size / SIM.oversample; + + //create transfer arguments to device space + m_subpixel_size = nb_subpixel_size; //check for conflict? + m_steps = nb_steps; //check for conflict? + + // presumably thickness and attenuation can be migrated to the gpu detector class XXX FIXME + //cu_detector_thick = SIM.detector_thick; + //cu_detector_mu = SIM.detector_attnlen; // synonyms + //cu_distance = SIM.distance; // distance and close distance, detector properties? XXX FIXME + //cu_close_distance = SIM.close_distance; + + m_water_size = water_size; + m_water_F = water_F; + m_water_MW = water_MW; + + //const int vector_length = 4; + + kokkostbx::transfer_double2kokkos(m_beam_vector, SIM.beam_vector, m_vector_length); + kokkostbx::transfer_double2kokkos(m_spindle_vector, SIM.spindle_vector, m_vector_length); + kokkostbx::transfer_double2kokkos(m_a0, SIM.a0, m_vector_length); + kokkostbx::transfer_double2kokkos(m_b0, SIM.b0, m_vector_length); + kokkostbx::transfer_double2kokkos(m_c0, SIM.c0, m_vector_length); + + // cudaSafeCall(cudaMalloc((void ** )&cu_beam_vector, sizeof(*cu_beam_vector) * vector_length)); + // cudaSafeCall(cudaMemcpyVectorDoubleToDevice(cu_beam_vector, SIM.beam_vector, vector_length)); + + // cudaSafeCall(cudaMalloc((void ** )&cu_spindle_vector, sizeof(*cu_spindle_vector) * vector_length)); + // cudaSafeCall(cudaMemcpyVectorDoubleToDevice(cu_spindle_vector, SIM.spindle_vector, vector_length)); + + // cudaSafeCall(cudaMalloc((void ** )&cu_a0, sizeof(*cu_a0) * vector_length)); + // cudaSafeCall(cudaMemcpyVectorDoubleToDevice(cu_a0, SIM.a0, vector_length)); + + // cudaSafeCall(cudaMalloc((void ** )&cu_b0, sizeof(*cu_b0) * vector_length)); + // cudaSafeCall(cudaMemcpyVectorDoubleToDevice(cu_b0, SIM.b0, vector_length)); + + // cudaSafeCall(cudaMalloc((void ** )&cu_c0, sizeof(*cu_c0) * vector_length)); + // cudaSafeCall(cudaMemcpyVectorDoubleToDevice(cu_c0, SIM.c0, vector_length)); + + // Unitize polar vector before sending it to the GPU. + // Optimization do it only once here rather than multiple time per pixel in the GPU. + double polar_vector_unitized[4]; + cpu_unitize(SIM.polar_vector, polar_vector_unitized); + kokkostbx::transfer_double2kokkos(m_polar_vector, polar_vector_unitized, m_vector_length); + + int sources_count = SIM.sources; + kokkostbx::transfer_double2kokkos(m_source_X, SIM.source_X, sources_count); + kokkostbx::transfer_double2kokkos(m_source_Y, SIM.source_Y, sources_count); + kokkostbx::transfer_double2kokkos(m_source_Z, SIM.source_Z, sources_count); + kokkostbx::transfer_double2kokkos(m_source_I, SIM.source_I, sources_count); + kokkostbx::transfer_double2kokkos(m_source_lambda, SIM.source_lambda, sources_count); + + int mosaic_domains_count = SIM.mosaic_domains; + kokkostbx::transfer_double2kokkos(m_mosaic_umats, SIM.mosaic_umats, mosaic_domains_count * 9); + + // cudaSafeCall(cudaMalloc((void ** )&cu_polar_vector, sizeof(*cu_polar_vector) * vector_length)); + // cudaSafeCall(cudaMemcpyVectorDoubleToDevice(cu_polar_vector, polar_vector_unitized, vector_length)); + + // cudaSafeCall(cudaMalloc((void ** )&cu_source_X, sizeof(*cu_source_X) * sources_count)); + // cudaSafeCall(cudaMemcpyVectorDoubleToDevice(cu_source_X, SIM.source_X, sources_count)); + + // cudaSafeCall(cudaMalloc((void ** )&cu_source_Y, sizeof(*cu_source_Y) * sources_count)); + // cudaSafeCall(cudaMemcpyVectorDoubleToDevice(cu_source_Y, SIM.source_Y, sources_count)); + + // cudaSafeCall(cudaMalloc((void ** )&cu_source_Z, sizeof(*cu_source_Z) * sources_count)); + // cudaSafeCall(cudaMemcpyVectorDoubleToDevice(cu_source_Z, SIM.source_Z, sources_count)); + + // cudaSafeCall(cudaMalloc((void ** )&cu_source_I, sizeof(*cu_source_I) * sources_count)); + // cudaSafeCall(cudaMemcpyVectorDoubleToDevice(cu_source_I, SIM.source_I, sources_count)); + + // cudaSafeCall(cudaMalloc((void ** )&cu_source_lambda, sizeof(*cu_source_lambda) * sources_count)); + // cudaSafeCall(cudaMemcpyVectorDoubleToDevice(cu_source_lambda, SIM.source_lambda, sources_count)); + + // cudaSafeCall(cudaMalloc((void ** )&cu_mosaic_umats, sizeof(*cu_mosaic_umats) * mosaic_domains_count * 9)); + // cudaSafeCall(cudaMemcpyVectorDoubleToDevice(cu_mosaic_umats, SIM.mosaic_umats, mosaic_domains_count * 9)); + }; + }; } // Kokkos } // simtbx diff --git a/simtbx/kokkos/simulation_kernels.h b/simtbx/kokkos/simulation_kernels.h index 0a91f33e59..7c1fa9eb21 100644 --- a/simtbx/kokkos/simulation_kernels.h +++ b/simtbx/kokkos/simulation_kernels.h @@ -1,3 +1,5 @@ +#ifndef SIMTBX_KOKKOS_SIMULATION_KERNELS_H +#define SIMTBX_KOKKOS_SIMULATION_KERNELS_H #include #include #include @@ -792,3 +794,4 @@ void add_background_kokkos_kernel(int sources, int nanoBragg_oversample, int ove floatimage(pixIdx) += Ibg*r_e_sqr*fluence*amorphous_molecules/steps; }); // end of pixIdx loop } +#endif // SIMTBX_KOKKOS_SIMULATION_KERNELS_H diff --git a/simtbx/tests/tst_memory_policy.py b/simtbx/tests/tst_memory_policy.py index 95f8755643..dbb94cee5e 100644 --- a/simtbx/tests/tst_memory_policy.py +++ b/simtbx/tests/tst_memory_policy.py @@ -211,9 +211,9 @@ def specialized_api_for_whitelist_low_memory(self, params, whitelist_pixels, arg self.SIM.xtal_shape = shapetype.Gauss self.SIM.interpolate = 0 # allocate GPU arrays - self.gpu_simulation = get_exascale("exascale_api",params.context)(nanoBragg = self.SIM) + self.gpu_simulation = get_exascale("exascale_api_small_whitelist",params.context)(nanoBragg = self.SIM) self.gpu_simulation.allocate() - self.gpu_detector = get_exascale("gpu_detector",params.context)( + self.gpu_detector = get_exascale("gpu_detector_small_whitelist",params.context)( deviceId=self.SIM.device_Id, detector=self.DETECTOR, beam=self.BEAM) self.gpu_detector.each_image_allocate() # self.gpu_detector.show_summary() From 79de5b7cda1af7acd4221e0130f7dc444038add5 Mon Sep 17 00:00:00 2001 From: Nicholas K Sauter Date: Tue, 9 Apr 2024 15:59:59 -0700 Subject: [PATCH 2/4] Debug case for non-working test. The script tests/tst_memory_policy.py fails with a cuda illegal access. The intention is to get help from NESAP to get a functional test. --- simtbx/kokkos/detector.cpp | 135 +++++++++++++ simtbx/kokkos/detector.h | 82 +------- simtbx/kokkos/simulation.cpp | 173 +++++++++++++++++ simtbx/kokkos/simulation.h | 89 +-------- simtbx/kokkos/simulation_kernels.h | 300 +++++++++++++++++++++++++++++ simtbx/tests/tst_memory_policy.py | 56 +++++- 6 files changed, 664 insertions(+), 171 deletions(-) diff --git a/simtbx/kokkos/detector.cpp b/simtbx/kokkos/detector.cpp index 233b97c7ff..c4ef795d72 100644 --- a/simtbx/kokkos/detector.cpp +++ b/simtbx/kokkos/detector.cpp @@ -92,5 +92,140 @@ namespace simtbx { namespace Kokkos { } } + template<> + void kokkos_detector::hello(){ + SCITBX_EXAMINE("small small small"); + } + template<> + void kokkos_detector::hello(){ + SCITBX_EXAMINE("large large large"); + } + + template<> void + kokkos_detector::each_image_allocate() { + resize(m_rangemap, m_total_pixel_count); + resize(m_omega_reduction, m_total_pixel_count); + resize(m_max_I_x_reduction, m_total_pixel_count); + resize(m_max_I_y_reduction, m_total_pixel_count); + resize(m_floatimage, m_total_pixel_count); + + resize(m_maskimage, m_total_pixel_count); + kokkostbx::transfer_shared2kokkos(m_sdet_vector, metrology.sdet); + kokkostbx::transfer_shared2kokkos(m_fdet_vector, metrology.fdet); + kokkostbx::transfer_shared2kokkos(m_odet_vector, metrology.odet); + kokkostbx::transfer_shared2kokkos(m_pix0_vector, metrology.pix0); + kokkostbx::transfer_shared2kokkos(m_distance, metrology.dists); + kokkostbx::transfer_shared2kokkos(m_Xbeam, metrology.Xbeam); + kokkostbx::transfer_shared2kokkos(m_Ybeam, metrology.Ybeam); + fence(); + + // metrology.show(); + + // printf(" rangemap size:%d\n", m_rangemap.span()); + // printf(" omega_reduction size:%d\n", m_omega_reduction.span()); + // printf(" max_I_x_reduction size:%d\n", m_max_I_x_reduction.span()); + // printf(" max_I_y_reduction size:%d\n", m_max_I_y_reduction.span()); + // printf(" maskimage size:%d\n", m_maskimage.span()); + // printf(" floatimage size:%d\n", m_floatimage.span()); + // printf(" sdet_vector size:%d\n", m_sdet_vector.span()); + // printf(" fdet_vector size:%d\n", m_fdet_vector.span()); + // printf(" odet_vector size:%d\n", m_odet_vector.span()); + // printf(" pix0_vector size:%d\n", m_pix0_vector.span()); + // printf(" distance size:%d\n", m_distance.span()); + // printf(" Xbeam size:%d\n", m_Xbeam.span()); + // printf(" Ybeam size:%d\n", m_Ybeam.span()); + + // print_view(m_fdet_vector); + // print_view(m_odet_vector, 1, 3); + + // printf("DONE.\n"); + } + template<> void + kokkos_detector::each_image_allocate() { + resize(m_maskimage, m_total_pixel_count); + kokkostbx::transfer_shared2kokkos(m_sdet_vector, metrology.sdet); + kokkostbx::transfer_shared2kokkos(m_fdet_vector, metrology.fdet); + kokkostbx::transfer_shared2kokkos(m_odet_vector, metrology.odet); + kokkostbx::transfer_shared2kokkos(m_pix0_vector, metrology.pix0); + kokkostbx::transfer_shared2kokkos(m_distance, metrology.dists); + kokkostbx::transfer_shared2kokkos(m_Xbeam, metrology.Xbeam); + kokkostbx::transfer_shared2kokkos(m_Ybeam, metrology.Ybeam); + fence(); + } + + template<> + void + kokkos_detector::set_active_pixels_on_GPU(af::shared active_pixel_list_value) { + m_active_pixel_size = active_pixel_list_value.size(); + kokkostbx::transfer_shared2kokkos(m_active_pixel_list, active_pixel_list_value); + active_pixel_list = active_pixel_list_value; + } + + template<> + void + kokkos_detector::set_active_pixels_on_GPU(af::shared active_pixel_list_value) { + m_active_pixel_size = active_pixel_list_value.size(); + kokkostbx::transfer_shared2kokkos(m_active_pixel_list, active_pixel_list_value); + active_pixel_list = active_pixel_list_value; + resize(m_rangemap, m_active_pixel_size); + resize(m_omega_reduction, m_active_pixel_size); + resize(m_max_I_x_reduction, m_active_pixel_size); + resize(m_max_I_y_reduction, m_active_pixel_size); + resize(m_floatimage, m_active_pixel_size); + resize(m_accumulate_floatimage, m_active_pixel_size); + fence(); + } + + template<> af::shared + kokkos_detector::get_whitelist_raw_pixels(af::shared selection) { + hello(); + //return the data array for the multipanel detector case, but only for whitelist pixels + vector_size_t active_pixel_selection = vector_size_t("active_pixel_selection", selection.size()); + kokkostbx::transfer_shared2kokkos(active_pixel_selection, selection); + + size_t output_pixel_size = selection.size(); + vector_cudareal_t active_pixel_results = vector_cudareal_t("active_pixel_results", output_pixel_size); + + auto temp = m_accumulate_floatimage; + + parallel_for("get_active_pixel_selection", + range_policy(0, output_pixel_size), + KOKKOS_LAMBDA (const int i) { + size_t index = active_pixel_selection( i ); + active_pixel_results( i ) = temp( index ); + }); + + af::shared output_array(output_pixel_size, af::init_functor_null()); + kokkostbx::transfer_kokkos2shared(output_array, active_pixel_results); + + SCITBX_ASSERT(output_array.size() == output_pixel_size); + return output_array; + } + template<> af::shared + kokkos_detector::get_whitelist_raw_pixels(af::shared selection) { + SCITBX_CHECK_POINT; + hello(); + //return the data array for the multipanel detector case, but only for whitelist pixels + + std::size_t output_pixel_size = selection.size(); + //vector_cudareal_t active_pixel_results = vector_cudareal_t("active_pixel_results", output_pixel_size); + + //auto temp = m_accumulate_floatimage; + + //parallel_for("get_active_pixel_selection2", + // range_policy(0, output_pixel_size), + // KOKKOS_LAMBDA (const int i) { + // active_pixel_results( i ) = temp( i ); + //}); + + af::shared output_array(output_pixel_size, af::init_functor_null()); + SCITBX_CHECK_POINT; + kokkostbx::transfer_kokkos2shared(output_array, m_accumulate_floatimage);//active_pixel_results); + SCITBX_CHECK_POINT; + + SCITBX_ASSERT(output_array.size() == output_pixel_size); + return output_array; + } + } // Kokkos } // simtbx diff --git a/simtbx/kokkos/detector.h b/simtbx/kokkos/detector.h index c7149eaeb4..74d286ffbe 100644 --- a/simtbx/kokkos/detector.h +++ b/simtbx/kokkos/detector.h @@ -41,10 +41,8 @@ struct packed_metrology{ af::sharedYbeam; }; -struct large_array_policy { -}; -struct small_whitelist_policy { -}; +struct large_array_policy {}; +struct small_whitelist_policy {}; template struct kokkos_detector @@ -63,7 +61,7 @@ struct kokkos_detector //void write_raw_pixels(simtbx::nanoBragg::nanoBragg&); //af::flex_double get_raw_pixels(); //void set_active_pixels_on_GPU(af::shared); - //af::shared get_whitelist_raw_pixels(af::shared); + af::shared get_whitelist_raw_pixels(af::shared); inline void each_image_free(){} //no op in Kokkos int h_deviceID; @@ -157,46 +155,7 @@ struct kokkos_detector return view_floatimage; }; - inline void - each_image_allocate() { - resize(m_rangemap, m_total_pixel_count); - resize(m_omega_reduction, m_total_pixel_count); - resize(m_max_I_x_reduction, m_total_pixel_count); - resize(m_max_I_y_reduction, m_total_pixel_count); - - resize(m_maskimage, m_total_pixel_count); - resize(m_floatimage, m_total_pixel_count); - - kokkostbx::transfer_shared2kokkos(m_sdet_vector, metrology.sdet); - kokkostbx::transfer_shared2kokkos(m_fdet_vector, metrology.fdet); - kokkostbx::transfer_shared2kokkos(m_odet_vector, metrology.odet); - kokkostbx::transfer_shared2kokkos(m_pix0_vector, metrology.pix0); - kokkostbx::transfer_shared2kokkos(m_distance, metrology.dists); - kokkostbx::transfer_shared2kokkos(m_Xbeam, metrology.Xbeam); - kokkostbx::transfer_shared2kokkos(m_Ybeam, metrology.Ybeam); - fence(); - - // metrology.show(); - - // printf(" rangemap size:%d\n", m_rangemap.span()); - // printf(" omega_reduction size:%d\n", m_omega_reduction.span()); - // printf(" max_I_x_reduction size:%d\n", m_max_I_x_reduction.span()); - // printf(" max_I_y_reduction size:%d\n", m_max_I_y_reduction.span()); - // printf(" maskimage size:%d\n", m_maskimage.span()); - // printf(" floatimage size:%d\n", m_floatimage.span()); - // printf(" sdet_vector size:%d\n", m_sdet_vector.span()); - // printf(" fdet_vector size:%d\n", m_fdet_vector.span()); - // printf(" odet_vector size:%d\n", m_odet_vector.span()); - // printf(" pix0_vector size:%d\n", m_pix0_vector.span()); - // printf(" distance size:%d\n", m_distance.span()); - // printf(" Xbeam size:%d\n", m_Xbeam.span()); - // printf(" Ybeam size:%d\n", m_Ybeam.span()); - - // print_view(m_fdet_vector); - // print_view(m_odet_vector, 1, 3); - - // printf("DONE.\n"); - } + void each_image_allocate(); inline void scale_in_place(const double& factor){ @@ -206,6 +165,8 @@ struct kokkos_detector }); } + void set_active_pixels_on_GPU(af::shared active_pixel_list_value); + inline void write_raw_pixels(simtbx::nanoBragg::nanoBragg& nB) { //only implement the monolithic detector case, one panel @@ -242,37 +203,8 @@ struct kokkos_detector return output_array; } - inline void - set_active_pixels_on_GPU(af::shared active_pixel_list_value) { - m_active_pixel_size = active_pixel_list_value.size(); - kokkostbx::transfer_shared2kokkos(m_active_pixel_list, active_pixel_list_value); - active_pixel_list = active_pixel_list_value; - } - - inline af::shared - get_whitelist_raw_pixels(af::shared selection) { - //return the data array for the multipanel detector case, but only for whitelist pixels - vector_size_t active_pixel_selection = vector_size_t("active_pixel_selection", selection.size()); - kokkostbx::transfer_shared2kokkos(active_pixel_selection, selection); + void hello(); - size_t output_pixel_size = selection.size(); - vector_cudareal_t active_pixel_results = vector_cudareal_t("active_pixel_results", output_pixel_size); - - auto temp = m_accumulate_floatimage; - - parallel_for("get_active_pixel_selection", - range_policy(0, output_pixel_size), - KOKKOS_LAMBDA (const int i) { - size_t index = active_pixel_selection( i ); - active_pixel_results( i ) = temp( index ); - }); - - af::shared output_array(output_pixel_size, af::init_functor_null()); - kokkostbx::transfer_kokkos2shared(output_array, active_pixel_results); - - SCITBX_ASSERT(output_array.size() == output_pixel_size); - return output_array; - } }; diff --git a/simtbx/kokkos/simulation.cpp b/simtbx/kokkos/simulation.cpp index b4fd96861b..cf2d79dd4e 100644 --- a/simtbx/kokkos/simulation.cpp +++ b/simtbx/kokkos/simulation.cpp @@ -4,6 +4,7 @@ #include "simtbx/kokkos/simulation_kernels.h" #include "kokkostbx/kokkos_utils.h" #include "scitbx/array_family/flex_types.h" +#include "simtbx/kokkos/detector.h" #define THREADS_PER_BLOCK_X 128 #define THREADS_PER_BLOCK_Y 1 @@ -42,5 +43,177 @@ namespace Kokkos { } */ + template<> void + exascale_api::add_energy_multichannel_mask_allpanel( + af::shared const ichannels, + simtbx::Kokkos::kokkos_energy_channels & kec, + simtbx::Kokkos::kokkos_detector & kdt, + af::shared const active_pixel_list, + af::shared const weight + ){ + //SCITBX_CHECK_POINT; // tested by tst_unified.py + kdt.set_active_pixels_on_GPU(active_pixel_list); + + // transfer source_I, source_lambda + // the int arguments are for sizes of the arrays + + SCITBX_ASSERT(SIM.sources == ichannels.size()); /* For each nanoBragg source, this value instructs + the simulation where to look for structure factors. If -1, skip this source wavelength. */ + SCITBX_ASSERT(SIM.sources == weight.size()); + + for (int ictr = 0; ictr < SIM.sources; ++ictr){ + if (ichannels[ictr] < 0) continue; // the ichannel array + //printf("HA HA %4d channel %4d, %5.1f %5.1f %5.1f %10.5f %10.5g\n",ictr,ichannels[ictr], + // SIM.source_X[ictr], SIM.source_Y[ictr], SIM.source_Z[ictr], + // SIM.source_I[ictr], SIM.source_lambda[ictr]); + + ::Kokkos::resize(m_crystal_orientation, SIM.phisteps, SIM.mosaic_domains, 3); + calc_CrystalOrientations( + SIM.phi0, SIM.phistep, SIM.phisteps, m_spindle_vector, m_a0, m_b0, m_c0, SIM.mosaic_spread, + SIM.mosaic_domains, m_mosaic_umats, m_crystal_orientation); + + // magic happens here: take pointer from singleton, temporarily use it for add Bragg iteration: + vector_cudareal_t current_channel_Fhkl = kec.d_channel_Fhkl[ichannels[ictr]]; + + vector_cudareal_t c_source_X = vector_cudareal_t("c_source_X", 0); + vector_cudareal_t c_source_Y = vector_cudareal_t("c_source_Y", 0); + vector_cudareal_t c_source_Z = vector_cudareal_t("c_source_Z", 0); + vector_cudareal_t c_source_I = vector_cudareal_t("c_source_I", 0); + vector_cudareal_t c_source_lambda = vector_cudareal_t("c_source_lambda", 0); + double weighted_I = SIM.source_I[ictr] * weight[ictr]; + kokkostbx::transfer_double2kokkos(c_source_X, &(SIM.source_X[ictr]), 1); + kokkostbx::transfer_double2kokkos(c_source_Y, &(SIM.source_Y[ictr]), 1); + kokkostbx::transfer_double2kokkos(c_source_Z, &(SIM.source_Z[ictr]), 1); + kokkostbx::transfer_double2kokkos(c_source_I, &(weighted_I), 1); + kokkostbx::transfer_double2kokkos(c_source_lambda, &(SIM.source_lambda[ictr]), 1); + + // set up cell basis vectors for the diffuse parameters (convert vec4 to vec3) + diffuse.a0 << SIM.a0[1],SIM.a0[2],SIM.a0[3]; diffuse.b0 << SIM.b0[1],SIM.b0[2],SIM.b0[3]; diffuse.c0 << SIM.c0[1],SIM.c0[2],SIM.c0[3]; + + debranch_maskall_Kernel( + kdt.m_panel_count, kdt.m_slow_dim_size, kdt.m_fast_dim_size, active_pixel_list.size(), + SIM.oversample, SIM.point_pixel, + SIM.pixel_size, m_subpixel_size, m_steps, + SIM.detector_thickstep, SIM.detector_thicksteps, SIM.detector_thick, SIM.detector_attnlen, + kdt.m_sdet_vector, + kdt.m_fdet_vector, + kdt.m_odet_vector, + kdt.m_pix0_vector, + kdt.m_distance, + m_beam_vector, + SIM.dmin, SIM.phisteps, 1, + c_source_X, c_source_Y, + c_source_Z, + c_source_I, c_source_lambda, + SIM.mosaic_domains, m_crystal_orientation, + SIM.Na, SIM.Nb, SIM.Nc, SIM.V_cell, + m_water_size, m_water_F, m_water_MW, simtbx::nanoBragg::r_e_sqr, + SIM.fluence, simtbx::nanoBragg::Avogadro, SIM.spot_scale, SIM.integral_form, + SIM.default_F, + current_channel_Fhkl, + kec.m_FhklParams, + SIM.nopolar, + m_polar_vector, SIM.polarization, SIM.fudge, + kdt.m_active_pixel_list, + diffuse, + // return arrays: + kdt.m_floatimage, + kdt.m_omega_reduction, + kdt.m_max_I_x_reduction, + kdt.m_max_I_y_reduction, + kdt.m_rangemap); + //don't want to free the kec data when the nanoBragg goes out of scope, so switch the pointer + // cu_current_channel_Fhkl = NULL; + + add_array(kdt.m_accumulate_floatimage, kdt.m_floatimage); + }// loop over channels + } + + template<> void + exascale_api::add_energy_multichannel_mask_allpanel( + af::shared const ichannels, + simtbx::Kokkos::kokkos_energy_channels & kec, + simtbx::Kokkos::kokkos_detector & kdt, + af::shared const active_pixel_list, + af::shared const weight + ){ + //SCITBX_CHECK_POINT; // tested by tst_memory_policy.py + kdt.set_active_pixels_on_GPU(active_pixel_list); + + // transfer source_I, source_lambda + // the int arguments are for sizes of the arrays + + SCITBX_ASSERT(SIM.sources == ichannels.size()); /* For each nanoBragg source, this value instructs + the simulation where to look for structure factors. If -1, skip this source wavelength. */ + SCITBX_ASSERT(SIM.sources == weight.size()); + + for (int ictr = 0; ictr < SIM.sources; ++ictr){ + if (ichannels[ictr] < 0) continue; // the ichannel array + //printf("HA HA %4d channel %4d, %5.1f %5.1f %5.1f %10.5f %10.5g\n",ictr,ichannels[ictr], + // SIM.source_X[ictr], SIM.source_Y[ictr], SIM.source_Z[ictr], + // SIM.source_I[ictr], SIM.source_lambda[ictr]); + + ::Kokkos::resize(m_crystal_orientation, SIM.phisteps, SIM.mosaic_domains, 3); + calc_CrystalOrientations( + SIM.phi0, SIM.phistep, SIM.phisteps, m_spindle_vector, m_a0, m_b0, m_c0, SIM.mosaic_spread, + SIM.mosaic_domains, m_mosaic_umats, m_crystal_orientation); + + // magic happens here: take pointer from singleton, temporarily use it for add Bragg iteration: + vector_cudareal_t current_channel_Fhkl = kec.d_channel_Fhkl[ichannels[ictr]]; + + vector_cudareal_t c_source_X = vector_cudareal_t("c_source_X", 0); + vector_cudareal_t c_source_Y = vector_cudareal_t("c_source_Y", 0); + vector_cudareal_t c_source_Z = vector_cudareal_t("c_source_Z", 0); + vector_cudareal_t c_source_I = vector_cudareal_t("c_source_I", 0); + vector_cudareal_t c_source_lambda = vector_cudareal_t("c_source_lambda", 0); + double weighted_I = SIM.source_I[ictr] * weight[ictr]; + kokkostbx::transfer_double2kokkos(c_source_X, &(SIM.source_X[ictr]), 1); + kokkostbx::transfer_double2kokkos(c_source_Y, &(SIM.source_Y[ictr]), 1); + kokkostbx::transfer_double2kokkos(c_source_Z, &(SIM.source_Z[ictr]), 1); + kokkostbx::transfer_double2kokkos(c_source_I, &(weighted_I), 1); + kokkostbx::transfer_double2kokkos(c_source_lambda, &(SIM.source_lambda[ictr]), 1); + + // set up cell basis vectors for the diffuse parameters (convert vec4 to vec3) + diffuse.a0 << SIM.a0[1],SIM.a0[2],SIM.a0[3]; diffuse.b0 << SIM.b0[1],SIM.b0[2],SIM.b0[3]; diffuse.c0 << SIM.c0[1],SIM.c0[2],SIM.c0[3]; + + debranch_maskall_low_memory_Kernel( + kdt.m_panel_count, kdt.m_slow_dim_size, kdt.m_fast_dim_size, active_pixel_list.size(), + SIM.oversample, SIM.point_pixel, + SIM.pixel_size, m_subpixel_size, m_steps, + SIM.detector_thickstep, SIM.detector_thicksteps, SIM.detector_thick, SIM.detector_attnlen, + kdt.m_sdet_vector, + kdt.m_fdet_vector, + kdt.m_odet_vector, + kdt.m_pix0_vector, + kdt.m_distance, + m_beam_vector, + SIM.dmin, SIM.phisteps, 1, + c_source_X, c_source_Y, + c_source_Z, + c_source_I, c_source_lambda, + SIM.mosaic_domains, m_crystal_orientation, + SIM.Na, SIM.Nb, SIM.Nc, SIM.V_cell, + m_water_size, m_water_F, m_water_MW, simtbx::nanoBragg::r_e_sqr, + SIM.fluence, simtbx::nanoBragg::Avogadro, SIM.spot_scale, SIM.integral_form, + SIM.default_F, + current_channel_Fhkl, + kec.m_FhklParams, + SIM.nopolar, + m_polar_vector, SIM.polarization, SIM.fudge, + kdt.m_active_pixel_list, + diffuse, + // return arrays: + kdt.m_floatimage, + kdt.m_omega_reduction, + kdt.m_max_I_x_reduction, + kdt.m_max_I_y_reduction, + kdt.m_rangemap); + //don't want to free the kec data when the nanoBragg goes out of scope, so switch the pointer + // cu_current_channel_Fhkl = NULL; + + add_array(kdt.m_accumulate_floatimage, kdt.m_floatimage); + }// loop over channels + } + } // Kokkos } // simtbx diff --git a/simtbx/kokkos/simulation.h b/simtbx/kokkos/simulation.h index 9b9d6a6552..9b69b4ce95 100644 --- a/simtbx/kokkos/simulation.h +++ b/simtbx/kokkos/simulation.h @@ -86,12 +86,12 @@ struct exascale_api { simtbx::Kokkos::kokkos_detector &, af::shared const ); */ - /*void add_energy_multichannel_mask_allpanel(af::shared const, + void add_energy_multichannel_mask_allpanel(af::shared const, simtbx::Kokkos::kokkos_energy_channels &, simtbx::Kokkos::kokkos_detector &, af::shared const, af::shared const - ); */ + ); //void add_background(simtbx::Kokkos::kokkos_detector &, int const&); //af::flex_double add_noise(simtbx::Kokkos::kokkos_detector &); @@ -330,91 +330,6 @@ struct exascale_api { add_array(kdt.m_accumulate_floatimage, kdt.m_floatimage); } - inline void - add_energy_multichannel_mask_allpanel( - af::shared const ichannels, - simtbx::Kokkos::kokkos_energy_channels & kec, - simtbx::Kokkos::kokkos_detector & kdt, - af::shared const active_pixel_list, - af::shared const weight - ){ - //SCITBX_CHECK_POINT; // tested by tst_unified.py - kdt.set_active_pixels_on_GPU(active_pixel_list); - - // transfer source_I, source_lambda - // the int arguments are for sizes of the arrays - - SCITBX_ASSERT(SIM.sources == ichannels.size()); /* For each nanoBragg source, this value instructs - the simulation where to look for structure factors. If -1, skip this source wavelength. */ - SCITBX_ASSERT(SIM.sources == weight.size()); - - for (int ictr = 0; ictr < SIM.sources; ++ictr){ - if (ichannels[ictr] < 0) continue; // the ichannel array - //printf("HA HA %4d channel %4d, %5.1f %5.1f %5.1f %10.5f %10.5g\n",ictr,ichannels[ictr], - // SIM.source_X[ictr], SIM.source_Y[ictr], SIM.source_Z[ictr], - // SIM.source_I[ictr], SIM.source_lambda[ictr]); - - ::Kokkos::resize(m_crystal_orientation, SIM.phisteps, SIM.mosaic_domains, 3); - calc_CrystalOrientations( - SIM.phi0, SIM.phistep, SIM.phisteps, m_spindle_vector, m_a0, m_b0, m_c0, SIM.mosaic_spread, - SIM.mosaic_domains, m_mosaic_umats, m_crystal_orientation); - - // magic happens here: take pointer from singleton, temporarily use it for add Bragg iteration: - vector_cudareal_t current_channel_Fhkl = kec.d_channel_Fhkl[ichannels[ictr]]; - - vector_cudareal_t c_source_X = vector_cudareal_t("c_source_X", 0); - vector_cudareal_t c_source_Y = vector_cudareal_t("c_source_Y", 0); - vector_cudareal_t c_source_Z = vector_cudareal_t("c_source_Z", 0); - vector_cudareal_t c_source_I = vector_cudareal_t("c_source_I", 0); - vector_cudareal_t c_source_lambda = vector_cudareal_t("c_source_lambda", 0); - double weighted_I = SIM.source_I[ictr] * weight[ictr]; - kokkostbx::transfer_double2kokkos(c_source_X, &(SIM.source_X[ictr]), 1); - kokkostbx::transfer_double2kokkos(c_source_Y, &(SIM.source_Y[ictr]), 1); - kokkostbx::transfer_double2kokkos(c_source_Z, &(SIM.source_Z[ictr]), 1); - kokkostbx::transfer_double2kokkos(c_source_I, &(weighted_I), 1); - kokkostbx::transfer_double2kokkos(c_source_lambda, &(SIM.source_lambda[ictr]), 1); - - // set up cell basis vectors for the diffuse parameters (convert vec4 to vec3) - diffuse.a0 << SIM.a0[1],SIM.a0[2],SIM.a0[3]; diffuse.b0 << SIM.b0[1],SIM.b0[2],SIM.b0[3]; diffuse.c0 << SIM.c0[1],SIM.c0[2],SIM.c0[3]; - - debranch_maskall_Kernel( - kdt.m_panel_count, kdt.m_slow_dim_size, kdt.m_fast_dim_size, active_pixel_list.size(), - SIM.oversample, SIM.point_pixel, - SIM.pixel_size, m_subpixel_size, m_steps, - SIM.detector_thickstep, SIM.detector_thicksteps, SIM.detector_thick, SIM.detector_attnlen, - kdt.m_sdet_vector, - kdt.m_fdet_vector, - kdt.m_odet_vector, - kdt.m_pix0_vector, - kdt.m_distance, - m_beam_vector, - SIM.dmin, SIM.phisteps, 1, - c_source_X, c_source_Y, - c_source_Z, - c_source_I, c_source_lambda, - SIM.mosaic_domains, m_crystal_orientation, - SIM.Na, SIM.Nb, SIM.Nc, SIM.V_cell, - m_water_size, m_water_F, m_water_MW, simtbx::nanoBragg::r_e_sqr, - SIM.fluence, simtbx::nanoBragg::Avogadro, SIM.spot_scale, SIM.integral_form, - SIM.default_F, - current_channel_Fhkl, - kec.m_FhklParams, - SIM.nopolar, - m_polar_vector, SIM.polarization, SIM.fudge, - kdt.m_active_pixel_list, - diffuse, - // return arrays: - kdt.m_floatimage, - kdt.m_omega_reduction, - kdt.m_max_I_x_reduction, - kdt.m_max_I_y_reduction, - kdt.m_rangemap); - //don't want to free the kec data when the nanoBragg goes out of scope, so switch the pointer - // cu_current_channel_Fhkl = NULL; - add_array(kdt.m_accumulate_floatimage, kdt.m_floatimage); - }// loop over channels - } - inline void add_background(simtbx::Kokkos::kokkos_detector & kdt, int const& override_source) { // cudaSafeCall(cudaSetDevice(SIM.device_Id)); diff --git a/simtbx/kokkos/simulation_kernels.h b/simtbx/kokkos/simulation_kernels.h index 7c1fa9eb21..dd78e00242 100644 --- a/simtbx/kokkos/simulation_kernels.h +++ b/simtbx/kokkos/simulation_kernels.h @@ -643,6 +643,306 @@ void debranch_maskall_Kernel(int npanels, int spixels, int fpixels, int total_pi }); } +void debranch_maskall_low_memory_Kernel(int npanels, int spixels, int fpixels, int total_pixels, + int oversample, int point_pixel, + CUDAREAL pixel_size, CUDAREAL subpixel_size, int steps, + CUDAREAL detector_thickstep, int detector_thicksteps, CUDAREAL detector_thick, CUDAREAL detector_mu, + const view_1d_t sdet_vector, const view_1d_t fdet_vector, + const view_1d_t odet_vector, const view_1d_t pix0_vector, + const vector_cudareal_t close_distance, + const vector_cudareal_t beam_vector, + CUDAREAL dmin, int phisteps, int sources, + const vector_cudareal_t source_X, const vector_cudareal_t source_Y, + const vector_cudareal_t source_Z, + const vector_cudareal_t source_I, const vector_cudareal_t source_lambda, + int mosaic_domains, crystal_orientation_t crystal_orientation, + CUDAREAL Na, CUDAREAL Nb, CUDAREAL Nc, CUDAREAL V_cell, + CUDAREAL water_size, CUDAREAL water_F, CUDAREAL water_MW, CUDAREAL r_e_sqr, + CUDAREAL fluence, CUDAREAL Avogadro, CUDAREAL spot_scale, int integral_form, + CUDAREAL default_F, + const vector_cudareal_t Fhkl, + const hklParams FhklParams, + int nopolar, + const vector_cudareal_t polar_vector, CUDAREAL polarization, CUDAREAL fudge, + const vector_size_t pixel_lookup, + simtbx::Kokkos::diffuse_api diffuse, + vector_float_t floatimage /*out*/, + vector_float_t omega_reduction /*out*/, vector_float_t max_I_x_reduction /*out*/, + vector_float_t max_I_y_reduction /*out*/, vector_bool_t rangemap) { + + + const int s_h_min = FhklParams.h_min; + const int s_k_min = FhklParams.k_min; + const int s_l_min = FhklParams.l_min; + const int s_h_range = FhklParams.h_range; + const int s_k_range = FhklParams.k_range; + const int s_l_range = FhklParams.l_range; + const int s_h_max = s_h_min + s_h_range - 1; + const int s_k_max = s_k_min + s_k_range - 1; + const int s_l_max = s_l_min + s_l_range - 1; + + // set up diffuse scattering if needed + vector_mat3_t laue_mats = vector_mat3_t("laue_mats",24); + vector_cudareal_t dG_trace = vector_cudareal_t("dG_trace",3); + vector_vec3_t dG_dgam = vector_vec3_t("dG_dgam",3); + int num_laue_mats = 0; + int dhh = 0, dkk = 0, dll = 0; + KOKKOS_MAT3 rotate_principal_axes = diffuse.rotate_principal_axes; // (1,0,0,0,1,0,0,0,1); + int laue_group_num = diffuse.laue_group_num; + int stencil_size = diffuse.stencil_size; + KOKKOS_MAT3 anisoG = diffuse.anisoG; // (300.,0,0,0,100.,0,0,0,300.); + KOKKOS_MAT3 anisoU = diffuse.anisoU; // (0.48,0,0,0,0.16,0,0,0,0.16); + KOKKOS_MAT3 Bmat_realspace(1.,0,0,0,1.,0,0,0,1.); // Placeholder + KOKKOS_MAT3 anisoG_local; + CUDAREAL anisoG_determ = 0; + KOKKOS_MAT3 anisoU_local; + bool use_diffuse=diffuse.enable; + // ***NEEDS UPDATE: use legacy API for passing diffuse scale as KOKKOS_MAT3 + vector_mat3_t diffuse_scale_mat3 = vector_mat3_t("diffuse_scale_mat3",1); + + if (use_diffuse){ + anisoG_local = anisoG; + anisoU_local = anisoU; + + if (laue_group_num < 1 || laue_group_num >14 ){ + throw std::string("Laue group number not in range 1-14"); + } + + Kokkos::parallel_reduce("prepare diffuse mats", 1, KOKKOS_LAMBDA (const int& i, int& num_laue_mats_temp){ + num_laue_mats_temp = gen_laue_mats(laue_group_num, laue_mats, rotate_principal_axes); + // KOKKOS_MAT3 rotate_principal_axes; + // rotate_principal_axes << 0.70710678, -0.70710678, 0., 0.70710678, 0.70710678, 0., 0., 0., 1.; + + KOKKOS_MAT3 Amatrix(diffuse.a0[0],diffuse.a0[1],diffuse.a0[2],diffuse.b0[0],diffuse.b0[1],diffuse.b0[2], + diffuse.c0[0],diffuse.c0[1],diffuse.c0[2]); + KOKKOS_MAT3 Ainv = Amatrix.inverse()*1.e-10; + CUDAREAL reciprocal_space_volume = 8*M_PI*M_PI*M_PI*Ainv.determinant(); + CUDAREAL _tmpfac = M_PI * 0.63 / fudge; + CUDAREAL diffuse_scale = reciprocal_space_volume * sqrt(_tmpfac*_tmpfac*_tmpfac); + + // ***NEEDS UPDATE: use legacy API for passing diffuse scale as KOKKOS_MAT3 + diffuse_scale_mat3(0)(0,0) = diffuse_scale; + + for ( int iL = 0; iL < num_laue_mats_temp; iL++ ){ + laue_mats(iL) = Ainv * laue_mats(iL); + } + const KOKKOS_MAT3 Ginv = anisoG_local.inverse(); + const KOKKOS_MAT3 dG = Bmat_realspace * Ginv; + for (int i_gam=0; i_gam<3; i_gam++){ + dG_dgam(i_gam)[i_gam] = 1; + KOKKOS_MAT3 temp_dgam; + temp_dgam(i_gam, 0) = dG_dgam(i_gam)[0]; + temp_dgam(i_gam, 1) = dG_dgam(i_gam)[1]; + temp_dgam(i_gam, 2) = dG_dgam(i_gam)[2]; + dG_trace(i_gam) = (Ginv*temp_dgam).trace(); + } + }, num_laue_mats); + anisoG_determ = anisoG_local.determinant(); + dhh = dkk = dll = stencil_size; // Limits of stencil for diffuse calc + } + KOKKOS_VEC3 dHH(dhh,dkk,dll); + // end of diffuse setup + +// Implementation notes. This kernel is aggressively debranched, therefore the assumptions are: +// 1) mosaicity non-zero positive +// 2) xtal shape is "Gauss" i.e. 3D spheroid. +// 3) No bounds check for access to the structure factor array. +// 4) No check for Flatt=0. + const CUDAREAL dmin_r = (dmin > 0.0) ? 1/dmin : 0.0; + + // add background from something amorphous, precalculate scaling + const CUDAREAL F_bg = water_F; + const CUDAREAL I_bg = F_bg * F_bg * r_e_sqr * fluence * water_size * water_size * water_size * 1e6 * Avogadro / water_MW; + const CUDAREAL I_factor = r_e_sqr * spot_scale * fluence / steps; + + Kokkos::parallel_for("debranch_maskall", total_pixels, KOKKOS_LAMBDA(const int& pixIdx) { + + vec3 polar_vector_tmp {polar_vector(1), polar_vector(2), polar_vector(3)}; + + // position in pixel array + const int j = pixel_lookup(pixIdx);//pixIdx: index into pixel subset; j: index into the data. + const int i_panel = j / (fpixels*spixels); // the panel number + const int j_panel = j % (fpixels*spixels); // the pixel number within the panel + const int fpixel = j_panel % fpixels; + const int spixel = j_panel / fpixels; + const int outIdx = pixIdx; + + // reset photon count for this pixel + CUDAREAL I = I_bg; + CUDAREAL omega_sub_reduction = 0.0; + CUDAREAL max_I_x_sub_reduction = 0.0; + CUDAREAL max_I_y_sub_reduction = 0.0; + CUDAREAL polar = 0.0; + if (nopolar) { + polar = 1.0; + } + + // add this now to avoid problems with skipping later + // move this to the bottom to avoid accessing global device memory. floatimage[j] = I_bg; + // loop over sub-pixels + int subS, subF; + for (subS = 0; subS < oversample; ++subS) { // Y voxel + for (subF = 0; subF < oversample; ++subF) { // X voxel + // absolute mm position on detector (relative to its origin) + CUDAREAL Fdet = subpixel_size * (fpixel * oversample + subF) + subpixel_size / 2.0; // X voxel + CUDAREAL Sdet = subpixel_size * (spixel * oversample + subS) + subpixel_size / 2.0; // Y voxel + // Fdet = pixel_size*fpixel; + // Sdet = pixel_size*spixel; + + max_I_x_sub_reduction = Fdet; + max_I_y_sub_reduction = Sdet; + + int thick_tic; + for (thick_tic = 0; thick_tic < detector_thicksteps; ++thick_tic) { + // assume "distance" is to the front of the detector sensor layer + CUDAREAL Odet = thick_tic * detector_thickstep; // Z Orthagonal voxel. + + // construct detector subpixel position in 3D space + // pixel_X = distance; + // pixel_Y = Sdet-Ybeam; + // pixel_Z = Fdet-Xbeam; + vec3 pixel_pos; + pixel_pos += Fdet * fdet_vector(i_panel); + pixel_pos += Sdet * sdet_vector(i_panel); + pixel_pos += Odet * odet_vector(i_panel); + pixel_pos += pix0_vector(i_panel); + + // construct the diffracted-beam unit vector to this sub-pixel + CUDAREAL airpath_r = 1 / pixel_pos.length(); + vec3 diffracted = pixel_pos.get_unit_vector(); + + // solid angle subtended by a pixel: (pix/airpath)^2*cos(2theta) + CUDAREAL omega_pixel = pixel_size * pixel_size * airpath_r * airpath_r * close_distance(i_panel) * airpath_r; + // option to turn off obliquity effect, inverse-square-law only + if (point_pixel) { + omega_pixel = airpath_r * airpath_r; + } + + // now calculate detector thickness effects + CUDAREAL capture_fraction = 1.0; + if (detector_thick > 0.0 && detector_mu> 0.0) { + // inverse of effective thickness increase + CUDAREAL parallax = odet_vector(i_panel).dot(diffracted); + capture_fraction = exp(-thick_tic * detector_thickstep / detector_mu / parallax) + - exp(-(thick_tic + 1) * detector_thickstep / detector_mu / parallax); + } + + // loop over sources now + int source; + for (source = 0; source < sources; ++source) { + + // retrieve stuff from cache + vec3 incident = {-source_X(source), -source_Y(source), -source_Z(source)}; + CUDAREAL lambda = source_lambda(source); + CUDAREAL source_fraction = source_I(source); + + // construct the incident beam unit vector while recovering source distance + // TODO[Giles]: Optimization! We can unitize the source vectors before passing them in. + incident.normalize(); + + // construct the scattering vector for this pixel + vec3 scattering = (diffracted - incident) / lambda; + CUDAREAL stol = 0.5 * scattering.length(); + + // rough cut to speed things up when we aren't using whole detector + if (dmin > 0.0 && stol > 0.0) { + // use reciprocal of (dmin > 0.5 / stol) + if (dmin_r <= 2 * stol) { + continue; + } + } + + // polarization factor + if (!nopolar) { + // need to compute polarization factor + polar = polarization_factor(polarization, incident, diffracted, polar_vector_tmp); + } else { + polar = 1.0; + } + + // sweep over phi angles + for (int phi_tic = 0; phi_tic < phisteps; ++phi_tic) { + // enumerate mosaic domains + for (int mos_tic = 0; mos_tic < mosaic_domains; ++mos_tic) { + // apply mosaic rotation after phi rotation + auto a = crystal_orientation(phi_tic, mos_tic, 0); + auto b = crystal_orientation(phi_tic, mos_tic, 1); + auto c = crystal_orientation(phi_tic, mos_tic, 2); + + // construct fractional Miller indicies + CUDAREAL h = a.dot(scattering); + CUDAREAL k = b.dot(scattering); + CUDAREAL l = c.dot(scattering); + + // round off to nearest whole index + int h0 = ceil(h - 0.5); + int k0 = ceil(k - 0.5); + int l0 = ceil(l - 0.5); + + // structure factor of the lattice (paralelpiped crystal) + // F_latt = sin(M_PI*s_Na*h)*sin(M_PI*s_Nb*k)*sin(M_PI*s_Nc*l)/sin(M_PI*h)/sin(M_PI*k)/sin(M_PI*l); + + CUDAREAL I_latt = 1.0; // Shape transform for the crystal. + CUDAREAL hrad_sqr = 0.0; + + // handy radius in reciprocal space, squared + hrad_sqr = (h - h0) * (h - h0) * Na * Na + (k - k0) * (k - k0) * Nb * Nb + (l - l0) * (l - l0) * Nc * Nc; + // fudge the radius so that volume and FWHM are similar to square_xtal spots + double my_arg = hrad_sqr / 0.63 * fudge; + { + CUDAREAL F_latt = Na * Nb * Nc * exp(-(my_arg)); + I_latt = F_latt * F_latt; + } + + // new code for diffuse. + if (use_diffuse) { + KOKKOS_VEC3 H_vec(h,k,l); + KOKKOS_VEC3 H0(h0,k0,l0); + CUDAREAL step_diffuse_param[6]; + // ***NEEDS UPDATE: use legacy API for passing diffuse scale as KOKKOS_MAT3 + calc_diffuse_at_hkl(H_vec,H0,dHH,s_h_min,s_k_min,s_l_min,s_h_max,s_k_max, + s_l_max,s_h_range,s_k_range,s_l_range,diffuse_scale_mat3(0),Fhkl,num_laue_mats,laue_mats, + anisoG_local,dG_trace,anisoG_determ,anisoU_local,dG_dgam,false,&I_latt,step_diffuse_param); + } + // end s_use_diffuse outer + + // structure factor of the unit cell + CUDAREAL F_cell = default_F; + //F_cell = quickFcell_ldg(s_hkls, s_h_max, s_h_min, s_k_max, s_k_min, s_l_max, s_l_min, h0, k0, l0, s_h_range, s_k_range, s_l_range, default_F, Fhkl); + if ( + h0 < s_h_min || + k0 < s_k_min || + l0 < s_l_min || + h0 > s_h_max || + k0 > s_k_max || + l0 > s_l_max + ) { + F_cell = 0.; + } else { + const int hkl_index = (h0-s_h_min)*s_k_range*s_l_range + (k0-s_k_min)*s_l_range + (l0-s_l_min); + F_cell = Fhkl[hkl_index]; + } + + // now we have the structure factor for this pixel + + // convert amplitudes into intensity (photons per steradian) + I += F_cell * F_cell * I_latt * source_fraction * capture_fraction * omega_pixel; + omega_sub_reduction += omega_pixel; + } // end of mosaic loop + } // end of phi loop + } // end of source loop + } // end of detector thickness loop + } // end of sub-pixel y loop + } // end of sub-pixel x loop + const double photons = I_bg + I_factor * polar * I; + floatimage( outIdx ) = photons; + //omega_reduction( outIdx ) = omega_sub_reduction; // shared contention + //max_I_x_reduction( outIdx ) = max_I_x_sub_reduction; + //max_I_y_reduction( outIdx ) = max_I_y_sub_reduction; + //rangemap( outIdx ) = true; + }); +} + // __global__ void nanoBraggSpotsInitCUDAKernel(int spixels, int fpixesl, float * floatimage, float * omega_reduction, // float * max_I_x_reduction, // float * max_I_y_reduction, bool * rangemap); diff --git a/simtbx/tests/tst_memory_policy.py b/simtbx/tests/tst_memory_policy.py index dbb94cee5e..177d6e5a97 100644 --- a/simtbx/tests/tst_memory_policy.py +++ b/simtbx/tests/tst_memory_policy.py @@ -233,10 +233,12 @@ def specialized_api_for_whitelist_low_memory(self, params, whitelist_pixels, arg per_image_scale_factor = self.domains_per_crystal # 1.0 self.gpu_detector.scale_in_place(per_image_scale_factor) # apply scale directly on GPU self.reset_pythony_beams(self.SIM) + print("AAA") self.whitelist_values = self.gpu_detector.get_whitelist_raw_pixels(whitelist_pixels) + print("BBB") -def get_whitelist_from_refls(prefix,SIM): - image_size = len(SIM.raw_pixels) +def get_whitelist_from_refls(prefix,SIM=None): + #image_size = len(SIM.raw_pixels) from dials.array_family import flex from dxtbx.model.experiment_list import ExperimentListFactory experiments = ExperimentListFactory.from_json_file("%s_imported.expt"%prefix, check_format = False) @@ -353,8 +355,9 @@ def run_all(params): free_gpu_before = get_gpu_memory()[0] del SWCs free_gpu_after = get_gpu_memory()[0] - print((free_gpu_after - free_gpu_before)/NTRIALS,"free") - assert (free_gpu_after - free_gpu_before)/NTRIALS >= 50 # old policy uses at least 50 MB per sim., actual value 57.6 MB + old_memory_use = (free_gpu_after - free_gpu_before)/NTRIALS + print(old_memory_use,"free") + assert old_memory_use >= 50 # old policy uses at least 50 MB per sim., actual value 57.6 MB #figure out the minimum change needed to reduce memory consumption by factor of image_size/whitelist_size #accomplish the same with compile-time polymorphism @@ -378,16 +381,51 @@ def run_all(params): free_gpu_before = get_gpu_memory()[0] del SWCs free_gpu_after = get_gpu_memory()[0] - print((free_gpu_after - free_gpu_before)/NTRIALS,"free") - assert (free_gpu_after - free_gpu_before)/NTRIALS >= 50 # old policy uses at least 50 MB per sim., actual value 57.6 MB + new_memory_use = (free_gpu_after - free_gpu_before)/NTRIALS + print(new_memory_use,"free") + assert old_memory_use > 4.*new_memory_use # new policy cuts down at least 4-fold on memory, actual value ** - return +def run_subset_for_NESAP_debug(params): + # make the dxtbx objects + BEAM = basic_beam() + DETECTOR = basic_detector() + CRYSTAL = basic_crystal() + SF_model = amplitudes(CRYSTAL) + # Famp = SF_model.Famp # simple uniform amplitudes + SF_model.random_structure(CRYSTAL) + SF_model.ersatz_correct_to_P1() + whitelist_pixels = flex.size_t((11929, + 351293,351294,351295,352828,352829,352830,352831,354364,354365, + 354366,354367,355900,355901,355902,355903,357436,357437,357438, + 357439,352383,352384,352385,353919,353920,353921,355455,355456, + 355457,356991,356992,356993,354120,354121,354122,355656,355657)) + + NTRIALS=5 + #figure out the minimum change needed to reduce memory consumption by factor of image_size/whitelist_size + #accomplish the same with compile-time polymorphism + #have side-by-side test in same python script + # Reproduce whitelist sims with small-memory mechanism + SWCs=[] + for x in range(NTRIALS): + print("Whitelist-only iteration with small memory",x) + SWCs.append(several_wavelength_case_policy(BEAM,DETECTOR,CRYSTAL,SF_model,weights=flex.double([1.]))) + SWCs[-1].specialized_api_for_whitelist_low_memory(whitelist_pixels=whitelist_pixels,params=params,argchk=False,sources=True) + #produce an output image file for intermediate debugging + working_raw_pixels = flex.double(image_size) # blank array + working_raw_pixels.set_selected(whitelist_pixels, SWCs[-1].whitelist_values) + working_raw_pixels.reshape(flex.grid(SWCs[-1].SIM.raw_pixels.focus())) + + free_gpu_before = get_gpu_memory()[0] + del SWCs + free_gpu_after = get_gpu_memory()[0] + new_memory_use = (free_gpu_after - free_gpu_before)/NTRIALS + print(new_memory_use,"free") if __name__=="__main__": params,options = parse_input() # Initialize based on GPU context gpu_instance_type = get_exascale("gpu_instance", params.context) gpu_instance = gpu_instance_type(deviceId = 0) - - run_all(params) + #run_all(params) + run_subset_for_NESAP_debug(params) print("OK") From 4cf3974b201c78ddc5cc31e231a78f00f597a96d Mon Sep 17 00:00:00 2001 From: Nicholas K Sauter Date: Sat, 20 Apr 2024 18:56:46 -0700 Subject: [PATCH 3/4] Minimal changes for working tst_memory_policy.py Memory savings achieved through code specialization, for the case where pixel values are simulated on a small whitelist. Specializations are not yet optimal, as there is still a lot of code duplication. Changes give ~4.5x reduction in memory footprint, but no success yet in resizing the array m_accumulate_floatimage. Attempts so far lead to cuda memory allocation error. --- simtbx/kokkos/detector.cpp | 97 +++++------------------------- simtbx/kokkos/detector.h | 46 +++++++++++--- simtbx/kokkos/kokkos_ext.cpp | 2 +- simtbx/kokkos/simulation.cpp | 3 +- simtbx/kokkos/simulation_kernels.h | 8 +++ simtbx/tests/tst_memory_policy.py | 28 +++------ 6 files changed, 71 insertions(+), 113 deletions(-) diff --git a/simtbx/kokkos/detector.cpp b/simtbx/kokkos/detector.cpp index c4ef795d72..238ca56881 100644 --- a/simtbx/kokkos/detector.cpp +++ b/simtbx/kokkos/detector.cpp @@ -93,16 +93,16 @@ namespace simtbx { namespace Kokkos { } template<> - void kokkos_detector::hello(){ - SCITBX_EXAMINE("small small small"); + std::string kokkos_detector::hello(){ + return("small small small"); } template<> - void kokkos_detector::hello(){ - SCITBX_EXAMINE("large large large"); + std::string kokkos_detector::hello(){ + return("large large large"); } template<> void - kokkos_detector::each_image_allocate() { + kokkos_detector::each_image_allocate(const std::size_t& n_pixels) { resize(m_rangemap, m_total_pixel_count); resize(m_omega_reduction, m_total_pixel_count); resize(m_max_I_x_reduction, m_total_pixel_count); @@ -140,9 +140,17 @@ namespace simtbx { namespace Kokkos { // printf("DONE.\n"); } + template<> void - kokkos_detector::each_image_allocate() { - resize(m_maskimage, m_total_pixel_count); + kokkos_detector::each_image_allocate(const std::size_t& n_pixels) { + SCITBX_ASSERT(n_pixels > 0); + resize(m_rangemap, n_pixels); + resize(m_omega_reduction, n_pixels); + resize(m_max_I_x_reduction, n_pixels); + resize(m_max_I_y_reduction, n_pixels); + resize(m_floatimage, n_pixels); + + resize(m_maskimage, n_pixels); kokkostbx::transfer_shared2kokkos(m_sdet_vector, metrology.sdet); kokkostbx::transfer_shared2kokkos(m_fdet_vector, metrology.fdet); kokkostbx::transfer_shared2kokkos(m_odet_vector, metrology.odet); @@ -152,80 +160,5 @@ namespace simtbx { namespace Kokkos { kokkostbx::transfer_shared2kokkos(m_Ybeam, metrology.Ybeam); fence(); } - - template<> - void - kokkos_detector::set_active_pixels_on_GPU(af::shared active_pixel_list_value) { - m_active_pixel_size = active_pixel_list_value.size(); - kokkostbx::transfer_shared2kokkos(m_active_pixel_list, active_pixel_list_value); - active_pixel_list = active_pixel_list_value; - } - - template<> - void - kokkos_detector::set_active_pixels_on_GPU(af::shared active_pixel_list_value) { - m_active_pixel_size = active_pixel_list_value.size(); - kokkostbx::transfer_shared2kokkos(m_active_pixel_list, active_pixel_list_value); - active_pixel_list = active_pixel_list_value; - resize(m_rangemap, m_active_pixel_size); - resize(m_omega_reduction, m_active_pixel_size); - resize(m_max_I_x_reduction, m_active_pixel_size); - resize(m_max_I_y_reduction, m_active_pixel_size); - resize(m_floatimage, m_active_pixel_size); - resize(m_accumulate_floatimage, m_active_pixel_size); - fence(); - } - - template<> af::shared - kokkos_detector::get_whitelist_raw_pixels(af::shared selection) { - hello(); - //return the data array for the multipanel detector case, but only for whitelist pixels - vector_size_t active_pixel_selection = vector_size_t("active_pixel_selection", selection.size()); - kokkostbx::transfer_shared2kokkos(active_pixel_selection, selection); - - size_t output_pixel_size = selection.size(); - vector_cudareal_t active_pixel_results = vector_cudareal_t("active_pixel_results", output_pixel_size); - - auto temp = m_accumulate_floatimage; - - parallel_for("get_active_pixel_selection", - range_policy(0, output_pixel_size), - KOKKOS_LAMBDA (const int i) { - size_t index = active_pixel_selection( i ); - active_pixel_results( i ) = temp( index ); - }); - - af::shared output_array(output_pixel_size, af::init_functor_null()); - kokkostbx::transfer_kokkos2shared(output_array, active_pixel_results); - - SCITBX_ASSERT(output_array.size() == output_pixel_size); - return output_array; - } - template<> af::shared - kokkos_detector::get_whitelist_raw_pixels(af::shared selection) { - SCITBX_CHECK_POINT; - hello(); - //return the data array for the multipanel detector case, but only for whitelist pixels - - std::size_t output_pixel_size = selection.size(); - //vector_cudareal_t active_pixel_results = vector_cudareal_t("active_pixel_results", output_pixel_size); - - //auto temp = m_accumulate_floatimage; - - //parallel_for("get_active_pixel_selection2", - // range_policy(0, output_pixel_size), - // KOKKOS_LAMBDA (const int i) { - // active_pixel_results( i ) = temp( i ); - //}); - - af::shared output_array(output_pixel_size, af::init_functor_null()); - SCITBX_CHECK_POINT; - kokkostbx::transfer_kokkos2shared(output_array, m_accumulate_floatimage);//active_pixel_results); - SCITBX_CHECK_POINT; - - SCITBX_ASSERT(output_array.size() == output_pixel_size); - return output_array; - } - } // Kokkos } // simtbx diff --git a/simtbx/kokkos/detector.h b/simtbx/kokkos/detector.h index 74d286ffbe..dc1f276a2d 100644 --- a/simtbx/kokkos/detector.h +++ b/simtbx/kokkos/detector.h @@ -23,6 +23,7 @@ using vec3 = kokkostbx::vector3; using mat3 = kokkostbx::matrix3; using Kokkos::fence; + namespace simtbx { namespace Kokkos { namespace af = scitbx::af; @@ -45,8 +46,7 @@ struct large_array_policy {}; struct small_whitelist_policy {}; template -struct kokkos_detector -{ +struct kokkos_detector{ inline kokkos_detector(){printf("NO OPERATION, DEVICE NUMBER IS NEEDED");}; //kokkos_detector(int const&, const simtbx::nanoBragg::nanoBragg& nB); //kokkos_detector(int const&, dxtbx::model::Detector const &, dxtbx::model::Beam const &); @@ -56,12 +56,12 @@ struct kokkos_detector std::cout << "Detector size: " << m_panel_count << " panel" << ( (m_panel_count>1)? "s" : "" ) << std::endl; metrology.show(); } - //void each_image_allocate(); + void each_image_allocate(const std::size_t&); //void scale_in_place(const double&); //void write_raw_pixels(simtbx::nanoBragg::nanoBragg&); //af::flex_double get_raw_pixels(); //void set_active_pixels_on_GPU(af::shared); - af::shared get_whitelist_raw_pixels(af::shared); + //af::shared get_whitelist_raw_pixels(af::shared); inline void each_image_free(){} //no op in Kokkos int h_deviceID; @@ -155,8 +155,6 @@ struct kokkos_detector return view_floatimage; }; - void each_image_allocate(); - inline void scale_in_place(const double& factor){ auto local_accumulate_floatimage = m_accumulate_floatimage; @@ -165,8 +163,6 @@ struct kokkos_detector }); } - void set_active_pixels_on_GPU(af::shared active_pixel_list_value); - inline void write_raw_pixels(simtbx::nanoBragg::nanoBragg& nB) { //only implement the monolithic detector case, one panel @@ -203,11 +199,41 @@ struct kokkos_detector return output_array; } - void hello(); + inline void + set_active_pixels_on_GPU(af::shared active_pixel_list_value) { + m_active_pixel_size = active_pixel_list_value.size(); + kokkostbx::transfer_shared2kokkos(m_active_pixel_list, active_pixel_list_value); + active_pixel_list = active_pixel_list_value; + } -}; + inline af::shared + get_whitelist_raw_pixels(af::shared selection) { + printf("algorithm: %20s selection size %10d\n",hello().c_str(), selection.size()); + //return the data array for the multipanel detector case, but only for whitelist pixels + vector_size_t active_pixel_selection = vector_size_t("active_pixel_selection", selection.size()); + kokkostbx::transfer_shared2kokkos(active_pixel_selection, selection); + + size_t output_pixel_size = selection.size(); + vector_cudareal_t active_pixel_results = vector_cudareal_t("active_pixel_results", output_pixel_size); + auto temp = m_accumulate_floatimage; + parallel_for("get_active_pixel_selection", + range_policy(0, output_pixel_size), + KOKKOS_LAMBDA (const int i) { + size_t index = active_pixel_selection( i ); + active_pixel_results( i ) = temp( index ); + }); + + af::shared output_array(output_pixel_size, af::init_functor_null()); + kokkostbx::transfer_kokkos2shared(output_array, active_pixel_results); + + SCITBX_ASSERT(output_array.size() == output_pixel_size); + return output_array; + } + + std::string hello(); +}; } // Kokkos } // simtbx #endif // SIMTBX_KOKKOS_DETECTOR_H diff --git a/simtbx/kokkos/kokkos_ext.cpp b/simtbx/kokkos/kokkos_ext.cpp index 3ef9f4e2ce..ca6d2dc096 100644 --- a/simtbx/kokkos/kokkos_ext.cpp +++ b/simtbx/kokkos/kokkos_ext.cpp @@ -87,6 +87,7 @@ namespace simtbx { namespace Kokkos { .def("show_summary",&simtbx::Kokkos::kokkos_detector::show_summary) .def("each_image_allocate", &simtbx::Kokkos::kokkos_detector::each_image_allocate, + ( arg_("n_pixels")=0 ), "Allocate large pixel arrays") .def("scale_in_place", &simtbx::Kokkos::kokkos_detector::scale_in_place, "Multiply by a scale factor on the GPU") @@ -95,7 +96,6 @@ namespace simtbx { namespace Kokkos { .def("get_raw_pixels",&simtbx::Kokkos::kokkos_detector::get_raw_pixels, "return multipanel detector raw pixels as a flex array") .def("get_whitelist_raw_pixels", - (af::shared (simtbx::Kokkos::kokkos_detector::*)(af::shared)) &simtbx::Kokkos::kokkos_detector::get_whitelist_raw_pixels, "return only those raw pixels requested by the whitelist selection, as a 1D flex array") .def("each_image_free", &simtbx::Kokkos::kokkos_detector::each_image_free) diff --git a/simtbx/kokkos/simulation.cpp b/simtbx/kokkos/simulation.cpp index cf2d79dd4e..65d3979e86 100644 --- a/simtbx/kokkos/simulation.cpp +++ b/simtbx/kokkos/simulation.cpp @@ -211,7 +211,8 @@ namespace Kokkos { //don't want to free the kec data when the nanoBragg goes out of scope, so switch the pointer // cu_current_channel_Fhkl = NULL; - add_array(kdt.m_accumulate_floatimage, kdt.m_floatimage); + //for the small_whitelist specialization, have a special version of add_array() that specifies size + add_array_limit(kdt.m_accumulate_floatimage, kdt.m_floatimage, kdt.m_floatimage.span()); }// loop over channels } diff --git a/simtbx/kokkos/simulation_kernels.h b/simtbx/kokkos/simulation_kernels.h index dd78e00242..580a184b04 100644 --- a/simtbx/kokkos/simulation_kernels.h +++ b/simtbx/kokkos/simulation_kernels.h @@ -955,6 +955,14 @@ void add_array( view_1d_t lhs, const view_1d_t rhs ) { }); } +template +void add_array_limit( view_1d_t lhs, const view_1d_t rhs, const std::size_t& limit ) { + Kokkos::parallel_for("add_arrays", limit, KOKKOS_LAMBDA(const int& i) { + lhs( i ) = lhs( i ) + (T)rhs( i ); + rhs( i ) = 0; + }); +} + void add_background_kokkos_kernel(int sources, int nanoBragg_oversample, int override_source, CUDAREAL pixel_size, int spixels, int fpixels, int detector_thicksteps, CUDAREAL detector_thickstep, CUDAREAL detector_attnlen, diff --git a/simtbx/tests/tst_memory_policy.py b/simtbx/tests/tst_memory_policy.py index 177d6e5a97..fe1629314c 100644 --- a/simtbx/tests/tst_memory_policy.py +++ b/simtbx/tests/tst_memory_policy.py @@ -215,7 +215,8 @@ def specialized_api_for_whitelist_low_memory(self, params, whitelist_pixels, arg self.gpu_simulation.allocate() self.gpu_detector = get_exascale("gpu_detector_small_whitelist",params.context)( deviceId=self.SIM.device_Id, detector=self.DETECTOR, beam=self.BEAM) - self.gpu_detector.each_image_allocate() + + self.gpu_detector.each_image_allocate(n_pixels = whitelist_pixels.size() ) # self.gpu_detector.show_summary() assert sources @@ -233,9 +234,8 @@ def specialized_api_for_whitelist_low_memory(self, params, whitelist_pixels, arg per_image_scale_factor = self.domains_per_crystal # 1.0 self.gpu_detector.scale_in_place(per_image_scale_factor) # apply scale directly on GPU self.reset_pythony_beams(self.SIM) - print("AAA") - self.whitelist_values = self.gpu_detector.get_whitelist_raw_pixels(whitelist_pixels) - print("BBB") + whitelist_idx = flex.size_t(range(whitelist_pixels.size())) + self.whitelist_values = self.gpu_detector.get_whitelist_raw_pixels(whitelist_idx) def get_whitelist_from_refls(prefix,SIM=None): #image_size = len(SIM.raw_pixels) @@ -347,7 +347,7 @@ def run_all(params): # Now reproduce whitelist sims showing accumulation of large persistent memory SWCs=[] for x in range(NTRIALS): - print("Whitelist-only iteration",x) + print("\nWhitelist-only iteration",x) SWCs.append(several_wavelength_case_policy(BEAM,DETECTOR,CRYSTAL,SF_model,weights=flex.double([1.]))) SWCs[-1].specialized_api_for_whitelist(whitelist_pixels=whitelist_pixels,params=params,argchk=False,sources=True) @@ -365,7 +365,7 @@ def run_all(params): # Reproduce whitelist sims with small-memory mechanism SWCs=[] for x in range(NTRIALS): - print("Whitelist-only iteration with small memory",x) + print("\nWhitelist-only iteration with small memory",x) SWCs.append(several_wavelength_case_policy(BEAM,DETECTOR,CRYSTAL,SF_model,weights=flex.double([1.]))) SWCs[-1].specialized_api_for_whitelist_low_memory(whitelist_pixels=whitelist_pixels,params=params,argchk=False,sources=True) #produce an output image file for intermediate debugging @@ -407,25 +407,15 @@ def run_subset_for_NESAP_debug(params): # Reproduce whitelist sims with small-memory mechanism SWCs=[] for x in range(NTRIALS): - print("Whitelist-only iteration with small memory",x) + print("\n Whitelist-only iteration with small memory",x) SWCs.append(several_wavelength_case_policy(BEAM,DETECTOR,CRYSTAL,SF_model,weights=flex.double([1.]))) SWCs[-1].specialized_api_for_whitelist_low_memory(whitelist_pixels=whitelist_pixels,params=params,argchk=False,sources=True) - #produce an output image file for intermediate debugging - working_raw_pixels = flex.double(image_size) # blank array - working_raw_pixels.set_selected(whitelist_pixels, SWCs[-1].whitelist_values) - working_raw_pixels.reshape(flex.grid(SWCs[-1].SIM.raw_pixels.focus())) - - free_gpu_before = get_gpu_memory()[0] - del SWCs - free_gpu_after = get_gpu_memory()[0] - new_memory_use = (free_gpu_after - free_gpu_before)/NTRIALS - print(new_memory_use,"free") if __name__=="__main__": params,options = parse_input() # Initialize based on GPU context gpu_instance_type = get_exascale("gpu_instance", params.context) gpu_instance = gpu_instance_type(deviceId = 0) - #run_all(params) - run_subset_for_NESAP_debug(params) + run_all(params) + #run_subset_for_NESAP_debug(params) print("OK") From fa0295b3344da50d1575a10f39e67d7a6a9a6645 Mon Sep 17 00:00:00 2001 From: Nicholas K Sauter Date: Wed, 22 May 2024 10:07:31 -0700 Subject: [PATCH 4/4] Remove debugging output to conserve stdout size. --- simtbx/kokkos/detector.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/simtbx/kokkos/detector.h b/simtbx/kokkos/detector.h index dc1f276a2d..85529fdc5c 100644 --- a/simtbx/kokkos/detector.h +++ b/simtbx/kokkos/detector.h @@ -208,7 +208,7 @@ struct kokkos_detector{ inline af::shared get_whitelist_raw_pixels(af::shared selection) { - printf("algorithm: %20s selection size %10d\n",hello().c_str(), selection.size()); + //printf("algorithm: %20s selection size %10d\n",hello().c_str(), selection.size()); //return the data array for the multipanel detector case, but only for whitelist pixels vector_size_t active_pixel_selection = vector_size_t("active_pixel_selection", selection.size()); kokkostbx::transfer_shared2kokkos(active_pixel_selection, selection);