diff --git a/src/vt/collective/reduce/allreduce/rabenseifner.impl.h b/src/vt/collective/reduce/allreduce/rabenseifner.impl.h index b699576218..39c732c62f 100644 --- a/src/vt/collective/reduce/allreduce/rabenseifner.impl.h +++ b/src/vt/collective/reduce/allreduce/rabenseifner.impl.h @@ -267,10 +267,10 @@ void Rabenseifner::scatterReduceIter() { auto dest = (vdest < nprocs_rem_) ? vdest * 2 : vdest + nprocs_rem_; vt_debug_print( terse, allreduce, - "[{}] Rabenseifer Part2 (step {}): Sending to Node {} starting with idx = {} and " + "Rabenseifner Part2 (step {}): Sending to Node {} starting with idx = {} and " "count " "{} \n", - this_node_, scatter_step_, dest, s_index_[scatter_step_], + scatter_step_, dest, s_index_[scatter_step_], s_count_[scatter_step_] ); @@ -310,9 +310,9 @@ void Rabenseifner::scatterReduceIterHandler( vt_debug_print( terse, allreduce, - "[{}] Rabenseifner Part2 (step {}): scatter_mask_= {} nprocs_pof2_ = {}: " + "Rabenseifner Part2 (step {}): scatter_mask_= {} nprocs_pof2_ = {}: " "idx = {} from {}\n", - this_node_, msg->step_, scatter_mask_, nprocs_pof2_, r_index_[msg->step_], + msg->step_, scatter_mask_, nprocs_pof2_, r_index_[msg->step_], theContext()->getFromNodeCurrentTask() ); @@ -382,10 +382,10 @@ void Rabenseifner::gatherIter() { vt_debug_print( terse, allreduce, - "[{}] Rabenseifner Part3 (step {}): Sending to Node {} starting with idx = {} and " + "Rabenseifner Part3 (step {}): Sending to Node {} starting with idx = {} and " "count " "{} \n", - this_node_, gather_step_, dest, r_index_[gather_step_], + gather_step_, dest, r_index_[gather_step_], r_count_[gather_step_] ); @@ -413,8 +413,8 @@ template < void Rabenseifner::gatherIterHandler( AllreduceRbnMsg* msg) { vt_debug_print( - terse, allreduce, "[{}] Rabenseifner Part3 (step {}): Received idx = {} from {}\n", - this_node_, msg->step_, s_index_[msg->step_], + terse, allreduce, "Rabenseifner Part3 (step {}): Received idx = {} from {}\n", + msg->step_, s_index_[msg->step_], theContext()->getFromNodeCurrentTask() ); @@ -456,7 +456,7 @@ template < void Rabenseifner::sendToExcludedNodes() { if (is_part_of_adjustment_group_ and is_even_) { vt_debug_print( - terse, allreduce, "[{}] Rabenseifner Part4: Sending to Node {} \n", this_node_, + terse, allreduce, "Rabenseifner Part4: Sending to Node {} \n", this_node_ + 1 ); proxy_[this_node_ + 1] diff --git a/src/vt/configs/arguments/args.cc b/src/vt/configs/arguments/args.cc index ae2f7f84ef..c43659fd6a 100644 --- a/src/vt/configs/arguments/args.cc +++ b/src/vt/configs/arguments/args.cc @@ -376,6 +376,7 @@ void addDebugPrintArgs(CLI::App& app, AppConfig& appConfig) { auto ddp = "Enable debug_context = \"" debug_pp(context) "\""; auto dep = "Enable debug_epoch = \"" debug_pp(epoch) "\""; auto dfp = "Enable debug_replay = \"" debug_pp(replay) "\""; + auto dgp = "Enable debug_allreduce = \"" debug_pp(allreduce) "\""; auto r1 = app.add_option("--vt_debug_level", appConfig.vt_debug_level, rq); @@ -413,6 +414,7 @@ void addDebugPrintArgs(CLI::App& app, AppConfig& appConfig) { auto dd = app.add_flag("--vt_debug_context", appConfig.vt_debug_context, ddp); auto de = app.add_flag("--vt_debug_epoch", appConfig.vt_debug_epoch, dep); auto df = app.add_flag("--vt_debug_replay", appConfig.vt_debug_replay, dfp); + auto dg = app.add_flag("--vt_debug_allreduce", appConfig.vt_debug_allreduce, dgp); auto debugGroup = "Debug Print Configuration (must be compile-time enabled)"; r->group(debugGroup); @@ -450,6 +452,7 @@ void addDebugPrintArgs(CLI::App& app, AppConfig& appConfig) { dd->group(debugGroup); de->group(debugGroup); df->group(debugGroup); + dg->group(debugGroup); auto dbq = "Always flush VT runtime prints"; auto eb = app.add_flag("--vt_debug_print_flush", appConfig.vt_debug_print_flush, dbq); diff --git a/tests/perf/allreduce.cc b/tests/perf/allreduce.cc index 4b94dae324..2f89677cf4 100644 --- a/tests/perf/allreduce.cc +++ b/tests/perf/allreduce.cc @@ -58,136 +58,105 @@ using namespace vt; using namespace vt::tests::perf::common; +static constexpr std::array const payloadSizes = { + 64, 128, 2048, 16384, 32768, 524288, 1048576, 2097152}; + struct MyTest : PerfTestHarness { - void SetUp() override { - PerfTestHarness::SetUp(); - data.resize(1 << 16); - for (auto& val : data) { - val = theContext()->getNode() + 1; - } + MyTest() { + DisableGlobalTimer(); } std::vector data; }; struct NodeObj { - explicit NodeObj(MyTest* test_obj, const std::string& name) : test_obj_(test_obj), timer_name_(name) { } + explicit NodeObj(MyTest* test_obj, const std::string& name) + : base_name_(name), + test_obj_(test_obj) { + for (auto const payload_size : payloadSizes) { + timer_names_[payload_size] = + fmt::format("{} {}", base_name_, payload_size); + } + } void initialize() { proxy_ = vt::theObjGroup()->getProxy(this); - // data_["Node"] = theContext()->getNode(); } } - struct MyMsg : vt::Message { }; - - void recursiveDoubling(std::vector in) { - // std::string printer(1024, 0x0); - // printer.append(fmt::format("\n[{}]: recursiveDoubling done! ", theContext()->getNode())); - - // for (int node = 0; node < theContext()->getNumNodes(); ++node) { - // if (node == theContext()->getNode()) { - // for (auto val : in) { - // printer.append(fmt::format("{} ", val)); - // } - - // fmt::print("{}\n", printer); - - // theCollective()->barrier(); - // } - // } - - // fmt::print("\n"); - // const auto p = theContext()->getNumNodes(); - // const auto expected = (p * (p + 1)) / 2; - // for (auto val : in) { - // vtAssert(val == expected, "FAILURE!"); - // } - test_obj_->StopTimer(timer_name_); + void handlerVec(std::vector vec) { + test_obj_->StopTimer(timer_names_.at(vec.size())); } - void newReduceComplete(std::vector in) { - // std::string printer(1024, 0x0); - // printer.append(fmt::format("\n[{}]: allreduce_rabenseifner done! ", theContext()->getNode())); - - // for (int node = 0; node < theContext()->getNumNodes(); ++node) { - // if (node == theContext()->getNode()) { - - // for (auto val : in) { - // printer.append(fmt::format("{} ", val)); - // } - - // fmt::print("{}\n", printer); - - // theCollective()->barrier(); - // } - // } - - // fmt::print("\n"); - // const auto p = theContext()->getNumNodes(); - // const auto expected = (p * (p + 1)) / 2; - // for (auto val : in) { - // vtAssert(val == expected, "FAILURE!"); - // } - test_obj_->StopTimer(timer_name_); +#if KOKKOS_ENABLED_CHECKPOINT + template + void handlerView(Kokkos::View view) { + test_obj_->StopTimer(timer_names_.at(view.extent(0))); } +#endif // KOKKOS_ENABLED_CHECKPOINT - void reduceComplete(std::vector in) { - // fmt::print( - // "[{}]: allreduce done! Results are ...\n", theContext()->getNode()); - // for (auto val : in) { - // fmt::print("{} ", val); - // } - - // fmt::print("\n"); - test_obj_->StopTimer(timer_name_); - } - std::string timer_name_ = {}; + std::string base_name_ = {}; + std::unordered_map timer_names_= {}; MyTest* test_obj_ = nullptr; vt::objgroup::proxy::Proxy proxy_ = {}; }; VT_PERF_TEST(MyTest, test_reduce) { auto grp_proxy = - vt::theObjGroup()->makeCollective("test_allreduce", this, "Reduce -> Bcast"); + vt::theObjGroup()->makeCollective("test_allreduce", this, "Reduce -> Bcast vector"); + + for (auto payload_size : payloadSizes) { + data.resize(payload_size, theContext()->getNode() + 1); + + theCollective()->barrier(); - theCollective()->barrier(); - StartTimer(grp_proxy[theContext()->getNode()].get()->timer_name_); - grp_proxy.allreduce<&NodeObj::reduceComplete, collective::PlusOp>(data); + StartTimer(grp_proxy[my_node_].get()->timer_names_.at(payload_size)); + grp_proxy.allreduce<&NodeObj::handlerVec, collective::PlusOp>(data); + } } VT_PERF_TEST(MyTest, test_allreduce_rabenseifner) { - auto proxy = - vt::theObjGroup()->makeCollective("test_allreduce_new", this, "Rabenseifner"); + auto proxy = vt::theObjGroup()->makeCollective( + "test_allreduce_rabenseifner", this, "Rabenseifner vector" + ); using DataT = decltype(data); using Reducer = collective::reduce::allreduce::Rabenseifner< - DataT, collective::PlusOp, NodeObj, &NodeObj::newReduceComplete>; + DataT, collective::PlusOp, NodeObj, &NodeObj::handlerVec>; auto grp_proxy = vt::theObjGroup()->makeCollective( "allreduce_rabenseifner", proxy, num_nodes_, data); grp_proxy[my_node_].get()->proxy_ = grp_proxy; - theCollective()->barrier(); - StartTimer(proxy[theContext()->getNode()].get()->timer_name_); - grp_proxy[my_node_].template invoke<&Reducer::allreduce>(); + for (auto payload_size : payloadSizes) { + data.resize(payload_size, theContext()->getNode() + 1); + + theCollective()->barrier(); + StartTimer(proxy[my_node_].get()->timer_names_.at(payload_size)); + grp_proxy[my_node_].template invoke<&Reducer::allreduce>(); + } } VT_PERF_TEST(MyTest, test_allreduce_recursive_doubling) { - auto proxy = - vt::theObjGroup()->makeCollective("test_allreduce_new_2", this, "Recursive doubling"); + auto proxy = vt::theObjGroup()->makeCollective( + "test_allreduce_recursive_doubling", this, "Recursive doubling vector" + ); using DataT = decltype(data); using Reducer = collective::reduce::allreduce::RecursiveDoubling< - DataT, collective::PlusOp, NodeObj, &NodeObj::recursiveDoubling>; + DataT, collective::PlusOp, NodeObj, &NodeObj::handlerVec>; auto grp_proxy = vt::theObjGroup()->makeCollective( "allreduce_recursive_doubling", proxy, num_nodes_, data); grp_proxy[my_node_].get()->proxy_ = grp_proxy; - theCollective()->barrier(); - StartTimer(proxy[theContext()->getNode()].get()->timer_name_); - grp_proxy[my_node_].template invoke<&Reducer::allreduce>(); + for (auto payload_size : payloadSizes) { + data.resize(payload_size, theContext()->getNode() + 1); + + theCollective()->barrier(); + StartTimer(proxy[my_node_].get()->timer_names_.at(payload_size)); + grp_proxy[my_node_].template invoke<&Reducer::allreduce>(); + } } VT_PERF_TEST_MAIN()