From 968579f630de9c129f0fac9ee56171af7c81c8f6 Mon Sep 17 00:00:00 2001 From: Matthew Whitlock Date: Mon, 1 May 2023 10:49:41 -0700 Subject: [PATCH 1/4] initial checkpoint rework --- src/vt/messaging/message/smart_ptr.h | 2 +- src/vt/serialization/sizer.h | 2 +- .../vrt/collection/collection_builder.impl.h | 2 +- src/vt/vrt/collection/manager.cc | 24 ---- src/vt/vrt/collection/manager.h | 73 ++++++------ src/vt/vrt/collection/manager.impl.h | 109 +++++++++++++----- .../vrt/collection/param/construct_params.h | 27 +++++ src/vt/vrt/proxy/collection_elm_proxy.h | 13 ++- src/vt/vrt/proxy/collection_elm_proxy.impl.h | 108 +++++++++++++++++ src/vt/vrt/proxy/collection_proxy.h | 13 +++ src/vt/vrt/proxy/collection_proxy.impl.h | 92 +++++++++++++++ src/vt/vrt/vrt_common.h | 9 ++ tests/unit/lb/test_lb_data_comm.cc | 2 +- 13 files changed, 380 insertions(+), 96 deletions(-) create mode 100644 src/vt/vrt/proxy/collection_elm_proxy.impl.h diff --git a/src/vt/messaging/message/smart_ptr.h b/src/vt/messaging/message/smart_ptr.h index 4bb315ab20..96dee5378e 100644 --- a/src/vt/messaging/message/smart_ptr.h +++ b/src/vt/messaging/message/smart_ptr.h @@ -236,7 +236,7 @@ struct MsgSharedPtr final { template < typename SerializerT, typename = std::enable_if_t< - std::is_same::value + checkpoint::is_footprinter::value > > void serialize(SerializerT& s) { diff --git a/src/vt/serialization/sizer.h b/src/vt/serialization/sizer.h index c8cd26b0e6..e266b0e894 100644 --- a/src/vt/serialization/sizer.h +++ b/src/vt/serialization/sizer.h @@ -69,7 +69,7 @@ struct MsgSizer< > { static std::size_t get(MsgT* msg) { auto& msg_ref = *msg; - return ::checkpoint::getSize(msg_ref); + return ::checkpoint::getSize(msg_ref); } }; diff --git a/src/vt/vrt/collection/collection_builder.impl.h b/src/vt/vrt/collection/collection_builder.impl.h index 28004bd2c7..dda1388632 100644 --- a/src/vt/vrt/collection/collection_builder.impl.h +++ b/src/vt/vrt/collection/collection_builder.impl.h @@ -58,7 +58,7 @@ std::tuple CollectionManager::makeCollection( auto const is_collective = po.collective_; // Generate a new proxy for this new collection - auto const proxy_bits = makeCollectionProxy(is_collective, is_migratable); + auto const proxy_bits = makeCollectionProxy(is_collective, is_migratable, po.proxy_bits_); po.proxy_bits_ = proxy_bits; if (not is_collective) { diff --git a/src/vt/vrt/collection/manager.cc b/src/vt/vrt/collection/manager.cc index 95c1ef5a95..04cacaa5a0 100644 --- a/src/vt/vrt/collection/manager.cc +++ b/src/vt/vrt/collection/manager.cc @@ -95,30 +95,6 @@ void CollectionManager::schedule(ActionType action) { theSched()->enqueue(action); } -VirtualProxyType CollectionManager::makeCollectionProxy( - bool is_collective, bool is_migratable -) { - VirtualIDType const new_id = is_collective ? - next_collective_id_++ : - next_rooted_id_++; - - auto const this_node = theContext()->getNode(); - bool const is_collection = true; - - // Create the new proxy with the `new_dist_id` - auto const proxy = VirtualProxyBuilder::createProxy( - new_id, this_node, is_collection, is_migratable, is_collective - ); - - vt_debug_print( - verbose, vrt_coll, - "makeCollectionProxy: node={}, new_dist_id={}, proxy={:x}\n", - this_node, new_id, proxy - ); - - return proxy; -} - /*static*/ void CollectionManager::computeReduceStamp(CollectionStampMsg* msg) { theCollection()->reduce_stamp_[msg->proxy_] = msg->getVal(); } diff --git a/src/vt/vrt/collection/manager.h b/src/vt/vrt/collection/manager.h index f31dd7119e..9659bd9e17 100644 --- a/src/vt/vrt/collection/manager.h +++ b/src/vt/vrt/collection/manager.h @@ -360,10 +360,14 @@ struct CollectionManager * * \param[in] is_collective whether the collection is collective * \param[in] is_migratable whether the collection is migratable + * \param[in] request_match an input proxy which we would like to use, if + * there are no existing conflicts. no_vrt_proxy indicates no + * request. * * \return the collection proxy bits */ - VirtualProxyType makeCollectionProxy(bool is_collective, bool is_migratable); + template + VirtualProxyType makeCollectionProxy(bool is_collective, bool is_migratable, VirtualProxyType request_match); /** * \brief Query the current index context of the running handler @@ -1613,6 +1617,16 @@ struct CollectionManager template IndexT getRange(VirtualProxyType proxy); + /** + * \brief Get the whether the collection has dynamic membership + * + * \param[in] proxy the proxy of the collection + * + * \return the dynamic membership state + */ + template + bool getDynamicMembership(VirtualProxyType proxy); + /** * \brief Get the local indices that are currently on this node * @@ -1685,54 +1699,47 @@ struct CollectionManager ); /** - * \internal \struct RestoreMigrateMsg + * \internal \struct MigrateRequestMsg * - * \brief Migrate elements to restore where it belongs based on the checkpoint + * \brief Migrate local element, potentially requested by remote location */ template < typename ColT, - typename MsgT = vt::Message, - typename IdxT = typename ColT::IndexType + typename MsgT = vt::Message > - struct RestoreMigrateMsg : MsgT { - RestoreMigrateMsg() = default; - RestoreMigrateMsg( - NodeType in_to_node, IdxT in_idx, CollectionProxyWrapType in_proxy - ) : to_node_(in_to_node), - idx_(in_idx), - proxy_(in_proxy) + struct MigrateRequestMsg : MsgT { + using ElmT = VrtElmProxy; + + MigrateRequestMsg() = default; + MigrateRequestMsg( + ElmT proxy_elm, NodeType to_node + ) : to_node_(to_node), + proxy_elm_(proxy_elm) { } NodeType to_node_ = uninitialized_destination; - IdxT idx_; - CollectionProxyWrapType proxy_; + ElmT proxy_elm_; }; /** - * \internal \struct RestoreMigrateColMsg - * - * \brief Migrate collection element to restore where it belongs + * \brief Migrate a remote proxy element to a node, by messaging that + * node to initiate a migration. Immediately returns a rooted epoch + * containing the request message. */ - template - struct RestoreMigrateColMsg - : RestoreMigrateMsg, IdxT> - { - RestoreMigrateColMsg() = default; - RestoreMigrateColMsg( - NodeType in_to_node, IdxT in_idx, CollectionProxyWrapType in_proxy - ) : RestoreMigrateMsg, IdxT>( - in_to_node, in_idx, in_proxy - ) - { } - }; + template + EpochType requestMigrateDeferred( + VrtElmProxy proxy_elem, NodeType destination + ); /** - * \internal \brief Migrate element to restore location from checkpoint - * - * \param[in] msg the migrate message + * \brief Migrate a remote proxy element to a node, by messaging that + * node to initiate a migration. Returns after migration is complete. */ template - static void migrateToRestoreLocation(RestoreMigrateMsg* msg); + void requestMigrate( + VrtElmProxy proxy_elem, NodeType destination + ); + /** * \brief Restore the collection (collective) from file on top of an existing diff --git a/src/vt/vrt/collection/manager.impl.h b/src/vt/vrt/collection/manager.impl.h index c43e92354c..976509d4af 100644 --- a/src/vt/vrt/collection/manager.impl.h +++ b/src/vt/vrt/collection/manager.impl.h @@ -1416,6 +1416,50 @@ void CollectionManager::insertMetaCollection( }; } +template +VirtualProxyType CollectionManager::makeCollectionProxy( + bool is_collective, bool is_migratable, VirtualProxyType requested +) { + + if(requested != no_vrt_proxy){ + auto conflicting_holder = findColHolder(requested); + + if(conflicting_holder == nullptr){ + VirtualIDType const req_id = VirtualProxyBuilder::getVirtualID(requested); + + if(is_collective) next_collective_id_ = std::max(next_collective_id_, req_id+1); + else next_rooted_id_ = std::max(next_rooted_id_, req_id+1); + + vt_debug_print( + verbose, vrt_coll, + "makeCollectionProxy: node={}, new_dist_id={}, proxy={:x} (by request)\n", + theContext()->getNode(), req_id, requested + ); + return requested; + } // else ignore request, make as normal + }; + + VirtualIDType const new_id = is_collective ? + next_collective_id_++ : + next_rooted_id_++; + + auto const this_node = theContext()->getNode(); + bool const is_collection = true; + + // Create the new proxy with the `new_dist_id` + auto const proxy = VirtualProxyBuilder::createProxy( + new_id, this_node, is_collection, is_migratable, is_collective + ); + + vt_debug_print( + verbose, vrt_coll, + "makeCollectionProxy: node={}, new_dist_id={}, proxy={:x}\n", + this_node, new_id, proxy + ); + + return proxy; +} + template void CollectionManager::constructGroup(VirtualProxyType const& proxy) { /* @@ -2129,6 +2173,12 @@ IndexT CollectionManager::getRange(VirtualProxyType proxy) { return col_holder->bounds; } +template +bool CollectionManager::getDynamicMembership(VirtualProxyType proxy) { + auto col_holder = findColHolder(proxy); + return col_holder->has_dynamic_membership_; +} + template std::set CollectionManager::getLocalIndices( CollectionProxyWrapType proxy @@ -2241,32 +2291,42 @@ void CollectionManager::checkpointToFile( namespace detail { template -inline void restoreOffHomeElement( - CollectionManager::RestoreMigrateColMsg* msg, ColT* +inline void MigrateRequestHandler ( + CollectionManager::MigrateRequestMsg* msg, ColT* ) { - auto idx = msg->idx_; auto node = msg->to_node_; - auto proxy = msg->proxy_; - theCollection()->migrate(proxy(idx), node); + auto proxy_elm = msg->proxy_elm_; + theCollection()->migrate(proxy_elm, node); } } /* end namespace detail */ template -/*static*/ void CollectionManager::migrateToRestoreLocation( - RestoreMigrateMsg* msg +EpochType CollectionManager::requestMigrateDeferred( + VrtElmProxy proxy_elem, NodeType destination ) { - auto idx = msg->idx_; - auto node = msg->to_node_; - auto proxy = msg->proxy_; - if (proxy(idx).tryGetLocalPtr() != nullptr) { - theCollection()->migrate(proxy(idx), node); - } else { - proxy(idx).template send< - RestoreMigrateColMsg, detail::restoreOffHomeElement - >(node, idx, proxy); - } + auto ep = theTerm()->makeEpochRooted( + "Request element migration", term::UseDS{true} + ); + theMsg()->pushEpoch(ep); + + proxy_elem.template send< + MigrateRequestMsg, detail::MigrateRequestHandler + >(proxy_elem, destination); + + theMsg()->popEpoch(ep); + theTerm()->finishedEpoch(ep); + return ep; +} + +template +void CollectionManager::requestMigrate( + VrtElmProxy proxy_elem, NodeType destination +) { + auto ep = requestMigrateDeferred(proxy_elem, destination); + vt::runSchedulerThrough(ep); } + template void CollectionManager::restoreFromFileInPlace( CollectionProxyWrapType proxy, typename ColT::IndexType range, @@ -2292,25 +2352,14 @@ void CollectionManager::restoreFromFileInPlace( metadata_file_name ); + //Everyone shuffles any elements not where their data is runInEpochCollective([&]{ for (auto&& elm : directory->elements_) { auto idx = elm.idx_; auto file_name = elm.file_name_; if (proxy(idx).tryGetLocalPtr() == nullptr) { - auto mapped_node = getMappedNode(proxy, idx); - vtAssertExpr(mapped_node != uninitialized_destination); - auto this_node = theContext()->getNode(); - - using MsgType = RestoreMigrateMsg; - auto msg = makeMessage(this_node, idx, proxy); - if (mapped_node != this_node) { - theMsg()->sendMsg>( - mapped_node, msg - ); - } else { - migrateToRestoreLocation(msg.get()); - } + requestMigrateDeferred(proxy(idx), theContext()->getNode()); } } }); diff --git a/src/vt/vrt/collection/param/construct_params.h b/src/vt/vrt/collection/param/construct_params.h index 579a188c53..c5e46691a2 100644 --- a/src/vt/vrt/collection/param/construct_params.h +++ b/src/vt/vrt/collection/param/construct_params.h @@ -142,6 +142,22 @@ struct ConstructParams { ); } +protected: + friend CollectionProxy; + /** + * \brief Specify the proxy bits to attempt to use for this collection, + * if no conflicts are found. Helpful when restoring from checkpoints. + * + * \param[in] in_proxy_bits_ the proxy bits to request + */ + ThisType&& proxyBits(VirtualProxyType in_proxy_bits){ + proxy_bits_ = in_proxy_bits; + migratable_ = VirtualProxyBuilder::isMigratable(in_proxy_bits); + return std::move(*this); + } + +public: + /** * \brief Specify the bounds for the collection. If it doesn't have dynamic * membership this whole range will be constructed. @@ -392,6 +408,17 @@ struct ConstructParams { "Must have valid bounds or exactly one bulk insert or use list insertion" ); } + if(proxy_bits_ != no_vrt_proxy){ + using Bits = VirtualProxyBuilder; + vtAssert( + Bits::isCollection(proxy_bits_) == true and + Bits::isMigratable(proxy_bits_) == migratable_ and + Bits::isRemote(proxy_bits_) == collective_ and + ( collective_ or + Bits::getVirtualNode(proxy_bits_) == theContext()->getNode()), + "Requested proxy bits must match requested collection properties" + ); + } } public: diff --git a/src/vt/vrt/proxy/collection_elm_proxy.h b/src/vt/vrt/proxy/collection_elm_proxy.h index 25aa587e96..53cb14807e 100644 --- a/src/vt/vrt/proxy/collection_elm_proxy.h +++ b/src/vt/vrt/proxy/collection_elm_proxy.h @@ -83,19 +83,22 @@ struct VrtElmProxy : ProxyCollectionElmTraits { other.elm_proxy_ == this->elm_proxy_; } - template - void serialize(SerializerT& s) { - ProxyCollectionElmTraits::serialize(s); - } + friend struct CollectionManager; template friend std::ostream& operator<<( std::ostream& os, VrtElmProxy const& vrt ); - friend struct CollectionManager; + template + void serialize(DefaultSerializer& s); + template + void serialize(CheckpointSerializer& s); + template + std::unique_ptr deserializeToElm(Ser& s); }; + template std::ostream& operator<<( std::ostream& os, VrtElmProxy const& vrt diff --git a/src/vt/vrt/proxy/collection_elm_proxy.impl.h b/src/vt/vrt/proxy/collection_elm_proxy.impl.h new file mode 100644 index 0000000000..35ced2c17a --- /dev/null +++ b/src/vt/vrt/proxy/collection_elm_proxy.impl.h @@ -0,0 +1,108 @@ +/* +//@HEADER +// ***************************************************************************** +// +// collection_proxy.impl.h +// DARMA/vt => Virtual Transport +// +// Copyright 2019-2021 National Technology & Engineering Solutions of Sandia, LLC +// (NTESS). Under the terms of Contract DE-NA0003525 with NTESS, the U.S. +// Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * Neither the name of the copyright holder nor the names of its +// contributors may be used to endorse or promote products derived from this +// software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact darma@sandia.gov +// +// ***************************************************************************** +//@HEADER +*/ + +#if !defined INCLUDED_VT_VRT_PROXY_COLLECTION_ELM_PROXY_H +#define INCLUDED_VT_VRT_PROXY_COLLECTION_ELM_PROXY_H + +#include "vt/config.h" +#include "vt/vrt/collection/manager.h" +#include "vt/vrt/proxy/collection_elm_proxy.h" + +namespace vt { namespace vrt { namespace collection { + +//Standard serialize, just pass along to base. +template +template +void VrtElmProxy::serialize(DefaultSerializer& s) { + ProxyCollectionElmTraits::serialize(s); +} + +//Checkpoint serialize, actually serialize the element itself. +template +template +void VrtElmProxy::serialize(CheckpointSerializer& s) { + ProxyCollectionElmTraits::serialize(s); + + //Make sure proxies within the element don't also try recovering + auto elm_serializer = checkpoint::withoutTrait(s); + + auto local_elm_ptr = this->tryGetLocalPtr(); + if(local_elm_ptr != nullptr){ + local_elm_ptr | elm_serializer; + } else { + //The element is somewhere else so we'll need to request a migration to here. + vtAssert(!s.isUnpacking(), "Must serialize elements from the node they are at"); + + //Avoid delaying the serializer though, we want to enable asynchronous progress. + std::unique_ptr new_elm_ptr; + new_elm_ptr | elm_serializer; + + auto ep = theCollection()->requestMigrateDeferred(*this, theContext()->getNode()); + + theTerm()->addAction(ep, [*this, new_elm_ptr = std::move(new_elm_ptr)]{ + auto local_elm_ptr = *this.tryGetLocalPtr(); + assert(local_elm_ptr != nullptr); + local_elm_ptr = std::move(new_elm_ptr); + }); + } +} + +//Deserialize without placing values into the runtime, +//just return the element pointer. +template +template +std::unique_ptr +VrtElmProxy::deserializeToElm(Ser& s) { + //Still have to hit data in order. + ProxyCollectionElmTraits::serialize(s); + + //Make sure proxies within the element don't also try recovering + auto elm_serializer = checkpoint::withoutTrait(s); + + std::unique_ptr elm; + elm | elm_serializer; + return elm; +} + +}}} /* end namespace vt::vrt::collection */ + diff --git a/src/vt/vrt/proxy/collection_proxy.h b/src/vt/vrt/proxy/collection_proxy.h index 9e932e0bfc..8b16209d67 100644 --- a/src/vt/vrt/proxy/collection_proxy.h +++ b/src/vt/vrt/proxy/collection_proxy.h @@ -114,6 +114,19 @@ struct CollectionProxy : ProxyCollectionTraits { * This must be called on every process */ void setFocusedSubPhase(SubphaseType subphase); + + + //Serialize normally + template + using DefaultSerializer = vt::vrt::DefaultSerializer; + template + void serialize(typename DefaultSerializer::type& s); + + //Serialize for checkpoint/recovery + template + using CheckpointSerializer = vt::vrt::CheckpointSerializer; + template + void serialize(typename CheckpointSerializer::type& s); }; }}} /* end namespace vt::vrt::collection */ diff --git a/src/vt/vrt/proxy/collection_proxy.impl.h b/src/vt/vrt/proxy/collection_proxy.impl.h index d32162e00a..4702b99daa 100644 --- a/src/vt/vrt/proxy/collection_proxy.impl.h +++ b/src/vt/vrt/proxy/collection_proxy.impl.h @@ -106,6 +106,98 @@ CollectionProxy::setFocusedSubPhase(SubphaseType subphase) { balance::CollectionLBData::setFocusedSubPhase(this->getProxy(), subphase); } +template +template +void CollectionProxy::serialize( + typename DefaultSerializer::type& s +){ + ProxyCollectionTraits::serialize(s); +} + +template +template +void CollectionProxy::serialize( + typename CheckpointSerializer::type& s +){ + + //Save the proxy info we had before, to detect when + //we need to create the proxy while unpacking. + VirtualProxyType oldProxy = this->getProxy(); + vtAssert(oldProxy != no_vrt_proxy || !s.isPacking(), + "Proxy must be instantiated to checkpoint"); + + //Serialize parent class info, which handles proxy bits. + ProxyCollectionTraits::serialize(s); + VirtualProxyType proxy = this->getProxy(); + + std::string label; + if(!s.isUnpacking()) label = vt::theCollection()->getLabel(proxy); + s | label; + + if(s.isUnpacking() && oldProxy == proxy) { + vtAssert(label == vt::theCollection()->getLabel(proxy), + "Checkpointed proxy and deserialize target labels do not match!"); + } + + + std::set localElmSet; + if(!s.isUnpacking()) localElmSet = vt::theCollection()->getLocalIndices(*this); + s | localElmSet; + + IndexT bounds; + if(!s.isUnpacking()) bounds = vt::theCollection()->getRange(proxy); + s | bounds; + + bool isDynamic; + if(!s.isUnpacking()) + isDynamic = vt::theCollection()->getDynamicMembership(proxy); + s | isDynamic; + + //TODO: magistrate's virtualized serialization support may enable checkpointing + //mapper objects. Pre-registered functions could be doable as well. + + //TODO: chkpt location manager so we don't have to message back and forth so much? + //auto lm = theLocMan()->getCollectionLM(proxy.getProxy()); + + + + //If unpacking, we may need to make the collection to unpack into. + if(s.isUnpacking() && oldProxy != proxy){ + vtAssert(oldProxy == no_vrt_proxy, + "Checkpointed proxy and deserialize target do not match!"); + + //The checkpointed proxy doesn't exist, we need to create it + //First get all of the element unique_ptrs to hand off to the constructor + std::vector> localElms; + for(auto& idx : localElmSet){ + //Tell elements not to try verifying placement/migrating. + localElms.emplace_back(std::make_tuple(idx, + std::move(ElmProxyType::deserializeToElm(s)))); + } + + bool is_collective = !VirtualProxyBuilder::isRemote(proxy); + vtAssert(is_collective, "VT requires collective collections to enable \ + list inserting and custom constructing, cannot recover a rooted \ + collection."); + + vt::makeCollection(label) + .bounds(bounds) + .dynamicMembership(isDynamic) + .proxyBits(proxy) + .listInsertHere(std::move(localElms)) + .deferWithEpoch([=](VirtualProxyType assigned_proxy){ + vtAssert(assigned_proxy == proxy, "Proxy must be assigned as expected (for now)"); + }); + } else { + //We're serializing/deserializing in-place + //Serialize each element itself, the elements will handle + //requesting migrations as needed. + for(auto& elm_idx : localElmSet){ + s | (*this)(elm_idx); + } + } +} + }}} /* end namespace vt::vrt::collection */ #endif /*INCLUDED_VT_VRT_PROXY_COLLECTION_PROXY_IMPL_H*/ diff --git a/src/vt/vrt/vrt_common.h b/src/vt/vrt/vrt_common.h index add9cc019d..af81f323f6 100644 --- a/src/vt/vrt/vrt_common.h +++ b/src/vt/vrt/vrt_common.h @@ -44,12 +44,21 @@ #if !defined INCLUDED_VT_VRT_VRT_COMMON_H #define INCLUDED_VT_VRT_VRT_COMMON_H +#include + #include "vt/config.h" namespace vt { namespace vrt { static constexpr NodeType const default_collection_reduce_root_node = 0; +struct CheckpointTrait {}; +template +using CheckpointSerializer = typename checkpoint::with_traits; + +template +using DefaultSerializer = typename checkpoint::without_traits; + }} /* end namespace vt::vrt */ #endif /*INCLUDED_VT_VRT_VRT_COMMON_H*/ diff --git a/tests/unit/lb/test_lb_data_comm.cc b/tests/unit/lb/test_lb_data_comm.cc index 42c17f35e0..ec8d3ef7da 100644 --- a/tests/unit/lb/test_lb_data_comm.cc +++ b/tests/unit/lb/test_lb_data_comm.cc @@ -757,7 +757,7 @@ TEST_F(TestLBDataComm, test_lb_data_comm_handler_to_handler_send) { EXPECT_TRUE(found); // Suppress warnings about that method being "declared but never referenced" from Intel icpc and Nvidia nvcc - (void)&ReduceMsg::serialize; + (void)&ReduceMsg::serialize>; } } /* end anon namespace */ From ffef26255cf7c3b8fab3ef652cc36a65727d9379 Mon Sep 17 00:00:00 2001 From: Matthew Whitlock Date: Tue, 16 May 2023 08:50:17 -0700 Subject: [PATCH 2/4] Checkpoint bugfixes, more asynchrony in recovery, support checkpointing objgroups --- src/vt/elm/elm_lb_data.h | 4 +- src/vt/objgroup/proxy/proxy_objgroup.h | 22 +++++ .../vrt/collection/collection_builder.impl.h | 4 +- src/vt/vrt/collection/manager.h | 19 +---- src/vt/vrt/collection/manager.impl.h | 14 ++-- src/vt/vrt/collection/types/migratable.impl.h | 1 + src/vt/vrt/proxy/collection_elm_proxy.h | 17 ++-- src/vt/vrt/proxy/collection_elm_proxy.impl.h | 81 ++++++++++++------- src/vt/vrt/proxy/collection_proxy.h | 10 +-- src/vt/vrt/proxy/collection_proxy.impl.h | 46 +++++------ src/vt/vrt/vrt_common.h | 6 +- tests/unit/lb/test_lb_data_comm.cc | 2 +- 12 files changed, 118 insertions(+), 108 deletions(-) diff --git a/src/vt/elm/elm_lb_data.h b/src/vt/elm/elm_lb_data.h index 1ebc07c6c3..51a02b33e0 100644 --- a/src/vt/elm/elm_lb_data.h +++ b/src/vt/elm/elm_lb_data.h @@ -47,6 +47,7 @@ #include "vt/elm/elm_id.h" #include "vt/elm/elm_comm.h" #include "vt/timing/timing.h" +#include "vt/vrt/vrt_common.h" namespace vt { namespace vrt { namespace collection { namespace balance { @@ -103,7 +104,8 @@ struct ElementLBData { void serialize(Serializer& s) { s | cur_time_started_; s | cur_time_; - s | cur_phase_; + if(!s.hasTraits(vt::vrt::CheckpointInternalTrait())) + s | cur_phase_; s | phase_timings_; s | phase_comm_; s | cur_subphase_; diff --git a/src/vt/objgroup/proxy/proxy_objgroup.h b/src/vt/objgroup/proxy/proxy_objgroup.h index b235bdcf95..731716677e 100644 --- a/src/vt/objgroup/proxy/proxy_objgroup.h +++ b/src/vt/objgroup/proxy/proxy_objgroup.h @@ -60,6 +60,8 @@ #include "vt/messaging/pending_send.h" #include "vt/utils/fntraits/fntraits.h" +#include "vt/vrt/vrt_common.h" + namespace vt { namespace objgroup { namespace proxy { /** @@ -365,7 +367,27 @@ struct Proxy { template void serialize(Serializer& s) { + auto old_proxy = proxy_; s | proxy_; + + using vt::vrt::CheckpointTrait; + using vt::vrt::CheckpointInternalTrait; + + if constexpr(s.hasTraits(CheckpointTrait())){ + vtAssert(old_proxy != no_obj_group, "ObjGroups must be pre-instantiated to be checkpointed or restored"); + vtAssert(old_proxy == proxy_, "The proxy ID bits of this ObjGroup do not match the ID found in the checkpoint!" \ + " Varying IDs is not yet supported."); + auto objPtr = get(); + + bool null = objPtr == nullptr; + s | null; + + if(!null){ + auto newS = s.withoutTraits(CheckpointTrait()) + .withTraits(CheckpointInternalTrait()); + newS | *objPtr; + } + } } private: diff --git a/src/vt/vrt/collection/collection_builder.impl.h b/src/vt/vrt/collection/collection_builder.impl.h index dda1388632..d15f38c47c 100644 --- a/src/vt/vrt/collection/collection_builder.impl.h +++ b/src/vt/vrt/collection/collection_builder.impl.h @@ -193,10 +193,10 @@ void CollectionManager::makeCollectionImpl(param::ConstructParams& po) { makeCollectionElement(proxy, idx, this_node, std::move(c)); } - if (global_constructed_elms != 0) { + //if (global_constructed_elms != 0) { // Construct a underlying group for the collection constructGroup(proxy); - } + //} } template diff --git a/src/vt/vrt/collection/manager.h b/src/vt/vrt/collection/manager.h index 2633c13b1c..44f1404874 100644 --- a/src/vt/vrt/collection/manager.h +++ b/src/vt/vrt/collection/manager.h @@ -62,6 +62,7 @@ #include "vt/vrt/collection/dispatch/registry.h" #include "vt/vrt/collection/listener/listen_events.h" #include "vt/vrt/proxy/collection_proxy.h" +#include "vt/vrt/proxy/collection_elm_proxy.h" #include "vt/topos/mapping/mapping_headers.h" #include "vt/messaging/message.h" #include "vt/messaging/pending_send.h" @@ -1640,23 +1641,6 @@ struct CollectionManager * * \brief Migrate local element, potentially requested by remote location */ - template < - typename ColT, - typename MsgT = vt::Message - > - struct MigrateRequestMsg : MsgT { - using ElmT = VrtElmProxy; - - MigrateRequestMsg() = default; - MigrateRequestMsg( - ElmT proxy_elm, NodeType to_node - ) : to_node_(to_node), - proxy_elm_(proxy_elm) - { } - - NodeType to_node_ = uninitialized_destination; - ElmT proxy_elm_; - }; /** * \brief Migrate a remote proxy element to a node, by messaging that @@ -1834,6 +1818,7 @@ struct CollectionManager #include "vt/vrt/collection/types/base.impl.h" #include "vt/rdmahandle/manager.collection.impl.h" #include "vt/vrt/proxy/collection_proxy.impl.h" +#include "vt/vrt/proxy/collection_elm_proxy.impl.h" #include "vt/context/runnable_context/lb_data.impl.h" #include "vt/context/runnable_context/collection.impl.h" diff --git a/src/vt/vrt/collection/manager.impl.h b/src/vt/vrt/collection/manager.impl.h index c0ca804bdc..cf5d23223c 100644 --- a/src/vt/vrt/collection/manager.impl.h +++ b/src/vt/vrt/collection/manager.impl.h @@ -2217,26 +2217,24 @@ void CollectionManager::checkpointToFile( namespace detail { template inline void MigrateRequestHandler ( - CollectionManager::MigrateRequestMsg* msg, ColT* + ColT*, VrtElmProxy proxy_elm, NodeType dest ) { - auto node = msg->to_node_; - auto proxy_elm = msg->proxy_elm_; - theCollection()->migrate(proxy_elm, node); + theCollection()->migrate(proxy_elm, dest); } } /* end namespace detail */ template EpochType CollectionManager::requestMigrateDeferred( - VrtElmProxy proxy_elem, NodeType destination + VrtElmProxy proxy_elm, NodeType destination ) { auto ep = theTerm()->makeEpochRooted( "Request element migration", term::UseDS{true} ); theMsg()->pushEpoch(ep); - proxy_elem.template send< - MigrateRequestMsg, detail::MigrateRequestHandler - >(proxy_elem, destination); + proxy_elm.template send>( + proxy_elm, destination + ); theMsg()->popEpoch(ep); theTerm()->finishedEpoch(ep); diff --git a/src/vt/vrt/collection/types/migratable.impl.h b/src/vt/vrt/collection/types/migratable.impl.h index e6016aef59..01a119c2af 100644 --- a/src/vt/vrt/collection/types/migratable.impl.h +++ b/src/vt/vrt/collection/types/migratable.impl.h @@ -45,6 +45,7 @@ #define INCLUDED_VT_VRT_COLLECTION_TYPES_MIGRATABLE_IMPL_H #include "vt/config.h" +#include "vt/vrt/vrt_common.h" #include "vt/vrt/collection/types/migratable.h" namespace vt { namespace vrt { namespace collection { diff --git a/src/vt/vrt/proxy/collection_elm_proxy.h b/src/vt/vrt/proxy/collection_elm_proxy.h index 53cb14807e..bd8011f4ac 100644 --- a/src/vt/vrt/proxy/collection_elm_proxy.h +++ b/src/vt/vrt/proxy/collection_elm_proxy.h @@ -50,6 +50,7 @@ #include "vt/vrt/collection/send/sendable.h" #include "vt/vrt/collection/insert/insertable.h" #include "vt/vrt/proxy/base_elm_proxy.h" +#include "vt/vrt/vrt_common.h" #include @@ -90,12 +91,16 @@ struct VrtElmProxy : ProxyCollectionElmTraits { std::ostream& os, VrtElmProxy const& vrt ); - template - void serialize(DefaultSerializer& s); - template - void serialize(CheckpointSerializer& s); - template - std::unique_ptr deserializeToElm(Ser& s); + + + template * = nullptr> + void serialize(SerT& s); + + template * = nullptr> + void serialize(SerT& s); + + template + std::unique_ptr deserializeToElm(SerT& s); }; diff --git a/src/vt/vrt/proxy/collection_elm_proxy.impl.h b/src/vt/vrt/proxy/collection_elm_proxy.impl.h index 35ced2c17a..775ab73dc3 100644 --- a/src/vt/vrt/proxy/collection_elm_proxy.impl.h +++ b/src/vt/vrt/proxy/collection_elm_proxy.impl.h @@ -41,8 +41,8 @@ //@HEADER */ -#if !defined INCLUDED_VT_VRT_PROXY_COLLECTION_ELM_PROXY_H -#define INCLUDED_VT_VRT_PROXY_COLLECTION_ELM_PROXY_H +#if !defined INCLUDED_VT_VRT_PROXY_COLLECTION_ELM_PROXY_IMPL_H +#define INCLUDED_VT_VRT_PROXY_COLLECTION_ELM_PROXY_IMPL_H #include "vt/config.h" #include "vt/vrt/collection/manager.h" @@ -52,57 +52,76 @@ namespace vt { namespace vrt { namespace collection { //Standard serialize, just pass along to base. template -template -void VrtElmProxy::serialize(DefaultSerializer& s) { +template *> +void VrtElmProxy::serialize(SerT& s) { ProxyCollectionElmTraits::serialize(s); } //Checkpoint serialize, actually serialize the element itself. template -template -void VrtElmProxy::serialize(CheckpointSerializer& s) { +template *> +void VrtElmProxy::serialize(SerT& s) { ProxyCollectionElmTraits::serialize(s); - //Make sure proxies within the element don't also try recovering - auto elm_serializer = checkpoint::withoutTrait(s); - auto local_elm_ptr = this->tryGetLocalPtr(); - if(local_elm_ptr != nullptr){ - local_elm_ptr | elm_serializer; - } else { - //The element is somewhere else so we'll need to request a migration to here. - vtAssert(!s.isUnpacking(), "Must serialize elements from the node they are at"); - - //Avoid delaying the serializer though, we want to enable asynchronous progress. - std::unique_ptr new_elm_ptr; - new_elm_ptr | elm_serializer; + vtAssert(local_elm_ptr != nullptr || s.isUnpacking(), "Must serialize/size elements from the node they are at"); + + //Traits for nested serialize/deserialize + using CheckpointlessTraits = typename SerT::Traits::without::with; + + //Weird nested serialization to enable asynchronous deserializing w/o changing semantics. + if(!(s.isPacking() || s.isUnpacking())){ + int size = checkpoint::getSize(*local_elm_ptr); + s.contiguousBytes(nullptr, 1, size); + } else if(s.isPacking()){ + auto serialized_elm = checkpoint::serialize(*local_elm_ptr); + int size = serialized_elm->getSize(); + s | size; + s.contiguousBytes(serialized_elm->getBuffer(), 1, size); + } else if(s.isUnpacking()){ + int size; + s | size; - auto ep = theCollection()->requestMigrateDeferred(*this, theContext()->getNode()); + auto buf = std::make_unique(size); + s.contiguousBytes(buf.get(), 1, size); + + if(local_elm_ptr != nullptr){ + checkpoint::deserializeInPlace(buf.get(), local_elm_ptr); + } else { + //The element is somewhere else so we'll need to request a migration to here. + auto ep = theCollection()->requestMigrateDeferred(*this, theContext()->getNode()); - theTerm()->addAction(ep, [*this, new_elm_ptr = std::move(new_elm_ptr)]{ - auto local_elm_ptr = *this.tryGetLocalPtr(); - assert(local_elm_ptr != nullptr); - local_elm_ptr = std::move(new_elm_ptr); - }); + theTerm()->addActionUnique(ep, std::move([elm_proxy = *this, buffer = std::move(buf)]{ + auto elm_ptr = elm_proxy.tryGetLocalPtr(); + assert(elm_ptr != nullptr); + checkpoint::deserializeInPlace(buffer.get(), elm_ptr); + })); + } } } //Deserialize without placing values into the runtime, //just return the element pointer. template -template +template std::unique_ptr -VrtElmProxy::deserializeToElm(Ser& s) { - //Still have to hit data in order. +VrtElmProxy::deserializeToElm(SerT& s) { + //Still have to hit data in the same order. ProxyCollectionElmTraits::serialize(s); - //Make sure proxies within the element don't also try recovering - auto elm_serializer = checkpoint::withoutTrait(s); + int size; + s | size; + auto buf = std::make_unique(size); + s.contiguousBytes(buf.get(), 1, size); + + std::unique_ptr elm(new ColT()); + + using CheckpointlessTraits = typename SerT::Traits::without::with; + checkpoint::deserializeInPlace(buf.get(), elm.get()); - std::unique_ptr elm; - elm | elm_serializer; return elm; } }}} /* end namespace vt::vrt::collection */ +#endif /*INCLUDED_VT_VRT_PROXY_COLLECTION_ELM_PROXY_IMPL_H*/ diff --git a/src/vt/vrt/proxy/collection_proxy.h b/src/vt/vrt/proxy/collection_proxy.h index 8b16209d67..5f3efce6bd 100644 --- a/src/vt/vrt/proxy/collection_proxy.h +++ b/src/vt/vrt/proxy/collection_proxy.h @@ -118,15 +118,7 @@ struct CollectionProxy : ProxyCollectionTraits { //Serialize normally template - using DefaultSerializer = vt::vrt::DefaultSerializer; - template - void serialize(typename DefaultSerializer::type& s); - - //Serialize for checkpoint/recovery - template - using CheckpointSerializer = vt::vrt::CheckpointSerializer; - template - void serialize(typename CheckpointSerializer::type& s); + void serialize(SerializerT& s); }; }}} /* end namespace vt::vrt::collection */ diff --git a/src/vt/vrt/proxy/collection_proxy.impl.h b/src/vt/vrt/proxy/collection_proxy.impl.h index 4702b99daa..7022ea636e 100644 --- a/src/vt/vrt/proxy/collection_proxy.impl.h +++ b/src/vt/vrt/proxy/collection_proxy.impl.h @@ -107,27 +107,23 @@ CollectionProxy::setFocusedSubPhase(SubphaseType subphase) { } template -template +template void CollectionProxy::serialize( - typename DefaultSerializer::type& s + SerializerT& s ){ - ProxyCollectionTraits::serialize(s); -} - -template -template -void CollectionProxy::serialize( - typename CheckpointSerializer::type& s -){ - //Save the proxy info we had before, to detect when - //we need to create the proxy while unpacking. + //we need to create the proxy while unpacking a checkpoint. VirtualProxyType oldProxy = this->getProxy(); - vtAssert(oldProxy != no_vrt_proxy || !s.isPacking(), - "Proxy must be instantiated to checkpoint"); - //Serialize parent class info, which handles proxy bits. ProxyCollectionTraits::serialize(s); + + //If not a checkpoint, we only serialize our reference info. + if constexpr( !s.hasTraits(CheckpointTrait()) ) { + return; + } + + vtAssert(oldProxy != no_vrt_proxy || !s.isPacking(), + "Proxy must be instantiated to checkpoint"); VirtualProxyType proxy = this->getProxy(); std::string label; @@ -159,8 +155,6 @@ void CollectionProxy::serialize( //TODO: chkpt location manager so we don't have to message back and forth so much? //auto lm = theLocMan()->getCollectionLM(proxy.getProxy()); - - //If unpacking, we may need to make the collection to unpack into. if(s.isUnpacking() && oldProxy != proxy){ vtAssert(oldProxy == no_vrt_proxy, @@ -168,32 +162,28 @@ void CollectionProxy::serialize( //The checkpointed proxy doesn't exist, we need to create it //First get all of the element unique_ptrs to hand off to the constructor - std::vector> localElms; + std::vector>> localElms; for(auto& idx : localElmSet){ //Tell elements not to try verifying placement/migrating. localElms.emplace_back(std::make_tuple(idx, - std::move(ElmProxyType::deserializeToElm(s)))); + std::move(ElmProxyType().deserializeToElm(s)))); } - bool is_collective = !VirtualProxyBuilder::isRemote(proxy); - vtAssert(is_collective, "VT requires collective collections to enable \ - list inserting and custom constructing, cannot recover a rooted \ - collection."); - vt::makeCollection(label) .bounds(bounds) .dynamicMembership(isDynamic) .proxyBits(proxy) .listInsertHere(std::move(localElms)) - .deferWithEpoch([=](VirtualProxyType assigned_proxy){ - vtAssert(assigned_proxy == proxy, "Proxy must be assigned as expected (for now)"); + .deferWithEpoch([=](CollectionProxy assigned_proxy){ + vtAssert(assigned_proxy.getProxy() == proxy, "Proxy must be assigned as expected (for now)"); }); } else { - //We're serializing/deserializing in-place + //We're serializing or in-place deserializing //Serialize each element itself, the elements will handle //requesting migrations as needed. for(auto& elm_idx : localElmSet){ - s | (*this)(elm_idx); + auto elm = (*this)(elm_idx); + s | elm; } } } diff --git a/src/vt/vrt/vrt_common.h b/src/vt/vrt/vrt_common.h index af81f323f6..e7ef9f3ecb 100644 --- a/src/vt/vrt/vrt_common.h +++ b/src/vt/vrt/vrt_common.h @@ -53,11 +53,7 @@ namespace vt { namespace vrt { static constexpr NodeType const default_collection_reduce_root_node = 0; struct CheckpointTrait {}; -template -using CheckpointSerializer = typename checkpoint::with_traits; - -template -using DefaultSerializer = typename checkpoint::without_traits; +struct CheckpointInternalTrait {}; }} /* end namespace vt::vrt */ diff --git a/tests/unit/lb/test_lb_data_comm.cc b/tests/unit/lb/test_lb_data_comm.cc index 20062eba72..9fa555fadf 100644 --- a/tests/unit/lb/test_lb_data_comm.cc +++ b/tests/unit/lb/test_lb_data_comm.cc @@ -759,7 +759,7 @@ TEST_F(TestLBDataComm, test_lb_data_comm_handler_to_handler_send) { EXPECT_TRUE(found); // Suppress warnings about that method being "declared but never referenced" from Intel icpc and Nvidia nvcc - (void)&ReduceMsg::serialize>; + (void)&ReduceMsg::serialize; } } /* end anon namespace */ From 6f427032cc6c150afd0e321ed83b9503d981088e Mon Sep 17 00:00:00 2001 From: Matthew Whitlock Date: Mon, 5 Jun 2023 17:07:52 -0700 Subject: [PATCH 3/4] Add some access utilities for gathering proxy info --- src/vt/objgroup/manager.h | 10 ++++++++++ src/vt/objgroup/manager.impl.h | 5 +++++ src/vt/vrt/proxy/collection_elm_proxy.h | 5 ++++- 3 files changed, 19 insertions(+), 1 deletion(-) diff --git a/src/vt/objgroup/manager.h b/src/vt/objgroup/manager.h index 0080182e5e..0a28371845 100644 --- a/src/vt/objgroup/manager.h +++ b/src/vt/objgroup/manager.h @@ -369,6 +369,16 @@ struct ObjGroupManager : runtime::component::Component { template ProxyElmType proxyElm(ObjT* obj); + /** + * \brief Get the group that a proxy element is a member of. + * + * \param[in] proxy_elm an element of an object group + * + * \return de-indexed proxy to the object group + */ + template + ProxyType proxyGroup(ProxyElmType proxy_elm); + /** * \brief Get object group label * diff --git a/src/vt/objgroup/manager.impl.h b/src/vt/objgroup/manager.impl.h index 936d2bc40b..edfa7f4c34 100644 --- a/src/vt/objgroup/manager.impl.h +++ b/src/vt/objgroup/manager.impl.h @@ -332,6 +332,11 @@ typename ObjGroupManager::ProxyElmType ObjGroupManager::proxyElm(ObjT* obj return getProxy(obj).operator()(theContext()->getNode()); } +template +typename ObjGroupManager::ProxyType ObjGroupManager::proxyGroup(ProxyElmType proxy_elm) { + return ProxyType(proxy_elm.getProxy()); +} + template std::string ObjGroupManager::getLabel(ObjGroupManager::ProxyType proxy) const { auto const proxy_bits = proxy.getProxy(); diff --git a/src/vt/vrt/proxy/collection_elm_proxy.h b/src/vt/vrt/proxy/collection_elm_proxy.h index bd8011f4ac..e4b475b49d 100644 --- a/src/vt/vrt/proxy/collection_elm_proxy.h +++ b/src/vt/vrt/proxy/collection_elm_proxy.h @@ -46,6 +46,7 @@ #include "vt/config.h" #include "vt/vrt/collection/proxy_traits/proxy_elm_traits.h" +#include "vt/vrt/collection/proxy_builder/elm_proxy_builder.h" #include "vt/vrt/collection/manager.fwd.h" #include "vt/vrt/collection/send/sendable.h" #include "vt/vrt/collection/insert/insertable.h" @@ -91,7 +92,9 @@ struct VrtElmProxy : ProxyCollectionElmTraits { std::ostream& os, VrtElmProxy const& vrt ); - + IndexT getIndex(){ + return this->getElementProxy().getIndex(); + } template * = nullptr> void serialize(SerT& s); From e8057e936d9ac05c348deacb73bb72febc9c95f0 Mon Sep 17 00:00:00 2001 From: Matthew Whitlock Date: Thu, 15 Feb 2024 14:49:26 -0800 Subject: [PATCH 4/4] Quickfix: system headers --- src/vt/configs/error/stack_out.h | 1 + src/vt/utils/compress/decompressor_base.h | 1 + 2 files changed, 2 insertions(+) diff --git a/src/vt/configs/error/stack_out.h b/src/vt/configs/error/stack_out.h index ffade6e2b2..f4140c2b8a 100644 --- a/src/vt/configs/error/stack_out.h +++ b/src/vt/configs/error/stack_out.h @@ -48,6 +48,7 @@ #include #include #include +#include namespace vt { namespace debug { namespace stack { diff --git a/src/vt/utils/compress/decompressor_base.h b/src/vt/utils/compress/decompressor_base.h index c0d76506b1..5f400ca678 100644 --- a/src/vt/utils/compress/decompressor_base.h +++ b/src/vt/utils/compress/decompressor_base.h @@ -46,6 +46,7 @@ #include #include +#include namespace vt { namespace util { namespace compress {