Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Resilience features #2149

Draft
wants to merge 6 commits into
base: develop
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/vt/configs/error/stack_out.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@
#include <string>
#include <tuple>
#include <vector>
#include <cstdint>

namespace vt { namespace debug { namespace stack {

Expand Down
4 changes: 3 additions & 1 deletion src/vt/elm/elm_lb_data.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
#include "vt/elm/elm_id.h"
#include "vt/elm/elm_comm.h"
#include "vt/timing/timing.h"
#include "vt/vrt/vrt_common.h"

namespace vt { namespace vrt { namespace collection { namespace balance {

Expand Down Expand Up @@ -103,7 +104,8 @@ struct ElementLBData {
void serialize(Serializer& s) {
s | cur_time_started_;
s | cur_time_;
s | cur_phase_;
if(!s.hasTraits(vt::vrt::CheckpointInternalTrait()))
s | cur_phase_;
s | phase_timings_;
s | phase_comm_;
s | cur_subphase_;
Expand Down
2 changes: 1 addition & 1 deletion src/vt/messaging/message/smart_ptr.h
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,7 @@ struct MsgSharedPtr final {
template <
typename SerializerT,
typename = std::enable_if_t<
std::is_same<SerializerT, checkpoint::Footprinter>::value
checkpoint::is_footprinter<SerializerT>::value
>
>
void serialize(SerializerT& s) {
Expand Down
10 changes: 10 additions & 0 deletions src/vt/objgroup/manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -369,6 +369,16 @@ struct ObjGroupManager : runtime::component::Component<ObjGroupManager> {
template <typename ObjT>
ProxyElmType<ObjT> proxyElm(ObjT* obj);

/**
* \brief Get the group that a proxy element is a member of.
*
* \param[in] proxy_elm an element of an object group
*
* \return de-indexed proxy to the object group
*/
template <typename ObjT>
ProxyType<ObjT> proxyGroup(ProxyElmType<ObjT> proxy_elm);

/**
* \brief Get object group label
*
Expand Down
5 changes: 5 additions & 0 deletions src/vt/objgroup/manager.impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -332,6 +332,11 @@ typename ObjGroupManager::ProxyElmType<ObjT> ObjGroupManager::proxyElm(ObjT* obj
return getProxy<ObjT>(obj).operator()(theContext()->getNode());
}

template <typename ObjT>
typename ObjGroupManager::ProxyType<ObjT> ObjGroupManager::proxyGroup(ProxyElmType<ObjT> proxy_elm) {
return ProxyType<ObjT>(proxy_elm.getProxy());
}

template <typename ObjT>
std::string ObjGroupManager::getLabel(ObjGroupManager::ProxyType<ObjT> proxy) const {
auto const proxy_bits = proxy.getProxy();
Expand Down
22 changes: 22 additions & 0 deletions src/vt/objgroup/proxy/proxy_objgroup.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@
#include "vt/messaging/pending_send.h"
#include "vt/utils/fntraits/fntraits.h"

#include "vt/vrt/vrt_common.h"

namespace vt { namespace objgroup { namespace proxy {

/**
Expand Down Expand Up @@ -428,7 +430,27 @@ struct Proxy {

template <typename Serializer>
void serialize(Serializer& s) {
auto old_proxy = proxy_;
s | proxy_;

using vt::vrt::CheckpointTrait;
using vt::vrt::CheckpointInternalTrait;

if constexpr(s.hasTraits(CheckpointTrait())){
vtAssert(old_proxy != no_obj_group, "ObjGroups must be pre-instantiated to be checkpointed or restored");
vtAssert(old_proxy == proxy_, "The proxy ID bits of this ObjGroup do not match the ID found in the checkpoint!" \
" Varying IDs is not yet supported.");
auto objPtr = get();

bool null = objPtr == nullptr;
s | null;

if(!null){
auto newS = s.withoutTraits(CheckpointTrait())
.withTraits(CheckpointInternalTrait());
newS | *objPtr;
}
}
}

private:
Expand Down
2 changes: 1 addition & 1 deletion src/vt/serialization/sizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ struct MsgSizer<
> {
static std::size_t get(MsgT* msg) {
auto& msg_ref = *msg;
return ::checkpoint::getSize<MsgT>(msg_ref);
return ::checkpoint::getSize(msg_ref);
}
};

Expand Down
1 change: 1 addition & 0 deletions src/vt/utils/compress/decompressor_base.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@

#include <string>
#include <cstdlib>
#include <cstdint>

namespace vt { namespace util { namespace compress {

Expand Down
6 changes: 3 additions & 3 deletions src/vt/vrt/collection/collection_builder.impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ std::tuple<EpochType, VirtualProxyType> CollectionManager::makeCollection(
auto const is_collective = po.collective_;

// Generate a new proxy for this new collection
auto const proxy_bits = makeCollectionProxy(is_collective, is_migratable);
auto const proxy_bits = makeCollectionProxy<typename ColT::IndexType>(is_collective, is_migratable, po.proxy_bits_);
po.proxy_bits_ = proxy_bits;

if (not is_collective) {
Expand Down Expand Up @@ -193,10 +193,10 @@ void CollectionManager::makeCollectionImpl(param::ConstructParams<ColT>& po) {
makeCollectionElement<ColT>(proxy, idx, this_node, std::move(c));
}

if (global_constructed_elms != 0) {
//if (global_constructed_elms != 0) {
// Construct a underlying group for the collection
constructGroup<ColT>(proxy);
}
//}
}

template <typename ColT, typename Callable>
Expand Down
24 changes: 0 additions & 24 deletions src/vt/vrt/collection/manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -95,30 +95,6 @@ void CollectionManager::schedule(ActionType action) {
theSched()->enqueue(action);
}

VirtualProxyType CollectionManager::makeCollectionProxy(
bool is_collective, bool is_migratable
) {
VirtualIDType const new_id = is_collective ?
next_collective_id_++ :
next_rooted_id_++;

auto const this_node = theContext()->getNode();
bool const is_collection = true;

// Create the new proxy with the `new_dist_id`
auto const proxy = VirtualProxyBuilder::createProxy(
new_id, this_node, is_collection, is_migratable, is_collective
);

vt_debug_print(
verbose, vrt_coll,
"makeCollectionProxy: node={}, new_dist_id={}, proxy={:x}\n",
this_node, new_id, proxy
);

return proxy;
}

/*static*/ void CollectionManager::computeReduceStamp(CollectionStampMsg* msg) {
theCollection()->reduce_stamp_[msg->proxy_] = msg->getVal();
}
Expand Down
45 changes: 37 additions & 8 deletions src/vt/vrt/collection/manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@
#include "vt/vrt/collection/dispatch/registry.h"
#include "vt/vrt/collection/listener/listen_events.h"
#include "vt/vrt/proxy/collection_proxy.h"
#include "vt/vrt/proxy/collection_elm_proxy.h"
#include "vt/topos/mapping/mapping_headers.h"
#include "vt/messaging/message.h"
#include "vt/messaging/pending_send.h"
Expand Down Expand Up @@ -352,10 +353,14 @@ struct CollectionManager
*
* \param[in] is_collective whether the collection is collective
* \param[in] is_migratable whether the collection is migratable
* \param[in] request_match an input proxy which we would like to use, if
* there are no existing conflicts. no_vrt_proxy indicates no
* request.
*
* \return the collection proxy bits
*/
VirtualProxyType makeCollectionProxy(bool is_collective, bool is_migratable);
template<typename IndexT>
VirtualProxyType makeCollectionProxy(bool is_collective, bool is_migratable, VirtualProxyType request_match);

/**
* \brief Query the current index context of the running handler
Expand Down Expand Up @@ -1550,6 +1555,16 @@ struct CollectionManager
template <typename ColT, typename IndexT = typename ColT::IndexType>
IndexT getRange(VirtualProxyType proxy);

/**
* \brief Get the whether the collection has dynamic membership
*
* \param[in] proxy the proxy of the collection
*
* \return the dynamic membership state
*/
template <typename ColT, typename IndexT = typename ColT::IndexType>
bool getDynamicMembership(VirtualProxyType proxy);

/**
* \brief Get the local indices that are currently on this node
*
Expand Down Expand Up @@ -1622,18 +1637,31 @@ struct CollectionManager
);

/**
* \internal \brief Migrate element to restore location from checkpoint
* \internal \struct MigrateRequestMsg
*
* \param[in] node the node
* \param[in] idx the element index
* \param[in] proxy the collection proxy
* \brief Migrate local element, potentially requested by remote location
*/

/**
* \brief Migrate a remote proxy element to a node, by messaging that
* node to initiate a migration. Immediately returns a rooted epoch
* containing the request message.
*/
template <typename ColT>
EpochType requestMigrateDeferred(
VrtElmProxy<ColT, typename ColT::IndexType> proxy_elem, NodeType destination
);

/**
* \brief Migrate a remote proxy element to a node, by messaging that
* node to initiate a migration. Returns after migration is complete.
*/
template <typename ColT>
static void migrateToRestoreLocation(
NodeType node, typename ColT::IndexType idx,
CollectionProxyWrapType<ColT> proxy
void requestMigrate(
VrtElmProxy<ColT, typename ColT::IndexType> proxy_elem, NodeType destination
);


/**
* \brief Restore the collection (collective) from file on top of an existing
* collection. Migrates collection elements to the rank saved from the
Expand Down Expand Up @@ -1790,6 +1818,7 @@ struct CollectionManager
#include "vt/vrt/collection/types/base.impl.h"
#include "vt/rdmahandle/manager.collection.impl.h"
#include "vt/vrt/proxy/collection_proxy.impl.h"
#include "vt/vrt/proxy/collection_elm_proxy.impl.h"
#include "vt/context/runnable_context/lb_data.impl.h"
#include "vt/context/runnable_context/collection.impl.h"

Expand Down
Loading