diff --git a/src/vt/configs/arguments/app_config.h b/src/vt/configs/arguments/app_config.h index 00224cbfe8..788b8d04eb 100644 --- a/src/vt/configs/arguments/app_config.h +++ b/src/vt/configs/arguments/app_config.h @@ -145,6 +145,7 @@ struct AppConfig { bool vt_lb_keep_last_elm = false; bool vt_lb_data = false; bool vt_lb_data_compress = true; + uint32_t vt_lb_data_retention = 0; std::string vt_lb_data_dir = "vt_lb_data"; std::string vt_lb_data_file = "data.%p.json"; std::string vt_lb_data_dir_in = "vt_lb_data_in"; @@ -317,6 +318,7 @@ struct AppConfig { | vt_lb_interval | vt_lb_data | vt_lb_data_compress + | vt_lb_data_retention | vt_lb_data_dir | vt_lb_data_file | vt_lb_data_dir_in diff --git a/src/vt/configs/arguments/args.cc b/src/vt/configs/arguments/args.cc index bc5d1e5825..fcb94c4f8f 100644 --- a/src/vt/configs/arguments/args.cc +++ b/src/vt/configs/arguments/args.cc @@ -469,6 +469,7 @@ void addLbArgs(CLI::App& app, AppConfig& appConfig) { auto lb_keep_last_elm = "Do not migrate last element in collection"; auto lb_data = "Enable load balancing data"; auto lb_data_comp = "Compress load balancing data output with brotli"; + auto lb_data_hist = "Minimal number of historical LB data phases to retain"; auto lb_data_dir = "Load balancing data output directory"; auto lb_data_file = "Load balancing data output file name"; auto lb_data_dir_in = "Load balancing data input directory"; @@ -490,6 +491,7 @@ void addLbArgs(CLI::App& app, AppConfig& appConfig) { auto wl = app.add_flag("--vt_lb_keep_last_elm", appConfig.vt_lb_keep_last_elm, lb_keep_last_elm); auto ww = app.add_flag("--vt_lb_data", appConfig.vt_lb_data, lb_data); auto xz = app.add_flag("--vt_lb_data_compress", appConfig.vt_lb_data_compress, lb_data_comp); + auto dr = app.add_option("--vt_lb_data_retention", appConfig.vt_lb_data_retention, lb_data_hist); auto wx = app.add_option("--vt_lb_data_dir", appConfig.vt_lb_data_dir, lb_data_dir)->capture_default_str(); auto wy = app.add_option("--vt_lb_data_file", appConfig.vt_lb_data_file, lb_data_file)->capture_default_str(); auto xx = app.add_option("--vt_lb_data_dir_in", appConfig.vt_lb_data_dir_in, lb_data_dir_in)->capture_default_str(); @@ -517,6 +519,7 @@ void addLbArgs(CLI::App& app, AppConfig& appConfig) { xx->group(debugLB); xy->group(debugLB); xz->group(debugLB); + dr->group(debugLB); yx->group(debugLB); yy->group(debugLB); yz->group(debugLB); diff --git a/src/vt/vrt/collection/balance/lb_invoke/lb_manager.cc b/src/vt/vrt/collection/balance/lb_invoke/lb_manager.cc index 45a369f8c1..905d67ae0c 100644 --- a/src/vt/vrt/collection/balance/lb_invoke/lb_manager.cc +++ b/src/vt/vrt/collection/balance/lb_invoke/lb_manager.cc @@ -72,6 +72,8 @@ #include "vt/vrt/collection/manager.h" #include "vt/utils/json/json_appender.h" +#include + namespace vt { namespace vrt { namespace collection { namespace balance { /*static*/ std::unique_ptr LBManager::construct() { @@ -159,6 +161,8 @@ LBType LBManager::decideLBToRun(PhaseType phase, bool try_file) { void LBManager::setLoadModel(std::shared_ptr model) { model_ = model; auto nlb_data = theNodeLBData(); + min_hist_lb_data_ = std::max(model->getNumPastPhasesNeeded(), theConfig()->vt_lb_data_retention); + nlb_data->setMinLBDataHistory(min_hist_lb_data_); model_->setLoads(nlb_data->getNodeLoad(), nlb_data->getNodeComm()); } @@ -465,7 +469,7 @@ void LBManager::finishedLB(PhaseType phase) { "finishedLB\n" ); - theNodeLBData()->startIterCleanup(phase, model_->getNumPastPhasesNeeded()); + theNodeLBData()->startIterCleanup(phase, min_hist_lb_data_); theNodeLBData()->outputLBDataForPhase(phase); destroyLB(); diff --git a/src/vt/vrt/collection/balance/lb_invoke/lb_manager.h b/src/vt/vrt/collection/balance/lb_invoke/lb_manager.h index 04890bafc2..fb874746ab 100644 --- a/src/vt/vrt/collection/balance/lb_invoke/lb_manager.h +++ b/src/vt/vrt/collection/balance/lb_invoke/lb_manager.h @@ -293,6 +293,8 @@ struct LBManager : runtime::component::Component { std::unique_ptr statistics_writer_ = nullptr; /// Whether the LB statistics directory has been created bool created_lbstats_dir_ = false; + //// The amount of phases of historical LB data to hold + uint32_t min_hist_lb_data_ = 0; }; void makeGraphSymmetric( diff --git a/src/vt/vrt/collection/balance/node_lb_data.cc b/src/vt/vrt/collection/balance/node_lb_data.cc index bc363f32ae..6f7e9cbd18 100644 --- a/src/vt/vrt/collection/balance/node_lb_data.cc +++ b/src/vt/vrt/collection/balance/node_lb_data.cc @@ -331,7 +331,11 @@ void NodeLBData::addNodeLBData( in->updatePhase(1); auto model = theLBManager()->getLoadModel(); - in->releaseLBDataFromUnneededPhases(phase, model->getNumPastPhasesNeeded()); + if(min_hist_lb_data_ > 0){ + in->releaseLBDataFromUnneededPhases(phase, min_hist_lb_data_); + } else { + in->releaseLBDataFromUnneededPhases(phase, model->getNumPastPhasesNeeded()); + } } VirtualProxyType NodeLBData::getCollectionProxyForElement( diff --git a/src/vt/vrt/collection/balance/node_lb_data.h b/src/vt/vrt/collection/balance/node_lb_data.h index 5de834e6d3..52e8907c1e 100644 --- a/src/vt/vrt/collection/balance/node_lb_data.h +++ b/src/vt/vrt/collection/balance/node_lb_data.h @@ -249,6 +249,13 @@ struct NodeLBData : runtime::component::Component { */ LBDataHolder* getLBData() { return lb_data_.get(); } + /** + * \internal \brief Set the minimal amount of historical LB data which should be hold + * + * \param[in] hist_len the minimal amount of LB data to hold + */ + void setMinLBDataHistory(uint32_t hist_len) { min_hist_lb_data_ = hist_len; } + template void serialize(SerializerT& s) { s | proxy_ @@ -292,6 +299,8 @@ struct NodeLBData : runtime::component::Component { std::unique_ptr lb_data_writer_ = nullptr; /// The struct that holds all the LB data std::unique_ptr lb_data_ = nullptr; + //// The minimal amount of historical LB data to hold + uint32_t min_hist_lb_data_ = 0; }; }}}} /* end namespace vt::vrt::collection::balance */