Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

1560 Change LB arguments #1561

Draft
wants to merge 15 commits into
base: 1359-lb-args-docs
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/vt/configs/arguments/app_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@ struct AppConfig {
std::string vt_lb_stats_file = "stats.%p.json";
std::string vt_lb_stats_dir_in = "vt_lb_stats_in";
std::string vt_lb_stats_file_in = "stats.%p.json";
bool vt_help_lb_args = false;

bool vt_no_detect_hang = false;
bool vt_print_no_progress = true;
Expand Down Expand Up @@ -308,6 +309,7 @@ struct AppConfig {
| vt_lb_stats_file
| vt_lb_stats_dir_in
| vt_lb_stats_file_in
| vt_help_lb_args

| vt_no_detect_hang
| vt_print_no_progress
Expand Down
8 changes: 7 additions & 1 deletion src/vt/configs/arguments/args.cc
Original file line number Diff line number Diff line change
Expand Up @@ -343,7 +343,7 @@ void ArgConfig::addLbArgs(CLI::App& app) {
* Flags for enabling load balancing and configuring it
*/
auto lb = "Enable load balancing";
auto lb_args = "Arguments pass to LB: \"x=0 y=1 test=2\"";
auto lb_args = "Arguments pass to LB: \"x=0 y=1\"; try --vt_help_lb_args";
auto lb_quiet = "Silence load balancing output";
auto lb_file_name = "LB specification file to read";
auto lb_show_spec = "Show LB specification during startup";
Expand Down Expand Up @@ -393,6 +393,12 @@ void ArgConfig::addLbArgs(CLI::App& app) {
xx->group(debugLB);
xy->group(debugLB);
xz->group(debugLB);

// help options deliberately omitted from the debugLB group above so that
// they appear grouped with --vt_help when --vt_help is used
auto help_lb_args = "Print help for --vt_lb_args";
auto h1 = app.add_flag("--vt_help_lb_args", config_.vt_help_lb_args, help_lb_args);
(void) h1;
}

void ArgConfig::addDiagnosticArgs(CLI::App& app) {
Expand Down
33 changes: 27 additions & 6 deletions src/vt/runtime/runtime.cc
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,21 @@ Runtime::Runtime(
arg_config_->parse(/*out*/ argc, /*out*/ argv);
int exit_code = std::get<0>(result);

if (getAppConfig()->vt_help_lb_args) {
// Help requested or invalid argument(s).
int rank = 0;
MPI_Comm_rank(initial_communicator_, &rank);

if (rank == 0) {
// Help requested
vt::debug::preConfigRef()->colorize_output = true;
vrt::collection::balance::LBManager::printLBArgsHelp(getAppConfig()->vt_lb_name);
}
if (exit_code == -1) {
exit_code = 0;
}
}

if (exit_code not_eq -1) {
// Help requested or invalid argument(s).
MPI_Comm comm = initial_communicator_;
Expand All @@ -169,14 +184,20 @@ Runtime::Runtime(
// exit code of 0 -> 'help'
std::ostream& out = exit_code == 0 ? std::cout : std::cerr;

out << "--- VT INITIALIZATION ABORT ---" << "\n\n"
<< msg << "\n"
<< "--- VT INITIALIZATION ABORT ---" << "\n"
<< std::flush;
if (exit_code != 0) {
out << "--- VT INITIALIZATION ABORT ---" << "\n";
}
out << "\n" << msg << "\n";
if (exit_code != 0) {
out << "--- VT INITIALIZATION ABORT ---" << "\n";
}
out << std::flush;
}

// Even in interop mode, still abort MPI on bad args.
MPI_Abort(comm, exit_code);
if (exit_code != 0) {
// Even in interop mode, still abort MPI on bad args.
MPI_Abort(comm, exit_code);
}
MPI_Finalize();

std::_Exit(exit_code); // no return
Expand Down
70 changes: 53 additions & 17 deletions src/vt/vrt/collection/balance/greedylb/greedylb.cc
Original file line number Diff line number Diff line change
Expand Up @@ -72,21 +72,61 @@ void GreedyLB::init(objgroup::proxy::Proxy<GreedyLB> in_proxy) {
proxy = scatter_proxy = in_proxy;
}

/*static*/ std::unordered_map<std::string, std::string>
GreedyLB::getInputKeysWithHelp() {
std::unordered_map<std::string, std::string> const keys_help = {
{
"I_tolerance",
R"(
Values: <double>
Default: 0.05
Description:
If the imbalance metric, I, is greater than I_tolerance, the load balancer
will run.
)"
},
{
"threshold",
R"(
Values: <double>
Default: 0.5
Description:
The load threshold of objects to consider for potential migration on each
rank. All objects over threshold * average_load on each rank will considered.
)"
},
{
"data_dist",
R"(
Values: {scatter, bcast, pt2pt}
Default: scatter
Description:
How to distribute the migrations decisions after the centralized LB runs.
)"
}
};
return keys_help;
}

void GreedyLB::inputParams(balance::SpecEntry* spec) {
std::vector<std::string> allowed{"min", "max", "auto", "strategy"};
auto keys_help = getInputKeysWithHelp();

std::vector<std::string> allowed;
for (auto&& elm : keys_help) {
allowed.push_back(elm.first);
}
spec->checkAllowedKeys(allowed);
min_threshold = spec->getOrDefault<double>("min", greedy_threshold_p);
max_threshold = spec->getOrDefault<double>("max", greedy_max_threshold_p);
auto_threshold = spec->getOrDefault<bool>("auto", greedy_auto_threshold_p);
I_tolerance = spec->getOrDefault<double>("I_tolerance", I_tolerance);
this_threshold = spec->getOrDefault<double>("threshold", this_threshold);

balance::LBArgsEnumConverter<DataDistStrategy> strategy_converter_(
"strategy", "DataDistStrategy", {
balance::LBArgsEnumConverter<DataDistStrategy> data_dist_converter_(
"data_dist", "DataDistStrategy", {
{DataDistStrategy::scatter, "scatter"},
{DataDistStrategy::pt2pt, "pt2pt"},
{DataDistStrategy::bcast, "bcast"}
}
);
strat_ = strategy_converter_.getFromSpec(spec, strat_);
data_dist_ = data_dist_converter_.getFromSpec(spec, data_dist_);
}

void GreedyLB::runLB() {
Expand All @@ -103,19 +143,15 @@ void GreedyLB::loadStats() {
this_load_begin = this_load;

if (avg_load > 0.0000000001) {
should_lb = I > greedy_tolerance;
}

if (auto_threshold) {
this_threshold = std::min(std::max(1.0f - I, min_threshold), max_threshold);
should_lb = I > I_tolerance;
}

if (this_node == 0) {
vt_print(
lb,
"loadStats: load={:.2f}, total={:.2f}, avg={:.2f}, I={:.2f},"
"should_lb={}, auto={}, threshold={}\n",
this_load, total_load, avg_load, I, should_lb, auto_threshold,
"should_lb={}, I_tolerance={}, threshold={}\n",
this_load, total_load, avg_load, I, should_lb, I_tolerance,
this_threshold
);
fflush(stdout);
Expand Down Expand Up @@ -289,7 +325,7 @@ void GreedyLB::transferObjs(std::vector<GreedyProc>&& in_load) {
}
}

if (strat_ == DataDistStrategy::scatter) {
if (data_dist_ == DataDistStrategy::scatter) {
std::size_t max_bytes = max_recs * sizeof(GreedyLBTypes::ObjIDType);
vt_debug_print(
normal, lb,
Expand All @@ -307,15 +343,15 @@ void GreedyLB::transferObjs(std::vector<GreedyProc>&& in_load) {
}
}
);
} else if (strat_ == DataDistStrategy::pt2pt) {
} else if (data_dist_ == DataDistStrategy::pt2pt) {
for (NodeType n = 0; n < theContext()->getNumNodes(); n++) {
vtAssert(
node_transfer.size() == static_cast<size_t>(theContext()->getNumNodes()),
"Must contain all nodes"
);
proxy[n].send<GreedySendMsg, &GreedyLB::recvObjs>(node_transfer[n]);
}
} else if (strat_ == DataDistStrategy::bcast) {
} else if (data_dist_ == DataDistStrategy::bcast) {
proxy.broadcast<GreedyBcastMsg, &GreedyLB::recvObjsBcast>(node_transfer);
}
}
Expand Down
13 changes: 5 additions & 8 deletions src/vt/vrt/collection/balance/greedylb/greedylb.h
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,8 @@ struct GreedyLB : BaseLB {
void runLB() override;
void inputParams(balance::SpecEntry* spec) override;

static std::unordered_map<std::string, std::string> getInputKeysWithHelp();

private:
double getAvgLoad() const;
double getMaxLoad() const;
Expand All @@ -109,17 +111,12 @@ struct GreedyLB : BaseLB {
static objgroup::proxy::Proxy<GreedyLB> scatter_proxy;

private:
double this_threshold = 0.0f;
double I_tolerance = 0.05f;
double this_threshold = 0.5f;
LoadType this_load_begin = 0.0f;
ObjSampleType load_over;
objgroup::proxy::Proxy<GreedyLB> proxy = {};

// Parameters read from LB spec file
double max_threshold = 0.0f;
double min_threshold = 0.0f;
bool auto_threshold = true;

DataDistStrategy strat_ = DataDistStrategy::scatter;
DataDistStrategy data_dist_ = DataDistStrategy::scatter;
};

}}}} /* end namespace vt::vrt::collection::lb */
Expand Down
8 changes: 2 additions & 6 deletions src/vt/vrt/collection/balance/greedylb/greedylb_constants.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,12 +48,8 @@

namespace vt { namespace vrt { namespace collection { namespace lb {

static constexpr NodeType const greedy_root = 0;
static constexpr int32_t const greedy_bin_size = 10;
static constexpr bool const greedy_auto_threshold_p = true;
static constexpr double const greedy_tolerance = 0.05f;
static constexpr double const greedy_threshold_p = 0.3f;
static constexpr double const greedy_max_threshold_p = 1.004f;
static constexpr NodeType const greedy_root = 0;
static constexpr int32_t const greedy_bin_size = 10;

}}}} /* end namespace vt::vrt::collection::lb */

Expand Down
73 changes: 56 additions & 17 deletions src/vt/vrt/collection/balance/hierarchicallb/hierlb.cc
Original file line number Diff line number Diff line change
Expand Up @@ -71,15 +71,60 @@ void HierarchicalLB::init(objgroup::proxy::Proxy<HierarchicalLB> in_proxy) {
proxy = in_proxy;
}

/*static*/ std::unordered_map<std::string, std::string>
HierarchicalLB::getInputKeysWithHelp() {
std::unordered_map<std::string, std::string> const keys_help = {
{
"I_tolerance",
R"(
Values: <double>
Default: 0.05
Description:
If the imbalance metric, I, is greater than I_tolerance, the load balancer
will run.
)"
},
{
"threshold",
R"(
Values: <double>
Default: 0.5
Description:
The load threshold of objects to consider for potential migration on each
rank. All objects over threshold * average_load on each rank will considered.
)"
},
{
"object_selection",
R"(
Values: {LoadOverLessThan, LoadOverGreaterThan, LoadOverOneEach}
Default: LoadOverLessThan
Description:
Select the strategy for which objects to select for rebalancing that are over
the selected threshold:
- LoadOverLessThan -- pick the smallest objects on the node
- LoadOverGreaterThan -- pick the largest objects on the node
- LoadOverOneEach -- pick objects from all the sample bins across the
range of sizes
)"
}
};
return keys_help;
}

void HierarchicalLB::inputParams(balance::SpecEntry* spec) {
std::vector<std::string> allowed{"min", "max", "auto", "strategy"};
auto keys_help = getInputKeysWithHelp();

std::vector<std::string> allowed;
for (auto&& elm : keys_help) {
allowed.push_back(elm.first);
}
spec->checkAllowedKeys(allowed);
min_threshold = spec->getOrDefault<double>("min", hierlb_threshold_p);
max_threshold = spec->getOrDefault<double>("max", hierlb_max_threshold_p);
auto_threshold = spec->getOrDefault<bool>("auto", hierlb_auto_threshold_p);
this_threshold = spec->getOrDefault<double>("threshold", this_threshold);
I_tolerance = spec->getOrDefault<double>("I_tolerance", I_tolerance);

std::string extract = spec->getOrDefault<std::string>(
"strategy", "LoadOverLessThan"
"object_selection", "LoadOverLessThan"
);
if (extract.compare("LoadOverLessThan") == 0) {
extract_strategy = HeapExtractEnum::LoadOverLessThan;
Expand All @@ -94,7 +139,7 @@ void HierarchicalLB::inputParams(balance::SpecEntry* spec) {
}
}

void HierarchicalLB::setupTree(double const threshold) {
void HierarchicalLB::setupTree() {
vtAssert(
tree_setup == false,
"Tree must not already be set up when is this called"
Expand All @@ -103,12 +148,10 @@ void HierarchicalLB::setupTree(double const threshold) {
auto const& this_node = theContext()->getNode();
auto const& num_nodes = theContext()->getNumNodes();

this_threshold = threshold;

vt_debug_print(
terse, hierlb,
"HierarchicalLB: setupTree: threshold={}\n",
threshold
this_threshold
);

for (NodeType node = 0; node < hierlb_nary; node++) {
Expand Down Expand Up @@ -196,19 +239,15 @@ void HierarchicalLB::loadStats() {
this_load_begin = this_load;

if (avg_load > 0.0000000001) {
should_lb = I > hierlb_tolerance;
}

if (auto_threshold) {
this_threshold = std::min(std::max(1.0f - I, min_threshold), max_threshold);
should_lb = I > I_tolerance;
}

if (this_node == 0) {
vt_print(
hierlb,
"loadStats: load={:.2f}, total={:.2f}, avg={:.2f}, I={:.2f},"
"should_lb={}, auto={}, threshold={}\n",
this_load, total_load, avg_load, I, should_lb, auto_threshold,
"should_lb={}, I_tolerance={}, threshold={}\n",
this_load, total_load, avg_load, I, should_lb, I_tolerance,
this_threshold
);
fflush(stdout);
Expand Down Expand Up @@ -698,7 +737,7 @@ void HierarchicalLB::clearObj(ObjSampleType& objs) {
}

void HierarchicalLB::runLB() {
setupTree(min_threshold);
setupTree();

auto cb = vt::theCB()->makeBcast<
HierarchicalLB, SetupDoneMsg, &HierarchicalLB::setupDone
Expand Down
Loading