Skip to content

Commit

Permalink
Merge pull request #299 from NVIDIA/am/fix-nemo-launcher
Browse files Browse the repository at this point in the history
Fix Nemo Launcher cmd generation, add tests
  • Loading branch information
amaslenn authored Nov 12, 2024
2 parents 95b3681 + cdb76bb commit def60ed
Show file tree
Hide file tree
Showing 3 changed files with 59 additions and 6 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -53,14 +53,14 @@ def gen_exec_command(self, tr: TestRun) -> str:
f"Local clone of git repo {tdef.python_executable.git_repo} does not exist. "
"Please ensure to run installation before running the test."
)
repo_path = Path.cwd() # dry-run compatibility
repo_path = self.system.install_path / tdef.python_executable.git_repo.repo_name # dry-run compatibility
venv_path = tdef.python_executable.venv_path
if not venv_path:
logging.warning(
f"The virtual environment for git repo {tdef.python_executable.git_repo} does not exist. "
"Please ensure to run installation before running the test."
)
venv_path = repo_path # dry-run compatibility
venv_path = self.system.install_path / tdef.python_executable.venv_name # dry-run compatibility
py_bin = (venv_path / "bin" / "python").absolute()
self.final_cmd_args.update(
{
Expand Down Expand Up @@ -97,7 +97,8 @@ def _prepare_environment(self, cmd_args: Dict[str, str], extra_env_vars: Dict[st
self.final_env_vars = self._override_env_vars(self.system.global_env_vars, extra_env_vars)

overriden_cmd_args = self._override_cmd_args(self.default_cmd_args, cmd_args)
self.final_cmd_args = {k: self._handle_special_keys(k, v) for k, v in overriden_cmd_args.items()}
overriden_cmd_args.pop("launcher_script", None)
self.final_cmd_args = {k: self._handle_special_keys(k, v) for k, v in sorted(overriden_cmd_args.items())}

for key, value in self.final_env_vars.items():
self.final_cmd_args[f"env_vars.{key}"] = value
Expand Down
24 changes: 24 additions & 0 deletions tests/ref_data/nemo-launcher.sbatch
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
__OUTPUT_DIR__/install/NeMo-Framework-Launcher__599ecfcbbd64fd2de02f2cc093b1610d73854022-venv/bin/python \
__OUTPUT_DIR__/install/NeMo-Framework-Launcher__599ecfcbbd64fd2de02f2cc093b1610d73854022/launcher_scripts/main.py \
cluster.gpus_per_node=null \
numa_mapping.enable=True \
stages=["training"] \
training.exp_manager.create_checkpoint_callback=False \
training.model.data.data_impl=mock \
training.model.data.data_prefix=[] \
training.model.global_batch_size=128 \
training.model.micro_batch_size=2 \
training.model.pipeline_model_parallel_size=4 \
training.model.tensor_model_parallel_size=4 \
training.run.name=run \
training.run.time_limit=3:00:00 \
training.trainer.enable_checkpointing=False \
training.trainer.log_every_n_steps=1 \
training.trainer.max_steps=20 \
training.trainer.val_check_interval=10 \
training=gpt3/40b_improved \
cluster.partition=main \
training.trainer.num_nodes=1 \
container=nvcr.io/nvidia/nemo:24.05.01 \
base_results_dir=__OUTPUT_DIR__/output \
launcher_scripts_path=__OUTPUT_DIR__/install/NeMo-Framework-Launcher__599ecfcbbd64fd2de02f2cc093b1610d73854022/launcher_scripts
34 changes: 31 additions & 3 deletions tests/test_acceptance.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,16 @@
from cloudai.schema.test_template.jax_toolbox.slurm_command_gen_strategy import JaxToolboxSlurmCommandGenStrategy
from cloudai.schema.test_template.jax_toolbox.template import JaxToolbox
from cloudai.schema.test_template.nccl_test.slurm_command_gen_strategy import NcclTestSlurmCommandGenStrategy
from cloudai.schema.test_template.nemo_launcher.slurm_command_gen_strategy import NeMoLauncherSlurmCommandGenStrategy
from cloudai.schema.test_template.nemo_launcher.template import NeMoLauncher
from cloudai.schema.test_template.sleep.slurm_command_gen_strategy import SleepSlurmCommandGenStrategy
from cloudai.schema.test_template.sleep.template import Sleep
from cloudai.schema.test_template.ucc_test.slurm_command_gen_strategy import UCCTestSlurmCommandGenStrategy
from cloudai.systems import SlurmSystem
from cloudai.test_definitions.gpt import GPTCmdArgs, GPTTestDefinition
from cloudai.test_definitions.grok import GrokCmdArgs, GrokTestDefinition
from cloudai.test_definitions.nccl import NCCLCmdArgs, NCCLTestDefinition
from cloudai.test_definitions.nemo_launcher import NeMoLauncherCmdArgs, NeMoLauncherTestDefinition
from cloudai.test_definitions.sleep import SleepCmdArgs, SleepTestDefinition
from cloudai.test_definitions.ucc import UCCCmdArgs, UCCTestDefinition

Expand Down Expand Up @@ -91,7 +94,9 @@ def partial_tr(slurm_system: SlurmSystem) -> partial[TestRun]:
return partial(TestRun, num_nodes=1, nodes=[], output_path=slurm_system.output_path)


@pytest.fixture(params=["ucc", "nccl", "sleep", "gpt-pre-test", "gpt-no-hook", "grok-pre-test", "grok-no-hook"])
@pytest.fixture(
params=["ucc", "nccl", "sleep", "gpt-pre-test", "gpt-no-hook", "grok-pre-test", "grok-no-hook", "nemo-launcher"]
)
def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) -> tuple[TestRun, str, Optional[str]]:
if request.param == "ucc":
tr = partial_tr(
Expand Down Expand Up @@ -211,6 +216,25 @@ def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) -
tr.pre_test = TestScenario(name=f"{pre_test_tr.name} NCCL pre-test", test_runs=[pre_test_tr])

return (tr, f"{request.param}.sbatch", "grok.run")
elif request.param == "nemo-launcher":
tr = partial_tr(
name="nemo-launcher",
test=Test(
test_definition=NeMoLauncherTestDefinition(
name="nemo-launcher",
description="nemo-launcher",
test_template_name="nemo-launcher",
cmd_args=NeMoLauncherCmdArgs(),
),
test_template=NeMoLauncher(slurm_system, name="nemo-launcher"),
),
)
tr.test.test_template.command_gen_strategy = NeMoLauncherSlurmCommandGenStrategy(
slurm_system, tr.test.test_definition.cmd_args_dict
)
tr.test.test_template.command_gen_strategy.job_name = Mock(return_value="job_name")

return (tr, "nemo-launcher.sbatch", None)

raise ValueError(f"Unknown test: {request.param}")

Expand All @@ -221,10 +245,14 @@ def test_sbatch_generation(slurm_system: SlurmSystem, test_req: tuple[TestRun, s
tr = test_req[0]

sbatch_script = tr.test.test_template.gen_exec_command(tr).split()[-1]
ref = (Path(__file__).parent / "ref_data" / test_req[1]).read_text().strip()
if "nemo-launcher" in test_req[1]:
sbatch_script = slurm_system.output_path / "generated_command.sh"
ref = ref.replace("__OUTPUT_DIR__", str(slurm_system.output_path.parent))
else:
ref = ref.replace("__OUTPUT_DIR__", str(slurm_system.output_path)).replace("__JOB_NAME__", "job_name")

curr = Path(sbatch_script).read_text().strip()
ref = (Path(__file__).parent / "ref_data" / test_req[1]).read_text().strip()
ref = ref.replace("__OUTPUT_DIR__", str(slurm_system.output_path)).replace("__JOB_NAME__", "job_name")

assert curr == ref

Expand Down

0 comments on commit def60ed

Please sign in to comment.