Skip to content

Commit

Permalink
Add nemo-run to tests/test_acceptance.py
Browse files Browse the repository at this point in the history
  • Loading branch information
TaekyungHeo committed Nov 18, 2024
1 parent fb5187b commit d01fd6d
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 1 deletion.
11 changes: 11 additions & 0 deletions tests/ref_data/nemo-run.sbatch
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#!/bin/bash
#SBATCH --job-name=__JOB_NAME__
#SBATCH -N 1
#SBATCH --output=__OUTPUT_DIR__/stdout.txt
#SBATCH --error=__OUTPUT_DIR__/stderr.txt
#SBATCH --partition=main

export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1)


srun --mpi=pmix --container-image=nvcr.io/nvidia/nemo:24.09 nemo llm pretrain --factory llama_3b -y trainer.num_nodes=1
34 changes: 33 additions & 1 deletion tests/test_acceptance.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@
from cloudai.schema.test_template.nccl_test.slurm_command_gen_strategy import NcclTestSlurmCommandGenStrategy
from cloudai.schema.test_template.nemo_launcher.slurm_command_gen_strategy import NeMoLauncherSlurmCommandGenStrategy
from cloudai.schema.test_template.nemo_launcher.template import NeMoLauncher
from cloudai.schema.test_template.nemo_run.slurm_command_gen_strategy import NeMoRunSlurmCommandGenStrategy
from cloudai.schema.test_template.nemo_run.template import NeMoRun
from cloudai.schema.test_template.sleep.slurm_command_gen_strategy import SleepSlurmCommandGenStrategy
from cloudai.schema.test_template.sleep.template import Sleep
from cloudai.schema.test_template.ucc_test.slurm_command_gen_strategy import UCCTestSlurmCommandGenStrategy
Expand All @@ -37,6 +39,7 @@
from cloudai.test_definitions.grok import GrokCmdArgs, GrokTestDefinition
from cloudai.test_definitions.nccl import NCCLCmdArgs, NCCLTestDefinition
from cloudai.test_definitions.nemo_launcher import NeMoLauncherCmdArgs, NeMoLauncherTestDefinition
from cloudai.test_definitions.nemo_run import NeMoRunCmdArgs, NeMoRunTestDefinition
from cloudai.test_definitions.sleep import SleepCmdArgs, SleepTestDefinition
from cloudai.test_definitions.ucc import UCCCmdArgs, UCCTestDefinition

Expand Down Expand Up @@ -95,7 +98,17 @@ def partial_tr(slurm_system: SlurmSystem) -> partial[TestRun]:


@pytest.fixture(
params=["ucc", "nccl", "sleep", "gpt-pre-test", "gpt-no-hook", "grok-pre-test", "grok-no-hook", "nemo-launcher"]
params=[
"ucc",
"nccl",
"sleep",
"gpt-pre-test",
"gpt-no-hook",
"grok-pre-test",
"grok-no-hook",
"nemo-launcher",
"nemo-run",
]
)
def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) -> tuple[TestRun, str, Optional[str]]:
if request.param == "ucc":
Expand Down Expand Up @@ -235,6 +248,25 @@ def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) -
tr.test.test_template.command_gen_strategy.job_name = Mock(return_value="job_name")

return (tr, "nemo-launcher.sbatch", None)
elif request.param == "nemo-run":
tr = partial_tr(
name="nemo-run",
test=Test(
test_definition=NeMoRunTestDefinition(
name="nemo-run",
description="nemo-run",
test_template_name="nemo-run",
cmd_args=NeMoRunCmdArgs(task="pretrain", recipe_name="llama_3b"),
),
test_template=NeMoRun(slurm_system, name="nemo-run"),
),
)
tr.test.test_template.command_gen_strategy = NeMoRunSlurmCommandGenStrategy(
slurm_system, tr.test.test_definition.cmd_args_dict
)
tr.test.test_template.command_gen_strategy.job_name = Mock(return_value="job_name")

return (tr, "nemo-run.sbatch", None)

raise ValueError(f"Unknown test: {request.param}")

Expand Down

0 comments on commit d01fd6d

Please sign in to comment.