From 7f27d55862d3177dccde35f3b5711bc79b80ac1e Mon Sep 17 00:00:00 2001 From: Andrey Maslennikov Date: Wed, 6 Nov 2024 12:19:21 +0200 Subject: [PATCH 01/21] Installation works --- conf/new/test/nemo-vfm.toml | 22 +++++++++++++++++ src/cloudai/__init__.py | 4 ++++ .../generic_slurm_container/template.py | 21 ++++++++++++++++ .../generic_slurm_container.py | 24 +++++++++++++++++++ 4 files changed, 71 insertions(+) create mode 100644 conf/new/test/nemo-vfm.toml create mode 100644 src/cloudai/schema/test_template/generic_slurm_container/template.py create mode 100644 src/cloudai/test_definitions/generic_slurm_container.py diff --git a/conf/new/test/nemo-vfm.toml b/conf/new/test/nemo-vfm.toml new file mode 100644 index 00000000..0cafe628 --- /dev/null +++ b/conf/new/test/nemo-vfm.toml @@ -0,0 +1,22 @@ +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name = "nemo-vfm" +description = "Nemo VFM" +test_template_name = "GenericSlurmContainer" + +[cmd_args] +docker_image_url = "gitlab-master.nvidia.com/dl/nemo/nemo-vfm:24.07.training.2" diff --git a/src/cloudai/__init__.py b/src/cloudai/__init__.py index fd394f24..e5d1c48a 100644 --- a/src/cloudai/__init__.py +++ b/src/cloudai/__init__.py @@ -57,6 +57,7 @@ from .schema.test_template.common.default_job_status_retrieval_strategy import DefaultJobStatusRetrievalStrategy from .schema.test_template.common.slurm_job_id_retrieval_strategy import SlurmJobIdRetrievalStrategy from .schema.test_template.common.standalone_job_id_retrieval_strategy import StandaloneJobIdRetrievalStrategy +from .schema.test_template.generic_slurm_container.template import GenericSlurmContainerTT from .schema.test_template.jax_toolbox.grading_strategy import JaxToolboxGradingStrategy from .schema.test_template.jax_toolbox.job_status_retrieval_strategy import JaxToolboxJobStatusRetrievalStrategy from .schema.test_template.jax_toolbox.report_generation_strategy import JaxToolboxReportGenerationStrategy @@ -98,6 +99,7 @@ SleepTestDefinition, UCCTestDefinition, ) +from .test_definitions.generic_slurm_container import SlurmContainerTestDefinition Registry().add_runner("slurm", SlurmRunner) Registry().add_runner("kubernetes", KubernetesRunner) @@ -165,6 +167,7 @@ Registry().add_test_definition("JaxToolboxGPT", GPTTestDefinition) Registry().add_test_definition("JaxToolboxGrok", GrokTestDefinition) Registry().add_test_definition("JaxToolboxNemotron", NemotronTestDefinition) +Registry().add_test_definition("GenericSlurmContainer", SlurmContainerTestDefinition) Registry().add_test_template("ChakraReplay", ChakraReplay) Registry().add_test_template("NcclTest", NcclTest) @@ -174,6 +177,7 @@ Registry().add_test_template("JaxToolboxGPT", JaxToolbox) Registry().add_test_template("JaxToolboxGrok", JaxToolbox) Registry().add_test_template("JaxToolboxNemotron", JaxToolbox) +Registry().add_test_template("GenericSlurmContainer", GenericSlurmContainerTT) __all__ = [ "BaseInstaller", diff --git a/src/cloudai/schema/test_template/generic_slurm_container/template.py b/src/cloudai/schema/test_template/generic_slurm_container/template.py new file mode 100644 index 00000000..30d94048 --- /dev/null +++ b/src/cloudai/schema/test_template/generic_slurm_container/template.py @@ -0,0 +1,21 @@ +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from cloudai import TestTemplate + + +class GenericSlurmContainerTT(TestTemplate): + pass diff --git a/src/cloudai/test_definitions/generic_slurm_container.py b/src/cloudai/test_definitions/generic_slurm_container.py new file mode 100644 index 00000000..b8d86196 --- /dev/null +++ b/src/cloudai/test_definitions/generic_slurm_container.py @@ -0,0 +1,24 @@ +from typing import Optional + +from cloudai import CmdArgs, Installable, TestDefinition +from cloudai.installer.installables import DockerImage + + +class SlurmContainerCmdArgs(CmdArgs): + docker_image_url: str + + +class SlurmContainerTestDefinition(TestDefinition): + cmd_args: SlurmContainerCmdArgs + + _docker_image: Optional[DockerImage] = None + + @property + def docker_image(self) -> DockerImage: + if not self._docker_image: + self._docker_image = DockerImage(url=self.cmd_args.docker_image_url) + return self._docker_image + + @property + def installables(self) -> list[Installable]: + return [self.docker_image] From 9c4611df6e32f2ad43e39fd04018641296b9682e Mon Sep 17 00:00:00 2001 From: Andrey Maslennikov Date: Wed, 6 Nov 2024 12:25:27 +0200 Subject: [PATCH 02/21] dry-run works, but doesn't generate anything --- conf/new/test_scenario/nemo-vfm.toml | 22 +++++++++++++++++++ src/cloudai/__init__.py | 6 +++++ .../slurm_command_gen_strategy.py | 21 ++++++++++++++++++ 3 files changed, 49 insertions(+) create mode 100644 conf/new/test_scenario/nemo-vfm.toml create mode 100644 src/cloudai/schema/test_template/generic_slurm_container/slurm_command_gen_strategy.py diff --git a/conf/new/test_scenario/nemo-vfm.toml b/conf/new/test_scenario/nemo-vfm.toml new file mode 100644 index 00000000..5bc15157 --- /dev/null +++ b/conf/new/test_scenario/nemo-vfm.toml @@ -0,0 +1,22 @@ +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name = "nemo-vfm" + +[[Tests]] +id = "Tests.1" +test_name = "nemo-vfm" +num_nodes = "2" diff --git a/src/cloudai/__init__.py b/src/cloudai/__init__.py index e5d1c48a..33de7111 100644 --- a/src/cloudai/__init__.py +++ b/src/cloudai/__init__.py @@ -57,6 +57,9 @@ from .schema.test_template.common.default_job_status_retrieval_strategy import DefaultJobStatusRetrievalStrategy from .schema.test_template.common.slurm_job_id_retrieval_strategy import SlurmJobIdRetrievalStrategy from .schema.test_template.common.standalone_job_id_retrieval_strategy import StandaloneJobIdRetrievalStrategy +from .schema.test_template.generic_slurm_container.slurm_command_gen_strategy import ( + GenericSlurmContainerCommandGenStrategy, +) from .schema.test_template.generic_slurm_container.template import GenericSlurmContainerTT from .schema.test_template.jax_toolbox.grading_strategy import JaxToolboxGradingStrategy from .schema.test_template.jax_toolbox.job_status_retrieval_strategy import JaxToolboxJobStatusRetrievalStrategy @@ -150,6 +153,9 @@ Registry().add_strategy(ReportGenerationStrategy, [SlurmSystem], [ChakraReplay], ChakraReplayReportGenerationStrategy) Registry().add_strategy(GradingStrategy, [SlurmSystem], [ChakraReplay], ChakraReplayGradingStrategy) Registry().add_strategy(CommandGenStrategy, [SlurmSystem], [ChakraReplay], ChakraReplaySlurmCommandGenStrategy) +Registry().add_strategy( + CommandGenStrategy, [SlurmSystem], [GenericSlurmContainerTT], GenericSlurmContainerCommandGenStrategy +) Registry().add_installer("slurm", SlurmInstaller) Registry().add_installer("standalone", StandaloneInstaller) diff --git a/src/cloudai/schema/test_template/generic_slurm_container/slurm_command_gen_strategy.py b/src/cloudai/schema/test_template/generic_slurm_container/slurm_command_gen_strategy.py new file mode 100644 index 00000000..acd7f95a --- /dev/null +++ b/src/cloudai/schema/test_template/generic_slurm_container/slurm_command_gen_strategy.py @@ -0,0 +1,21 @@ +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from cloudai.systems.slurm.strategy import SlurmCommandGenStrategy + + +class GenericSlurmContainerCommandGenStrategy(SlurmCommandGenStrategy): + pass From af3df566ba4a28691da340bda7f12569a036cbcc Mon Sep 17 00:00:00 2001 From: Andrey Maslennikov Date: Wed, 6 Nov 2024 14:24:39 +0200 Subject: [PATCH 03/21] Install repo and prepare for real runs --- conf/new/test/nemo-vfm.toml | 14 +++++++++++ src/cloudai/__init__.py | 4 ++-- src/cloudai/installer/slurm_installer.py | 10 ++++++++ .../slurm_command_gen_strategy.py | 20 +++++++++++++++- .../generic_slurm_container.py | 23 +++++++++++++++++-- 5 files changed, 66 insertions(+), 5 deletions(-) diff --git a/conf/new/test/nemo-vfm.toml b/conf/new/test/nemo-vfm.toml index 0cafe628..358d2e6e 100644 --- a/conf/new/test/nemo-vfm.toml +++ b/conf/new/test/nemo-vfm.toml @@ -20,3 +20,17 @@ test_template_name = "GenericSlurmContainer" [cmd_args] docker_image_url = "gitlab-master.nvidia.com/dl/nemo/nemo-vfm:24.07.training.2" +# docker_image_url = "gitlab-master.nvidia.com/dl/nemo/nemo-vfm:24.10" +repository_url = "ssh://git@gitlab-master.nvidia.com:12051/dl/nemo/nemo-vfm.git" +repository_commit_hash = "f7c546022acca7cf818ec88398f408f53b012586" + +[extra_cmd_args] +"bash" = '-c "pwd ; whoami ; torchrun --help"' +# "bash" = '-c "cd ${DIR} ; python -u nemo/collections/multimodal/vfm/train.py --yes $*"' + +[extra_env_vars] +"WANDB_PROJECT" = "vfm" +"WANDB_RESUME" = "allow" +"NVTE_FUSED_ATTN" = "0 " +"CUDA_DEVICE_MAX_CONNECTIONS" = "1" +"PYTORCH_CUDA_ALLOC_CONF" = "expandable_segments:True" diff --git a/src/cloudai/__init__.py b/src/cloudai/__init__.py index 33de7111..3fdee20d 100644 --- a/src/cloudai/__init__.py +++ b/src/cloudai/__init__.py @@ -133,7 +133,7 @@ Registry().add_strategy( JobIdRetrievalStrategy, [SlurmSystem], - [ChakraReplay, JaxToolbox, NcclTest, UCCTest, Sleep], + [ChakraReplay, JaxToolbox, NcclTest, UCCTest, Sleep, GenericSlurmContainerTT], SlurmJobIdRetrievalStrategy, ) Registry().add_strategy(JobIdRetrievalStrategy, [StandaloneSystem], [Sleep], StandaloneJobIdRetrievalStrategy) @@ -146,7 +146,7 @@ Registry().add_strategy( JobStatusRetrievalStrategy, [SlurmSystem], - [ChakraReplay, UCCTest, NeMoLauncher, Sleep], + [ChakraReplay, UCCTest, NeMoLauncher, Sleep, GenericSlurmContainerTT], DefaultJobStatusRetrievalStrategy, ) Registry().add_strategy(CommandGenStrategy, [SlurmSystem], [UCCTest], UCCTestSlurmCommandGenStrategy) diff --git a/src/cloudai/installer/slurm_installer.py b/src/cloudai/installer/slurm_installer.py index f7904381..638a8efc 100644 --- a/src/cloudai/installer/slurm_installer.py +++ b/src/cloudai/installer/slurm_installer.py @@ -118,6 +118,8 @@ def install_one(self, item: Installable) -> InstallStatusResult: if isinstance(item, DockerImage): res = self._install_docker_image(item) return InstallStatusResult(res.success, res.message) + elif isinstance(item, GitRepo): + return self._install_one_git_repo(item) elif isinstance(item, PythonExecutable): return self._install_python_executable(item) @@ -139,6 +141,8 @@ def uninstall_one(self, item: Installable) -> InstallStatusResult: return InstallStatusResult(res.success, res.message) elif isinstance(item, PythonExecutable): return self._uninstall_python_executable(item) + elif isinstance(item, GitRepo): + return self._uninstall_git_repo(item) return InstallStatusResult(False, f"Unsupported item type: {type(item)}") @@ -148,6 +152,12 @@ def is_installed_one(self, item: Installable) -> InstallStatusResult: if res.success and res.docker_image_path: item.installed_path = res.docker_image_path return InstallStatusResult(res.success, res.message) + elif isinstance(item, GitRepo): + repo_path = item.installed_path if item.installed_path else self.system.install_path / item.repo_name + if repo_path.exists(): + item.installed_path = repo_path + return InstallStatusResult(True) + return InstallStatusResult(False, f"Git repository {item.git_url} not cloned") elif isinstance(item, PythonExecutable): return self._is_python_executable_installed(item) diff --git a/src/cloudai/schema/test_template/generic_slurm_container/slurm_command_gen_strategy.py b/src/cloudai/schema/test_template/generic_slurm_container/slurm_command_gen_strategy.py index acd7f95a..332f5606 100644 --- a/src/cloudai/schema/test_template/generic_slurm_container/slurm_command_gen_strategy.py +++ b/src/cloudai/schema/test_template/generic_slurm_container/slurm_command_gen_strategy.py @@ -14,8 +14,26 @@ # See the License for the specific language governing permissions and # limitations under the License. +from typing import Any, cast + +from cloudai import TestRun from cloudai.systems.slurm.strategy import SlurmCommandGenStrategy +from cloudai.test_definitions.generic_slurm_container import SlurmContainerTestDefinition class GenericSlurmContainerCommandGenStrategy(SlurmCommandGenStrategy): - pass + def generate_srun_prefix(self, slurm_args: dict[str, Any], tr: TestRun) -> list[str]: + tdef: SlurmContainerTestDefinition = cast(SlurmContainerTestDefinition, tr.test.test_definition) + slurm_args["image_path"] = tdef.docker_image.installed_path + # slurm_args["container_mounts"] = "" # TBD + cmd = super().generate_srun_prefix(slurm_args, tr) + + # cmd = ["pip", "install", "-e", ".", "\n"] + cmd + return cmd + + def generate_test_command(self, env_vars: dict[str, str], cmd_args: dict[str, str], tr: TestRun) -> list[str]: + srun_command_parts: list[str] = [] + if tr.test.extra_cmd_args: + srun_command_parts.append(tr.test.extra_cmd_args) + + return srun_command_parts diff --git a/src/cloudai/test_definitions/generic_slurm_container.py b/src/cloudai/test_definitions/generic_slurm_container.py index b8d86196..002a7d00 100644 --- a/src/cloudai/test_definitions/generic_slurm_container.py +++ b/src/cloudai/test_definitions/generic_slurm_container.py @@ -1,17 +1,20 @@ from typing import Optional from cloudai import CmdArgs, Installable, TestDefinition -from cloudai.installer.installables import DockerImage +from cloudai.installer.installables import DockerImage, GitRepo class SlurmContainerCmdArgs(CmdArgs): docker_image_url: str + repository_url: str + repository_commit_hash: str class SlurmContainerTestDefinition(TestDefinition): cmd_args: SlurmContainerCmdArgs _docker_image: Optional[DockerImage] = None + _git_repo: Optional[GitRepo] = None @property def docker_image(self) -> DockerImage: @@ -19,6 +22,22 @@ def docker_image(self) -> DockerImage: self._docker_image = DockerImage(url=self.cmd_args.docker_image_url) return self._docker_image + @property + def git_repo(self) -> GitRepo: + if not self._git_repo: + self._python_executable = GitRepo( + git_url=self.cmd_args.repository_url, commit_hash=self.cmd_args.repository_commit_hash + ) + + return self._python_executable + @property def installables(self) -> list[Installable]: - return [self.docker_image] + return [self.docker_image, self.git_repo] + + @property + def extra_args_str(self) -> str: + parts = [] + for k, v in self.extra_cmd_args.items(): + parts.append(f"{k} {v}" if v else k) + return " ".join(parts) From cef9c159333da1bae94c46d0c34cc74f1f95ddff Mon Sep 17 00:00:00 2001 From: Andrey Maslennikov Date: Wed, 6 Nov 2024 15:28:48 +0200 Subject: [PATCH 04/21] Finalize single node run test --- conf/new/test/nemo-vfm.toml | 12 +++++------- conf/new/test_scenario/nemo-vfm.toml | 4 ++-- src/cloudai/installer/slurm_installer.py | 2 +- src/cloudai/runner/slurm/slurm_runner.py | 1 + .../slurm_command_gen_strategy.py | 5 ++--- .../test_definitions/generic_slurm_container.py | 4 ++-- 6 files changed, 13 insertions(+), 15 deletions(-) diff --git a/conf/new/test/nemo-vfm.toml b/conf/new/test/nemo-vfm.toml index 358d2e6e..3d82f495 100644 --- a/conf/new/test/nemo-vfm.toml +++ b/conf/new/test/nemo-vfm.toml @@ -14,23 +14,21 @@ # See the License for the specific language governing permissions and # limitations under the License. -name = "nemo-vfm" -description = "Nemo VFM" +name = "nemo-vfm-single" +description = "Nemo VFM for single node" test_template_name = "GenericSlurmContainer" [cmd_args] -docker_image_url = "gitlab-master.nvidia.com/dl/nemo/nemo-vfm:24.07.training.2" -# docker_image_url = "gitlab-master.nvidia.com/dl/nemo/nemo-vfm:24.10" +docker_image_url = "gitlab-master.nvidia.com/dl/nemo/nemo-vfm:24.10" repository_url = "ssh://git@gitlab-master.nvidia.com:12051/dl/nemo/nemo-vfm.git" repository_commit_hash = "f7c546022acca7cf818ec88398f408f53b012586" [extra_cmd_args] -"bash" = '-c "pwd ; whoami ; torchrun --help"' -# "bash" = '-c "cd ${DIR} ; python -u nemo/collections/multimodal/vfm/train.py --yes $*"' +"bash" = '-c "cd /work ; pwd ; ls -l ; pip install -e . ; torchrun --nproc-per-node=8 nemo/collections/diffusion/train.py --yes --factory train_mock trainer.strategy.tensor_model_parallel_size=1 trainer.strategy.context_parallel_size=1 trainer.strategy.sequence_parallel=False model.config.num_layers=1 data.global_batch_size=8 model.config.hidden_size=1024 data.seq_length=1024 data.task_encoder.seq_length=1024 model.config.num_layers=1"' [extra_env_vars] "WANDB_PROJECT" = "vfm" "WANDB_RESUME" = "allow" -"NVTE_FUSED_ATTN" = "0 " +"NVTE_FUSED_ATTN" = "0" "CUDA_DEVICE_MAX_CONNECTIONS" = "1" "PYTORCH_CUDA_ALLOC_CONF" = "expandable_segments:True" diff --git a/conf/new/test_scenario/nemo-vfm.toml b/conf/new/test_scenario/nemo-vfm.toml index 5bc15157..8daea380 100644 --- a/conf/new/test_scenario/nemo-vfm.toml +++ b/conf/new/test_scenario/nemo-vfm.toml @@ -18,5 +18,5 @@ name = "nemo-vfm" [[Tests]] id = "Tests.1" -test_name = "nemo-vfm" -num_nodes = "2" +test_name = "nemo-vfm-single" +num_nodes = 1 diff --git a/src/cloudai/installer/slurm_installer.py b/src/cloudai/installer/slurm_installer.py index 638a8efc..8d542f28 100644 --- a/src/cloudai/installer/slurm_installer.py +++ b/src/cloudai/installer/slurm_installer.py @@ -153,7 +153,7 @@ def is_installed_one(self, item: Installable) -> InstallStatusResult: item.installed_path = res.docker_image_path return InstallStatusResult(res.success, res.message) elif isinstance(item, GitRepo): - repo_path = item.installed_path if item.installed_path else self.system.install_path / item.repo_name + repo_path = self.system.install_path / item.repo_name if repo_path.exists(): item.installed_path = repo_path return InstallStatusResult(True) diff --git a/src/cloudai/runner/slurm/slurm_runner.py b/src/cloudai/runner/slurm/slurm_runner.py index 3023e9d0..3726e252 100644 --- a/src/cloudai/runner/slurm/slurm_runner.py +++ b/src/cloudai/runner/slurm/slurm_runner.py @@ -68,4 +68,5 @@ def _submit_test(self, tr: TestRun) -> SlurmJob: stderr=stderr, message="Failed to retrieve job ID from command output.", ) + logging.info(f"Submitted slurm job: {job_id}") return SlurmJob(self.mode, self.system, tr, job_id) diff --git a/src/cloudai/schema/test_template/generic_slurm_container/slurm_command_gen_strategy.py b/src/cloudai/schema/test_template/generic_slurm_container/slurm_command_gen_strategy.py index 332f5606..4e06c8a8 100644 --- a/src/cloudai/schema/test_template/generic_slurm_container/slurm_command_gen_strategy.py +++ b/src/cloudai/schema/test_template/generic_slurm_container/slurm_command_gen_strategy.py @@ -25,10 +25,9 @@ class GenericSlurmContainerCommandGenStrategy(SlurmCommandGenStrategy): def generate_srun_prefix(self, slurm_args: dict[str, Any], tr: TestRun) -> list[str]: tdef: SlurmContainerTestDefinition = cast(SlurmContainerTestDefinition, tr.test.test_definition) slurm_args["image_path"] = tdef.docker_image.installed_path - # slurm_args["container_mounts"] = "" # TBD - cmd = super().generate_srun_prefix(slurm_args, tr) + slurm_args["container_mounts"] = f"{tdef.git_repo.installed_path.absolute()}:/work" - # cmd = ["pip", "install", "-e", ".", "\n"] + cmd + cmd = super().generate_srun_prefix(slurm_args, tr) return cmd def generate_test_command(self, env_vars: dict[str, str], cmd_args: dict[str, str], tr: TestRun) -> list[str]: diff --git a/src/cloudai/test_definitions/generic_slurm_container.py b/src/cloudai/test_definitions/generic_slurm_container.py index 002a7d00..b7edaea2 100644 --- a/src/cloudai/test_definitions/generic_slurm_container.py +++ b/src/cloudai/test_definitions/generic_slurm_container.py @@ -25,11 +25,11 @@ def docker_image(self) -> DockerImage: @property def git_repo(self) -> GitRepo: if not self._git_repo: - self._python_executable = GitRepo( + self._git_repo = GitRepo( git_url=self.cmd_args.repository_url, commit_hash=self.cmd_args.repository_commit_hash ) - return self._python_executable + return self._git_repo @property def installables(self) -> list[Installable]: From 465a08acf37d455ae72aebd4b770774b9fa8101d Mon Sep 17 00:00:00 2001 From: Andrey Maslennikov Date: Thu, 7 Nov 2024 10:49:08 +0200 Subject: [PATCH 05/21] Use abs path --- conf/new/test/{nemo-vfm.toml => nemo-vfm-single.toml} | 2 +- conf/new/test_scenario/nemo-vfm.toml | 5 +++++ .../generic_slurm_container/slurm_command_gen_strategy.py | 4 +++- 3 files changed, 9 insertions(+), 2 deletions(-) rename conf/new/test/{nemo-vfm.toml => nemo-vfm-single.toml} (95%) diff --git a/conf/new/test/nemo-vfm.toml b/conf/new/test/nemo-vfm-single.toml similarity index 95% rename from conf/new/test/nemo-vfm.toml rename to conf/new/test/nemo-vfm-single.toml index 3d82f495..a9f36806 100644 --- a/conf/new/test/nemo-vfm.toml +++ b/conf/new/test/nemo-vfm-single.toml @@ -24,7 +24,7 @@ repository_url = "ssh://git@gitlab-master.nvidia.com:12051/dl/nemo/nemo-vfm.git" repository_commit_hash = "f7c546022acca7cf818ec88398f408f53b012586" [extra_cmd_args] -"bash" = '-c "cd /work ; pwd ; ls -l ; pip install -e . ; torchrun --nproc-per-node=8 nemo/collections/diffusion/train.py --yes --factory train_mock trainer.strategy.tensor_model_parallel_size=1 trainer.strategy.context_parallel_size=1 trainer.strategy.sequence_parallel=False model.config.num_layers=1 data.global_batch_size=8 model.config.hidden_size=1024 data.seq_length=1024 data.task_encoder.seq_length=1024 model.config.num_layers=1"' +"bash" = '-c "cd /work ; pwd ; ls -l ; pip install -e . ; torchrun --nproc-per-node=8 nemo/collections/diffusion/train.py --yes --factory train_mock trainer.strategy.tensor_model_parallel_size=1 trainer.strategy.context_parallel_size=1 trainer.strategy.sequence_parallel=False model.config.num_layers=1 data.global_batch_size=8 model.config.hidden_size=1024 data.seq_length=1024 data.task_encoder.seq_length=1024 model.config.num_layers=1 trainer.max_steps=500"' [extra_env_vars] "WANDB_PROJECT" = "vfm" diff --git a/conf/new/test_scenario/nemo-vfm.toml b/conf/new/test_scenario/nemo-vfm.toml index 8daea380..ecab0e53 100644 --- a/conf/new/test_scenario/nemo-vfm.toml +++ b/conf/new/test_scenario/nemo-vfm.toml @@ -20,3 +20,8 @@ name = "nemo-vfm" id = "Tests.1" test_name = "nemo-vfm-single" num_nodes = 1 + +[[Tests]] +id = "Tests.2" +test_name = "nemo-vfm-single" +num_nodes = 2 diff --git a/src/cloudai/schema/test_template/generic_slurm_container/slurm_command_gen_strategy.py b/src/cloudai/schema/test_template/generic_slurm_container/slurm_command_gen_strategy.py index 4e06c8a8..470f5ab4 100644 --- a/src/cloudai/schema/test_template/generic_slurm_container/slurm_command_gen_strategy.py +++ b/src/cloudai/schema/test_template/generic_slurm_container/slurm_command_gen_strategy.py @@ -14,6 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +from pathlib import Path from typing import Any, cast from cloudai import TestRun @@ -25,7 +26,8 @@ class GenericSlurmContainerCommandGenStrategy(SlurmCommandGenStrategy): def generate_srun_prefix(self, slurm_args: dict[str, Any], tr: TestRun) -> list[str]: tdef: SlurmContainerTestDefinition = cast(SlurmContainerTestDefinition, tr.test.test_definition) slurm_args["image_path"] = tdef.docker_image.installed_path - slurm_args["container_mounts"] = f"{tdef.git_repo.installed_path.absolute()}:/work" + repo_path = tdef.git_repo.installed_path or Path.cwd() + slurm_args["container_mounts"] = f"{repo_path.absolute()}:/work" cmd = super().generate_srun_prefix(slurm_args, tr) return cmd From e73eb98ee96bb310c058f5a8405a05ea18e4001e Mon Sep 17 00:00:00 2001 From: Andrei Maslennikov Date: Thu, 7 Nov 2024 06:11:37 -0800 Subject: [PATCH 06/21] Update for nemo-vfm training --- conf/new/test/nemo-vfm-single.toml | 8 +++++--- conf/new/test_scenario/nemo-vfm.toml | 10 ++++++---- .../slurm_command_gen_strategy.py | 5 +++-- .../test_definitions/generic_slurm_container.py | 14 +++++++++++++- 4 files changed, 27 insertions(+), 10 deletions(-) diff --git a/conf/new/test/nemo-vfm-single.toml b/conf/new/test/nemo-vfm-single.toml index a9f36806..ba10a5e4 100644 --- a/conf/new/test/nemo-vfm-single.toml +++ b/conf/new/test/nemo-vfm-single.toml @@ -19,12 +19,14 @@ description = "Nemo VFM for single node" test_template_name = "GenericSlurmContainer" [cmd_args] -docker_image_url = "gitlab-master.nvidia.com/dl/nemo/nemo-vfm:24.10" +docker_image_url = "gitlab-master.nvidia.com/dl/nemo/nemo-vfm:24.07.training.2" repository_url = "ssh://git@gitlab-master.nvidia.com:12051/dl/nemo/nemo-vfm.git" -repository_commit_hash = "f7c546022acca7cf818ec88398f408f53b012586" +repository_commit_hash = "98d67aaf64fa74811483a30abf354ba75b0ba416" # benchmark branch +mcore_vfm_repo = "ssh://git@gitlab-master.nvidia.com:12051/dl/nemo/mcore-vfm.git" +mcore_vfm_commit_hash = "7e9490ad83439a2db96a4af557aed32a9ce72ef7" # main branch [extra_cmd_args] -"bash" = '-c "cd /work ; pwd ; ls -l ; pip install -e . ; torchrun --nproc-per-node=8 nemo/collections/diffusion/train.py --yes --factory train_mock trainer.strategy.tensor_model_parallel_size=1 trainer.strategy.context_parallel_size=1 trainer.strategy.sequence_parallel=False model.config.num_layers=1 data.global_batch_size=8 model.config.hidden_size=1024 data.seq_length=1024 data.task_encoder.seq_length=1024 model.config.num_layers=1 trainer.max_steps=500"' +"bash" = '-c "cd /work ; pwd ; ls ; git log -1 ; python -u nemo/collections/multimodal/vfm/train.py --yes --factory mock_ditllama28b_8k trainer.strategy.tensor_model_parallel_size=1 trainer.strategy.sequence_parallel=False trainer.strategy.context_parallel_size=1 model.config.recompute_granularity=full model.config.recompute_method=uniform model.config.recompute_num_layers=1 data.micro_batch_size=1 data.global_batch_size=256 data.seq_length=65536 data.task_encoder.seq_length=65536"' [extra_env_vars] "WANDB_PROJECT" = "vfm" diff --git a/conf/new/test_scenario/nemo-vfm.toml b/conf/new/test_scenario/nemo-vfm.toml index ecab0e53..1ba879d5 100644 --- a/conf/new/test_scenario/nemo-vfm.toml +++ b/conf/new/test_scenario/nemo-vfm.toml @@ -20,8 +20,10 @@ name = "nemo-vfm" id = "Tests.1" test_name = "nemo-vfm-single" num_nodes = 1 +time_limit = "01:00:00" -[[Tests]] -id = "Tests.2" -test_name = "nemo-vfm-single" -num_nodes = 2 +# [[Tests]] +# id = "Tests.2" +# test_name = "nemo-vfm-single" +# num_nodes = 2 +# time_limit = "01:00:00" diff --git a/src/cloudai/schema/test_template/generic_slurm_container/slurm_command_gen_strategy.py b/src/cloudai/schema/test_template/generic_slurm_container/slurm_command_gen_strategy.py index 470f5ab4..304e14ee 100644 --- a/src/cloudai/schema/test_template/generic_slurm_container/slurm_command_gen_strategy.py +++ b/src/cloudai/schema/test_template/generic_slurm_container/slurm_command_gen_strategy.py @@ -27,10 +27,11 @@ def generate_srun_prefix(self, slurm_args: dict[str, Any], tr: TestRun) -> list[ tdef: SlurmContainerTestDefinition = cast(SlurmContainerTestDefinition, tr.test.test_definition) slurm_args["image_path"] = tdef.docker_image.installed_path repo_path = tdef.git_repo.installed_path or Path.cwd() - slurm_args["container_mounts"] = f"{repo_path.absolute()}:/work" + mcore_vfm_path = tdef.mcore_vfm_git_repo.installed_path or Path.cwd() + slurm_args["container_mounts"] = f"{repo_path.absolute()}:/work,/lustre:/lustre/,{mcore_vfm_path.absolute()}:/opt/megatron-lm" cmd = super().generate_srun_prefix(slurm_args, tr) - return cmd + return cmd + ["--no-container-mount-home"] def generate_test_command(self, env_vars: dict[str, str], cmd_args: dict[str, str], tr: TestRun) -> list[str]: srun_command_parts: list[str] = [] diff --git a/src/cloudai/test_definitions/generic_slurm_container.py b/src/cloudai/test_definitions/generic_slurm_container.py index b7edaea2..5ec099e8 100644 --- a/src/cloudai/test_definitions/generic_slurm_container.py +++ b/src/cloudai/test_definitions/generic_slurm_container.py @@ -8,6 +8,8 @@ class SlurmContainerCmdArgs(CmdArgs): docker_image_url: str repository_url: str repository_commit_hash: str + mcore_vfm_repo: str + mcore_vfm_commit_hash: str class SlurmContainerTestDefinition(TestDefinition): @@ -15,6 +17,7 @@ class SlurmContainerTestDefinition(TestDefinition): _docker_image: Optional[DockerImage] = None _git_repo: Optional[GitRepo] = None + _mcore_git_repo: Optional[GitRepo] = None @property def docker_image(self) -> DockerImage: @@ -31,9 +34,18 @@ def git_repo(self) -> GitRepo: return self._git_repo + @property + def mcore_vfm_git_repo(self) -> GitRepo: + if not self._mcore_git_repo: + self._mcore_git_repo = GitRepo( + git_url=self.cmd_args.mcore_vfm_repo, commit_hash=self.cmd_args.mcore_vfm_commit_hash + ) + + return self._mcore_git_repo + @property def installables(self) -> list[Installable]: - return [self.docker_image, self.git_repo] + return [self.docker_image, self.git_repo, self.mcore_vfm_git_repo] @property def extra_args_str(self) -> str: From 7b661e66262da772a630c97efdd1eff4b00bc241 Mon Sep 17 00:00:00 2001 From: Andrei Maslennikov Date: Fri, 8 Nov 2024 03:29:33 -0800 Subject: [PATCH 07/21] Update configs --- ...-vfm-single.toml => nemo-vfm-ditllama28b_8k.toml} | 10 +++++----- conf/new/test_scenario/nemo-vfm.toml | 12 ++++++------ 2 files changed, 11 insertions(+), 11 deletions(-) rename conf/new/test/{nemo-vfm-single.toml => nemo-vfm-ditllama28b_8k.toml} (63%) diff --git a/conf/new/test/nemo-vfm-single.toml b/conf/new/test/nemo-vfm-ditllama28b_8k.toml similarity index 63% rename from conf/new/test/nemo-vfm-single.toml rename to conf/new/test/nemo-vfm-ditllama28b_8k.toml index ba10a5e4..a35ce35c 100644 --- a/conf/new/test/nemo-vfm-single.toml +++ b/conf/new/test/nemo-vfm-ditllama28b_8k.toml @@ -14,19 +14,19 @@ # See the License for the specific language governing permissions and # limitations under the License. -name = "nemo-vfm-single" -description = "Nemo VFM for single node" +name = "nemo-vfm-ditllama28b_8k" +description = "Nemo VFM factory=mock_ditllama28b_8k" test_template_name = "GenericSlurmContainer" [cmd_args] -docker_image_url = "gitlab-master.nvidia.com/dl/nemo/nemo-vfm:24.07.training.2" +docker_image_url = "gitlab-master.nvidia.com/dl/nemo/nemo-vfm:24.10" repository_url = "ssh://git@gitlab-master.nvidia.com:12051/dl/nemo/nemo-vfm.git" repository_commit_hash = "98d67aaf64fa74811483a30abf354ba75b0ba416" # benchmark branch mcore_vfm_repo = "ssh://git@gitlab-master.nvidia.com:12051/dl/nemo/mcore-vfm.git" -mcore_vfm_commit_hash = "7e9490ad83439a2db96a4af557aed32a9ce72ef7" # main branch +mcore_vfm_commit_hash = "e14f0b211aec63c7e62385a686b455b6ce5c404d" # fsdp branch [extra_cmd_args] -"bash" = '-c "cd /work ; pwd ; ls ; git log -1 ; python -u nemo/collections/multimodal/vfm/train.py --yes --factory mock_ditllama28b_8k trainer.strategy.tensor_model_parallel_size=1 trainer.strategy.sequence_parallel=False trainer.strategy.context_parallel_size=1 model.config.recompute_granularity=full model.config.recompute_method=uniform model.config.recompute_num_layers=1 data.micro_batch_size=1 data.global_batch_size=256 data.seq_length=65536 data.task_encoder.seq_length=65536"' +"bash" = '-c "cd /work ; pwd ; ls ; git log -1 ; export PYTHONPATH=`pwd`:/opt/megatron-lm/:$PYTHONPATH ; python -u nemo/collections/diffusion/train.py --yes --factory mock_ditllama28b_8k trainer.strategy.tensor_model_parallel_size=1 trainer.strategy.sequence_parallel=False trainer.strategy.context_parallel_size=1 model.config.recompute_granularity=full model.config.recompute_method=uniform model.config.recompute_num_layers=1 data.micro_batch_size=1 data.global_batch_size=256 data.seq_length=65536 data.task_encoder.seq_length=65536 trainer.max_steps=10"' [extra_env_vars] "WANDB_PROJECT" = "vfm" diff --git a/conf/new/test_scenario/nemo-vfm.toml b/conf/new/test_scenario/nemo-vfm.toml index 1ba879d5..f290fe1a 100644 --- a/conf/new/test_scenario/nemo-vfm.toml +++ b/conf/new/test_scenario/nemo-vfm.toml @@ -18,12 +18,12 @@ name = "nemo-vfm" [[Tests]] id = "Tests.1" -test_name = "nemo-vfm-single" +test_name = "nemo-vfm-ditllama28b_8k" num_nodes = 1 time_limit = "01:00:00" -# [[Tests]] -# id = "Tests.2" -# test_name = "nemo-vfm-single" -# num_nodes = 2 -# time_limit = "01:00:00" +[[Tests]] +id = "Tests.2" +test_name = "nemo-vfm-ditllama28b_8k" +num_nodes = 8 +time_limit = "01:00:00" From 8d630efd6d9754809d6fd58951f8d79cd9ef4553 Mon Sep 17 00:00:00 2001 From: Andrei Maslennikov Date: Fri, 8 Nov 2024 04:18:17 -0800 Subject: [PATCH 08/21] Add more configs --- conf/new/test/nemo-vfm-mock_dit7b_65k.toml | 36 +++++++++++++++++++ conf/new/test/nemo-vfm-mock_dit7b_8k.toml | 36 +++++++++++++++++++ .../test/nemo-vfm-mock_ditllama28b_65k.toml | 36 +++++++++++++++++++ conf/new/test_scenario/nemo-vfm.toml | 22 +++++++++--- 4 files changed, 125 insertions(+), 5 deletions(-) create mode 100644 conf/new/test/nemo-vfm-mock_dit7b_65k.toml create mode 100644 conf/new/test/nemo-vfm-mock_dit7b_8k.toml create mode 100644 conf/new/test/nemo-vfm-mock_ditllama28b_65k.toml diff --git a/conf/new/test/nemo-vfm-mock_dit7b_65k.toml b/conf/new/test/nemo-vfm-mock_dit7b_65k.toml new file mode 100644 index 00000000..bc1bd453 --- /dev/null +++ b/conf/new/test/nemo-vfm-mock_dit7b_65k.toml @@ -0,0 +1,36 @@ +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name = "nemo-vfm-mock_dit7b_65k" +description = "Nemo VFM factory=mock_mock_dit7b_65k" +test_template_name = "GenericSlurmContainer" + +[cmd_args] +docker_image_url = "gitlab-master.nvidia.com/dl/nemo/nemo-vfm:24.10" +repository_url = "ssh://git@gitlab-master.nvidia.com:12051/dl/nemo/nemo-vfm.git" +repository_commit_hash = "98d67aaf64fa74811483a30abf354ba75b0ba416" # benchmark branch +mcore_vfm_repo = "ssh://git@gitlab-master.nvidia.com:12051/dl/nemo/mcore-vfm.git" +mcore_vfm_commit_hash = "e14f0b211aec63c7e62385a686b455b6ce5c404d" # fsdp branch + +[extra_cmd_args] +"bash" = '-c "cd /work ; pwd ; ls ; git log -1 ; export PYTHONPATH=`pwd`:/opt/megatron-lm/:$PYTHONPATH ; python -u nemo/collections/diffusion/train.py --yes --factory mock_mock_dit7b_65k trainer.strategy.tensor_model_parallel_size=1 trainer.strategy.sequence_parallel=False trainer.strategy.context_parallel_size=1 model.config.recompute_granularity=full model.config.recompute_method=uniform model.config.recompute_num_layers=1 data.micro_batch_size=1 data.global_batch_size=256 data.seq_length=65536 data.task_encoder.seq_length=65536 trainer.max_steps=10"' + +[extra_env_vars] +"WANDB_PROJECT" = "vfm" +"WANDB_RESUME" = "allow" +"NVTE_FUSED_ATTN" = "0" +"CUDA_DEVICE_MAX_CONNECTIONS" = "1" +"PYTORCH_CUDA_ALLOC_CONF" = "expandable_segments:True" diff --git a/conf/new/test/nemo-vfm-mock_dit7b_8k.toml b/conf/new/test/nemo-vfm-mock_dit7b_8k.toml new file mode 100644 index 00000000..208cb60a --- /dev/null +++ b/conf/new/test/nemo-vfm-mock_dit7b_8k.toml @@ -0,0 +1,36 @@ +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name = "nemo-vfm-mock_dit7b_8k" +description = "Nemo VFM factory=mock_mock_dit7b_8k" +test_template_name = "GenericSlurmContainer" + +[cmd_args] +docker_image_url = "gitlab-master.nvidia.com/dl/nemo/nemo-vfm:24.10" +repository_url = "ssh://git@gitlab-master.nvidia.com:12051/dl/nemo/nemo-vfm.git" +repository_commit_hash = "98d67aaf64fa74811483a30abf354ba75b0ba416" # benchmark branch +mcore_vfm_repo = "ssh://git@gitlab-master.nvidia.com:12051/dl/nemo/mcore-vfm.git" +mcore_vfm_commit_hash = "e14f0b211aec63c7e62385a686b455b6ce5c404d" # fsdp branch + +[extra_cmd_args] +"bash" = '-c "cd /work ; pwd ; ls ; git log -1 ; export PYTHONPATH=`pwd`:/opt/megatron-lm/:$PYTHONPATH ; python -u nemo/collections/diffusion/train.py --yes --factory mock_mock_dit7b_8k trainer.strategy.tensor_model_parallel_size=1 trainer.strategy.sequence_parallel=False trainer.strategy.context_parallel_size=1 model.config.recompute_granularity=full model.config.recompute_method=uniform model.config.recompute_num_layers=1 data.micro_batch_size=1 data.global_batch_size=256 data.seq_length=65536 data.task_encoder.seq_length=65536 trainer.max_steps=10"' + +[extra_env_vars] +"WANDB_PROJECT" = "vfm" +"WANDB_RESUME" = "allow" +"NVTE_FUSED_ATTN" = "0" +"CUDA_DEVICE_MAX_CONNECTIONS" = "1" +"PYTORCH_CUDA_ALLOC_CONF" = "expandable_segments:True" diff --git a/conf/new/test/nemo-vfm-mock_ditllama28b_65k.toml b/conf/new/test/nemo-vfm-mock_ditllama28b_65k.toml new file mode 100644 index 00000000..a27e18e3 --- /dev/null +++ b/conf/new/test/nemo-vfm-mock_ditllama28b_65k.toml @@ -0,0 +1,36 @@ +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name = "nemo-vfm-mock_ditllama28b_65k" +description = "Nemo VFM factory=mock_mock_ditllama28b_65k" +test_template_name = "GenericSlurmContainer" + +[cmd_args] +docker_image_url = "gitlab-master.nvidia.com/dl/nemo/nemo-vfm:24.10" +repository_url = "ssh://git@gitlab-master.nvidia.com:12051/dl/nemo/nemo-vfm.git" +repository_commit_hash = "98d67aaf64fa74811483a30abf354ba75b0ba416" # benchmark branch +mcore_vfm_repo = "ssh://git@gitlab-master.nvidia.com:12051/dl/nemo/mcore-vfm.git" +mcore_vfm_commit_hash = "e14f0b211aec63c7e62385a686b455b6ce5c404d" # fsdp branch + +[extra_cmd_args] +"bash" = '-c "cd /work ; pwd ; ls ; git log -1 ; export PYTHONPATH=`pwd`:/opt/megatron-lm/:$PYTHONPATH ; python -u nemo/collections/diffusion/train.py --yes --factory mock_mock_ditllama28b_65k trainer.strategy.tensor_model_parallel_size=1 trainer.strategy.sequence_parallel=False trainer.strategy.context_parallel_size=1 model.config.recompute_granularity=full model.config.recompute_method=uniform model.config.recompute_num_layers=1 data.micro_batch_size=1 data.global_batch_size=256 data.seq_length=65536 data.task_encoder.seq_length=65536 trainer.max_steps=10"' + +[extra_env_vars] +"WANDB_PROJECT" = "vfm" +"WANDB_RESUME" = "allow" +"NVTE_FUSED_ATTN" = "0" +"CUDA_DEVICE_MAX_CONNECTIONS" = "1" +"PYTORCH_CUDA_ALLOC_CONF" = "expandable_segments:True" diff --git a/conf/new/test_scenario/nemo-vfm.toml b/conf/new/test_scenario/nemo-vfm.toml index f290fe1a..1a695dbb 100644 --- a/conf/new/test_scenario/nemo-vfm.toml +++ b/conf/new/test_scenario/nemo-vfm.toml @@ -17,13 +17,25 @@ name = "nemo-vfm" [[Tests]] -id = "Tests.1" -test_name = "nemo-vfm-ditllama28b_8k" -num_nodes = 1 +id = "Tests.mock.dit7b_8k" +test_name = "nemo-vfm-mock_dit7b_8k" +num_nodes = 8 +time_limit = "01:00:00" + +[[Tests]] +id = "Tests.mock.dit7b_65k" +test_name = "nemo-vfm-mock_dit7b_65k" +num_nodes = 8 +time_limit = "01:00:00" + +[[Tests]] +id = "Tests.mock.ditllama28b_8k" +test_name = "nemo-vfm-mock_ditllama28b_8k" +num_nodes = 8 time_limit = "01:00:00" [[Tests]] -id = "Tests.2" -test_name = "nemo-vfm-ditllama28b_8k" +id = "Tests.mock.ditllama28b_65k" +test_name = "nemo-vfm-mock_ditllama28b_65k" num_nodes = 8 time_limit = "01:00:00" From 2398218ffa7648c666318fe1d1024f9d811b7f59 Mon Sep 17 00:00:00 2001 From: Andrey Maslennikov Date: Fri, 8 Nov 2024 15:24:55 +0200 Subject: [PATCH 09/21] Fixes and tuning --- conf/new/test/nemo-vfm-mock_dit7b_65k.toml | 4 ++-- conf/new/test/nemo-vfm-mock_dit7b_8k.toml | 4 ++-- conf/new/test/nemo-vfm-mock_ditllama28b_65k.toml | 4 ++-- ...-ditllama28b_8k.toml => nemo-vfm-mock_ditllama28b_8k.toml} | 2 +- .../generic_slurm_container/slurm_command_gen_strategy.py | 2 +- 5 files changed, 8 insertions(+), 8 deletions(-) rename conf/new/test/{nemo-vfm-ditllama28b_8k.toml => nemo-vfm-mock_ditllama28b_8k.toml} (98%) diff --git a/conf/new/test/nemo-vfm-mock_dit7b_65k.toml b/conf/new/test/nemo-vfm-mock_dit7b_65k.toml index bc1bd453..deb12387 100644 --- a/conf/new/test/nemo-vfm-mock_dit7b_65k.toml +++ b/conf/new/test/nemo-vfm-mock_dit7b_65k.toml @@ -15,7 +15,7 @@ # limitations under the License. name = "nemo-vfm-mock_dit7b_65k" -description = "Nemo VFM factory=mock_mock_dit7b_65k" +description = "Nemo VFM factory=mock_dit7b_65k" test_template_name = "GenericSlurmContainer" [cmd_args] @@ -26,7 +26,7 @@ mcore_vfm_repo = "ssh://git@gitlab-master.nvidia.com:12051/dl/nemo/mcore-vfm.git mcore_vfm_commit_hash = "e14f0b211aec63c7e62385a686b455b6ce5c404d" # fsdp branch [extra_cmd_args] -"bash" = '-c "cd /work ; pwd ; ls ; git log -1 ; export PYTHONPATH=`pwd`:/opt/megatron-lm/:$PYTHONPATH ; python -u nemo/collections/diffusion/train.py --yes --factory mock_mock_dit7b_65k trainer.strategy.tensor_model_parallel_size=1 trainer.strategy.sequence_parallel=False trainer.strategy.context_parallel_size=1 model.config.recompute_granularity=full model.config.recompute_method=uniform model.config.recompute_num_layers=1 data.micro_batch_size=1 data.global_batch_size=256 data.seq_length=65536 data.task_encoder.seq_length=65536 trainer.max_steps=10"' +"bash" = '-c "cd /work ; pwd ; ls ; git log -1 ; export PYTHONPATH=`pwd`:/opt/megatron-lm/:$PYTHONPATH ; python -u nemo/collections/diffusion/train.py --yes --factory mock_dit7b_65k trainer.strategy.tensor_model_parallel_size=1 trainer.strategy.sequence_parallel=False trainer.strategy.context_parallel_size=1 model.config.recompute_granularity=full model.config.recompute_method=uniform model.config.recompute_num_layers=1 data.micro_batch_size=1 data.global_batch_size=256 data.seq_length=65536 data.task_encoder.seq_length=65536 trainer.max_steps=10"' [extra_env_vars] "WANDB_PROJECT" = "vfm" diff --git a/conf/new/test/nemo-vfm-mock_dit7b_8k.toml b/conf/new/test/nemo-vfm-mock_dit7b_8k.toml index 208cb60a..02f45c13 100644 --- a/conf/new/test/nemo-vfm-mock_dit7b_8k.toml +++ b/conf/new/test/nemo-vfm-mock_dit7b_8k.toml @@ -15,7 +15,7 @@ # limitations under the License. name = "nemo-vfm-mock_dit7b_8k" -description = "Nemo VFM factory=mock_mock_dit7b_8k" +description = "Nemo VFM factory=mock_dit7b_8k" test_template_name = "GenericSlurmContainer" [cmd_args] @@ -26,7 +26,7 @@ mcore_vfm_repo = "ssh://git@gitlab-master.nvidia.com:12051/dl/nemo/mcore-vfm.git mcore_vfm_commit_hash = "e14f0b211aec63c7e62385a686b455b6ce5c404d" # fsdp branch [extra_cmd_args] -"bash" = '-c "cd /work ; pwd ; ls ; git log -1 ; export PYTHONPATH=`pwd`:/opt/megatron-lm/:$PYTHONPATH ; python -u nemo/collections/diffusion/train.py --yes --factory mock_mock_dit7b_8k trainer.strategy.tensor_model_parallel_size=1 trainer.strategy.sequence_parallel=False trainer.strategy.context_parallel_size=1 model.config.recompute_granularity=full model.config.recompute_method=uniform model.config.recompute_num_layers=1 data.micro_batch_size=1 data.global_batch_size=256 data.seq_length=65536 data.task_encoder.seq_length=65536 trainer.max_steps=10"' +"bash" = '-c "cd /work ; pwd ; ls ; git log -1 ; export PYTHONPATH=`pwd`:/opt/megatron-lm/:$PYTHONPATH ; python -u nemo/collections/diffusion/train.py --yes --factory mock_dit7b_8k trainer.strategy.tensor_model_parallel_size=1 trainer.strategy.sequence_parallel=False trainer.strategy.context_parallel_size=1 model.config.recompute_granularity=full model.config.recompute_method=uniform model.config.recompute_num_layers=1 data.micro_batch_size=1 data.global_batch_size=256 data.seq_length=65536 data.task_encoder.seq_length=65536 trainer.max_steps=10"' [extra_env_vars] "WANDB_PROJECT" = "vfm" diff --git a/conf/new/test/nemo-vfm-mock_ditllama28b_65k.toml b/conf/new/test/nemo-vfm-mock_ditllama28b_65k.toml index a27e18e3..fec9c947 100644 --- a/conf/new/test/nemo-vfm-mock_ditllama28b_65k.toml +++ b/conf/new/test/nemo-vfm-mock_ditllama28b_65k.toml @@ -15,7 +15,7 @@ # limitations under the License. name = "nemo-vfm-mock_ditllama28b_65k" -description = "Nemo VFM factory=mock_mock_ditllama28b_65k" +description = "Nemo VFM factory=mock_ditllama28b_65k" test_template_name = "GenericSlurmContainer" [cmd_args] @@ -26,7 +26,7 @@ mcore_vfm_repo = "ssh://git@gitlab-master.nvidia.com:12051/dl/nemo/mcore-vfm.git mcore_vfm_commit_hash = "e14f0b211aec63c7e62385a686b455b6ce5c404d" # fsdp branch [extra_cmd_args] -"bash" = '-c "cd /work ; pwd ; ls ; git log -1 ; export PYTHONPATH=`pwd`:/opt/megatron-lm/:$PYTHONPATH ; python -u nemo/collections/diffusion/train.py --yes --factory mock_mock_ditllama28b_65k trainer.strategy.tensor_model_parallel_size=1 trainer.strategy.sequence_parallel=False trainer.strategy.context_parallel_size=1 model.config.recompute_granularity=full model.config.recompute_method=uniform model.config.recompute_num_layers=1 data.micro_batch_size=1 data.global_batch_size=256 data.seq_length=65536 data.task_encoder.seq_length=65536 trainer.max_steps=10"' +"bash" = '-c "cd /work ; pwd ; ls ; git log -1 ; export PYTHONPATH=`pwd`:/opt/megatron-lm/:$PYTHONPATH ; python -u nemo/collections/diffusion/train.py --yes --factory mock_ditllama28b_65k trainer.strategy.tensor_model_parallel_size=1 trainer.strategy.sequence_parallel=False trainer.strategy.context_parallel_size=1 model.config.recompute_granularity=full model.config.recompute_method=uniform model.config.recompute_num_layers=1 data.micro_batch_size=1 data.global_batch_size=256 data.seq_length=65536 data.task_encoder.seq_length=65536 trainer.max_steps=10"' [extra_env_vars] "WANDB_PROJECT" = "vfm" diff --git a/conf/new/test/nemo-vfm-ditllama28b_8k.toml b/conf/new/test/nemo-vfm-mock_ditllama28b_8k.toml similarity index 98% rename from conf/new/test/nemo-vfm-ditllama28b_8k.toml rename to conf/new/test/nemo-vfm-mock_ditllama28b_8k.toml index a35ce35c..9ff187f2 100644 --- a/conf/new/test/nemo-vfm-ditllama28b_8k.toml +++ b/conf/new/test/nemo-vfm-mock_ditllama28b_8k.toml @@ -14,7 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -name = "nemo-vfm-ditllama28b_8k" +name = "nemo-vfm-mock_ditllama28b_8k" description = "Nemo VFM factory=mock_ditllama28b_8k" test_template_name = "GenericSlurmContainer" diff --git a/src/cloudai/schema/test_template/generic_slurm_container/slurm_command_gen_strategy.py b/src/cloudai/schema/test_template/generic_slurm_container/slurm_command_gen_strategy.py index 304e14ee..895924a2 100644 --- a/src/cloudai/schema/test_template/generic_slurm_container/slurm_command_gen_strategy.py +++ b/src/cloudai/schema/test_template/generic_slurm_container/slurm_command_gen_strategy.py @@ -28,7 +28,7 @@ def generate_srun_prefix(self, slurm_args: dict[str, Any], tr: TestRun) -> list[ slurm_args["image_path"] = tdef.docker_image.installed_path repo_path = tdef.git_repo.installed_path or Path.cwd() mcore_vfm_path = tdef.mcore_vfm_git_repo.installed_path or Path.cwd() - slurm_args["container_mounts"] = f"{repo_path.absolute()}:/work,/lustre:/lustre/,{mcore_vfm_path.absolute()}:/opt/megatron-lm" + slurm_args["container_mounts"] = f"{repo_path.absolute()}:/work,{mcore_vfm_path.absolute()}:/opt/megatron-lm" cmd = super().generate_srun_prefix(slurm_args, tr) return cmd + ["--no-container-mount-home"] From 4f03e0251f2cf55ba12181767b40b1b056bcd059 Mon Sep 17 00:00:00 2001 From: Andrey Maslennikov Date: Mon, 11 Nov 2024 13:02:05 +0200 Subject: [PATCH 10/21] Add csv report generation for GenericSlurmContainer --- src/cloudai/__init__.py | 7 ++ .../report_generator/report_generator.py | 14 ++-- .../report_generation_strategy.py | 64 +++++++++++++++++++ 3 files changed, 81 insertions(+), 4 deletions(-) create mode 100644 src/cloudai/schema/test_template/generic_slurm_container/report_generation_strategy.py diff --git a/src/cloudai/__init__.py b/src/cloudai/__init__.py index 3fdee20d..eb22d844 100644 --- a/src/cloudai/__init__.py +++ b/src/cloudai/__init__.py @@ -57,6 +57,9 @@ from .schema.test_template.common.default_job_status_retrieval_strategy import DefaultJobStatusRetrievalStrategy from .schema.test_template.common.slurm_job_id_retrieval_strategy import SlurmJobIdRetrievalStrategy from .schema.test_template.common.standalone_job_id_retrieval_strategy import StandaloneJobIdRetrievalStrategy +from .schema.test_template.generic_slurm_container.report_generation_strategy import ( + GenericSlurmContainerReportGenerationStrategy, +) from .schema.test_template.generic_slurm_container.slurm_command_gen_strategy import ( GenericSlurmContainerCommandGenStrategy, ) @@ -126,7 +129,11 @@ Registry().add_strategy(JobIdRetrievalStrategy, [SlurmSystem], [NeMoLauncher], NeMoLauncherSlurmJobIdRetrievalStrategy) Registry().add_strategy(CommandGenStrategy, [SlurmSystem], [NeMoLauncher], NeMoLauncherSlurmCommandGenStrategy) Registry().add_strategy(ReportGenerationStrategy, [SlurmSystem], [UCCTest], UCCTestReportGenerationStrategy) +Registry().add_strategy( + ReportGenerationStrategy, [SlurmSystem], [GenericSlurmContainerTT], GenericSlurmContainerReportGenerationStrategy +) Registry().add_strategy(GradingStrategy, [SlurmSystem], [NeMoLauncher], NeMoLauncherGradingStrategy) + Registry().add_strategy(GradingStrategy, [SlurmSystem], [JaxToolbox], JaxToolboxGradingStrategy) Registry().add_strategy(GradingStrategy, [SlurmSystem], [UCCTest], UCCTestGradingStrategy) Registry().add_strategy(CommandGenStrategy, [SlurmSystem], [JaxToolbox], JaxToolboxSlurmCommandGenStrategy) diff --git a/src/cloudai/report_generator/report_generator.py b/src/cloudai/report_generator/report_generator.py index 3c8a7e2a..d2b149d2 100644 --- a/src/cloudai/report_generator/report_generator.py +++ b/src/cloudai/report_generator/report_generator.py @@ -70,7 +70,13 @@ def _generate_test_report(self, directory_path: Path, tr: TestRun) -> None: tr (TestRun): The test run object. """ for subdir in directory_path.iterdir(): - if subdir.is_dir() and tr.test.test_template.can_handle_directory(subdir): - tr.test.test_template.generate_report(tr.test.name, subdir, tr.sol) - else: - logging.warning(f"Skipping directory '{subdir}' for test '{tr.test.name}'") + if not subdir.is_dir(): + logging.debug(f"Skipping file '{subdir}', not a directory.") + continue + if not tr.test.test_template.can_handle_directory(subdir): + logging.warning( + f"Skipping '{subdir}', can't hande with strategy={tr.test.test_template.report_generation_strategy}." + ) + continue + + tr.test.test_template.generate_report(tr.test.name, subdir, tr.sol) diff --git a/src/cloudai/schema/test_template/generic_slurm_container/report_generation_strategy.py b/src/cloudai/schema/test_template/generic_slurm_container/report_generation_strategy.py new file mode 100644 index 00000000..9151e3e7 --- /dev/null +++ b/src/cloudai/schema/test_template/generic_slurm_container/report_generation_strategy.py @@ -0,0 +1,64 @@ +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import re +from pathlib import Path +from typing import Optional + +from cloudai import ReportGenerationStrategy + + +class GenericSlurmContainerReportGenerationStrategy(ReportGenerationStrategy): + def can_handle_directory(self, directory_path: Path) -> bool: + stdout_path = directory_path / "stdout.txt" + if stdout_path.exists(): + with stdout_path.open("r") as file: + if re.search( + r"Training epoch \d+, iteration \d+/\d+ | lr: [\d.]+ | global_batch_size: \d+ | global_step: \d+ | " + r"reduced_train_loss: [\d.]+ | train_step_timing in s: [\d.]+", + file.read(), + ): + return True + return False + + def generate_report(self, test_name: str, directory_path: Path, sol: Optional[float] = None) -> None: + stdout_path = directory_path / "stdout.txt" + if not stdout_path.is_file(): + return + + # Training epoch 0, iteration 1/9 | lr: 0.0001 | global_batch_size: 256 | global_step: 1 | reduced_train_loss: 3.616 | train_step_timing in s: 112.7 | consumed_samples: 512 + # parse data from stdout.txt and save into csv file + with stdout_path.open("r") as file: + lines = file.readlines() + with open(directory_path / "report.csv", "w") as csv_file: + csv_file.write( + "epoch,iteration,lr,global_batch_size,global_step,reduced_train_loss,train_step_timing,consumed_samples\n" + ) + for line in lines: + pattern = ( + r"Training epoch (\d+), iteration (\d+)/\d+ \| lr: ([\d.]+) \| global_batch_size: (\d+) \| " + r"global_step: (\d+) \| reduced_train_loss: ([\d.]+) \| train_step_timing in s: ([\d.]+)" + ) + if " | consumed_samples:" in line: + pattern = ( + r"Training epoch (\d+), iteration (\d+)/\d+ \| lr: ([\d.]+) \| global_batch_size: (\d+) \| " + r"global_step: (\d+) \| reduced_train_loss: ([\d.]+) \| train_step_timing in s: ([\d.]+) " + r"\| consumed_samples: (\d+)" + ) + + match = re.match(pattern, line) + if match: + csv_file.write(",".join(match.groups()) + "\n") From 4a53e9c0f26db46f4cb2731216417879cee1a02a Mon Sep 17 00:00:00 2001 From: Andrey Maslennikov Date: Mon, 11 Nov 2024 13:29:31 +0200 Subject: [PATCH 11/21] Make linters happy and tests pass --- conf/common/system/example_slurm_cluster.toml | 2 +- src/cloudai/__init__.py | 15 ++++++++------ .../report_generator/report_generator.py | 3 ++- .../report_generation_strategy.py | 4 ++-- .../slurm_command_gen_strategy.py | 2 ++ .../generic_slurm_container/template.py | 4 +++- .../generic_slurm_container.py | 20 +++++++++++++++++++ tests/test_init.py | 8 ++++++-- 8 files changed, 45 insertions(+), 13 deletions(-) diff --git a/conf/common/system/example_slurm_cluster.toml b/conf/common/system/example_slurm_cluster.toml index ddf2f210..ff795171 100644 --- a/conf/common/system/example_slurm_cluster.toml +++ b/conf/common/system/example_slurm_cluster.toml @@ -17,7 +17,7 @@ name = "example-cluster" scheduler = "slurm" -install_path = "./install" +install_path = "./install_dir" output_path = "./results" default_partition = "partition_1" diff --git a/src/cloudai/__init__.py b/src/cloudai/__init__.py index eb22d844..eeaa022b 100644 --- a/src/cloudai/__init__.py +++ b/src/cloudai/__init__.py @@ -63,7 +63,7 @@ from .schema.test_template.generic_slurm_container.slurm_command_gen_strategy import ( GenericSlurmContainerCommandGenStrategy, ) -from .schema.test_template.generic_slurm_container.template import GenericSlurmContainerTT +from .schema.test_template.generic_slurm_container.template import GenericSlurmContainer from .schema.test_template.jax_toolbox.grading_strategy import JaxToolboxGradingStrategy from .schema.test_template.jax_toolbox.job_status_retrieval_strategy import JaxToolboxJobStatusRetrievalStrategy from .schema.test_template.jax_toolbox.report_generation_strategy import JaxToolboxReportGenerationStrategy @@ -130,7 +130,10 @@ Registry().add_strategy(CommandGenStrategy, [SlurmSystem], [NeMoLauncher], NeMoLauncherSlurmCommandGenStrategy) Registry().add_strategy(ReportGenerationStrategy, [SlurmSystem], [UCCTest], UCCTestReportGenerationStrategy) Registry().add_strategy( - ReportGenerationStrategy, [SlurmSystem], [GenericSlurmContainerTT], GenericSlurmContainerReportGenerationStrategy + ReportGenerationStrategy, + [SlurmSystem], + [GenericSlurmContainer], + GenericSlurmContainerReportGenerationStrategy, ) Registry().add_strategy(GradingStrategy, [SlurmSystem], [NeMoLauncher], NeMoLauncherGradingStrategy) @@ -140,7 +143,7 @@ Registry().add_strategy( JobIdRetrievalStrategy, [SlurmSystem], - [ChakraReplay, JaxToolbox, NcclTest, UCCTest, Sleep, GenericSlurmContainerTT], + [ChakraReplay, JaxToolbox, NcclTest, UCCTest, Sleep, GenericSlurmContainer], SlurmJobIdRetrievalStrategy, ) Registry().add_strategy(JobIdRetrievalStrategy, [StandaloneSystem], [Sleep], StandaloneJobIdRetrievalStrategy) @@ -153,7 +156,7 @@ Registry().add_strategy( JobStatusRetrievalStrategy, [SlurmSystem], - [ChakraReplay, UCCTest, NeMoLauncher, Sleep, GenericSlurmContainerTT], + [ChakraReplay, UCCTest, NeMoLauncher, Sleep, GenericSlurmContainer], DefaultJobStatusRetrievalStrategy, ) Registry().add_strategy(CommandGenStrategy, [SlurmSystem], [UCCTest], UCCTestSlurmCommandGenStrategy) @@ -161,7 +164,7 @@ Registry().add_strategy(GradingStrategy, [SlurmSystem], [ChakraReplay], ChakraReplayGradingStrategy) Registry().add_strategy(CommandGenStrategy, [SlurmSystem], [ChakraReplay], ChakraReplaySlurmCommandGenStrategy) Registry().add_strategy( - CommandGenStrategy, [SlurmSystem], [GenericSlurmContainerTT], GenericSlurmContainerCommandGenStrategy + CommandGenStrategy, [SlurmSystem], [GenericSlurmContainer], GenericSlurmContainerCommandGenStrategy ) Registry().add_installer("slurm", SlurmInstaller) @@ -190,7 +193,7 @@ Registry().add_test_template("JaxToolboxGPT", JaxToolbox) Registry().add_test_template("JaxToolboxGrok", JaxToolbox) Registry().add_test_template("JaxToolboxNemotron", JaxToolbox) -Registry().add_test_template("GenericSlurmContainer", GenericSlurmContainerTT) +Registry().add_test_template("GenericSlurmContainer", GenericSlurmContainer) __all__ = [ "BaseInstaller", diff --git a/src/cloudai/report_generator/report_generator.py b/src/cloudai/report_generator/report_generator.py index d2b149d2..9d7ef563 100644 --- a/src/cloudai/report_generator/report_generator.py +++ b/src/cloudai/report_generator/report_generator.py @@ -75,7 +75,8 @@ def _generate_test_report(self, directory_path: Path, tr: TestRun) -> None: continue if not tr.test.test_template.can_handle_directory(subdir): logging.warning( - f"Skipping '{subdir}', can't hande with strategy={tr.test.test_template.report_generation_strategy}." + f"Skipping '{subdir}', can't hande with " + f"strategy={tr.test.test_template.report_generation_strategy}." ) continue diff --git a/src/cloudai/schema/test_template/generic_slurm_container/report_generation_strategy.py b/src/cloudai/schema/test_template/generic_slurm_container/report_generation_strategy.py index 9151e3e7..82cc00a9 100644 --- a/src/cloudai/schema/test_template/generic_slurm_container/report_generation_strategy.py +++ b/src/cloudai/schema/test_template/generic_slurm_container/report_generation_strategy.py @@ -22,6 +22,8 @@ class GenericSlurmContainerReportGenerationStrategy(ReportGenerationStrategy): + """Report generation strategy for a generic Slurm container test.""" + def can_handle_directory(self, directory_path: Path) -> bool: stdout_path = directory_path / "stdout.txt" if stdout_path.exists(): @@ -39,8 +41,6 @@ def generate_report(self, test_name: str, directory_path: Path, sol: Optional[fl if not stdout_path.is_file(): return - # Training epoch 0, iteration 1/9 | lr: 0.0001 | global_batch_size: 256 | global_step: 1 | reduced_train_loss: 3.616 | train_step_timing in s: 112.7 | consumed_samples: 512 - # parse data from stdout.txt and save into csv file with stdout_path.open("r") as file: lines = file.readlines() with open(directory_path / "report.csv", "w") as csv_file: diff --git a/src/cloudai/schema/test_template/generic_slurm_container/slurm_command_gen_strategy.py b/src/cloudai/schema/test_template/generic_slurm_container/slurm_command_gen_strategy.py index 895924a2..76dab8ca 100644 --- a/src/cloudai/schema/test_template/generic_slurm_container/slurm_command_gen_strategy.py +++ b/src/cloudai/schema/test_template/generic_slurm_container/slurm_command_gen_strategy.py @@ -23,6 +23,8 @@ class GenericSlurmContainerCommandGenStrategy(SlurmCommandGenStrategy): + """Command generation strategy for generic Slurm container tests.""" + def generate_srun_prefix(self, slurm_args: dict[str, Any], tr: TestRun) -> list[str]: tdef: SlurmContainerTestDefinition = cast(SlurmContainerTestDefinition, tr.test.test_definition) slurm_args["image_path"] = tdef.docker_image.installed_path diff --git a/src/cloudai/schema/test_template/generic_slurm_container/template.py b/src/cloudai/schema/test_template/generic_slurm_container/template.py index 30d94048..f8b8cb30 100644 --- a/src/cloudai/schema/test_template/generic_slurm_container/template.py +++ b/src/cloudai/schema/test_template/generic_slurm_container/template.py @@ -17,5 +17,7 @@ from cloudai import TestTemplate -class GenericSlurmContainerTT(TestTemplate): +class GenericSlurmContainer(TestTemplate): + """Generic Slurm container test template.""" + pass diff --git a/src/cloudai/test_definitions/generic_slurm_container.py b/src/cloudai/test_definitions/generic_slurm_container.py index 5ec099e8..f4ca6a2b 100644 --- a/src/cloudai/test_definitions/generic_slurm_container.py +++ b/src/cloudai/test_definitions/generic_slurm_container.py @@ -1,3 +1,19 @@ +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from typing import Optional from cloudai import CmdArgs, Installable, TestDefinition @@ -5,6 +21,8 @@ class SlurmContainerCmdArgs(CmdArgs): + """Command line arguments for a generic Slurm container test.""" + docker_image_url: str repository_url: str repository_commit_hash: str @@ -13,6 +31,8 @@ class SlurmContainerCmdArgs(CmdArgs): class SlurmContainerTestDefinition(TestDefinition): + """Test definition for a generic Slurm container test.""" + cmd_args: SlurmContainerCmdArgs _docker_image: Optional[DockerImage] = None diff --git a/tests/test_init.py b/tests/test_init.py index 410e154b..e6f8e2bb 100644 --- a/tests/test_init.py +++ b/tests/test_init.py @@ -18,11 +18,13 @@ from cloudai import ( CommandGenStrategy, + GenericSlurmContainer, GradingStrategy, JobIdRetrievalStrategy, JsonGenStrategy, Registry, ReportGenerationStrategy, + SlurmContainerTestDefinition, ) from cloudai.installer.slurm_installer import SlurmInstaller from cloudai.installer.standalone_installer import StandaloneInstaller @@ -127,12 +129,13 @@ def test_strategies(key: tuple, value: type): def test_test_templates(): test_templates = Registry().test_templates_map - assert len(test_templates) == 8 + assert len(test_templates) == 9 assert test_templates["ChakraReplay"] == ChakraReplay assert test_templates["NcclTest"] == NcclTest assert test_templates["NeMoLauncher"] == NeMoLauncher assert test_templates["Sleep"] == Sleep assert test_templates["UCCTest"] == UCCTest + assert test_templates["GenericSlurmContainer"] == GenericSlurmContainer def test_installers(): @@ -144,12 +147,13 @@ def test_installers(): def test_definitions(): test_defs = Registry().test_definitions_map - assert len(test_defs) == 8 + assert len(test_defs) == 9 assert test_defs["UCCTest"] == UCCTestDefinition assert test_defs["NcclTest"] == NCCLTestDefinition assert test_defs["ChakraReplay"] == ChakraReplayTestDefinition assert test_defs["Sleep"] == SleepTestDefinition assert test_defs["NeMoLauncher"] == NeMoLauncherTestDefinition + assert test_defs["GenericSlurmContainer"] == SlurmContainerTestDefinition def test_definitions_matches_templates(): From 0379c5a58191436d8add9ebaeb449427b1997b80 Mon Sep 17 00:00:00 2001 From: Andrey Maslennikov Date: Mon, 11 Nov 2024 13:34:32 +0200 Subject: [PATCH 12/21] Rely on pyproject for running dev tools --- .github/workflows/ci.yml | 6 +++--- pyproject.toml | 3 +++ 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 207d7be5..1077797a 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -23,13 +23,13 @@ jobs: run: pip install -r requirements-dev.txt - name: Run ruff linter - run: ruff check . + run: ruff check - name: Run ruff formatter - run: ruff format --check --diff . + run: ruff format --check --diff - name: Run pyright - run: pyright . + run: pyright - name: Run vulture check run: vulture src/ tests/ diff --git a/pyproject.toml b/pyproject.toml index a6442964..4ef950cd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -100,3 +100,6 @@ min_confidence = 100 [tool.coverage.report] exclude_also = ["@abstractmethod"] + +[tool.pyright] +include = ["src", "tests"] From 4d1fbba4c8ae6b9b41cdf932d7db2bd8ac19df83 Mon Sep 17 00:00:00 2001 From: Andrey Maslennikov Date: Mon, 11 Nov 2024 13:08:08 +0100 Subject: [PATCH 13/21] Simplify naming --- src/cloudai/__init__.py | 22 +++++++++---------- .../report_generation_strategy.py | 2 +- .../slurm_command_gen_strategy.py | 4 ++-- .../generic_slurm_container/template.py | 2 +- ..._slurm_container.py => slurm_container.py} | 0 tests/test_init.py | 4 ++-- 6 files changed, 16 insertions(+), 18 deletions(-) rename src/cloudai/test_definitions/{generic_slurm_container.py => slurm_container.py} (100%) diff --git a/src/cloudai/__init__.py b/src/cloudai/__init__.py index eeaa022b..fe695281 100644 --- a/src/cloudai/__init__.py +++ b/src/cloudai/__init__.py @@ -58,12 +58,12 @@ from .schema.test_template.common.slurm_job_id_retrieval_strategy import SlurmJobIdRetrievalStrategy from .schema.test_template.common.standalone_job_id_retrieval_strategy import StandaloneJobIdRetrievalStrategy from .schema.test_template.generic_slurm_container.report_generation_strategy import ( - GenericSlurmContainerReportGenerationStrategy, + SlurmContainerReportGenerationStrategy, ) from .schema.test_template.generic_slurm_container.slurm_command_gen_strategy import ( - GenericSlurmContainerCommandGenStrategy, + SlurmContainerCommandGenStrategy, ) -from .schema.test_template.generic_slurm_container.template import GenericSlurmContainer +from .schema.test_template.generic_slurm_container.template import SlurmContainer from .schema.test_template.jax_toolbox.grading_strategy import JaxToolboxGradingStrategy from .schema.test_template.jax_toolbox.job_status_retrieval_strategy import JaxToolboxJobStatusRetrievalStrategy from .schema.test_template.jax_toolbox.report_generation_strategy import JaxToolboxReportGenerationStrategy @@ -105,7 +105,7 @@ SleepTestDefinition, UCCTestDefinition, ) -from .test_definitions.generic_slurm_container import SlurmContainerTestDefinition +from .test_definitions.slurm_container import SlurmContainerTestDefinition Registry().add_runner("slurm", SlurmRunner) Registry().add_runner("kubernetes", KubernetesRunner) @@ -132,8 +132,8 @@ Registry().add_strategy( ReportGenerationStrategy, [SlurmSystem], - [GenericSlurmContainer], - GenericSlurmContainerReportGenerationStrategy, + [SlurmContainer], + SlurmContainerReportGenerationStrategy, ) Registry().add_strategy(GradingStrategy, [SlurmSystem], [NeMoLauncher], NeMoLauncherGradingStrategy) @@ -143,7 +143,7 @@ Registry().add_strategy( JobIdRetrievalStrategy, [SlurmSystem], - [ChakraReplay, JaxToolbox, NcclTest, UCCTest, Sleep, GenericSlurmContainer], + [ChakraReplay, JaxToolbox, NcclTest, UCCTest, Sleep, SlurmContainer], SlurmJobIdRetrievalStrategy, ) Registry().add_strategy(JobIdRetrievalStrategy, [StandaloneSystem], [Sleep], StandaloneJobIdRetrievalStrategy) @@ -156,16 +156,14 @@ Registry().add_strategy( JobStatusRetrievalStrategy, [SlurmSystem], - [ChakraReplay, UCCTest, NeMoLauncher, Sleep, GenericSlurmContainer], + [ChakraReplay, UCCTest, NeMoLauncher, Sleep, SlurmContainer], DefaultJobStatusRetrievalStrategy, ) Registry().add_strategy(CommandGenStrategy, [SlurmSystem], [UCCTest], UCCTestSlurmCommandGenStrategy) Registry().add_strategy(ReportGenerationStrategy, [SlurmSystem], [ChakraReplay], ChakraReplayReportGenerationStrategy) Registry().add_strategy(GradingStrategy, [SlurmSystem], [ChakraReplay], ChakraReplayGradingStrategy) Registry().add_strategy(CommandGenStrategy, [SlurmSystem], [ChakraReplay], ChakraReplaySlurmCommandGenStrategy) -Registry().add_strategy( - CommandGenStrategy, [SlurmSystem], [GenericSlurmContainer], GenericSlurmContainerCommandGenStrategy -) +Registry().add_strategy(CommandGenStrategy, [SlurmSystem], [SlurmContainer], SlurmContainerCommandGenStrategy) Registry().add_installer("slurm", SlurmInstaller) Registry().add_installer("standalone", StandaloneInstaller) @@ -193,7 +191,7 @@ Registry().add_test_template("JaxToolboxGPT", JaxToolbox) Registry().add_test_template("JaxToolboxGrok", JaxToolbox) Registry().add_test_template("JaxToolboxNemotron", JaxToolbox) -Registry().add_test_template("GenericSlurmContainer", GenericSlurmContainer) +Registry().add_test_template("GenericSlurmContainer", SlurmContainer) __all__ = [ "BaseInstaller", diff --git a/src/cloudai/schema/test_template/generic_slurm_container/report_generation_strategy.py b/src/cloudai/schema/test_template/generic_slurm_container/report_generation_strategy.py index 82cc00a9..c7c4554d 100644 --- a/src/cloudai/schema/test_template/generic_slurm_container/report_generation_strategy.py +++ b/src/cloudai/schema/test_template/generic_slurm_container/report_generation_strategy.py @@ -21,7 +21,7 @@ from cloudai import ReportGenerationStrategy -class GenericSlurmContainerReportGenerationStrategy(ReportGenerationStrategy): +class SlurmContainerReportGenerationStrategy(ReportGenerationStrategy): """Report generation strategy for a generic Slurm container test.""" def can_handle_directory(self, directory_path: Path) -> bool: diff --git a/src/cloudai/schema/test_template/generic_slurm_container/slurm_command_gen_strategy.py b/src/cloudai/schema/test_template/generic_slurm_container/slurm_command_gen_strategy.py index 76dab8ca..dd43aa18 100644 --- a/src/cloudai/schema/test_template/generic_slurm_container/slurm_command_gen_strategy.py +++ b/src/cloudai/schema/test_template/generic_slurm_container/slurm_command_gen_strategy.py @@ -19,10 +19,10 @@ from cloudai import TestRun from cloudai.systems.slurm.strategy import SlurmCommandGenStrategy -from cloudai.test_definitions.generic_slurm_container import SlurmContainerTestDefinition +from cloudai.test_definitions.slurm_container import SlurmContainerTestDefinition -class GenericSlurmContainerCommandGenStrategy(SlurmCommandGenStrategy): +class SlurmContainerCommandGenStrategy(SlurmCommandGenStrategy): """Command generation strategy for generic Slurm container tests.""" def generate_srun_prefix(self, slurm_args: dict[str, Any], tr: TestRun) -> list[str]: diff --git a/src/cloudai/schema/test_template/generic_slurm_container/template.py b/src/cloudai/schema/test_template/generic_slurm_container/template.py index f8b8cb30..9e49eb35 100644 --- a/src/cloudai/schema/test_template/generic_slurm_container/template.py +++ b/src/cloudai/schema/test_template/generic_slurm_container/template.py @@ -17,7 +17,7 @@ from cloudai import TestTemplate -class GenericSlurmContainer(TestTemplate): +class SlurmContainer(TestTemplate): """Generic Slurm container test template.""" pass diff --git a/src/cloudai/test_definitions/generic_slurm_container.py b/src/cloudai/test_definitions/slurm_container.py similarity index 100% rename from src/cloudai/test_definitions/generic_slurm_container.py rename to src/cloudai/test_definitions/slurm_container.py diff --git a/tests/test_init.py b/tests/test_init.py index e6f8e2bb..5ca6c5f2 100644 --- a/tests/test_init.py +++ b/tests/test_init.py @@ -18,12 +18,12 @@ from cloudai import ( CommandGenStrategy, - GenericSlurmContainer, GradingStrategy, JobIdRetrievalStrategy, JsonGenStrategy, Registry, ReportGenerationStrategy, + SlurmContainer, SlurmContainerTestDefinition, ) from cloudai.installer.slurm_installer import SlurmInstaller @@ -135,7 +135,7 @@ def test_test_templates(): assert test_templates["NeMoLauncher"] == NeMoLauncher assert test_templates["Sleep"] == Sleep assert test_templates["UCCTest"] == UCCTest - assert test_templates["GenericSlurmContainer"] == GenericSlurmContainer + assert test_templates["GenericSlurmContainer"] == SlurmContainer def test_installers(): From 30df9c439adcf9c1f2e5fd8e4f918a61e9f34b56 Mon Sep 17 00:00:00 2001 From: Andrey Maslennikov Date: Mon, 11 Nov 2024 13:19:08 +0100 Subject: [PATCH 14/21] Simplify naming and after-merge updates --- conf/new/test/nemo-vfm-mock_dit7b_65k.toml | 2 +- conf/new/test/nemo-vfm-mock_dit7b_8k.toml | 2 +- .../test/nemo-vfm-mock_ditllama28b_65k.toml | 2 +- .../new/test/nemo-vfm-mock_ditllama28b_8k.toml | 2 +- src/cloudai/__init__.py | 18 +++++++++--------- .../report_generation_strategy.py | 0 .../slurm_command_gen_strategy.py | 6 +++--- .../template.py | 0 .../strategy/slurm_command_gen_strategy.py | 4 ++-- tests/test_init.py | 4 ++-- 10 files changed, 20 insertions(+), 20 deletions(-) rename src/cloudai/schema/test_template/{generic_slurm_container => slurm_container}/report_generation_strategy.py (100%) rename src/cloudai/schema/test_template/{generic_slurm_container => slurm_container}/slurm_command_gen_strategy.py (90%) rename src/cloudai/schema/test_template/{generic_slurm_container => slurm_container}/template.py (100%) diff --git a/conf/new/test/nemo-vfm-mock_dit7b_65k.toml b/conf/new/test/nemo-vfm-mock_dit7b_65k.toml index deb12387..e8fe485f 100644 --- a/conf/new/test/nemo-vfm-mock_dit7b_65k.toml +++ b/conf/new/test/nemo-vfm-mock_dit7b_65k.toml @@ -16,7 +16,7 @@ name = "nemo-vfm-mock_dit7b_65k" description = "Nemo VFM factory=mock_dit7b_65k" -test_template_name = "GenericSlurmContainer" +test_template_name = "SlurmContainer" [cmd_args] docker_image_url = "gitlab-master.nvidia.com/dl/nemo/nemo-vfm:24.10" diff --git a/conf/new/test/nemo-vfm-mock_dit7b_8k.toml b/conf/new/test/nemo-vfm-mock_dit7b_8k.toml index 02f45c13..1d7bcd5b 100644 --- a/conf/new/test/nemo-vfm-mock_dit7b_8k.toml +++ b/conf/new/test/nemo-vfm-mock_dit7b_8k.toml @@ -16,7 +16,7 @@ name = "nemo-vfm-mock_dit7b_8k" description = "Nemo VFM factory=mock_dit7b_8k" -test_template_name = "GenericSlurmContainer" +test_template_name = "SlurmContainer" [cmd_args] docker_image_url = "gitlab-master.nvidia.com/dl/nemo/nemo-vfm:24.10" diff --git a/conf/new/test/nemo-vfm-mock_ditllama28b_65k.toml b/conf/new/test/nemo-vfm-mock_ditllama28b_65k.toml index fec9c947..6f78c14b 100644 --- a/conf/new/test/nemo-vfm-mock_ditllama28b_65k.toml +++ b/conf/new/test/nemo-vfm-mock_ditllama28b_65k.toml @@ -16,7 +16,7 @@ name = "nemo-vfm-mock_ditllama28b_65k" description = "Nemo VFM factory=mock_ditllama28b_65k" -test_template_name = "GenericSlurmContainer" +test_template_name = "SlurmContainer" [cmd_args] docker_image_url = "gitlab-master.nvidia.com/dl/nemo/nemo-vfm:24.10" diff --git a/conf/new/test/nemo-vfm-mock_ditllama28b_8k.toml b/conf/new/test/nemo-vfm-mock_ditllama28b_8k.toml index 9ff187f2..2083ad69 100644 --- a/conf/new/test/nemo-vfm-mock_ditllama28b_8k.toml +++ b/conf/new/test/nemo-vfm-mock_ditllama28b_8k.toml @@ -16,7 +16,7 @@ name = "nemo-vfm-mock_ditllama28b_8k" description = "Nemo VFM factory=mock_ditllama28b_8k" -test_template_name = "GenericSlurmContainer" +test_template_name = "SlurmContainer" [cmd_args] docker_image_url = "gitlab-master.nvidia.com/dl/nemo/nemo-vfm:24.10" diff --git a/src/cloudai/__init__.py b/src/cloudai/__init__.py index fe695281..fc4dc7a0 100644 --- a/src/cloudai/__init__.py +++ b/src/cloudai/__init__.py @@ -57,13 +57,6 @@ from .schema.test_template.common.default_job_status_retrieval_strategy import DefaultJobStatusRetrievalStrategy from .schema.test_template.common.slurm_job_id_retrieval_strategy import SlurmJobIdRetrievalStrategy from .schema.test_template.common.standalone_job_id_retrieval_strategy import StandaloneJobIdRetrievalStrategy -from .schema.test_template.generic_slurm_container.report_generation_strategy import ( - SlurmContainerReportGenerationStrategy, -) -from .schema.test_template.generic_slurm_container.slurm_command_gen_strategy import ( - SlurmContainerCommandGenStrategy, -) -from .schema.test_template.generic_slurm_container.template import SlurmContainer from .schema.test_template.jax_toolbox.grading_strategy import JaxToolboxGradingStrategy from .schema.test_template.jax_toolbox.job_status_retrieval_strategy import JaxToolboxJobStatusRetrievalStrategy from .schema.test_template.jax_toolbox.report_generation_strategy import JaxToolboxReportGenerationStrategy @@ -88,6 +81,13 @@ from .schema.test_template.sleep.slurm_command_gen_strategy import SleepSlurmCommandGenStrategy from .schema.test_template.sleep.standalone_command_gen_strategy import SleepStandaloneCommandGenStrategy from .schema.test_template.sleep.template import Sleep +from .schema.test_template.slurm_container.report_generation_strategy import ( + SlurmContainerReportGenerationStrategy, +) +from .schema.test_template.slurm_container.slurm_command_gen_strategy import ( + SlurmContainerCommandGenStrategy, +) +from .schema.test_template.slurm_container.template import SlurmContainer from .schema.test_template.ucc_test.grading_strategy import UCCTestGradingStrategy from .schema.test_template.ucc_test.report_generation_strategy import UCCTestReportGenerationStrategy from .schema.test_template.ucc_test.slurm_command_gen_strategy import UCCTestSlurmCommandGenStrategy @@ -181,7 +181,7 @@ Registry().add_test_definition("JaxToolboxGPT", GPTTestDefinition) Registry().add_test_definition("JaxToolboxGrok", GrokTestDefinition) Registry().add_test_definition("JaxToolboxNemotron", NemotronTestDefinition) -Registry().add_test_definition("GenericSlurmContainer", SlurmContainerTestDefinition) +Registry().add_test_definition("SlurmContainer", SlurmContainerTestDefinition) Registry().add_test_template("ChakraReplay", ChakraReplay) Registry().add_test_template("NcclTest", NcclTest) @@ -191,7 +191,7 @@ Registry().add_test_template("JaxToolboxGPT", JaxToolbox) Registry().add_test_template("JaxToolboxGrok", JaxToolbox) Registry().add_test_template("JaxToolboxNemotron", JaxToolbox) -Registry().add_test_template("GenericSlurmContainer", SlurmContainer) +Registry().add_test_template("SlurmContainer", SlurmContainer) __all__ = [ "BaseInstaller", diff --git a/src/cloudai/schema/test_template/generic_slurm_container/report_generation_strategy.py b/src/cloudai/schema/test_template/slurm_container/report_generation_strategy.py similarity index 100% rename from src/cloudai/schema/test_template/generic_slurm_container/report_generation_strategy.py rename to src/cloudai/schema/test_template/slurm_container/report_generation_strategy.py diff --git a/src/cloudai/schema/test_template/generic_slurm_container/slurm_command_gen_strategy.py b/src/cloudai/schema/test_template/slurm_container/slurm_command_gen_strategy.py similarity index 90% rename from src/cloudai/schema/test_template/generic_slurm_container/slurm_command_gen_strategy.py rename to src/cloudai/schema/test_template/slurm_container/slurm_command_gen_strategy.py index dd43aa18..162febdf 100644 --- a/src/cloudai/schema/test_template/generic_slurm_container/slurm_command_gen_strategy.py +++ b/src/cloudai/schema/test_template/slurm_container/slurm_command_gen_strategy.py @@ -25,15 +25,15 @@ class SlurmContainerCommandGenStrategy(SlurmCommandGenStrategy): """Command generation strategy for generic Slurm container tests.""" - def generate_srun_prefix(self, slurm_args: dict[str, Any], tr: TestRun) -> list[str]: + def gen_srun_prefix(self, slurm_args: dict[str, Any], tr: TestRun) -> list[str]: tdef: SlurmContainerTestDefinition = cast(SlurmContainerTestDefinition, tr.test.test_definition) slurm_args["image_path"] = tdef.docker_image.installed_path repo_path = tdef.git_repo.installed_path or Path.cwd() mcore_vfm_path = tdef.mcore_vfm_git_repo.installed_path or Path.cwd() slurm_args["container_mounts"] = f"{repo_path.absolute()}:/work,{mcore_vfm_path.absolute()}:/opt/megatron-lm" - cmd = super().generate_srun_prefix(slurm_args, tr) - return cmd + ["--no-container-mount-home"] + cmd = super().gen_srun_prefix(slurm_args, tr) + return cmd + ["--no-container-mount-home "] def generate_test_command(self, env_vars: dict[str, str], cmd_args: dict[str, str], tr: TestRun) -> list[str]: srun_command_parts: list[str] = [] diff --git a/src/cloudai/schema/test_template/generic_slurm_container/template.py b/src/cloudai/schema/test_template/slurm_container/template.py similarity index 100% rename from src/cloudai/schema/test_template/generic_slurm_container/template.py rename to src/cloudai/schema/test_template/slurm_container/template.py diff --git a/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py b/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py index ee8a463a..910db56c 100644 --- a/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py +++ b/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py @@ -195,11 +195,11 @@ def gen_post_test(self, post_test: TestScenario, base_output_path: Path) -> str: def _gen_srun_command( self, slurm_args: Dict[str, Any], env_vars: Dict[str, str], cmd_args: Dict[str, str], tr: TestRun ) -> str: - srun_command_parts = self.gen_srun_prefix(slurm_args) + srun_command_parts = self.gen_srun_prefix(slurm_args, tr) test_command_parts = self.generate_test_command(env_vars, cmd_args, tr) return " ".join(srun_command_parts + test_command_parts) - def gen_srun_prefix(self, slurm_args: Dict[str, Any]) -> List[str]: + def gen_srun_prefix(self, slurm_args: Dict[str, Any], tr: TestRun) -> List[str]: srun_command_parts = ["srun", f"--mpi={self.system.mpi}"] if slurm_args.get("image_path"): srun_command_parts.append(f'--container-image={slurm_args["image_path"]}') diff --git a/tests/test_init.py b/tests/test_init.py index 5ca6c5f2..07ca9268 100644 --- a/tests/test_init.py +++ b/tests/test_init.py @@ -135,7 +135,7 @@ def test_test_templates(): assert test_templates["NeMoLauncher"] == NeMoLauncher assert test_templates["Sleep"] == Sleep assert test_templates["UCCTest"] == UCCTest - assert test_templates["GenericSlurmContainer"] == SlurmContainer + assert test_templates["SlurmContainer"] == SlurmContainer def test_installers(): @@ -153,7 +153,7 @@ def test_definitions(): assert test_defs["ChakraReplay"] == ChakraReplayTestDefinition assert test_defs["Sleep"] == SleepTestDefinition assert test_defs["NeMoLauncher"] == NeMoLauncherTestDefinition - assert test_defs["GenericSlurmContainer"] == SlurmContainerTestDefinition + assert test_defs["SlurmContainer"] == SlurmContainerTestDefinition def test_definitions_matches_templates(): From d57d76e897005b859b56f30b7d28023506906b16 Mon Sep 17 00:00:00 2001 From: Andrey Maslennikov Date: Mon, 11 Nov 2024 13:28:24 +0100 Subject: [PATCH 15/21] Add acceptance test for slurm_container --- .../slurm_command_gen_strategy.py | 2 +- tests/test_acceptance.py | 33 ++++++++++++++++++- 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/src/cloudai/schema/test_template/slurm_container/slurm_command_gen_strategy.py b/src/cloudai/schema/test_template/slurm_container/slurm_command_gen_strategy.py index 162febdf..7b4fe63b 100644 --- a/src/cloudai/schema/test_template/slurm_container/slurm_command_gen_strategy.py +++ b/src/cloudai/schema/test_template/slurm_container/slurm_command_gen_strategy.py @@ -33,7 +33,7 @@ def gen_srun_prefix(self, slurm_args: dict[str, Any], tr: TestRun) -> list[str]: slurm_args["container_mounts"] = f"{repo_path.absolute()}:/work,{mcore_vfm_path.absolute()}:/opt/megatron-lm" cmd = super().gen_srun_prefix(slurm_args, tr) - return cmd + ["--no-container-mount-home "] + return cmd + ["--no-container-mount-home"] def generate_test_command(self, env_vars: dict[str, str], cmd_args: dict[str, str], tr: TestRun) -> list[str]: srun_command_parts: list[str] = [] diff --git a/tests/test_acceptance.py b/tests/test_acceptance.py index d1e57782..38bbb39a 100644 --- a/tests/test_acceptance.py +++ b/tests/test_acceptance.py @@ -29,12 +29,15 @@ from cloudai.schema.test_template.nccl_test.slurm_command_gen_strategy import NcclTestSlurmCommandGenStrategy from cloudai.schema.test_template.sleep.slurm_command_gen_strategy import SleepSlurmCommandGenStrategy from cloudai.schema.test_template.sleep.template import Sleep +from cloudai.schema.test_template.slurm_container.slurm_command_gen_strategy import SlurmContainerCommandGenStrategy +from cloudai.schema.test_template.slurm_container.template import SlurmContainer from cloudai.schema.test_template.ucc_test.slurm_command_gen_strategy import UCCTestSlurmCommandGenStrategy from cloudai.systems import SlurmSystem from cloudai.test_definitions.gpt import GPTCmdArgs, GPTTestDefinition from cloudai.test_definitions.grok import GrokCmdArgs, GrokTestDefinition from cloudai.test_definitions.nccl import NCCLCmdArgs, NCCLTestDefinition from cloudai.test_definitions.sleep import SleepCmdArgs, SleepTestDefinition +from cloudai.test_definitions.slurm_container import SlurmContainerCmdArgs, SlurmContainerTestDefinition from cloudai.test_definitions.ucc import UCCCmdArgs, UCCTestDefinition SLURM_TEST_SCENARIOS = [ @@ -91,7 +94,9 @@ def partial_tr(slurm_system: SlurmSystem) -> partial[TestRun]: return partial(TestRun, num_nodes=1, nodes=[], output_path=slurm_system.output_path) -@pytest.fixture(params=["ucc", "nccl", "sleep", "gpt-pre-test", "gpt-no-hook", "grok-pre-test", "grok-no-hook"]) +@pytest.fixture( + params=["ucc", "nccl", "sleep", "gpt-pre-test", "gpt-no-hook", "grok-pre-test", "grok-no-hook", "slurm_container"] +) def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) -> tuple[TestRun, str, Optional[str]]: if request.param == "ucc": tr = partial_tr( @@ -211,6 +216,32 @@ def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) - tr.pre_test = TestScenario(name=f"{pre_test_tr.name} NCCL pre-test", test_runs=[pre_test_tr]) return (tr, f"{request.param}.sbatch", "grok.run") + elif request.param == "slurm_container": + tr = partial_tr( + name="slurm_container", + test=Test( + test_definition=SlurmContainerTestDefinition( + name="slurm_container", + description="slurm_container", + test_template_name="slurm_container", + cmd_args=SlurmContainerCmdArgs( + docker_image_url="https://docker/url", + repository_url="https://repo/url", + repository_commit_hash="commit_hash", + mcore_vfm_repo="https://mcore_vfm/repo", + mcore_vfm_commit_hash="mcore_vfm_commit_hash", + ), + extra_cmd_args={"bash": '-c "pwd ; ls"'}, + ), + test_template=SlurmContainer(slurm_system, name="slurm_container"), + ), + ) + tr.test.test_template.command_gen_strategy = SlurmContainerCommandGenStrategy( + slurm_system, tr.test.test_definition.cmd_args_dict + ) + tr.test.test_template.command_gen_strategy.job_name = Mock(return_value="job_name") + + return (tr, "slurm_container.sbatch", None) raise ValueError(f"Unknown test: {request.param}") From 5d1b81c87e14e99d31289b2c571e780f7bd473da Mon Sep 17 00:00:00 2001 From: Andrey Maslennikov Date: Mon, 11 Nov 2024 15:11:59 +0100 Subject: [PATCH 16/21] Add missing ref file --- tests/ref_data/slurm_container.sbatch | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 tests/ref_data/slurm_container.sbatch diff --git a/tests/ref_data/slurm_container.sbatch b/tests/ref_data/slurm_container.sbatch new file mode 100644 index 00000000..b959d148 --- /dev/null +++ b/tests/ref_data/slurm_container.sbatch @@ -0,0 +1,11 @@ +#!/bin/bash +#SBATCH --job-name=__JOB_NAME__ +#SBATCH -N 1 +#SBATCH --output=__OUTPUT_DIR__/stdout.txt +#SBATCH --error=__OUTPUT_DIR__/stderr.txt +#SBATCH --partition=main + +export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) + + +srun --mpi=pmix --container-image=https://docker/url --container-mounts=/Users/andreyma/workspace/nvidia/cloudai:/work,/Users/andreyma/workspace/nvidia/cloudai:/opt/megatron-lm --no-container-mount-home bash -c "pwd ; ls" \ No newline at end of file From d83c8d386b194b17c6de19c31f8c1e6d08dd85a9 Mon Sep 17 00:00:00 2001 From: Andrey Maslennikov Date: Mon, 11 Nov 2024 15:25:24 +0100 Subject: [PATCH 17/21] Use system-wide root for __OUTPUT_DIR__ template --- .../slurm_container/slurm_command_gen_strategy.py | 6 ++++-- tests/ref_data/gpt-no-hook.sbatch | 6 +++--- tests/ref_data/gpt-pre-test.sbatch | 10 +++++----- tests/ref_data/grok-no-hook.sbatch | 6 +++--- tests/ref_data/grok-pre-test.sbatch | 10 +++++----- tests/ref_data/nccl.sbatch | 4 ++-- tests/ref_data/sleep.sbatch | 4 ++-- tests/ref_data/slurm_container.sbatch | 6 +++--- tests/ref_data/ucc.sbatch | 4 ++-- tests/test_acceptance.py | 2 +- 10 files changed, 30 insertions(+), 28 deletions(-) diff --git a/src/cloudai/schema/test_template/slurm_container/slurm_command_gen_strategy.py b/src/cloudai/schema/test_template/slurm_container/slurm_command_gen_strategy.py index 7b4fe63b..5863d06f 100644 --- a/src/cloudai/schema/test_template/slurm_container/slurm_command_gen_strategy.py +++ b/src/cloudai/schema/test_template/slurm_container/slurm_command_gen_strategy.py @@ -28,8 +28,10 @@ class SlurmContainerCommandGenStrategy(SlurmCommandGenStrategy): def gen_srun_prefix(self, slurm_args: dict[str, Any], tr: TestRun) -> list[str]: tdef: SlurmContainerTestDefinition = cast(SlurmContainerTestDefinition, tr.test.test_definition) slurm_args["image_path"] = tdef.docker_image.installed_path - repo_path = tdef.git_repo.installed_path or Path.cwd() - mcore_vfm_path = tdef.mcore_vfm_git_repo.installed_path or Path.cwd() + repo_path = tdef.git_repo.installed_path or self.system.install_path / tdef.git_repo.repo_name + mcore_vfm_path = ( + tdef.mcore_vfm_git_repo.installed_path or self.system.install_path / tdef.mcore_vfm_git_repo.repo_name + ) slurm_args["container_mounts"] = f"{repo_path.absolute()}:/work,{mcore_vfm_path.absolute()}:/opt/megatron-lm" cmd = super().gen_srun_prefix(slurm_args, tr) diff --git a/tests/ref_data/gpt-no-hook.sbatch b/tests/ref_data/gpt-no-hook.sbatch index f01e9222..77999bda 100644 --- a/tests/ref_data/gpt-no-hook.sbatch +++ b/tests/ref_data/gpt-no-hook.sbatch @@ -15,8 +15,8 @@ echo "Loading container with srun command" --mpi=none \ \ --export=ALL \ - -o __OUTPUT_DIR__/output-%j-%n-%t.txt \ - -e __OUTPUT_DIR__/error-%j-%n-%t.txt \ + -o __OUTPUT_DIR__/output/output-%j-%n-%t.txt \ + -e __OUTPUT_DIR__/output/error-%j-%n-%t.txt \ --container-name=cont \ - --container-mounts=__OUTPUT_DIR__:/opt/paxml/workspace/ \ + --container-mounts=__OUTPUT_DIR__/output:/opt/paxml/workspace/ \ /opt/paxml/workspace/run.sh diff --git a/tests/ref_data/gpt-pre-test.sbatch b/tests/ref_data/gpt-pre-test.sbatch index c0f6114f..d21f0ed7 100644 --- a/tests/ref_data/gpt-pre-test.sbatch +++ b/tests/ref_data/gpt-pre-test.sbatch @@ -8,8 +8,8 @@ export COMBINE_THRESHOLD=1 export PER_GPU_COMBINE_THRESHOLD=0 export XLA_FLAGS="--xla_gpu_all_gather_combine_threshold_bytes=$COMBINE_THRESHOLD --xla_gpu_all_reduce_combine_threshold_bytes=$COMBINE_THRESHOLD --xla_gpu_reduce_scatter_combine_threshold_bytes=$PER_GPU_COMBINE_THRESHOLD" -srun --output=__OUTPUT_DIR__/pre_test/nccl/stdout.txt --error=__OUTPUT_DIR__/pre_test/nccl/stderr.txt --mpi=pmix --container-image=nvcr.io/nvidia/pytorch:24.02-py3 /usr/local/bin/all_reduce_perf_mpi --nthreads 1 --ngpus 1 --minbytes 32M --maxbytes 32M --stepbytes 1M --op sum --datatype float --root 0 --iters 20 --warmup_iters 5 --agg_iters 1 --average 1 --parallel_init 0 --check 1 --blocking 0 --cudagraph 0 -SUCCESS_0=$(grep -q "Avg bus bandwidth" __OUTPUT_DIR__/pre_test/nccl/stdout.txt && echo 1 || echo 0) +srun --output=__OUTPUT_DIR__/output/pre_test/nccl/stdout.txt --error=__OUTPUT_DIR__/output/pre_test/nccl/stderr.txt --mpi=pmix --container-image=nvcr.io/nvidia/pytorch:24.02-py3 /usr/local/bin/all_reduce_perf_mpi --nthreads 1 --ngpus 1 --minbytes 32M --maxbytes 32M --stepbytes 1M --op sum --datatype float --root 0 --iters 20 --warmup_iters 5 --agg_iters 1 --average 1 --parallel_init 0 --check 1 --blocking 0 --cudagraph 0 +SUCCESS_0=$(grep -q "Avg bus bandwidth" __OUTPUT_DIR__/output/pre_test/nccl/stdout.txt && echo 1 || echo 0) PRE_TEST_SUCCESS=$( [ $SUCCESS_0 -eq 1 ] && echo 1 || echo 0 ) if [ $PRE_TEST_SUCCESS -eq 1 ]; then echo "Loading container with srun command" @@ -19,9 +19,9 @@ if [ $PRE_TEST_SUCCESS -eq 1 ]; then --mpi=none \ \ --export=ALL \ - -o __OUTPUT_DIR__/output-%j-%n-%t.txt \ - -e __OUTPUT_DIR__/error-%j-%n-%t.txt \ + -o __OUTPUT_DIR__/output/output-%j-%n-%t.txt \ + -e __OUTPUT_DIR__/output/error-%j-%n-%t.txt \ --container-name=cont \ - --container-mounts=__OUTPUT_DIR__:/opt/paxml/workspace/ \ + --container-mounts=__OUTPUT_DIR__/output:/opt/paxml/workspace/ \ /opt/paxml/workspace/run.sh fi diff --git a/tests/ref_data/grok-no-hook.sbatch b/tests/ref_data/grok-no-hook.sbatch index 7e7adfc2..8d008611 100644 --- a/tests/ref_data/grok-no-hook.sbatch +++ b/tests/ref_data/grok-no-hook.sbatch @@ -15,8 +15,8 @@ echo "Loading container with srun command" --mpi=none \ \ --export=ALL \ - -o __OUTPUT_DIR__/output-%j-%n-%t.txt \ - -e __OUTPUT_DIR__/error-%j-%n-%t.txt \ + -o __OUTPUT_DIR__/output/output-%j-%n-%t.txt \ + -e __OUTPUT_DIR__/output/error-%j-%n-%t.txt \ --container-name=cont \ - --container-mounts=__OUTPUT_DIR__:/opt/paxml/workspace/ \ + --container-mounts=__OUTPUT_DIR__/output:/opt/paxml/workspace/ \ /opt/paxml/workspace/run.sh diff --git a/tests/ref_data/grok-pre-test.sbatch b/tests/ref_data/grok-pre-test.sbatch index 51730bd7..7d88745a 100644 --- a/tests/ref_data/grok-pre-test.sbatch +++ b/tests/ref_data/grok-pre-test.sbatch @@ -8,8 +8,8 @@ export COMBINE_THRESHOLD=1 export PER_GPU_COMBINE_THRESHOLD=0 export XLA_FLAGS="--xla_disable_hlo_passes=rematerialization --xla_dump_hlo_pass_re=.* --xla_gpu_all_gather_combine_threshold_bytes=$COMBINE_THRESHOLD --xla_gpu_all_reduce_combine_threshold_bytes=$COMBINE_THRESHOLD --xla_gpu_enable_all_gather_combine_by_dim=false --xla_gpu_enable_highest_priority_async_stream=true --xla_gpu_enable_latency_hiding_scheduler=true --xla_gpu_enable_pipelined_all_gather=true --xla_gpu_enable_pipelined_all_reduce=true --xla_gpu_enable_pipelined_reduce_scatter=true --xla_gpu_enable_reduce_scatter_combine_by_dim=false --xla_gpu_enable_triton_gemm=false --xla_gpu_enable_triton_softmax_fusion=false --xla_gpu_enable_while_loop_double_buffering=true --xla_gpu_graph_level=0 --xla_gpu_pgle_profile_file_or_directory_path=/opt/paxml/workspace/pgle_output_profile.pbtxt --xla_gpu_reduce_scatter_combine_threshold_bytes=$PER_GPU_COMBINE_THRESHOLD --xla_gpu_run_post_layout_collective_pipeliner=false --xla_gpu_use_memcpy_local_p2p=false" -srun --output=__OUTPUT_DIR__/pre_test/nccl/stdout.txt --error=__OUTPUT_DIR__/pre_test/nccl/stderr.txt --mpi=pmix --container-image=nvcr.io/nvidia/pytorch:24.02-py3 /usr/local/bin/all_reduce_perf_mpi --nthreads 1 --ngpus 1 --minbytes 32M --maxbytes 32M --stepbytes 1M --op sum --datatype float --root 0 --iters 20 --warmup_iters 5 --agg_iters 1 --average 1 --parallel_init 0 --check 1 --blocking 0 --cudagraph 0 -SUCCESS_0=$(grep -q "Avg bus bandwidth" __OUTPUT_DIR__/pre_test/nccl/stdout.txt && echo 1 || echo 0) +srun --output=__OUTPUT_DIR__/output/pre_test/nccl/stdout.txt --error=__OUTPUT_DIR__/output/pre_test/nccl/stderr.txt --mpi=pmix --container-image=nvcr.io/nvidia/pytorch:24.02-py3 /usr/local/bin/all_reduce_perf_mpi --nthreads 1 --ngpus 1 --minbytes 32M --maxbytes 32M --stepbytes 1M --op sum --datatype float --root 0 --iters 20 --warmup_iters 5 --agg_iters 1 --average 1 --parallel_init 0 --check 1 --blocking 0 --cudagraph 0 +SUCCESS_0=$(grep -q "Avg bus bandwidth" __OUTPUT_DIR__/output/pre_test/nccl/stdout.txt && echo 1 || echo 0) PRE_TEST_SUCCESS=$( [ $SUCCESS_0 -eq 1 ] && echo 1 || echo 0 ) if [ $PRE_TEST_SUCCESS -eq 1 ]; then echo "Loading container with srun command" @@ -19,9 +19,9 @@ if [ $PRE_TEST_SUCCESS -eq 1 ]; then --mpi=none \ \ --export=ALL \ - -o __OUTPUT_DIR__/output-%j-%n-%t.txt \ - -e __OUTPUT_DIR__/error-%j-%n-%t.txt \ + -o __OUTPUT_DIR__/output/output-%j-%n-%t.txt \ + -e __OUTPUT_DIR__/output/error-%j-%n-%t.txt \ --container-name=cont \ - --container-mounts=__OUTPUT_DIR__:/opt/paxml/workspace/ \ + --container-mounts=__OUTPUT_DIR__/output:/opt/paxml/workspace/ \ /opt/paxml/workspace/run.sh fi diff --git a/tests/ref_data/nccl.sbatch b/tests/ref_data/nccl.sbatch index dc179ba9..2a9f57b4 100644 --- a/tests/ref_data/nccl.sbatch +++ b/tests/ref_data/nccl.sbatch @@ -1,8 +1,8 @@ #!/bin/bash #SBATCH --job-name=__JOB_NAME__ #SBATCH -N 1 -#SBATCH --output=__OUTPUT_DIR__/stdout.txt -#SBATCH --error=__OUTPUT_DIR__/stderr.txt +#SBATCH --output=__OUTPUT_DIR__/output/stdout.txt +#SBATCH --error=__OUTPUT_DIR__/output/stderr.txt #SBATCH --partition=main export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) diff --git a/tests/ref_data/sleep.sbatch b/tests/ref_data/sleep.sbatch index 9262001b..1ce9ca32 100644 --- a/tests/ref_data/sleep.sbatch +++ b/tests/ref_data/sleep.sbatch @@ -1,8 +1,8 @@ #!/bin/bash #SBATCH --job-name=__JOB_NAME__ #SBATCH -N 1 -#SBATCH --output=__OUTPUT_DIR__/stdout.txt -#SBATCH --error=__OUTPUT_DIR__/stderr.txt +#SBATCH --output=__OUTPUT_DIR__/output/stdout.txt +#SBATCH --error=__OUTPUT_DIR__/output/stderr.txt #SBATCH --partition=main export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) diff --git a/tests/ref_data/slurm_container.sbatch b/tests/ref_data/slurm_container.sbatch index b959d148..6479402b 100644 --- a/tests/ref_data/slurm_container.sbatch +++ b/tests/ref_data/slurm_container.sbatch @@ -1,11 +1,11 @@ #!/bin/bash #SBATCH --job-name=__JOB_NAME__ #SBATCH -N 1 -#SBATCH --output=__OUTPUT_DIR__/stdout.txt -#SBATCH --error=__OUTPUT_DIR__/stderr.txt +#SBATCH --output=__OUTPUT_DIR__/output/stdout.txt +#SBATCH --error=__OUTPUT_DIR__/output/stderr.txt #SBATCH --partition=main export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) -srun --mpi=pmix --container-image=https://docker/url --container-mounts=/Users/andreyma/workspace/nvidia/cloudai:/work,/Users/andreyma/workspace/nvidia/cloudai:/opt/megatron-lm --no-container-mount-home bash -c "pwd ; ls" \ No newline at end of file +srun --mpi=pmix --container-image=https://docker/url --container-mounts=__OUTPUT_DIR__/install/url__commit_hash:/work,__OUTPUT_DIR__/install/repo__mcore_vfm_commit_hash:/opt/megatron-lm --no-container-mount-home bash -c "pwd ; ls" diff --git a/tests/ref_data/ucc.sbatch b/tests/ref_data/ucc.sbatch index a9f9e686..a3f5fca8 100644 --- a/tests/ref_data/ucc.sbatch +++ b/tests/ref_data/ucc.sbatch @@ -1,8 +1,8 @@ #!/bin/bash #SBATCH --job-name=__JOB_NAME__ #SBATCH -N 1 -#SBATCH --output=__OUTPUT_DIR__/stdout.txt -#SBATCH --error=__OUTPUT_DIR__/stderr.txt +#SBATCH --output=__OUTPUT_DIR__/output/stdout.txt +#SBATCH --error=__OUTPUT_DIR__/output/stderr.txt #SBATCH --partition=main export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) diff --git a/tests/test_acceptance.py b/tests/test_acceptance.py index 38bbb39a..0711831b 100644 --- a/tests/test_acceptance.py +++ b/tests/test_acceptance.py @@ -255,7 +255,7 @@ def test_sbatch_generation(slurm_system: SlurmSystem, test_req: tuple[TestRun, s curr = Path(sbatch_script).read_text().strip() ref = (Path(__file__).parent / "ref_data" / test_req[1]).read_text().strip() - ref = ref.replace("__OUTPUT_DIR__", str(slurm_system.output_path)).replace("__JOB_NAME__", "job_name") + ref = ref.replace("__OUTPUT_DIR__", str(slurm_system.output_path.parent)).replace("__JOB_NAME__", "job_name") assert curr == ref From c634ad1365b66aed4bea9a7d1fa9c4fba5d26199 Mon Sep 17 00:00:00 2001 From: Andrey Maslennikov Date: Mon, 11 Nov 2024 15:28:09 +0100 Subject: [PATCH 18/21] Make ruff happy --- .../test_template/slurm_container/slurm_command_gen_strategy.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/cloudai/schema/test_template/slurm_container/slurm_command_gen_strategy.py b/src/cloudai/schema/test_template/slurm_container/slurm_command_gen_strategy.py index 5863d06f..6763debc 100644 --- a/src/cloudai/schema/test_template/slurm_container/slurm_command_gen_strategy.py +++ b/src/cloudai/schema/test_template/slurm_container/slurm_command_gen_strategy.py @@ -14,7 +14,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -from pathlib import Path from typing import Any, cast from cloudai import TestRun From 0c8a909e2085ab2d0f4aebefb71607c5d515496d Mon Sep 17 00:00:00 2001 From: Andrey Maslennikov Date: Mon, 11 Nov 2024 15:47:40 +0100 Subject: [PATCH 19/21] Make container mounts more configurable --- .../slurm_container/slurm_command_gen_strategy.py | 6 +----- src/cloudai/test_definitions/slurm_container.py | 9 +++++++++ 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/src/cloudai/schema/test_template/slurm_container/slurm_command_gen_strategy.py b/src/cloudai/schema/test_template/slurm_container/slurm_command_gen_strategy.py index 6763debc..23a22958 100644 --- a/src/cloudai/schema/test_template/slurm_container/slurm_command_gen_strategy.py +++ b/src/cloudai/schema/test_template/slurm_container/slurm_command_gen_strategy.py @@ -27,11 +27,7 @@ class SlurmContainerCommandGenStrategy(SlurmCommandGenStrategy): def gen_srun_prefix(self, slurm_args: dict[str, Any], tr: TestRun) -> list[str]: tdef: SlurmContainerTestDefinition = cast(SlurmContainerTestDefinition, tr.test.test_definition) slurm_args["image_path"] = tdef.docker_image.installed_path - repo_path = tdef.git_repo.installed_path or self.system.install_path / tdef.git_repo.repo_name - mcore_vfm_path = ( - tdef.mcore_vfm_git_repo.installed_path or self.system.install_path / tdef.mcore_vfm_git_repo.repo_name - ) - slurm_args["container_mounts"] = f"{repo_path.absolute()}:/work,{mcore_vfm_path.absolute()}:/opt/megatron-lm" + slurm_args["container_mounts"] = ",".join(tdef.container_mounts(self.system.install_path)) cmd = super().gen_srun_prefix(slurm_args, tr) return cmd + ["--no-container-mount-home"] diff --git a/src/cloudai/test_definitions/slurm_container.py b/src/cloudai/test_definitions/slurm_container.py index f4ca6a2b..e84d3b47 100644 --- a/src/cloudai/test_definitions/slurm_container.py +++ b/src/cloudai/test_definitions/slurm_container.py @@ -14,6 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +from pathlib import Path from typing import Optional from cloudai import CmdArgs, Installable, TestDefinition @@ -63,6 +64,14 @@ def mcore_vfm_git_repo(self) -> GitRepo: return self._mcore_git_repo + def container_mounts(self, root: Path) -> list[str]: + repo_path = self.git_repo.installed_path or root / self.git_repo.repo_name + mcore_vfm_path = self.mcore_vfm_git_repo.installed_path or root / self.mcore_vfm_git_repo.repo_name + return [ + f"{repo_path.absolute()}:/work", + f"{mcore_vfm_path.absolute()}:/opt/megatron-lm", + ] + @property def installables(self) -> list[Installable]: return [self.docker_image, self.git_repo, self.mcore_vfm_git_repo] From 63973e356bf560e291a4ea9f0145e11928506657 Mon Sep 17 00:00:00 2001 From: Andrey Maslennikov Date: Wed, 13 Nov 2024 12:07:23 +0100 Subject: [PATCH 20/21] Remove config examples --- conf/new/test/nemo-vfm-mock_dit7b_65k.toml | 36 ---------------- conf/new/test/nemo-vfm-mock_dit7b_8k.toml | 36 ---------------- .../test/nemo-vfm-mock_ditllama28b_65k.toml | 36 ---------------- .../test/nemo-vfm-mock_ditllama28b_8k.toml | 36 ---------------- conf/new/test_scenario/nemo-vfm.toml | 41 ------------------- 5 files changed, 185 deletions(-) delete mode 100644 conf/new/test/nemo-vfm-mock_dit7b_65k.toml delete mode 100644 conf/new/test/nemo-vfm-mock_dit7b_8k.toml delete mode 100644 conf/new/test/nemo-vfm-mock_ditllama28b_65k.toml delete mode 100644 conf/new/test/nemo-vfm-mock_ditllama28b_8k.toml delete mode 100644 conf/new/test_scenario/nemo-vfm.toml diff --git a/conf/new/test/nemo-vfm-mock_dit7b_65k.toml b/conf/new/test/nemo-vfm-mock_dit7b_65k.toml deleted file mode 100644 index e8fe485f..00000000 --- a/conf/new/test/nemo-vfm-mock_dit7b_65k.toml +++ /dev/null @@ -1,36 +0,0 @@ -# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES -# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -name = "nemo-vfm-mock_dit7b_65k" -description = "Nemo VFM factory=mock_dit7b_65k" -test_template_name = "SlurmContainer" - -[cmd_args] -docker_image_url = "gitlab-master.nvidia.com/dl/nemo/nemo-vfm:24.10" -repository_url = "ssh://git@gitlab-master.nvidia.com:12051/dl/nemo/nemo-vfm.git" -repository_commit_hash = "98d67aaf64fa74811483a30abf354ba75b0ba416" # benchmark branch -mcore_vfm_repo = "ssh://git@gitlab-master.nvidia.com:12051/dl/nemo/mcore-vfm.git" -mcore_vfm_commit_hash = "e14f0b211aec63c7e62385a686b455b6ce5c404d" # fsdp branch - -[extra_cmd_args] -"bash" = '-c "cd /work ; pwd ; ls ; git log -1 ; export PYTHONPATH=`pwd`:/opt/megatron-lm/:$PYTHONPATH ; python -u nemo/collections/diffusion/train.py --yes --factory mock_dit7b_65k trainer.strategy.tensor_model_parallel_size=1 trainer.strategy.sequence_parallel=False trainer.strategy.context_parallel_size=1 model.config.recompute_granularity=full model.config.recompute_method=uniform model.config.recompute_num_layers=1 data.micro_batch_size=1 data.global_batch_size=256 data.seq_length=65536 data.task_encoder.seq_length=65536 trainer.max_steps=10"' - -[extra_env_vars] -"WANDB_PROJECT" = "vfm" -"WANDB_RESUME" = "allow" -"NVTE_FUSED_ATTN" = "0" -"CUDA_DEVICE_MAX_CONNECTIONS" = "1" -"PYTORCH_CUDA_ALLOC_CONF" = "expandable_segments:True" diff --git a/conf/new/test/nemo-vfm-mock_dit7b_8k.toml b/conf/new/test/nemo-vfm-mock_dit7b_8k.toml deleted file mode 100644 index 1d7bcd5b..00000000 --- a/conf/new/test/nemo-vfm-mock_dit7b_8k.toml +++ /dev/null @@ -1,36 +0,0 @@ -# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES -# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -name = "nemo-vfm-mock_dit7b_8k" -description = "Nemo VFM factory=mock_dit7b_8k" -test_template_name = "SlurmContainer" - -[cmd_args] -docker_image_url = "gitlab-master.nvidia.com/dl/nemo/nemo-vfm:24.10" -repository_url = "ssh://git@gitlab-master.nvidia.com:12051/dl/nemo/nemo-vfm.git" -repository_commit_hash = "98d67aaf64fa74811483a30abf354ba75b0ba416" # benchmark branch -mcore_vfm_repo = "ssh://git@gitlab-master.nvidia.com:12051/dl/nemo/mcore-vfm.git" -mcore_vfm_commit_hash = "e14f0b211aec63c7e62385a686b455b6ce5c404d" # fsdp branch - -[extra_cmd_args] -"bash" = '-c "cd /work ; pwd ; ls ; git log -1 ; export PYTHONPATH=`pwd`:/opt/megatron-lm/:$PYTHONPATH ; python -u nemo/collections/diffusion/train.py --yes --factory mock_dit7b_8k trainer.strategy.tensor_model_parallel_size=1 trainer.strategy.sequence_parallel=False trainer.strategy.context_parallel_size=1 model.config.recompute_granularity=full model.config.recompute_method=uniform model.config.recompute_num_layers=1 data.micro_batch_size=1 data.global_batch_size=256 data.seq_length=65536 data.task_encoder.seq_length=65536 trainer.max_steps=10"' - -[extra_env_vars] -"WANDB_PROJECT" = "vfm" -"WANDB_RESUME" = "allow" -"NVTE_FUSED_ATTN" = "0" -"CUDA_DEVICE_MAX_CONNECTIONS" = "1" -"PYTORCH_CUDA_ALLOC_CONF" = "expandable_segments:True" diff --git a/conf/new/test/nemo-vfm-mock_ditllama28b_65k.toml b/conf/new/test/nemo-vfm-mock_ditllama28b_65k.toml deleted file mode 100644 index 6f78c14b..00000000 --- a/conf/new/test/nemo-vfm-mock_ditllama28b_65k.toml +++ /dev/null @@ -1,36 +0,0 @@ -# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES -# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -name = "nemo-vfm-mock_ditllama28b_65k" -description = "Nemo VFM factory=mock_ditllama28b_65k" -test_template_name = "SlurmContainer" - -[cmd_args] -docker_image_url = "gitlab-master.nvidia.com/dl/nemo/nemo-vfm:24.10" -repository_url = "ssh://git@gitlab-master.nvidia.com:12051/dl/nemo/nemo-vfm.git" -repository_commit_hash = "98d67aaf64fa74811483a30abf354ba75b0ba416" # benchmark branch -mcore_vfm_repo = "ssh://git@gitlab-master.nvidia.com:12051/dl/nemo/mcore-vfm.git" -mcore_vfm_commit_hash = "e14f0b211aec63c7e62385a686b455b6ce5c404d" # fsdp branch - -[extra_cmd_args] -"bash" = '-c "cd /work ; pwd ; ls ; git log -1 ; export PYTHONPATH=`pwd`:/opt/megatron-lm/:$PYTHONPATH ; python -u nemo/collections/diffusion/train.py --yes --factory mock_ditllama28b_65k trainer.strategy.tensor_model_parallel_size=1 trainer.strategy.sequence_parallel=False trainer.strategy.context_parallel_size=1 model.config.recompute_granularity=full model.config.recompute_method=uniform model.config.recompute_num_layers=1 data.micro_batch_size=1 data.global_batch_size=256 data.seq_length=65536 data.task_encoder.seq_length=65536 trainer.max_steps=10"' - -[extra_env_vars] -"WANDB_PROJECT" = "vfm" -"WANDB_RESUME" = "allow" -"NVTE_FUSED_ATTN" = "0" -"CUDA_DEVICE_MAX_CONNECTIONS" = "1" -"PYTORCH_CUDA_ALLOC_CONF" = "expandable_segments:True" diff --git a/conf/new/test/nemo-vfm-mock_ditllama28b_8k.toml b/conf/new/test/nemo-vfm-mock_ditllama28b_8k.toml deleted file mode 100644 index 2083ad69..00000000 --- a/conf/new/test/nemo-vfm-mock_ditllama28b_8k.toml +++ /dev/null @@ -1,36 +0,0 @@ -# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES -# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -name = "nemo-vfm-mock_ditllama28b_8k" -description = "Nemo VFM factory=mock_ditllama28b_8k" -test_template_name = "SlurmContainer" - -[cmd_args] -docker_image_url = "gitlab-master.nvidia.com/dl/nemo/nemo-vfm:24.10" -repository_url = "ssh://git@gitlab-master.nvidia.com:12051/dl/nemo/nemo-vfm.git" -repository_commit_hash = "98d67aaf64fa74811483a30abf354ba75b0ba416" # benchmark branch -mcore_vfm_repo = "ssh://git@gitlab-master.nvidia.com:12051/dl/nemo/mcore-vfm.git" -mcore_vfm_commit_hash = "e14f0b211aec63c7e62385a686b455b6ce5c404d" # fsdp branch - -[extra_cmd_args] -"bash" = '-c "cd /work ; pwd ; ls ; git log -1 ; export PYTHONPATH=`pwd`:/opt/megatron-lm/:$PYTHONPATH ; python -u nemo/collections/diffusion/train.py --yes --factory mock_ditllama28b_8k trainer.strategy.tensor_model_parallel_size=1 trainer.strategy.sequence_parallel=False trainer.strategy.context_parallel_size=1 model.config.recompute_granularity=full model.config.recompute_method=uniform model.config.recompute_num_layers=1 data.micro_batch_size=1 data.global_batch_size=256 data.seq_length=65536 data.task_encoder.seq_length=65536 trainer.max_steps=10"' - -[extra_env_vars] -"WANDB_PROJECT" = "vfm" -"WANDB_RESUME" = "allow" -"NVTE_FUSED_ATTN" = "0" -"CUDA_DEVICE_MAX_CONNECTIONS" = "1" -"PYTORCH_CUDA_ALLOC_CONF" = "expandable_segments:True" diff --git a/conf/new/test_scenario/nemo-vfm.toml b/conf/new/test_scenario/nemo-vfm.toml deleted file mode 100644 index 1a695dbb..00000000 --- a/conf/new/test_scenario/nemo-vfm.toml +++ /dev/null @@ -1,41 +0,0 @@ -# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES -# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -name = "nemo-vfm" - -[[Tests]] -id = "Tests.mock.dit7b_8k" -test_name = "nemo-vfm-mock_dit7b_8k" -num_nodes = 8 -time_limit = "01:00:00" - -[[Tests]] -id = "Tests.mock.dit7b_65k" -test_name = "nemo-vfm-mock_dit7b_65k" -num_nodes = 8 -time_limit = "01:00:00" - -[[Tests]] -id = "Tests.mock.ditllama28b_8k" -test_name = "nemo-vfm-mock_ditllama28b_8k" -num_nodes = 8 -time_limit = "01:00:00" - -[[Tests]] -id = "Tests.mock.ditllama28b_65k" -test_name = "nemo-vfm-mock_ditllama28b_65k" -num_nodes = 8 -time_limit = "01:00:00" From 82d3d24778fb538d88405e96ae0047efd8aa0e42 Mon Sep 17 00:00:00 2001 From: Taekyung Heo Date: Fri, 15 Nov 2024 15:04:36 -0500 Subject: [PATCH 21/21] Use absolute path for NeMo launcher repository --- .../nemo_launcher/slurm_command_gen_strategy.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/cloudai/schema/test_template/nemo_launcher/slurm_command_gen_strategy.py b/src/cloudai/schema/test_template/nemo_launcher/slurm_command_gen_strategy.py index 2ca4392d..61e608c6 100644 --- a/src/cloudai/schema/test_template/nemo_launcher/slurm_command_gen_strategy.py +++ b/src/cloudai/schema/test_template/nemo_launcher/slurm_command_gen_strategy.py @@ -47,7 +47,11 @@ def gen_exec_command(self, tr: TestRun) -> str: ) self.final_cmd_args["cluster.gpus_per_node"] = self.system.gpus_per_node or "null" - repo_path = tdef.python_executable.git_repo.installed_path + repo_path = ( + tdef.python_executable.git_repo.installed_path.absolute() + if tdef.python_executable.git_repo.installed_path is not None + else None + ) if not repo_path: logging.warning( f"Local clone of git repo {tdef.python_executable.git_repo} does not exist. "